diff --git a/src/IOBuf.h b/src/IOBuf.h index a405027b..0804e3d1 100644 --- a/src/IOBuf.h +++ b/src/IOBuf.h @@ -7,10 +7,10 @@ #include #include +#include #include #include #include -#include #include @@ -64,6 +64,7 @@ class IOBuf { } void TrimEnd(size_t amount) { length_ -= amount; } + void SetLength(size_t amount) { length_ = amount; } bool IsChained() const { return Next() != this; } @@ -319,11 +320,34 @@ class MutIOBuf : public IOBuf { template T& GetNoAdvance() { assert(p_->Length() > 0); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (p_->Length() - offset_ < sizeof(T)) { + // request straddles buffers, allocate a new chunk of memory to copy it + // into (so it is contiguous) + chunk_list.emplace_front(); + auto& chunk = chunk_list.front(); + chunk.reserve(sizeof(T)); + auto p = p_; + auto len = sizeof(T); + auto offset = offset_; + while (len > 0) { + auto remainder = std::min(p->Length() - offset, len); + auto data = p->Data() + offset; + chunk.insert(chunk.end(), data, data + remainder); + p = p->Next(); + offset = 0; + len -= remainder; + } + return *reinterpret_cast(Data()); + } +#else + if (p_->Length() - offset_ < sizeof(T)) { throw std::runtime_error( "MutDataPointer::Get(): request straddles buffer"); } - +#endif return *reinterpret_cast(Data()); } @@ -354,6 +378,7 @@ class MutIOBuf : public IOBuf { private: MutIOBuf* p_{nullptr}; size_t offset_{0}; + std::forward_list> chunk_list; }; MutDataPointer GetMutDataPointer() { return MutDataPointer(this); } diff --git a/src/native/EventManager.cc b/src/native/EventManager.cc index dc1f7f2f..eb8eda93 100644 --- a/src/native/EventManager.cc +++ b/src/native/EventManager.cc @@ -16,6 +16,19 @@ #include "Trace.h" #include "VMem.h" +uint32_t nsleep_states[16]; +uint32_t sleep_state[16]; +/*uint32_t processCnt[16]; +uint32_t swEventCnt[16]; +uint32_t idleEventCnt[16]; +uint32_t processInterruptCntAll[16]; +uint32_t processInterruptCntA[16]; +uint32_t processInterruptCntB[16]; +uint32_t processInterruptCntC[16]; +uint32_t passTokenCnt[16]; +uint32_t receiveTokenCnt[16]; +uint32_t genFireCnt[16];*/ + namespace { struct InterruptHandler { ebbrt::RcuHListHook hook; @@ -105,11 +118,13 @@ SwitchStack(uintptr_t first_param, uintptr_t stack, void (*func)(uintptr_t)); void ebbrt::EventManager::StartProcessingEvents() { auto stack_top = (active_event_context_.stack + kStackPages).ToAddr(); + //ebbrt::kprintf_force("StartProcessingEvents()\n"); SwitchStack(reinterpret_cast(this), stack_top, CallProcess); } void ebbrt::EventManager::CallProcess(uintptr_t mgr) { auto pmgr = reinterpret_cast(mgr); + //ebbrt::kprintf_force("CallProcess()\n"); pmgr->Process(); } @@ -129,12 +144,17 @@ template void ebbrt::EventManager::InvokeFunction(F&& f) { void ebbrt::EventManager::Process() { auto stack_top = (active_event_context_.stack + kStackPages).ToAddr(); + uint32_t mycpu = static_cast(Cpu::GetMine()); Cpu::GetMine().SetEventStack(stack_top); + uint32_t ecx, edx, eax; + ecx = edx = eax = 0; + // process an interrupt without halting // the sti instruction starts processing interrupts *after* the next // instruction is executed (to allow for a halt for example). The nop gives us // a one instruction window to process an interrupt (before the cli) process: + //processCnt[mycpu]++; asm volatile("sti;" "nop;" "cli;"); @@ -145,17 +165,38 @@ void ebbrt::EventManager::Process() { auto f = std::move(tasks_.front()); tasks_.pop_front(); InvokeFunction(f); + //swEventCnt[mycpu]++; // if we had a task to execute, then we go to the top again goto process; } if (idle_callback_) { + //idleEventCnt[mycpu]++; InvokeFunction(*idle_callback_); goto process; } + nsleep_states[mycpu] ++; + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" ((void*)&flags), "c" (ecx), "d"(edx)); + + // https://elixir.bootlin.com/linux/v4.15.1/source/arch/x86/include/asm/mwait.h#L100 + // https://elixir.bootlin.com/linux/v5.5.1/source/drivers/idle/intel_idle.c + + // sandy bridge + // C1 0x00 + // C1E 0x01 + // C3 0x10 + // C6 0x20 + // C7 0x30 + ecx = 1; /* break on interrupt flag */ + eax = sleep_state[mycpu]; /* we always pick the deepest sleep state */ + + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); + asm volatile("sti;" - "hlt;"); + "hlt;"); kabort("Woke up from halt?!?!"); } @@ -175,7 +216,7 @@ void ebbrt::EventManager::FreeStack(Pfn stack) { free_stacks_.push(stack); } static_assert(ebbrt::Cpu::kMaxCpus <= 256, "adjust event id calculation"); ebbrt::EventManager::EventManager(const RepMap& rm) - : reps_(rm), next_event_id_(Cpu::GetMine() << 24), + : reps_(rm), next_event_id_(Cpu::GetMine() << 24), active_event_context_(next_event_id_++, AllocateStack()) {} void ebbrt::EventManager::Spawn(MovableFunction func, @@ -204,6 +245,7 @@ void ebbrt::EventManager::CallSync(uintptr_t mgr) { // "fresh" event. Therefore if the sync_contexts_ stack is empty, we just go // back to the event loop if (unlikely(pmgr->sync_contexts_.empty())) { + //ebbrt::kprintf_force("CallSync, Process()\n"); pmgr->Process(); } else { // save this stack @@ -327,19 +369,27 @@ uint8_t ebbrt::EventManager::AllocateVector(MovableFunction func) { } void ebbrt::EventManager::ProcessInterrupt(int num) { + //uint32_t mycpu = static_cast(Cpu::GetMine()); apic::Eoi(); + //processInterruptCntAll[mycpu]++; + if (num == 32) { // pull all remote tasks onto our queue std::lock_guard l(remote_.lock); tasks_.splice(tasks_.end(), std::move(remote_.tasks)); + //processInterruptCntA[mycpu]++; + } else if (num == 33) { + //processInterruptCntB[mycpu]++; ReceiveToken(); } else { + //processInterruptCntC[mycpu]++; auto ih = vec_data->map.find(num); kassert(ih != nullptr); auto& f = ih->func; InvokeFunction(f); } + //ebbrt::kprintf_force("ProcessInterrupt %d\n", num);OA Process(); } @@ -378,7 +428,9 @@ ebbrt::EventManager::EventContext::EventContext(uint32_t event_id, Pfn stack) void ebbrt::EventManager::PassToken() { size_t my_cpu_index = Cpu::GetMine(); + //uint32_t mycpu = static_cast(Cpu::GetMine()); if (Cpu::Count() > 1) { + //passTokenCnt[mycpu] ++; auto next_cpu_index = (my_cpu_index + 1) % Cpu::Count(); auto next_cpu = Cpu::GetByIndex(next_cpu_index); kassert(next_cpu != nullptr); @@ -389,14 +441,18 @@ void ebbrt::EventManager::PassToken() { } void ebbrt::EventManager::ReceiveToken() { + //uint32_t mycpu = static_cast(Cpu::GetMine()); pending_generation_ = generation_++; - + //receiveTokenCnt[mycpu] ++; StartTimer(); } // Check Generation void ebbrt::EventManager::Fire() { - if (generation_count_[pending_generation_ % 2] == 0) { + //uint32_t mycpu = static_cast(Cpu::GetMine()); + //genFireCnt[mycpu] ++; + + if (generation_count_[pending_generation_ % 2] == 0) { // generation complete PassToken(); // temporarily store tasks that have now lived at least one entire @@ -410,12 +466,15 @@ void ebbrt::EventManager::Fire() { tasks.pop(); } } else { + //ebbrt::kprintf("EventManager()::Fire()\n"); StartTimer(); } } void ebbrt::EventManager::StartTimer() { - timer->Start(*this, std::chrono::milliseconds(1), + //timer->Start(*this, std::chrono::milliseconds(2), + // /* repeat = */ false); + timer->Start(*this, std::chrono::microseconds(1500), /* repeat = */ false); } diff --git a/src/native/EventManager.h b/src/native/EventManager.h index 2f40e1fa..21a831f5 100644 --- a/src/native/EventManager.h +++ b/src/native/EventManager.h @@ -23,6 +23,23 @@ #include "Trans.h" #include "VMemAllocator.h" +extern uint32_t nsleep_states[16]; +extern uint32_t sleep_state[16]; +/*extern uint32_t processCnt[16]; +extern uint32_t swEventCnt[16]; +extern uint32_t idleEventCnt[16]; +extern uint32_t processInterruptCntAll[16]; +extern uint32_t processInterruptCntA[16]; +extern uint32_t processInterruptCntB[16]; +extern uint32_t processInterruptCntC[16]; +extern uint32_t passTokenCnt[16]; +extern uint32_t receiveTokenCnt[16]; +extern uint32_t genFireCnt[16]; +extern uint32_t timerCnt[16]; +extern uint32_t fireCntA[16]; +extern uint32_t fireCntB[16]; +*/ + namespace ebbrt { class EventManager : Timer::Hook { @@ -112,7 +129,8 @@ class EventManager : Timer::Hook { size_t pending_generation_ = 0; std::queue> prev_rcu_tasks_; std::queue> curr_rcu_tasks_; - + unsigned long flags; + struct RemoteData : CacheAligned { ebbrt::SpinLock lock; std::list> tasks; diff --git a/src/native/GeneralPurposeAllocator.h b/src/native/GeneralPurposeAllocator.h index 12c91d97..b51869c6 100644 --- a/src/native/GeneralPurposeAllocator.h +++ b/src/native/GeneralPurposeAllocator.h @@ -16,6 +16,26 @@ namespace ebbrt { +// handler used in Pci.cc code to handle faults on multicores when mapping +// device +class MulticorePciFaultHandler : public ebbrt::VMemAllocator::PageFaultHandler { + ebbrt::Pfn vpage_; + ebbrt::Pfn ppage_; + size_t size_; + + public: + void SetMap(ebbrt::Pfn va, ebbrt::Pfn pa, size_t s) { + vpage_ = va; + ppage_ = pa; + size_ = s; + } + + void HandleFault(ebbrt::idt::ExceptionFrame* ef, + uintptr_t faulted_address) override { + ebbrt::vmem::MapMemory(vpage_, ppage_, size_); + } +}; + // page fault handler for mapping in physical pages // to virtual pages on all cores class LargeRegionFaultHandler : public ebbrt::VMemAllocator::PageFaultHandler { diff --git a/src/native/Ixgbe.h b/src/native/Ixgbe.h new file mode 100644 index 00000000..8ef9c07a --- /dev/null +++ b/src/native/Ixgbe.h @@ -0,0 +1,400 @@ +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ + +// from https://github.com/cisco-open-source/ethtool/ixgbe.c + +/* Register Bit Masks */ +#define IXGBE_FCTRL_SBP 0x00000002 +#define IXGBE_FCTRL_MPE 0x00000100 +#define IXGBE_FCTRL_UPE 0x00000200 +#define IXGBE_FCTRL_BAM 0x00000400 +#define IXGBE_FCTRL_PMCF 0x00001000 +#define IXGBE_FCTRL_DPF 0x00002000 +#define IXGBE_FCTRL_RPFCE 0x00004000 +#define IXGBE_FCTRL_RFCE 0x00008000 +#define IXGBE_VLNCTRL_VET 0x0000FFFF +#define IXGBE_VLNCTRL_CFI 0x10000000 +#define IXGBE_VLNCTRL_CFIEN 0x20000000 +#define IXGBE_VLNCTRL_VFE 0x40000000 +#define IXGBE_VLNCTRL_VME 0x80000000 +#define IXGBE_LINKS_UP 0x40000000 +#define IXGBE_LINKS_SPEED 0x20000000 +#define IXGBE_SRRCTL_BSIZEPKT_MASK 0x0000007F +#define IXGBE_HLREG0_TXCRCEN 0x00000001 +#define IXGBE_HLREG0_RXCRCSTRP 0x00000002 +#define IXGBE_HLREG0_JUMBOEN 0x00000004 +#define IXGBE_HLREG0_TXPADEN 0x00000400 +#define IXGBE_HLREG0_LPBK 0x00008000 +#define IXGBE_RMCS_TFCE_802_3X 0x00000008 +#define IXGBE_RMCS_TFCE_PRIORITY 0x00000010 +#define IXGBE_FCCFG_TFCE_802_3X 0x00000008 +#define IXGBE_FCCFG_TFCE_PRIORITY 0x00000010 +#define IXGBE_MFLCN_PMCF 0x00000001 /* Pass MAC Control Frames */ +#define IXGBE_MFLCN_DPF 0x00000002 /* Discard Pause Frame */ +#define IXGBE_MFLCN_RPFCE 0x00000004 /* Receive Priority FC Enable */ +#define IXGBE_MFLCN_RFCE 0x00000008 /* Receive FC Enable */ + +#define IXGBE_EITR_CNT_WDIS 0x80000000 + +// max transmit sizes +#define IXGBE_MAX_TXD_PWR 14 +#define IXGBE_MAX_DATA_PER_TXD (1u << IXGBE_MAX_TXD_PWR) +#define IXGBE_TSO_LIMIT 262144 + +enum l4_type { l4_type_udp = 0, l4_type_tcp, l4_type_sctp, l4_type_rsv }; + +#define ETHHDR_LEN 14 +#define IPHDR_LEN 20 +#define UDPHDR_LEN 8 + +#define RXFLAG_IPCS (1 << 0) +#define RXFLAG_IPCS_VALID (1 << 1) +#define RXFLAG_L4CS (1 << 2) +#define RXFLAG_L4CS_VALID (1 << 3) + +#define IXGBE_RX_BUFFER_WRITE 15 + +#define mb() asm volatile("mfence" ::: "memory") +#define rmb() asm volatile("lfence" ::: "memory") +#define wmb() asm volatile("sfence" ::: "memory") + +/*********************** + * RX + * Descriptors + **********************/ +// 7.1.5 Legacy Receive Descriptor, Table 7 - 11 +typedef union { + + uint64_t raw[2]; + + struct { + uint64_t buffer_address; + + union { + uint64_t word2_raw; + + struct { + uint64_t length : 16; + uint64_t fragment_checksum : 16; + + // uint64_t status : 8; + uint64_t dd : 1; + uint64_t eop : 1; + uint64_t rsvd1 : 1; + uint64_t vp : 1; + uint64_t udpcs : 1; + uint64_t l4cs : 1; + uint64_t ipcs : 1; + uint64_t pif : 1; + + // uint64_t errors : 8; + uint64_t rxe : 1; + uint64_t rsvd2 : 1; + uint64_t rsvd3 : 1; + uint64_t rsvd4 : 1; + uint64_t rsvd5 : 1; + uint64_t rsvd6 : 1; + uint64_t tcpe : 1; + uint64_t ipe : 1; + + uint64_t vlan_tag : 16; + }; // struct + + }; // union + + } __attribute__((packed)); // struct + +} rdesc_legacy_t; // typedef union + +// 7.1.6.1 Advanced Receive Descriptors Read Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t packet_buffer; + uint64_t header_buffer; + } __attribute__((packed)); // struct +} rdesc_adv_rf_t; + +// 7.1.6.2 Advanced Receive Descriptors — Write-Back Format +typedef union { + uint64_t raw[2]; + struct { + union { + uint32_t raw32_1; + struct { + uint32_t rss_type : 4; + + // packet type + uint32_t pt_ipv4 : 1; + uint32_t pt_ipv4e : 1; + uint32_t pt_ipv6 : 1; + uint32_t pt_ipv6e : 1; + uint32_t pt_tcp : 1; + uint32_t pt_udp : 1; + uint32_t pt_sctp : 1; + uint32_t pt_nfs : 1; + uint32_t pt_isesp : 1; + uint32_t pt_isah : 1; + uint32_t pt_linksec : 1; + uint32_t pt_l2packet : 1; + uint32_t pt_rsvd : 1; + + uint32_t rsccnt : 4; + uint32_t hdr_len : 10; + uint32_t sph : 1; + }; + }; // union raw32_1 + + union { + uint32_t raw32_2; + uint32_t rss_hash; + uint32_t fragment_checksum; + uint32_t rtt; + uint32_t fcoe_param; + uint32_t flow_directors_filters_id; // may need more, page 317 + }; // union raw32_2 + + union { + uint32_t raw32_3; + + struct { + // extended status + uint32_t dd : 1; + uint32_t eop : 1; + uint32_t flm : 1; + uint32_t vp : 1; + + // fcstat - 2 bits + uint32_t udpcs : 1; + uint32_t l4i : 1; + + uint32_t ipcs : 1; + uint32_t pif : 1; + uint32_t rsvd_1 : 1; + uint32_t vext : 1; + uint32_t udpv : 1; + uint32_t llint : 1; + uint32_t rsvd_2 : 4; + uint32_t ts : 1; + uint32_t secp : 1; + uint32_t lb : 1; + uint32_t rsvd_3 : 1; + + // extended error + uint32_t fdierr : 3; + uint32_t hbo : 1; + uint32_t rsvd : 3; + uint32_t secerr : 2; + uint32_t rxe : 1; + uint32_t l4e : 1; + uint32_t ipe : 1; + }; // status_last_descriptor; + + struct { + // extended status + uint32_t dd2 : 1; + uint32_t eop2 : 1; + uint32_t rsvd_4 : 2; + uint32_t next_descriptor_ptr : 16; + + // extended error + uint32_t error : 12; + }; // status_non_last_descriptor; + }; // union raw32_3 + + union { + uint32_t raw32_4; + struct { + uint32_t pkt_len : 16; + uint32_t vlan_tag : 16; + }; + }; // union raw32_4 + + } __attribute__((packed)); // struct +} rdesc_adv_wb_t; + +/*********************** + * TX + * Descriptors + **********************/ +// 7.2.3.2.2 Legacy Transmit Descriptor Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t buffer_address; + + union { + uint64_t word2_raw; + + struct { + uint64_t length : 16; + uint64_t cso : 8; + + // cmd + uint64_t eop : 1; + uint64_t ifcs : 1; + uint64_t ic : 1; + uint64_t rs : 1; + uint64_t rsvd_1 : 1; + uint64_t dext : 1; + uint64_t vle : 1; + uint64_t rsvd_2 : 1; + + // sta + uint64_t dd : 1; + uint64_t rsvd_3 : 3; + + uint64_t rsvd_4 : 4; + uint64_t css : 8; + uint64_t vlan : 16; + }; + }; + + } __attribute__((packed)); +} tdesc_legacy_t; + +// 7.2.3.2.3 Advanced Transmit Context Descriptor +typedef union { + uint64_t raw[2]; + + struct { + union { + uint64_t raw_1; + + struct { + uint64_t iplen : 9; + uint64_t maclen : 7; + uint64_t vlan : 16; + uint64_t ipsec_sa_index : 10; + uint64_t fcoef : 6; + uint64_t rsvd_1 : 16; + }; + }; + + union { + uint64_t raw_2; + + struct { + uint64_t ipsec_esp_len : 9; + + // tucmd + uint64_t snap : 1; + uint64_t ipv4 : 1; + uint64_t l4t : 2; // l4 packet type + uint64_t ipsec_type : 1; + uint64_t encyption : 1; + uint64_t fcoe : 1; + uint64_t rsvd_2 : 4; + + uint64_t dtyp : 4; + uint64_t rsvd_3 : 5; + uint64_t dext : 1; + + uint64_t bcntlen : 6; + uint64_t idx : 1; + uint64_t rsvd_4 : 3; + uint64_t l4len : 8; + uint64_t mss : 16; + }; + }; + + } __attribute__((packed)); + +} tdesc_advance_ctxt_wb_t; + +// 7.2.3.2.4 Advanced Transmit Data Descriptor - Read Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t address; + + union { + uint64_t raw2; + struct { + uint64_t dtalen : 16; + uint64_t rsvd_1 : 2; + + // mac + uint64_t mac_ilsec : 1; + uint64_t mac_1588 : 1; + + uint64_t dtyp : 4; + + // dcmd + uint64_t eop : 1; + uint64_t ifcs : 1; + uint64_t rsvd_2 : 1; + uint64_t rs : 1; + uint64_t rsvd_3 : 1; + uint64_t dext : 1; + uint64_t vle : 1; + uint64_t tse : 1; + + // status + uint64_t dd : 1; + uint64_t rsvd_4 : 3; + + // idx + uint64_t idx : 3; + // uint64_t rsvd_5 : 2; + + uint64_t cc : 1; + + // popts + uint64_t ixsm : 1; + uint64_t txsm : 1; + uint64_t ipsec : 1; + uint64_t rsvd_6 : 3; + + uint64_t paylen : 18; + }; + }; + }; + +} tdesc_advance_tx_rf_t; + +// Advanced Transmit Data Descriptor - Write-back Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t rsvd_1; + + union { + uint64_t raw2; + + struct { + uint64_t rsvd_2 : 32; + + // status + uint64_t dd : 1; + uint64_t rsvd_3 : 3; + + uint64_t rsvd_4 : 28; + }; + }; + }; + +} tdesc_advance_tx_wbf_t; + +struct VirtioNetHeader { + static const constexpr uint8_t kNeedsCsum = 1; + static const constexpr uint8_t kGsoNone = 0; + static const constexpr uint8_t kGsoTcpv4 = 1; + static const constexpr uint8_t kGsoUdp = 3; + static const constexpr uint8_t kGsoTcpv6 = 4; + static const constexpr uint8_t kGsoEvn = 0x80; + + uint8_t flags; + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; + uint16_t num_buffers; +}; + +#endif // BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc new file mode 100644 index 00000000..d15938ac --- /dev/null +++ b/src/native/IxgbeDriver.cc @@ -0,0 +1,2708 @@ +// Copyright Boston University SESA Group 2013 - 2018. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +#include "IxgbeDriver.h" + +#include "../Align.h" +#include "../StaticIOBuf.h" +#include "../UniqueIOBuf.h" +#include "Clock.h" +#include "Debug.h" +#include "EventManager.h" +#include "Fls.h" +#include "Ixgbe.h" +#include "Net.h" +#include "Pfn.h" + +#include +#include +#include + +struct IxgbeLog ixgbe_stats[16]; +union IxgbeLogEntry *ixgbe_logs[16]; +std::unique_ptr bsendbufs[16]; +//uint64_t rxPollCnt[16]; + +void ebbrt::IxgbeDriver::Create(pci::Device& dev) { + auto ixgbe_dev = new IxgbeDriver(dev); + + // physical device bringup + ixgbe_dev->Init(); + + ixgbe_dev->ebb_ = + IxgbeDriverRep::Create(ixgbe_dev, ebb_allocator->AllocateLocal()); + + // only even core numbers + if(static_cast(Cpu::Count()) > 1) { + kassert(static_cast(Cpu::Count()) % 2 == 0); + } + + // initialize per core rx and tx queues + for (size_t i = 0; i < Cpu::Count(); i++) { + ixgbe_dev->SetupMultiQueue(i); + } + + ixgbe_dev->FinishSetup(); + + // TODO remove? + ebbrt::clock::SleepMilli(200); + ebbrt::kprintf("82599 initialze complete\n"); + + memset(ixgbe_stats, 0, sizeof(ixgbe_stats)); + memset(ixgbe_logs, 0, sizeof(ixgbe_logs)); + + uint32_t ncores = static_cast(ebbrt::Cpu::Count()); + for (uint32_t i = 0; i < ncores; i++) { + ebbrt::Promise p; + auto f = p.GetFuture(); + ebbrt::event_manager->SpawnRemote( + [i, &p] () mutable { + ixgbe_logs[i] = (union IxgbeLogEntry *)malloc(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry)); + memset(ixgbe_logs[i], 0, sizeof(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry))); + + bsendbufs[i] = MakeUniqueIOBuf(IXGBE_TSO_LIMIT); + memset(bsendbufs[i]->MutData(), 0, IXGBE_TSO_LIMIT); + ebbrt::kprintf_force("i=%d sizeof=%u ixgbe_addr=%p bsendbufs_addr=%p\n", i, + sizeof(ixgbe_logs[i]), (void*)ixgbe_logs[i], (void*)(bsendbufs[i]->Data())); + p.SetValue(); + }, i); + f.Block(); + } +} + +const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { + return mac_addr_; +} + +std::string ebbrt::IxgbeDriver::ReadNic() { + uint32_t i = static_cast(Cpu::GetMine()); + return ixgmq[i]->str_stats.str(); +} + +void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { + ebb_->Send(std::move(buf), std::move(pinfo)); +} + +// After packet transmission, need to mark bit in +// tx queue so that it can be used again +// TX_HEAD_WB does it automatically +void ebbrt::IxgbeDriverRep::ReclaimTx() { +#ifndef TX_HEAD_WB + // with TX head writeback, shouldn't need to poll anymore (right?) + tdesc_advance_tx_wbf_t* awbfx; + uint32_t free_desc; + + if(ixgmq_.tx_tail_ > ixgmq_.tx_head_) { + free_desc = IxgbeDriver::NTXDESCS - (ixgmq_.tx_tail_ - ixgmq_.tx_head_); + } else if(ixgmq_.tx_tail_ < ixgmq_.tx_head_) { + free_desc = IxgbeDriver::NTXDESCS - ((ixgmq_.tx_tail_+IxgbeDriver::NTXDESCS) - ixgmq_.tx_head_); + } else { + free_desc = IxgbeDriver::NTXDESCS; + } + + // (IxgbeDriver::NTXDESCS - 1): 340 W, 1599820.2, eax=0x60 + if(free_desc < (IxgbeDriver::NTXDESCS - 1)) { + auto head = ixgmq_.tx_head_; + auto tail = ixgmq_.tx_tail_; + + while(head != tail) { + if(ixgmq_.tx_iseop[head] == true) { + awbfx = reinterpret_cast(&(ixgmq_.tx_ring_[head])); + while(awbfx->dd == 0) { + asm volatile("lfence":::"memory"); + } + ixgmq_.tx_iseop[head] = false; + } + head = (head + 1) % ixgmq_.tx_size_; + } + ixgmq_.tx_head_ = head; + } +#endif +} + +void ebbrt::IxgbeDriverRep::SendUdp(std::unique_ptr buf, uint64_t len, PacketInfo pinfo) { + uint64_t data; + std::unique_ptr b; + tdesc_advance_tx_rf_t* arfx; + uint32_t end; + + // coalesce into single packet if no checksum + //ebbrt::kprintf_force("SendUdp len=%d\n", len); + if(buf->IsChained()) { + b = MakeUniqueIOBuf(len); + auto mdata = b->MutData(); + for (auto& buf_it : *buf) { + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); + } + data = reinterpret_cast(b->MutData()); + } else { + data = reinterpret_cast(buf->Data()); + } + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + + // In a single-send packet, PAYLEN defines the entire packet size fetched from host memory. + arfx->paylen = len; + + // crc checksum + arfx->ifcs = 1; + + // rs bit should only be set when eop is set + arfx->eop = 1; + arfx->rs = 1; + + // type is advanced + arfx->dtyp = 0x3; + arfx->dext = 1; + + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + end = static_cast(ixgmq_.tx_tail_); + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + +} + +//void ebbrt::IxgbeDriverRep::SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo) { +void ebbrt::IxgbeDriverRep::SendTCPUnchained(uint64_t bdata, uint64_t len, PacketInfo pinfo) { + uint64_t data, tsodata, tsolen; + std::unique_ptr b; + tdesc_advance_tx_rf_t* arfx; + tdesc_advance_ctxt_wb_t* actx; + uint32_t end; + + //data = reinterpret_cast(buf->Data()); + data = bdata; + //ebbrt::kprintf_force("SendTCPUnchained len=%llu\n", len); + + if(len > IXGBE_MAX_DATA_PER_TXD) { + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0x2 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + //first descriptor + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; + arfx->dtyp = 0x3; + arfx->ifcs = 1; + arfx->dext = 1; + arfx->tse = 1; + arfx->ixsm = 1; + arfx->txsm = 1; + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload length, so no header length + arfx->paylen = pinfo.tcp_len; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + + tsodata = data; + tsolen = len; + + while(tsolen > IXGBE_MAX_DATA_PER_TXD) { + tsodata += IXGBE_MAX_DATA_PER_TXD; + tsolen -= IXGBE_MAX_DATA_PER_TXD; + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->dtyp = 0x3; + arfx->dext = 1; + arfx->tse = 1; + arfx->ifcs = 1; + arfx->address = tsodata; + + if(tsolen > IXGBE_MAX_DATA_PER_TXD) { + arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } else { + // last descriptor + arfx->dtalen = tsolen; + arfx->eop = 1; + arfx->rs = 1; + + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } + } + } + else if(len > 1490 && len < IXGBE_MAX_DATA_PER_TXD) { + + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + arfx->dtyp = 0x3; + arfx->eop = 1; + arfx->rs = 1; + arfx->ifcs = 1; + arfx->dext = 1; + arfx->tse = 1; + + arfx->ixsm = 1; + arfx->txsm = 1; + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size + arfx->paylen = pinfo.tcp_len; + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } else { /**** NOT IN TSE mode****/ + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux, ignored when no TSE + actx->mss = 0; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = 0; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + arfx->paylen = len; + + arfx->dtyp = 0x3; + arfx->eop = 1; + arfx->rs = 1; + arfx->ifcs = 1; + + arfx->dext = 1; + arfx->tse = 0; + + arfx->ixsm = 1; + arfx->txsm = 1; + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } +} + +void ebbrt::IxgbeDriverRep::SendTCPChained(std::unique_ptr buf, uint64_t len, uint64_t num_chains, PacketInfo pinfo) { + uint64_t data, i; + std::unique_ptr b; + tdesc_advance_tx_rf_t* arfx; + tdesc_advance_ctxt_wb_t* actx; + uint32_t end; + //uint32_t mcore = static_cast(Cpu::GetMine()); + + //ebbrt::kprintf_force("** SendTCPChained num_chains=%llu len=%llu START ** \n", num_chains, len); + if(len <= 1490) { + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux, ignored when no TSE + actx->mss = 0; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = 0; + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + i = 0; + for (auto& buf_it : *buf) { + data = reinterpret_cast(buf_it.Data()); + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = buf_it.Length(); + // only valid on first descriptor of packet + if(i == 0) { + arfx->paylen = len; + } + + arfx->rs = 1; + arfx->dtyp = 0x3; + //only valid on last descriptor making up packet + if(i == (num_chains-1)) { + arfx->eop = 1; + } + + arfx->ifcs = 1; + + arfx->dext = 1; + arfx->tse = 0; + + arfx->ixsm = 1; + + // if need TCP checksum offload + arfx->txsm = 1; + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + i++; + } + } else { + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0x2 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + i = 0; + for (auto& buf_it : *buf) { + if (buf_it.Length() > IXGBE_MAX_DATA_PER_TXD) { + ebbrt::kprintf("TSE buf_it.Length() = %u > IXGBE_MAX_DATA_PER_TXD num_chains=%llu i=%llu\n", buf_it.Length(), num_chains, i); + return; + } + + data = reinterpret_cast(buf_it.Data()); + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = buf_it.Length(); //GBE_MAX_DATA_PER_TXD; + arfx->dtyp = 0x3; + arfx->ifcs = 1; + arfx->dext = 1; + arfx->tse = 1; + + //ebbrt::kprintf_force("\t SendTCPChained i=%u len=%llu\n", i, buf_it.Length()); + + // first descriptor + if (i == 0) { + arfx->ixsm = 1; + arfx->txsm = 1; + + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload length, so no header length + // only valid on first descriptor + arfx->paylen = pinfo.tcp_len; + } else if(i == (num_chains-1)) { // last descriptor + arfx->eop = 1; + arfx->rs = 1; + arfx->dtalen = buf_it.Length(); + } else { + arfx->dtalen = buf_it.Length(); + } + + i ++; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + //asm volatile("sfence" ::: "memory"); + // WriteTdt_1(mcore, ixgmq_.tx_tail_); + } + } + + //ebbrt::kprintf_force("** SendTcpChained END **\n"); +} + +void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { + uint64_t len, num_chains; + uint32_t mcore = static_cast(Cpu::GetMine()); + //std::unique_ptr b; + + // On TSO, the maximum PAYLEN can be up to 2^18 - 1 + len = buf->ComputeChainDataLength(); + ixgmq_.stat_num_tx_bytes += len; + if (len > IXGBE_TSO_LIMIT) { + ebbrt::kprintf_force("\t kabort Send() len=%lld greater than TSO limit of 262144 bytes\n", len); + return; + } + num_chains = buf->CountChainElements(); + + /*if(mcore == 15) { + ebbrt::kprintf_force("\t Sending on core 15??"); + return; + }*/ + +#ifndef TX_HEAD_WB + ReclaimTx(); +#endif + +/*#ifdef RSC_EN + // if no IP/TCP checksum - likely UDP packet + if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { + SendUdp(std::move(buf), len, pinfo); + } else { // TCP Packet + // hardware limits sending over 40 descriptors per packet, have to manually coalesce here + // hopefully not too often + //ebbrt::kprintf_force("Send() len=%llu num_chains=%llu\n", len, num_chains); + if(num_chains > 38) { + ixgmq_.tx_desc_counts[39] ++; + b = MakeUniqueIOBuf(len); + auto mdata = b->MutData(); + for (auto& buf_it : *buf) { + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); + } + SendTCPUnchained(std::move(b), len, pinfo); + + } else if(num_chains > 1 && num_chains <= 38) { + ixgmq_.tx_desc_counts[num_chains] ++; + SendTCPChained(std::move(buf), len, num_chains, pinfo); + } else { //Not Chained + ixgmq_.tx_desc_counts[1] ++; + SendTCPUnchained(std::move(buf), len, pinfo); + } + } +#else*/ + // if no IP/TCP checksum - likely UDP packet + if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { + SendUdp(std::move(buf), len, pinfo); + } else { // TCP Packet + ixgmq_.tx_desc_counts[1] ++; + + if(num_chains > 1) { + //b = MakeUniqueIOBuf(len); + auto mdata = bsendbufs[mcore]->MutData(); + for (auto& buf_it : *buf) { + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); + } + //SendTCPUnchained(std::move(b), len, pinfo); + SendTCPUnchained(reinterpret_cast(bsendbufs[mcore]->Data()), len, pinfo); + ixgmq_.stat_num_tx_desc += num_chains; + } else { + SendTCPUnchained(reinterpret_cast(buf->Data()), len, pinfo); + ixgmq_.stat_num_tx_desc += 1; + //SendTCPUnchained(std::move(buf), len, pinfo); + } + } +//#endif + + asm volatile("sfence" ::: "memory"); + WriteTdt_1(mcore, ixgmq_.tx_tail_); +} + +void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { + // Disable RXCTRL - 8.2.3.8.10 + bar0_.Write32(0x03000, m); +} + +void ebbrt::IxgbeDriver::WriteDmatxctl(uint32_t m) { + uint32_t reg; + + reg = bar0_.Read32(0x04A80); + ebbrt::kprintf("0x04A80: DMATXCTL 0x%08X - reset to 0x%08X\n", reg, reg & m); + + // DMATXCTL - 8.2.3.9.2 + bar0_.Write32(0x04A80, reg & m); +} +void ebbrt::IxgbeDriver::WriteDmatxctl_te(uint32_t m) { + auto reg = bar0_.Read32(0x04A80); + ebbrt::kprintf("DMATXCTL= 0x%X\n", reg | m); + bar0_.Write32(0x04A80, reg | m); +} + +//8.2.3.5.10 Extended Interrupt Auto Mask Enable registers — EIAM[n] (0x00AD0 + 4*(n-1), n=1...2; RW) +void ebbrt::IxgbeDriver::WriteEiam(uint32_t n, uint32_t m) { + bar0_.Write32(0x00AD0 + 4*n, m); +} + +// 8.2.3.5.18 - General Purpose Interrupt Enable — GPIE (0x00898; RW) +void ebbrt::IxgbeDriver::WriteGpie(uint32_t m) { + //auto reg = bar0_.Read32(0x00898); + //bar0_.Write32(0x00898, reg | m); + bar0_.Write32(0x00898, m); +} + +// 8.2.3.5.1 Extended Interrupt Cause Register- EICR (0x00800; RW1C) +void ebbrt::IxgbeDriver::ReadEicr() { + /* Note + * The EICR is also cleared on read if GPIE.OCD bit is cleared. When the + * GPIE.OCD bit is set, then only bits 16...29 are cleared on read. + */ + // 8.2.3.5.18 General Purpose Interrupt Enable — GPIE (0x00898;RW) + uint32_t reg; + reg = bar0_.Read32(0x00898); + ebbrt::kbugon((reg & 0x20), "GPIE.OCD not cleared\n"); + + reg = bar0_.Read32(0x00800); + ebbrt::kprintf("First Read - 0x00800: EICR 0x%08X, ", reg); + + reg = bar0_.Read32(0x00800); + ebbrt::kprintf("Second Read - EICR 0x%08X\n", reg); +} +void ebbrt::IxgbeDriver::WriteEicr(uint32_t m) { + auto reg = bar0_.Read32(0x00800); + bar0_.Write32(0x00800, reg | m); +} + +// 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) +uint32_t ebbrt::IxgbeDriver::ReadEims() { return bar0_.Read32(0x00880); } +void ebbrt::IxgbeDriver::WriteEims(uint32_t m) { bar0_.Write32(0x00880, m); } + +// 8.2.3.5.4 Extended Interrupt Mask Clear Register- EIMC (0x00888; WO) +void ebbrt::IxgbeDriver::WriteEimc(uint32_t m) { bar0_.Write32(0x00888, m); } + +// 8.2.3.5.5 Extended Interrupt Auto Clear Register — EIAC (0x00810; RW) +void ebbrt::IxgbeDriver::WriteEiac(uint32_t m) { + auto reg = bar0_.Read32(0x00810); + bar0_.Write32(0x00810, reg | m); +} + +// 8.2.3.5.8 Extended Interrupt Mask Set/Read Registers — EIMS[n] (0x00AA0 + +// 4*(n-1), n=1...2; RWS) +void ebbrt::IxgbeDriver::WriteEimsn(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00AA0 + 4 * n); + bar0_.Write32(0x00AA0 + 4 * n, reg | m); +} + +// 8.2.3.5.12 +// Extended Interrupt Throttle Registers — EITR[n] +// (0x00820 + 4*n, n=0...23 and 0x012300 + 4*(n-24), +// n=24...128; RW) +void ebbrt::IxgbeDriver::WriteEitr(uint32_t n, uint32_t m) { + ebbrt::kbugon(n > 128, "%s error\n", __FUNCTION__); + + if (n < 24) { + bar0_.Write32(0x00820 + 4 * n, m); + } else { + bar0_.Write32(0x012300 + 4 * (n - 24), m); + } +} + +// 8.2.3.9.10 Transmit Descriptor Control — TXDCTL[n] (0x06028+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTxdctl(uint32_t n, uint32_t m) { + bar0_.Write32(0x06028 + (0x40 * n), m); +} +uint8_t ebbrt::IxgbeDriver::ReadTxdctl_enable(uint32_t n) { + auto reg = bar0_.Read32(0x06028 + 0x40 * n); + return (reg >> 25) & 0x1; +} + +// 8.2.3.8.6 Receive Descriptor Control — RXDCTL[n] (0x01028 + +// 0x40*n, n=0...63 and 0x0D028 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRxdctl_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01028 + (0x40 * n), m); +} +void ebbrt::IxgbeDriver::WriteRxdctl_1_enable(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01028 + (0x40 * n)); + bar0_.Write32(0x01028 + (0x40 * n), reg | m); +} + +uint8_t ebbrt::IxgbeDriver::ReadRxdctl_1_enable(uint32_t n) { + auto reg = bar0_.Read32(0x01028 + (0x40 * n)); + return (reg >> 25) & 0x1; +} + +void ebbrt::IxgbeDriver::WriteRxdctl_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D028 + (0x40 * n), m); +} + +// 8.2.3.27.14 PF VM L2 Control Register — PFVML2FLT[n] (0x0F000 + 4*n, +// n=0...63; RW) +void ebbrt::IxgbeDriver::WritePfvml2flt(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F000 + 4 * n, m); +} + +// 8.2.3.9.14 Manageability Transmit TC Mapping — MNGTXMAP (0x0CD10; RW) +void ebbrt::IxgbeDriver::WriteMngtxmap(uint32_t m) { + bar0_.Write32(0x0CD10, m); +} + +// 8.2.3.1.1 Device Control Register — CTRL (0x00000 / 0x00004;RW) +void ebbrt::IxgbeDriver::WriteCtrl(uint32_t m) { bar0_.Write32(0x0, m); } +void ebbrt::IxgbeDriver::ReadCtrl() { + uint32_t reg; + reg = bar0_.Read32(0x0); + ebbrt::kprintf("%s = 0x%X\n", __FUNCTION__, reg); +} + +// 8.2.3.1.3 Extended Device Control Register — CTRL_EXT (0x00018; RW) +void ebbrt::IxgbeDriver::WriteCtrlExt(uint32_t m) { + //auto reg = bar0_.Read32(0x00018); + //bar0_.Write32(0x00018, reg | m); + bar0_.Write32(0x00018, m); +} + +// 8.2.3.7.1 Filter Control Register — FCTRL (0x05080; RW) +void ebbrt::IxgbeDriver::WriteFctrl(uint32_t m) { bar0_.Write32(0x05080, m); } + +// 8.2.3.24.9 Flexible Host Filter Table Registers — FHFT (0x09000 — 0x093FC and +// 0x09800 — 0x099FC; RW) +void ebbrt::IxgbeDriver::WriteFhft_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x09000, m); +} +void ebbrt::IxgbeDriver::WriteFhft_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x09800, m); +} + +// 8.2.3.1.2 Device Status Register — STATUS (0x00008; RO) +bool ebbrt::IxgbeDriver::ReadStatusPcieMes() { + auto reg = bar0_.Read32(0x8); + return !(reg & 0x80000); +} +uint8_t ebbrt::IxgbeDriver::ReadStatusLanId() { + auto reg = bar0_.Read32(0x8); + return (reg >> 2) & 0x3; +} + +// 8.2.3.3.2 Flow Control Transmit Timer Value n — FCTTVn (0x03200 + 4*n, +// n=0...3; RW) +void ebbrt::IxgbeDriver::WriteFcttv(uint32_t n, uint32_t m) { + bar0_.Write32(0x03200 + (4 * n), m); +} + +// 8.2.3.3.3 Flow Control Receive Threshold Low — FCRTL[n] (0x03220 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteFcrtl(uint32_t n, uint32_t m) { + bar0_.Write32(0x03220 + (4 * n), m); +} + +// 8.2.3.3.4 Flow Control Receive Threshold High — FCRTH[n] (0x03260 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteFcrth(uint32_t n, uint32_t m) { + bar0_.Write32(0x03260 + (4 * n), m); +} + +// 8.2.3.3.5 Flow Control Refresh Threshold Value — FCRTV (0x032A0; RW) +void ebbrt::IxgbeDriver::WriteFcrtv(uint32_t m) { bar0_.Write32(0x032A0, m); } + +// 8.2.3.3.7 Flow Control Configuration — FCCFG (0x03D00; RW) +void ebbrt::IxgbeDriver::WriteFccfg(uint32_t m) { bar0_.Write32(0x03D00, m); } + +// 8.2.3.2.2 EEPROM Read Register — EERD (0x10014; RW) +void ebbrt::IxgbeDriver::WriteEerd(uint32_t m) { bar0_.Write32(0x10014, m); } +bool ebbrt::IxgbeDriver::ReadEerdDone() { + auto reg = bar0_.Read32(0x10014); + return !!(reg & 0x2); // return true when Read Done = 1 +} + +uint16_t ebbrt::IxgbeDriver::ReadEerdData() { + auto reg = bar0_.Read32(0x10014); + return (reg >> 16) & 0xFFFF; +} + +uint16_t ebbrt::IxgbeDriver::ReadEeprom(uint16_t offset) { + WriteEerd(offset << 2 | 1); + // TODO: Timeout + while (ReadEerdDone() == 0) + ; + return ReadEerdData(); +} + +// 8.2.3.22.32 - Core Analog Configuration Register — CoreCTL (0x014F00; RW) +void ebbrt::IxgbeDriver::WriteCorectl(uint16_t m) { + bar0_.Write32(0x014F00, 0x0 | m); +} + +// 8.2.3.22.19 Auto Negotiation Control Register — AUTOC (0x042A0; RW) +void ebbrt::IxgbeDriver::WriteAutoc(uint32_t m) { + auto reg = bar0_.Read32(0x042A0); + bar0_.Write32(0x042A0, reg | m); +} +uint8_t ebbrt::IxgbeDriver::ReadAutocRestartAn() { + auto reg = bar0_.Read32(0x042A0); + return (reg >> 12) & 0x1; +} + +// 8.2.3.22.23 Auto Negotiation Link Partner Link Control Word 1 Register — +// ANLP1 (0x042B0; RO) +uint8_t ebbrt::IxgbeDriver::ReadAnlp1() { + auto reg = bar0_.Read32(0x042B0); + return (reg >> 16) & 0xFF; +} + +// 8.2.3.2.1 EEPROM/Flash Control Register — EEC (0x10010; RW) +uint8_t ebbrt::IxgbeDriver::ReadEecAutoRd() { + auto reg = bar0_.Read32(0x10010); + return (reg >> 9) & 0xFF; +} + +// 8.2.3.7.7 Multicast Table Array — MTA[n] (0x05200 + 4*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteMta(uint32_t n, uint32_t m) { + bar0_.Write32(0x05200 + (4 * n), m); +} + +// 8.2.3.7.11 VLAN Filter Table Array — VFTA[n] (0x0A000 + 4*n,n=0...127; RW) +void ebbrt::IxgbeDriver::WriteVfta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A000 + (4 * n), m); +} + +// 8.2.3.27.15 PF VM VLAN Pool Filter — PFVLVF[n] (0x0F100 + 4*n, n=0...63; RW) +void ebbrt::IxgbeDriver::WritePfvlvf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F100 + 4 * n, m); +} + +// 8.2.3.27.16 PF VM VLAN Pool Filter Bitmap — PFVLVFB[n] (0x0F200 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WritePfvlvfb(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F200 + 4 * n, m); +} + +// 8.2.3.7.23 Rx Filter ECC Err Insertion 0 — RXFECCERR0 (0x051B8; RW) +void ebbrt::IxgbeDriver::WriteRxfeccerr0(uint32_t m) { + auto reg = bar0_.Read32(0x051B8); + bar0_.Write32(0x051B8, reg | m); +} + +// Checks the MAC's EEPROM to see if it supports a given SFP+ module type, if +// 1360 +// so it returns the offsets to the phy init sequence block. +// also based on +// http://lxr.free-electrons.com/source/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c?v=3.14#L1395 +// https://github.com/freebsd/freebsd/blob/386ddae58459341ec567604707805814a2128a57/sys/dev/ixgbe/ixgbe_82599.c#L173 +void ebbrt::IxgbeDriver::PhyInit() { + + uint16_t list_offset; + uint16_t data_offset = 0x0; + uint16_t data_value; + uint16_t sfp_id; + uint16_t sfp_type = 0x4; /* SPF_DA_CORE1 */ + + /* IXGBE_PHY_INIT_OFFSET_NL */ + list_offset = ReadEeprom(0x002B); + + if ((list_offset == 0x0) || (list_offset == 0xFFFF)) { + return; + } + + /* Shift offset to first ID word */ + list_offset++; + + sfp_id = ReadEeprom(list_offset); + + while (sfp_id != 0xFFFF) { + if (sfp_id == sfp_type) { + list_offset++; + data_offset = ReadEeprom(list_offset); + if ((data_offset == 0x0) || (data_offset == 0xFFFF)) { + ebbrt::kprintf("sfp init failed\n"); + return; + } else { + break; + } + } else { + list_offset += 2; + sfp_id = ReadEeprom(list_offset); + } + list_offset++; + } + + if (sfp_id == 0xFFFF) { + ebbrt::kprintf("sfp init failed\n"); + return; + } + + ebbrt::kprintf("data offset -> 0x%x\n", data_offset); + + SwfwLockPhy(); + + data_value = ReadEeprom(++data_offset); + while (data_value != 0xFFFF) { + ebbrt::kprintf("data_value -> 0x%x\n", data_value); + WriteCorectl(data_value); + data_value = ReadEeprom(++data_offset); + } + SwfwUnlockPhy(); + + ebbrt::clock::SleepMilli(20); + + WriteAutoc(0x0 << 13 | 0x1 << 12); + while (ReadAnlp1() != 0) + ; // TODO: timeout + + WriteAutoc(0x3 << 13 | 0x1 << 12); + while (ReadAutocRestartAn() != 0) + ; // TODO: timeout + + ebbrt::kprintf("PHY init done\n"); +} + +// 8.2.3.7.8 Receive Address Low — RAL[n] (0x0A200 + 8*n, n=0...127; RW) +uint32_t ebbrt::IxgbeDriver::ReadRal(uint32_t n) { + auto reg = bar0_.Read32(0x0A200 + 8 * n); + return reg; +} +void ebbrt::IxgbeDriver::WriteRal(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A200 + (8 * n), m); +} + +// 8.2.3.7.9 Receive Address High — RAH[n] (0x0A204 + 8*n, n=0...127; RW) +uint16_t ebbrt::IxgbeDriver::ReadRah(uint32_t n) { + auto reg = bar0_.Read32(0x0A204 + 8 * n); + return (reg)&0xFFFF; +} +uint8_t ebbrt::IxgbeDriver::ReadRahAv(uint32_t n) { + return (bar0_.Read32(0x0A204 + 8 * n) >> 31) & 0xFF; +} +void ebbrt::IxgbeDriver::WriteRah(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A204 + (8 * n), m); +} + +// 8.2.3.7.10 MAC Pool Select Array — MPSAR[n] (0x0A600 + 4*n, n=0...255; RW) +void ebbrt::IxgbeDriver::WriteMpsar(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A600 + 4 * n, m); +} + +// 8.2.3.7.19 Five tuple Queue Filter — FTQF[n] (0x0E600 + 4*n,n=0...127; RW) +void ebbrt::IxgbeDriver::WriteFtqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E600 + 4 * n, m); +} + +// 8.2.3.7.16 Source Address Queue Filter — SAQF[n] (0x0E000 + 4*n, n=0...127; +// RW) +void ebbrt::IxgbeDriver::WriteSaqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E000 + 4 * n, m); +} + +// 8.2.3.7.17 Destination Address Queue Filter — DAQF[n] (0x0E200 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDaqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E200 + 4 * n, m); +} + +// 8.2.3.7.18 Source Destination Port Queue Filter — SDPQF[n] (0x0E400 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteSdpqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E400 + 4 * n, m); +} + +// 8.2.3.27.17 PF Unicast Table Array — PFUTA[n] (0x0F400 + 4*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WritePfuta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F400 + 4 * n, m); +} + +// 8.2.3.7.3 Multicast Control Register — MCSTCTRL (0x05090; RW) +void ebbrt::IxgbeDriver::WriteMcstctrl(uint32_t m) { + auto reg = bar0_.Read32(0x05090); + bar0_.Write32(0x05090, reg & m); +} + +// 8.2.3.10.13 DCB Transmit Descriptor Plane Queue Select — RTTDQSEL (0x04904; +// RW) +void ebbrt::IxgbeDriver::WriteRttdqsel(uint32_t m) { + auto reg = bar0_.Read32(0x04904); + bar0_.Write32(0x04904, reg | m); +} + +// 8.2.3.10.14 DCB Transmit Descriptor Plane T1 Config — RTTDT1C (0x04908; RW) +void ebbrt::IxgbeDriver::WriteRttdt1c(uint32_t m) { bar0_.Write32(0x04908, m); } + +// 8.2.3.10.16 DCB Transmit Rate-Scheduler Config — RTTBCNRC (0x04984; RW) +void ebbrt::IxgbeDriver::WriteRttbcnrc(uint32_t m) { + bar0_.Write32(0x04984, m); +} + +// 8.2.3.10.9 DCB Transmit Descriptor Plane T2 Config - RTTDT2C[n] (0x04910 + +// 4*n, n=0...7; RW) DMA-Tx +void ebbrt::IxgbeDriver::WriteRttdt2c(uint32_t n, uint32_t m) { + bar0_.Write32(0x04910 + 4 * n, m); +} + +// 8.2.3.10.10 DCB Transmit Packet Plane T2 Config — RTTPT2C[n] (0x0CD20 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteRttpt2c(uint32_t n, uint32_t m) { + bar0_.Write32(0x0CD20 + 4 * n, m); +} + +// 8.2.3.10.6 DCB Receive Packet Plane T4 Config — RTRPT4C[n] (0x02140 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteRtrpt4c(uint32_t n, uint32_t m) { + bar0_.Write32(0x02140 + 4 * n, m); +} + +// 8.2.3.10.1 DCB Receive Packet Plane Control and Status — RTRPCS (0x02430; RW) +void ebbrt::IxgbeDriver::WriteRtrpcs(uint32_t m) { bar0_.Write32(0x02430, m); } + +// 8.2.3.11.2 Tx DCA Control Registers — DCA_TXCTRL[n] (0x0600C + 0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDcaTxctrlTxdescWbro(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + bar0_.Write32(0x0600C + 0x40 * n, reg & m); +} +void ebbrt::IxgbeDriver::ReadDcaTxctrl(uint32_t n) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + ebbrt::kprintf("DCA_TXCTRL=0x%X\n", reg); +} + +// 8.2.3.11.1 Rx DCA Control Register — DCA_RXCTRL[n] (0x0100C + 0x40*n, +// n=0...63 and 0x0D00C + 0x40*(n-64), +// n=64...127 / 0x02200 + 4*n, [n=0...15]; RW) +void ebbrt::IxgbeDriver::WriteDcaRxctrl_1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg & m); +} +void ebbrt::IxgbeDriver::ReadDcaRxctrl(uint32_t n) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + ebbrt::kprintf("DCA_RXCTRL=0x%X\n", reg); +} + +void ebbrt::IxgbeDriver::WriteDcaRxctrl_2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0D00C + 0x40 * n); + bar0_.Write32(0x0D00C + 0x40 * n, reg & m); +} + +// 8.2.3.7.5 Receive Checksum Control — RXCSUM (0x05000; RW) +void ebbrt::IxgbeDriver::WriteRxcsum(uint32_t m) { + //auto reg = bar0_.Read32(0x05000); + //bar0_.Write32(0x05000, reg | m); + bar0_.Write32(0x05000, m); +} + +// 8.2.3.8.13 RSC Control — RSCCTL[n] (0x0102C + 0x40*n, n=0...63 +// and 0x0D02C + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRscctl(uint32_t n, uint32_t m) { + //auto reg = bar0_.Read32(0x0102C + 0x40 * n); + //bar0_.Write32(0x0102C + 0x40 * n, reg | m); + bar0_.Write32(0x0102C + 0x40 * n, m); +} + +// 8.2.3.7.4 Packet Split Receive Type Register — PSRTYPE[n] +// (0x0EA00 + 4*n, n=0...63 / 0x05480 + 4*n, n=0...15; RW) +void ebbrt::IxgbeDriver::WritePsrtype(uint32_t n, uint32_t m) { + bar0_.Write32(0x0EA00 + 4 * n, m); +} + +void ebbrt::IxgbeDriver::WritePsrtypeZero(uint32_t n) { + //bar0_.Write32(0x0EA00 + 0x40 * n, 0x0); + bar0_.Write32(0x05480, n); +} + +// 8.2.3.7.15 Redirection Table — RETA[n] (0x0EB00 + 4*n, n=0...31/ 0x05C00 + +// 4*n, n=0...31; RW) +void ebbrt::IxgbeDriver::WriteReta(uint32_t n, uint32_t m) { + //bar0_.Write32(0x0EB00 + 4 * n, m); + bar0_.Write32(0x05C00 + 4 * n, m); + ebbrt::kprintf("WriteReta(n=%d) %X = 0x%08X\n", n, 0x05C00 + 4 * n, m); +} + +// 8.2.3.7.6 Receive Filter Control Register — RFCTL (0x05008; RW) +void ebbrt::IxgbeDriver::WriteRfctl(uint32_t m) { bar0_.Write32(0x05008, m); } + +// 8.2.3.9.16 Tx Packet Buffer Threshold — +// TXPBTHRESH (0x04950 +0x4*n, n=0...7; RW) +void ebbrt::IxgbeDriver::WriteTxpbthresh(uint32_t n, uint32_t m) { + bar0_.Write32(0x04950 + 0x4 * n, m); +} + +// 8.2.3.7.12 Multiple Receive Queues Command Register- MRQC (0x0EC80 / 0x05818; +// RW) +void ebbrt::IxgbeDriver::WriteMrqc(uint32_t m) { + //auto reg = bar0_.Read32(0x0EC80); + //bar0_.Write32(0x0EC80, reg | m); + bar0_.Write32(0x05818, m); +} + +// 8.2.3.9.15 Multiple Transmit Queues Command Register — MTQC (0x08120; RW) +void ebbrt::IxgbeDriver::WriteMtqc(uint32_t m) { bar0_.Write32(0x08120, m); } + +// 8.2.3.27.1 VT Control Register — PFVTCTL (0x051B0; RW) +void ebbrt::IxgbeDriver::WritePfvtctl(uint32_t m) { bar0_.Write32(0x051B0, m); } + +// 8.2.3.10.4 DCB Receive User Priority to Traffic Class — RTRUP2TC (0x03020; +// RW) +void ebbrt::IxgbeDriver::WriteRtrup2tc(uint32_t m) { + bar0_.Write32(0x03020, m); +} + +// 8.2.3.10.5 DCB Transmit User Priority to Traffic Class — RTTUP2TC (0x0C800; +// RW) +void ebbrt::IxgbeDriver::WriteRttup2tc(uint32_t m) { + bar0_.Write32(0x0C800, m); +} + +// 8.2.3.9.1 DMA Tx TCP Max Allow Size Requests — DTXMXSZRQ (0x08100; RW) +void ebbrt::IxgbeDriver::WriteDtxmxszrq(uint32_t m) { + auto reg = bar0_.Read32(0x08100); + bar0_.Write32(0x08100, reg | m); +} + +// 8.2.3.27.9 PF PF Queue Drop Enable Register — PFQDE (0x02F04; RW) +void ebbrt::IxgbeDriver::WritePfqde(uint32_t m) { bar0_.Write32(0x02F04, m); } + +// 8.2.3.22.34 MAC Flow Control Register — MFLCN (0x04294; RW) +void ebbrt::IxgbeDriver::WriteMflcn(uint32_t m) { + auto reg = bar0_.Read32(0x04294); + bar0_.Write32(0x04294, reg | m); +} + +// 8.2.3.3.7 Flow Control Configuration — FCCFG (0x03D00; RW) +/*void ebbrt::IxgbeDriver::WriteFccfg(uint32_t m) { + auto reg = bar0_.Read32(0x03D00); + bar0_.Write32(0x03D00, reg | m); + }*/ + +// void ebbrt::IxgbeDriver::WriteDcaRxctrl_2_RxdataWrro(uint32_t n, uint32_t m); + +// 8.2.3.4.9 - Software Semaphore Register — SWSM (0x10140; RW) +bool ebbrt::IxgbeDriver::SwsmSmbiRead() { + return !!(bar0_.Read32(0x10140) & 0x1); +} +bool ebbrt::IxgbeDriver::SwsmSwesmbiRead() { + return !(bar0_.Read32(0x10140) & 0x2); +} +void ebbrt::IxgbeDriver::SwsmSwesmbiSet() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg | 0x2); + bar0_.Write32(0x10140, reg | 0x2); +} +void ebbrt::IxgbeDriver::SwsmSmbiClear() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg & 0xFFFFFFFE); + bar0_.Write32(0x10140, reg & 0xFFFFFFFE); +} +void ebbrt::IxgbeDriver::SwsmSwesmbiClear() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg & 0xFFFFFFFD); + bar0_.Write32(0x10140, reg & 0xFFFFFFFD); +} + +// 8.2.3.22.20 Link Status Register — LINKS (0x042A4; RO) +bool ebbrt::IxgbeDriver::ReadLinksLinkUp() { + auto reg = bar0_.Read32(0x042A4); + return ((reg >> 30) & 0x1) == 1; +} + +// 8.2.3.4.11 Software-Firmware Synchronization - SW_FW_SYNC (0x10160; RW) +uint32_t ebbrt::IxgbeDriver::ReadSwfwSyncSmBits(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + return (reg & m) & 0x3FF; // masking bits 9:0 +} +void ebbrt::IxgbeDriver::WriteSwfwSyncSmBits(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + bar0_.Write32(0x10160, reg | m); +} +void ebbrt::IxgbeDriver::WriteSwfwSyncSmBits2(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + bar0_.Write32(0x10160, reg & m); +} + +// 8.2.3.11.1 Rx DCA Control Register — DCA_RXCTRL[n] (0x0100C + 0x40*n, +// n=0...63 and 0x0D00C + 0x40*(n-64), // n=0...63 and 0x0D00C + 0x40*(n-64), +// n=64...127 / 0x02200 + 4*n, [n=0...15]; RW) // n=64...127 / 0x02200 + 4*n, +// [n=0...15]; RW) +void ebbrt::IxgbeDriver::WriteDcaRxctrl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteDcaRxctrlClear(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg & m); +} + +// 8.2.3.11.4 DCA Control Register — DCA_CTRL (0x11074; RW) +void ebbrt::IxgbeDriver::WriteDcaCtrl(uint32_t m) { + auto reg = bar0_.Read32(0x11074); + bar0_.Write32(0x11074, reg | m); +} + +// 8.2.3.11.2 Tx DCA Control Registers — DCA_TXCTRL[n] (0x0600C + 0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDcaTxctrl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + bar0_.Write32(0x0600C + 0x40 * n, reg | m); +} + +// 8.2.3.8.1 Receive Descriptor Base Address Low — RDBAL[n] (0x01000 + 0x40*n, +// n=0...63 and 0x0D000 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdbal_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01000 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdbal_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D000 + 0x40 * n, m); +} + +// 8.2.3.8.2 Receive Descriptor Base Address High — RDBAH[n] (0x01004 + 0x40*n, +// n=0...63 and 0x0D004 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdbah_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01004 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdbah_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D004 + 0x40 * n, m); +} + +// 8.2.3.9.5 Transmit Descriptor Base Address Low — TDBAL[n] (0x06000+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdbal(uint32_t n, uint32_t m) { + bar0_.Write32(0x06000 + 0x40 * n, m); +} + +// 8.2.3.9.6 Transmit Descriptor Base Address High — TDBAH[n] (0x06004+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdbah(uint32_t n, uint32_t m) { + bar0_.Write32(0x06004 + 0x40 * n, m); +} + +// 8.2.3.9.7 Transmit Descriptor Length — TDLEN[n] (0x06008+0x40*n, n=0...127; +// RW) +void ebbrt::IxgbeDriver::WriteTdlen(uint32_t n, uint32_t m) { + bar0_.Write32(0x06008 + 0x40 * n, m); +} + +// 8.2.3.9.8 Transmit Descriptor Head — TDH[n] (0x06010+0x40*n, n=0...127; RO) +void ebbrt::IxgbeDriver::WriteTdh(uint32_t n, uint32_t m) { + bar0_.Write32(0x06010 + 0x40 * n, m); +} +uint16_t ebbrt::IxgbeDriver::ReadTdh(uint32_t n) { + auto reg = bar0_.Read32(0x06010 + 0x40 * n); + return reg & 0xFFFF; +} +uint32_t ebbrt::IxgbeDriver::ReadTdt(uint32_t n) { + return bar0_.Read32(0x06018 + 0x40 * n) & 0xFFFF; +} + +// 8.2.3.9.11 Tx Descriptor Completion Write Back Address Low — +// TDWBAL[n] (0x06038+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdwbal(uint32_t n, uint32_t m) { + bar0_.Write32(0x06038 + 0x40 * n, m); +} +// 8.2.3.9.12 Tx Descriptor Completion Write Back Address High — +// TDWBAH[n] (0x0603C+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdwbah(uint32_t n, uint32_t m) { + bar0_.Write32(0x0603C + 0x40 * n, m); +} + +// 8.2.3.9.9 Transmit Descriptor Tail — TDT[n] (0x06018+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdt(uint32_t n, uint32_t m) { + bar0_.Write32(0x06018 + 0x40 * n, m); +} + +// 8.2.3.8.3 Receive Descriptor Length — RDLEN[n] (0x01008 + 0x40*n, n=0...63 +// and 0x0D008 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdlen_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01008 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdlen_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D008 + 0x40 * n, m); +} + +// 8.2.3.8.7 Split Receive Control Registers — SRRCTL[n] (0x01014 + 0x40*n, +// n=0...63 and 0x0D014 + 0x40*(n-64), n=64...127 / 0x02100 + 4*n, [n=0...15]; +// RW) +void ebbrt::IxgbeDriver::WriteSrrctl_1(uint32_t n, uint32_t m) { + //auto reg = bar0_.Read32(0x01014 + 0x40 * n); + bar0_.Write32(0x01014 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteSrrctlZero(uint32_t n) { + bar0_.Write32(0x01014 + 0x40 * n, 0x0); +} + +// 8.2.3.8.12 RSC Data Buffer Control Register — RSCDBU (0x03028; RW) +void ebbrt::IxgbeDriver::WriteRscdbu(uint32_t m) { + //auto reg = bar0_.Read32(0x03028); + bar0_.Write32(0x03028, m); +} + +void ebbrt::IxgbeDriver::WriteSrrctl_1_desctype(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01014 + 0x40 * n); + bar0_.Write32(0x01014 + 0x40 * n, reg & m); +} + +// 8.2.3.8.8 Receive DMA Control Register — RDRXCTL (0x02F00; RW) +void ebbrt::IxgbeDriver::WriteRdrxctl(uint32_t m) { + auto reg = bar0_.Read32(0x02F00); + bar0_.Write32(0x02F00, reg | m); +} + +void ebbrt::IxgbeDriver::WriteRdrxctlRSCFRSTSIZE(uint32_t m) { + auto reg = bar0_.Read32(0x02F00); + bar0_.Write32(0x02F00, reg & m); +} + +uint8_t ebbrt::IxgbeDriver::ReadRdrxctlDmaidone() { + auto reg = bar0_.Read32(0x02F00); + return (reg >> 3) & 0x1; +} + +// 8.2.3.8.9 Receive Packet Buffer Size — RXPBSIZE[n] (0x03C00 + 4*n, n=0...7; +// RW) +void ebbrt::IxgbeDriver::WriteRxpbsize(uint32_t n, uint32_t m) { + bar0_.Write32(0x03C00 + 4 * n, m); +} + +// 8.2.3.9.13 Transmit Packet Buffer Size — TXPBSIZE[n] (0x0CC00 + 0x4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteTxpbsize(uint32_t n, uint32_t m) { + bar0_.Write32(0x0CC00 + 0x4 * n, m); +} + +// 8.2.3.9.16 Tx Packet Buffer Threshold — TXPBTHRESH (0x04950+0x4*n, n=0...7; +// RW) +void ebbrt::IxgbeDriver::WriteTxpbThresh(uint32_t n, uint32_t m) { + bar0_.Write32(0x04950 + 0x4 * n, m); +} + +// 8.2.3.22.8 MAC Core Control 0 Register — HLREG0 (0x04240; RW) +void ebbrt::IxgbeDriver::WriteHlreg0(uint32_t m) { + //auto reg = bar0_.Read32(0x04240); + bar0_.Write32(0x04240, m); +} + +// 8.2.3.22.13 Max Frame Size — MAXFRS (0x04268; RW) +void ebbrt::IxgbeDriver::WriteMaxfrs(uint32_t m) { + bar0_.Write32(0x04268, m); +} + +// 8.2.3.8.5 Receive Descriptor Tail — RDT[n] (0x01018 + 0x40*n, n=0...63 and +// 0x0D018 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdt_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01018 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdt_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D018 + 0x40 * n, m); +} + +// 8.2.3.8.4 Receive Descriptor Head — RDH[n] (0x01010 + 0x40*n, n=0...63 and +// 0x0D010 + 0x40*(n-64), n=64...127; RO) +void ebbrt::IxgbeDriver::WriteRdh_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01010 + 0x40 * n, m); +} +void ebbrt::IxgbeDriverRep::WriteRdh_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x01010 + 0x40 * n, m); +} + +uint16_t ebbrt::IxgbeDriver::ReadRdh_1(uint32_t n) { + auto reg = bar0_.Read32(0x01010 + 0x40 * n); + return reg & 0xFFFF; +} + +uint16_t ebbrt::IxgbeDriver::ReadRdt_1(uint32_t n) { + auto reg = bar0_.Read32(0x01018 + 0x40 * n); + return reg & 0xFFFF; +} + +void ebbrt::IxgbeDriver::SwfwSemRelease() { + SwsmSwesmbiClear(); + SwsmSmbiClear(); + ebbrt::kprintf("%s\n", __FUNCTION__); +} + +// 8.2.3.5.16 Interrupt Vector Allocation Registers — IVAR[n] (0x00900 + 4*n, +// n=0...63; RW) +void ebbrt::IxgbeDriver::WriteIvarAlloc0(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval0(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc3(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval3(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +// 8.2.3.10.2 DCB Transmit Descriptor Plane Control and Status — RTTDCS +// (0x04900; RW) DMA-Tx +void ebbrt::IxgbeDriver::WriteRttdcs(uint32_t m) { + //auto reg = bar0_.Read32(0x04900); + bar0_.Write32(0x04900, m); +} +void ebbrt::IxgbeDriver::WriteRttdcsArbdisEn(uint32_t m) { + auto reg = bar0_.Read32(0x04900); + bar0_.Write32(0x04900, reg & m); +} + +// 8.2.3.10.3 DCB Transmit Packet Plane Control and Status- RTTPCS (0x0CD00; RW) +void ebbrt::IxgbeDriver::WriteRttpcs(uint32_t m) { bar0_.Write32(0x0CD00, m); } + +// 8.2.3.12.5 Security Rx Control — SECRXCTRL (0x08D00; RW) +void ebbrt::IxgbeDriver::WriteSecrxctrl_Rx_Dis(uint32_t m) { + bar0_.Write32(0x08D00, m); + /*auto reg = bar0_.Read32(0x08D00); + if (m) { + bar0_.Write32(0x08D00, reg | m); + } else { + bar0_.Write32(0x08D00, reg & ~(0x1 << 1)); + }*/ +} + +// 8.2.3.12.6 Security Rx Status — SECRXSTAT (0x08D04; RO) +uint8_t ebbrt::IxgbeDriver::ReadSecrxstat_Sr_Rdy() { + auto reg = bar0_.Read32(0x08D04); + return reg & 0x1; +} + +// 8.2.3.23.59 Total Packets Received — TPR (0x040D0; RC) +uint32_t ebbrt::IxgbeDriver::ReadTpr() { + auto reg = bar0_.Read32(0x040D0); + ebbrt::kprintf("%s %d\n", __FUNCTION__, reg); + return reg; +} + +// 8.2.3.23.26 Good Packets Received Count — GPRC (0x04074; RO) +uint32_t ebbrt::IxgbeDriver::ReadGprc() { + auto reg = bar0_.Read32(0x04074); + ebbrt::kprintf("%s %d\n", __FUNCTION__, reg); + return reg; +} + +bool ebbrt::IxgbeDriver::SwfwSemAcquire() { + // polls SWSM.SMBI until 0b is read or timeout + // TODO: timeout after 10 ms + while (SwsmSmbiRead()) + ; + + // writes 1b to SWSM.SWESMBI bit + SwsmSwesmbiSet(); + + // polls SWSM.SWESMBI bit until read as 1b + // TODO: timeout of 3 secs + while (SwsmSwesmbiRead()) + ; + + return true; +} + +// 10.5.4 Software and Firmware Synchronization +bool ebbrt::IxgbeDriver::SwfwLockPhy() { + bool good = false; + +again: + if (!SwfwSemAcquire()) { + ebbrt::kabort("SwfwSemAcquire failed\n"); + } else { + ebbrt::kprintf("SWSM Sem acquired\n"); + } + + if ((ReadStatusLanId() == 0) && (ReadSwfwSyncSmBits(0x2) == 0) // SW_PHY_SM0 + && (ReadSwfwSyncSmBits(0x40) == 0)) // FW_PHY_SM0 + { + WriteSwfwSyncSmBits(0x2); // SW_PHY_SM0 + ebbrt::kprintf("SW_PHY_SMO written\n"); + good = true; + } else if ((ReadSwfwSyncSmBits(0x4) == 0) // SW_PHY_SM1 + && (ReadSwfwSyncSmBits(0x80) == 0)) // FW_PHY_SM1 + { + WriteSwfwSyncSmBits(0x4); // SW_PHY_SM1 + ebbrt::kprintf("SW_PHY_SM1 written\n"); + good = true; + } + + SwfwSemRelease(); + + if (!good) { + ebbrt::kprintf("%s: failed, trying again\n", __FUNCTION__); + ebbrt::clock::SleepMilli(20); + goto again; + } + + return true; +} +void ebbrt::IxgbeDriver::SwfwUnlockPhy() { + if (!SwfwSemAcquire()) { + ebbrt::kabort("SwfwSemAcquire failed\n"); + } else { + ebbrt::kprintf("SWSM Sem acquired\n"); + } + + if (ReadStatusLanId() == 0) { + WriteSwfwSyncSmBits2(~0x2); // SW_PHY_SM0 + } else { + WriteSwfwSyncSmBits2(~0x4); // SW_PHY_SM1 + } + + SwfwSemRelease(); + + ebbrt::clock::SleepMilli(10); +} + +void ebbrt::IxgbeDriver::StopDevice() { + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + + // disable rx + WriteRxctrl(0x0); + + // disable tx + WriteDmatxctl(0xFFFFFFFE); + + // disable interrupts + //WriteEimc(0x7FFFFFFF); + WriteEimc(0xFFFFFFFF); + ReadEicr(); + + // disable each rx and tx queue + for (auto i = 0; i < 128; i++) { + // Bit 26, transmit software flush + WriteTxdctl(i, 0x04000000); + + if (i < 64) { + WriteRxdctl_1(i, 0x0); + } else { + WriteRxdctl_2(i - 64, 0x0); + } + } + + // from arrakis + ebbrt::clock::SleepMilli(2); + + // Master disable procedure + WriteCtrl(0x4); // PCIe Master Disable + while (ReadStatusPcieMes() != 1) + ; + ebbrt::kprintf("Ixgbe 82599 stop done\n"); +} + +void ebbrt::IxgbeDriver::GlobalReset() { + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + + WriteCtrl(0x8); // Link Reset + WriteCtrl(0x4000000); // Device Reset + + // Note: To ensure that a global device reset has fully completed and that the + // 82599 responds to subsequent accesses, programmers must wait + // before approximately 1 ms after setting attempting to check + // if the bit has cleared or to access (read or write) any other device + // register. + ebbrt::clock::SleepMilli(2); + ReadCtrl(); +} + +/** + * ixgbe_init_hw_generic - Generic hardware initialization + * @hw: pointer to hardware structure + * + * Initialize the hardware by resetting the hardware, filling the bus info + * structure and media type, clears all on chip counters, initializes receive + * address registers, multicast table, VLAN filter table, calls routine to set + * up link and flow control settings, and leaves transmit and receive units + * disabled and uninitialized + **/ +void ebbrt::IxgbeDriver::Init() { + uint64_t d_mac; + + + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + bar0_.Map(); // allocate virtual memory + ebbrt::clock::SleepMilli(200); + ebbrt::kprintf("Sleep 200 ms\n"); + + StopDevice(); + GlobalReset(); + ebbrt::clock::SleepMilli(50); + GlobalReset(); + ebbrt::clock::SleepMilli(250); + + // disable interrupts + //WriteEimc(0x7FFFFFFF); + WriteEimc(0xFFFFFFFF); + ReadEicr(); + + // Let firmware know we have taken over + //WriteCtrlExt(0x1 << 28); // DRV_LOAD + WriteCtrlExt(0x10010000); // DRV_LOAD and NS_DIS + + //If legacy descriptors are used, this bit should be set to 1b. + // No snoop disable from FreeBSD ?? +//#ifndef RSC_EN +// WriteCtrlExt(0x1 << 16); // NS_DIS +//#endif + + // Initialize flow-control registers + for (auto i = 0; i < 8; i++) { + if (i < 4) { + WriteFcttv(i, 0x0); + } + WriteFcrtl(i, 0x0); + WriteFcrth(i, 0x0); + } + + WriteFcrtv(0x0); + WriteFccfg(0x0); + + // Initialize Phy + PhyInit(); + + // Wait for EEPROM auto read + while (ReadEecAutoRd() == 0) { + }; // TODO: Timeout + ebbrt::kprintf("EEPROM auto read done\n"); + + ebbrt::clock::SleepMilli(200); + d_mac = ReadRal(0) | ((uint64_t)ReadRah(0) << 32); + // ebbrt::kprintf("mac %p valid = %x\n", d_mac, ReadRahAv(0)); + for (auto i = 0; i < 6; i++) { + mac_addr_[i] = (d_mac >> (i * 8)) & 0xFF; + } + ebbrt::kprintf( + "Mac Address: %02X:%02X:%02X:%02X:%02X:%02X\n", + static_cast(mac_addr_[0]), static_cast(mac_addr_[1]), + static_cast(mac_addr_[2]), static_cast(mac_addr_[3]), + static_cast(mac_addr_[4]), static_cast(mac_addr_[5])); + + // Wait for DMA initialization + while (ReadRdrxctlDmaidone() == 0) { + }; // TODO: Timeout + + // Wait for link to come up + while (!ReadLinksLinkUp()) { + }; // TODO: timeout + ebbrt::kprintf("Link is up\n"); + ebbrt::clock::SleepMilli(50); + + // clears on read + WriteEicr(0xFFFFFFFF); + + /* + * use EIAM to auto-mask when MSI-X interrupt is asserted + * this saves a register write for every interrupt + */ + //WriteEiam(0, 0xFFFFFFFF); + //WriteEiam(1, 0xFFFFFFFF); + + /* setup msix */ + // switch to msix mode + WriteGpie(0x1 << 4 | 0x1 << 5 | 0x1 << 31 | 0x1 << 30); // Multiple_MSIX + //WriteGpie(0x1 << 5); // OCD + //WriteGpie(0x1 << 31); // PBA_support + // Enable auto masking of interrupt + //WriteGpie(0x1 << 30); // EIAME + + // TODO: Set up management interrupt handler + + //WriteGpie(0xC0000036); + //uint32_t gpie = 0xC0000036 | (0x7 << 11); + //uint32_t gpie = 0xC0000036 | (IxgbeDriver::RSC_DELAY << 11); + //WriteGpie(gpie); + //ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d us\n", (((gpie >> 11) & 0x7)+1)*4); + +//#ifdef RSC_EN + // TODO: RSC delay value, just a guess at (1 + 1) * 4us = 8 us + // Recommended value based on 7.3.2.1.1 + //WriteGpie(IxgbeDriver::RSC_DELAY << 11); + //ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d\n", (IxgbeDriver::RSC_DELAY + 1) * 4); +//#endif + + /* FreeBSD: + * ixgbe_common.c - s32 ixgbe_init_rx_addrs_generic(struct ixgbe_hw *hw) + * Places the MAC address in receive address register 0 and clears the rest + * of the receive address registers. Clears the multicast table. Assumes + * the receiver is in reset when the routine is called. + */ + // Initialize RX filters + + /* Zero out the other receive addresses. */ + for (auto i = 1; i < 128; i++) { + WriteRal(i, 0x0); + WriteRah(i, 0x0); + } + + // clear mta + for (auto i = 0; i < 128; i++) { + WriteMta(i, 0x0); + } + + // No init uta tables? + + // set vlan filter table + for (auto i = 0; i < 128; i++) { + WriteVfta(i, 0x0); + } + + for (auto i = 0; i < 64; i++) { + // WritePfvlvf(i, 0x1 << 31); // VI_En bit 31 + WritePfvlvf(i, 0x0); + WritePfvlvfb(i, 0x0); + // WritePsrtypeZero(0x0); + } + + // PF Unicast Table Array + for (auto i = 0; i < 128; i++) { + WritePfuta(i, 0x0); + } + + // not sure why initing these tables? + for (auto i = 0; i < 128; i++) { + WriteFhft_1(i, 0x0); + if (i < 64) { + WriteFhft_2(i, 0x0); + } + } + + // enable ECC Reporting TODO - causes interrupts to be broken?? + // WriteRxfeccerr0(0x1 << 9); + + /**** Initialize RX filters ****/ + // FreeBSD if_ix.c - ixgbe_initialize_receive_units - Enable broadcast accept + WriteFctrl(0x1 << 10); // Set BAM = 1 + + // TODO VLNCTRL + WriteMcstctrl(0x0); + +//#ifndef RSC_EN + //WriteRxcsum(0x1 << 12); // IP payload checksum enable + //WriteRxcsum(0x3 << 12); // IP payload checksum enable + WriteRxcsum(0x3000); +//#else + // note: PCSD: The Fragment Checksum and IP Identification fields are mutually exclusive with + // the RSS hash. Only one of the two options is reported in the Rx descriptor. +// WriteRxcsum(0x2000); +//#endif +// TODO RQTC + +#ifdef RSC_EN + WriteRfctl(0xC0); +#else + WriteRfctl(0xE0); +#endif + + for (auto i = 0; i < 256; i++) { + WriteMpsar(i, 0x0); + } + + // !! Support for RSS is not provided when legacy receive descriptor format is used. + + // RSSRK - random seeds taken from Linux + WriteRssrk(0, 0xA38DD80F); + WriteRssrk(1, 0xD107C3DC); + WriteRssrk(2, 0x8CEB19C4); + WriteRssrk(3, 0xA41E1B6B); + WriteRssrk(4, 0xB7218638); + WriteRssrk(5, 0x6B8B6155); + WriteRssrk(6, 0xDC8D08B5); + WriteRssrk(7, 0xD2E8684B); + WriteRssrk(8, 0xECEF8417); + WriteRssrk(9, 0xE56C84D5); + + // Fill in RSS redirection table (128 entries), sets which core the lowest 7 bits of hashed output goes to + // hacky atm + // memcached-silo -- avoid firing interrupts on core 15 + /*uint32_t i = 0; + WriteReta(i+0, 0x03020100); + WriteReta(i+1, 0x07060504); + WriteReta(i+2, 0x0B0A0908); + WriteReta(i+3, 0x000E0D0C); + WriteReta(i+4, 0x04030201); + WriteReta(i+5, 0x08070605); + WriteReta(i+6, 0x0C0B0A09); + WriteReta(i+7, 0x01000E0D); + + WriteReta(i+8, 0x05040302); + WriteReta(i+9, 0x09080706); + WriteReta(i+10, 0x0D0C0B0A); + WriteReta(i+11, 0x0201000E); + WriteReta(i+12, 0x06050403); + WriteReta(i+13, 0x0A090807); + WriteReta(i+14, 0x0E0D0C0B); + WriteReta(i+15, 0x03020100); + + WriteReta(i+16, 0x07060504); + WriteReta(i+17, 0x0B0A0908); + WriteReta(i+18, 0x000E0D0C); + WriteReta(i+19, 0x04030201); + WriteReta(i+20, 0x08070605); + WriteReta(i+21, 0x0C0B0A09); + WriteReta(i+22, 0x01000E0D); + WriteReta(i+23, 0x05040302); + + WriteReta(i+24, 0x09080706); + WriteReta(i+25, 0x0D0C0B0A); + WriteReta(i+26, 0x0201000E); + WriteReta(i+27, 0x06050403); + WriteReta(i+28, 0x0A090807); + WriteReta(i+29, 0x0E0D0C0B); + WriteReta(i+30, 0x03020100); + WriteReta(i+31, 0x07060504); + */ + uint32_t ncore = static_cast(Cpu::Count()); + for (auto i = 0; i < 32; i += 4) { + // all route to core 0 + if(ncore == 1) { + WriteReta(i, 0x0000000); + WriteReta(i+1, 0x0000000); + WriteReta(i+2, 0x0000000); + WriteReta(i+3, 0x0000000); + } else if(ncore == 2) { + WriteReta(i, 0x1010100); + WriteReta(i+1, 0x1010100); + WriteReta(i+2, 0x1010100); + WriteReta(i+3, 0x1010100); + } else if(ncore == 4) { + WriteReta(i, 0x3020100); + WriteReta(i+1, 0x3020100); + WriteReta(i+2, 0x3020100); + WriteReta(i+3, 0x3020100); + } else if(ncore == 6) { + WriteReta(i, 0x03020100); + WriteReta(i+1, 0x01000504); + WriteReta(i+2, 0x05040302); + WriteReta(i+3, 0x04030201); + } else if(ncore == 8) { + WriteReta(i, 0x3020100); + WriteReta(i+1, 0x7060504); + WriteReta(i+2, 0x3020100); + WriteReta(i+3, 0x7060504); + } else if(ncore == 16){ + // memcached + //WriteReta(i+0, 0x03020100); + //WriteReta(i+1, 0x07060504); + //WriteReta(i+2, 0x0B0A0908); + //WriteReta(i+3, 0x0F0E0D0C); + + // nodejs -- all on core 1 + ebbrt::kprintf_force("*** NodeJS firing all on core 1\n"); + WriteReta(i+0, 0x01010101); + WriteReta(i+1, 0x01010101); + WriteReta(i+2, 0x01010101); + WriteReta(i+3, 0x01010101); + } else { + ebbrt::kabort("%s: Can only redirect interrupts to 16 cores\n", __FUNCTION__); + } + } + + for (auto i = 0; i < 128; i++) { + WriteFtqf(i, 0x0); + WriteSaqf(i, 0x0); + WriteDaqf(i, 0x0); + WriteSdpqf(i, 0x0); + } + + // TODO SYNQF + // TODO ETQF + // TODO ETQS + + // Make sure RX CRC strip enabled in HLREG0 and RDRXCTL + WriteRdrxctlRSCFRSTSIZE(~(0x1F << 17)); // s/w set to 0 + WriteRdrxctl(0x1 << 1); // CRCStrip + //WriteHlreg0(0x1 << 1); // CRCStrip + WriteHlreg0(0x2FFF); // CRCStrip, Enable Jumbo Packets - Linux Default + WriteRdrxctl(0x1 << 25); // RSCACKC s/w set to 1 + WriteRdrxctl(0x1 << 26); // FCOE_WRFIX s/w set to 1 + + // Disable RSC for ACK Packets, disables the coalescing of TCP packets without TCP payload. + // This bit should be set if performance problems are found. + WriteRscdbu(0xa0); + + /***** END RX FILTER *****/ + + // Configure buffers etc. according to specification + // Section 4.6.11.3.4 (no DCB, no virtualization) + + /* Transmit Init: Set RTTDCS.ARBDIS to 1b. + * Program DTXMXSZRQ, TXPBSIZE, TXPBTHRESH, MTQC, and MNGTXMAP, according + * to the DCB and virtualization modes (see Section 4.6.11.3). + * Clear RTTDCS.ARBDIS to 0b. + */ + //WriteRttdcs(0x1 << 6); + WriteRttdcs(0xC00040); + WriteDtxmxszrq(MAX_BYTES_NUM_REQ); + WriteTxpbsize(0, 0xA0 << 10); + WriteTxpbThresh(0, 0xA0); + for (auto i = 1; i < 8; i++) { + WriteTxpbsize(i, 0x0); + WriteTxpbThresh(i, 0x0); + } + WriteMtqc(0x0); + WriteMngtxmap(0x0); + WriteRttdcs(0xC00000); + //WriteRttdcsArbdisEn(~(0x1 << 6)); + + /* Receive Init: Program RXPBSIZE, MRQC, PFQDE, RTRUP2TC, MFLCN.RPFCE, + * and MFLCN.RFCE according to the DCB and virtualization modes + */ + WriteRxpbsize(0, 0x200 << 10); + for (auto i = 1; i < 8; i++) { + WriteRxpbsize(i, 0x0); + } + WriteMrqc(0x330001); + + WritePfqde(0x0); + WriteRtrup2tc(0x0); + WriteMflcn(0x0 << 2); + WriteMflcn(0x1 << 3); + // end DCB off, VT off + + // MAXFRS + WriteMaxfrs(1518 << 16); + //WriteMaxfrs(4096 << 16); + + // disable relaxed ordering + /*for (auto i = 0; i < 128; i++) { + WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); // Txdesc_Wbro + + if (i < 64) { + WriteDcaRxctrl_1( + i, ~(0x1 << 15)); // Rx split header relax order enable, bit 15 + WriteDcaRxctrl_1( + i, ~(0x1 << 13)); // Rx data Write Relax Order Enable, bit 13 + } else { + WriteDcaRxctrl_2( + i - 64, ~(0x1 << 15)); // Rx split header relax order enable, bit 15 + WriteDcaRxctrl_2( + i - 64, ~(0x1 << 13)); // Rx data Write Relax Order Enable, bit 13 + } + }*/ + +#ifdef DCA_ENABLE + // DCA_MODE = DCA 1.0 + WriteDcaCtrl(0x1 << 1); +#endif +} + +void ebbrt::IxgbeDriver::FinishSetup() { + // No snoop disable from FreeBSD ?? + WriteCtrlExt(0x10000); + //WriteCtrlExt(0x1 << 16); // NS_DIS + /*for (size_t i = 0; i < Cpu::Count(); i++) { + WriteDcaRxctrlClear(i, ~(0x1 << 12)); // clear bit 12 + }*/ + WriteEims(0xFFFF); +} + +// initializes per core rx/tx queues and interrupts +void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { + if (!rcv_vector) { + rcv_vector = + event_manager->AllocateVector([this]() { ebb_->ReceivePoll(); }); + } + + // allocate memory for descriptor rings + ixgmq[i].reset(new e10Kq(i, Cpu::GetMyNode())); + + // not going to set up receive descripts greater than 63 + ebbrt::kbugon(i >= 64, "can't set up descriptors greater than 63\n"); + + // update register RDBAL, RDBAH with receive descriptor base address + WriteRdbal_1(i, ixgmq[i]->rxaddr_ & 0xFFFFFFFF); + WriteRdbah_1(i, (ixgmq[i]->rxaddr_ >> 32) & 0xFFFFFFFF); + + // set to number of bytes allocated for receive descriptor ring + WriteRdlen_1(i, ixgmq[i]->rx_size_bytes_); + + // program srrctl register + WriteSrrctl_1(i, 0x2000402); // 3KB + //WriteSrrctl_1(i, 0x2000410); // 16KB + //WriteSrrctl_1(i, (0x1 << 25) | (0x4 << 8) | (IxgbeDriver::RXBUFSZ / 1024)); // desctype adv 001b, BSIZEHEADER = 0x7 * 64B, BSIZEPACKET= 0x4 * 1 KB + //WriteSrrctl_1(i, (0x1 << 25) | (0x4 << 8) | (3072 / 1024)); + + /*WriteSrrctlZero(i); + WriteSrrctl_1(i, RXBUFSZ / 1024); // bsizepacket + WriteSrrctl_1(i, (128 / 64) << 8); // bsizeheader + +// TODO headsplit adv +#ifdef RSC_EN + WriteSrrctl_1(i, 0x1 << 25); // desctype adv +#endif +#else + // legacy is default?? + WriteSrrctl_1(i, ~(0x7 << 25)); // desctype legacy + #endif + + WriteSrrctl_1(i, 0x1 << 28); // Drop_En*/ + +#ifdef RSC_EN + // RSC set up + /**** + Maximum descriptors per Large receive as follow: + 00b = Maximum of 1 descriptor per large receive. + 01b = Maximum of 4 descriptors per large receive. + 10b = Maximum of 8 descriptors per large receive. + 11b = Maximum of 16 descriptors per large receive. + + (3:2) MAXDESC * SRRCTL.BSIZEPKT must not exceed 64 KB minus one, which is the + maximum total length in the IP header and must be larger than the expected + received MSS + *****/ + + WriteRscctl(i, 0x1 | (0x11 << 2)); // RSCEN=1, MAXDESC= (0x1) * SRRCTL.BSIZEPACKET < 64KB + WritePsrtype(i, 0x1 << 4); // 4.6.7.2.2 - PSR_type4 in PSRTYPE[n] should be set +#endif + + // In NON-IOV, only psrtype[0] is used + //if (i == 0) { + // WritePsrtypeZero(0x1330); + // } + + // Set head and tail pointers + WriteRdt_1(i, 0x0); + WriteRdh_1(i, 0x0); + + // Set Enable bit in receive queue + WriteRxdctl_1_enable(i, 0x1 << 25); + // TODO: Timeout + while (ReadRxdctl_1_enable(i) == 0) + ; + + // setup RX interrupts for queue i + dev_.SetMsixEntry(i, rcv_vector, ebbrt::Cpu::GetByIndex(i)->apic_id()); + + ebbrt::kprintf_force("Core %d: BSIZEPACKET=%d bytes NTXDESCS=%d NRXDESCS=%d ITR_INTERVAL=%dus RCV_VECTOR=%d APIC_ID=%d \n", i, RXBUFSZ, NTXDESCS, NRXDESCS, (int) (IxgbeDriver::ITR_INTERVAL * 2), (int)rcv_vector, (int)(ebbrt::Cpu::GetByIndex(i)->apic_id())); + + // don't set up interrupts for tx since we have head writeback?? + auto qn = i / 2; // put into correct IVAR + if ((i % 2) == 0) { // check if 2xN or 2xN + 1 + WriteIvarAlloc0(qn, i | 0x1 << 7); // rx interrupt allocation corresponds to index i * + // 2 in MSI-X table + //WriteIvarAllocval0(qn, 0x1 << 7); + WriteIvarAlloc0(qn, i << 8 | 0x1 << 15); + //WriteIvarAllocval0(qn, 0x1 << 15); + } else { + WriteIvarAlloc2(qn, i << 16 | 0x1 << 23); + //WriteIvarAllocval2(qn, 0x1 << 23); + WriteIvarAlloc2(qn, i << 24 | 0x1 << 31); + //WriteIvarAllocval2(qn, 0x1 << 31); + } + + // must be greater than rsc delay + WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3) | IXGBE_EITR_CNT_WDIS); + // WriteEitr(i, 0x80 << 3); // 7 * 2us = 14 us + //WriteEitr(i, (32 << 3) | IXGBE_EITR_CNT_WDIS); + + // 7.3.1.4 - Note that there are no EIAC(1)...EIAC(2) registers. + // The hardware setting for interrupts 16...63 is always auto clear. + if (i < 16) { + // enable auto clear + WriteEiac(0x1 << i); + } + + // enable interrupt + WriteEimsn(i / 32, (0x1 << (i % 32))); + + // make sure interupt is cleared + if (i < 16) { + WriteEicr(0x1 << i); + } + + // Enable RX + // disable RX_DIS + //WriteSecrxctrl_Rx_Dis(0x1 << 1); + WriteSecrxctrl_Rx_Dis(0x3); + // TODO Timeout + while (ReadSecrxstat_Sr_Rdy() == 0) + ; + WriteRxctrl(0x1); + // enable RX_DIS, disable aes encryption offload, power savings + WriteSecrxctrl_Rx_Dis(0x1); + + // add buffer to each descriptor + for (size_t j = 0; j < NRXDESCS-1; j++) { + auto rxphys = + reinterpret_cast((ixgmq[i]->circ_buffer_[j])->MutData()); + auto tail = ixgmq[i]->rx_tail_; + +// update buffer address for descriptor +/*#ifdef RSC_EN + rdesc_adv_rf_t* tmp; + tmp = reinterpret_cast(&(ixgmq[i]->rx_ring_[tail])); + + tmp->packet_buffer = rxphys; + // TODO only use this if enabling header splitting? + tmp->header_buffer = 0; +#else*/ + ixgmq[i]->rx_ring_[tail].buffer_address = rxphys; +//#endif + /*if(i == 0) { + ebbrt::kprintf("rx_ring_[tail=%u].buffer_address = 0x%X\n", tail, rxphys); + }*/ + + ixgmq[i]->rx_tail_ = (tail + 1) % ixgmq[i]->rx_size_; + } + + auto rxphys = + reinterpret_cast((ixgmq[i]->circ_buffer_[NRXDESCS-1])->MutData()); + ixgmq[i]->rx_ring_[ixgmq[i]->rx_tail_].buffer_address = rxphys; + + asm volatile("sfence" ::: "memory"); + // bump tail pts via register rdt to enable descriptor fetching by setting to + // length of ring minus one + WriteRdt_1(i, ixgmq[i]->rx_tail_); + +#ifdef DCA_ENABLE + auto myapic = ebbrt::Cpu::GetByIndex(i)->apic_id(); + + WriteDcaRxctrl(i, 0x1 << 5); // Descriptor DCA EN + WriteDcaRxctrl(i, 0x1 << 6); // Rx Header DCA EN + WriteDcaRxctrl(i, 0x1 << 7); // Payload DCA EN + + WriteDcaRxctrl(i, myapic << 24); // CPUID = apic id + + WriteDcaTxctrl(i, 0x1 << 5); // DCA Enable + WriteDcaTxctrl(i, myapic << 24); // CPUID = apic id +//#else +// ReadDcaTxctrl(i); +// ReadDcaRxctrl(i); +#endif + + // program base address registers + WriteTdbal(i, ixgmq[i]->txaddr_ & 0xFFFFFFFF); + WriteTdbah(i, (ixgmq[i]->txaddr_ >> 32) & 0xFFFFFFFF); + + // length must also be 128 byte aligned + WriteTdlen(i, ixgmq[i]->tx_size_bytes_); + +#ifdef TX_HEAD_WB + ebbrt::kprintf_force("TX_HEAD_WB Enabled\n"); + WriteTdwbal(i, (ixgmq[i]->txhwbaddr_ & 0xFFFFFFFF) | 0x1); + WriteTdwbah(i, (ixgmq[i]->txhwbaddr_ >> 32) & 0xFFFFFFFF); +#endif + + // enable transmit path: This step should be executed oly for the first enabled transmit queue and does + // not need to be repeated for any following queues. + if(i == 0) { + WriteDmatxctl_te(0x1); + } + + WriteTdh(i, 0x0); + + // transmit queue enable - PTHRESH=32 HTHRESH=1 WTHRESH=1 + WriteTxdctl(i, 0x2010120); + //WriteTxdctl(i, 0x2000000); + + // poll until set, TODO: Timeout + while (ReadTxdctl_enable(i) == 0) { + ebbrt::clock::SleepMilli(1); + } + + WriteTdt(i, 0x0); + ixgmq[i]->tx_tail_=0; + + // TODO: set up dca txctrl FreeBSD? + // clear TXdescWBROen + //WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); +} + +// Packet receive interrupt handler +void ebbrt::IxgbeDriverRep::ReceivePoll() { + uint32_t plen, i; + uint64_t rxflag; + rdesc_adv_wb_t* rx_desc; + uint32_t mcore = static_cast(Cpu::GetMine()); + uint32_t icnt; + uint64_t now = 0, last = 0; + uint64_t cjoules, cins, ccyc, crefcyc, cllc; + uint64_t c3, c6, c7; + //uint32_t eicr; + + //rxPollCnt[mcore]++; + c3 = c6 = c7 = 0; + + // hard coded for this processor to initialize PMC counter + if(ixgmq_.start_perf == false) { + uint32_t index, low, high; + uint64_t data; + + data = 0x333; + index = 0x38D; + low = (uint32_t)(data & 0xFFFFFFFF); + high = (data >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); + + data = 0x43412E; + index = 0x186; + low = (uint32_t)(data & 0xFFFFFFFF); + high = (data >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); + + data = 0x700000001; + index = 0x38F; + low = (uint32_t)(data & 0xFFFFFFFF); + high = (data >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); + + ixgmq_.start_perf = true; + } + + if(ixgmq_.collect_stats) { + ccyc = 0; + cllc = 0; + icnt = ixgbe_stats[mcore].itr_cnt; +// ixgbe_stats[mcore].itr_cnt2 ++; + + if (icnt < IXGBE_LOG_SIZE) { + //get current tsc and store it + now = ebbrt::trace::rdtsc(); + kassert(now != 0); + + //eicr = ReadEicr(); + //ixgbe_logs[mcore][icnt].Fields.c3 = eicr; + + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.tsc), now); + //ixgbe_logs[mcore][icnt].Fields.tsc = now; + + // get last tsc + last = ixgbe_stats[mcore].itr_joules_last_tsc; + + // ~ 1 ms has passed + if ((now - last) > TSC_KHZ) { + cjoules = ixgmq_.powerMeter.ReadMsr(); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.joules), cjoules); + //ixgbe_logs[mcore][icnt].Fields.joules = cjoules; + + //c3 = ebbrt::msr::Read(0x3FC); + //c6 = ebbrt::msr::Read(0x3FD); + //c7 = ebbrt::msr::Read(0x3FE); + c7 = nsleep_states[mcore]; + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c3), c3); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c6), c6); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c7), c7); + //ixgbe_logs[mcore][icnt].Fields.c3 = c3; + //ixgbe_logs[mcore][icnt].Fields.c6 = c6; + //ixgbe_logs[mcore][icnt].Fields.c7 = c7; + + if (ixgmq_.start_perf) { + cins = ebbrt::msr::Read(0x309); + ccyc = ebbrt::msr::Read(0x30A); + crefcyc = ebbrt::msr::Read(0x30B); + cllc = ebbrt::msr::Read(0xC1); + + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ninstructions), cins); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ncycles), ccyc); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nref_cycles), crefcyc); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nllc_miss), cllc); + //ixgbe_logs[mcore][icnt].Fields.ninstructions = cins; + //ixgbe_logs[mcore][icnt].Fields.ncycles = ccyc; + //ixgbe_logs[mcore][icnt].Fields.nref_cycles = crefcyc; + //ixgbe_logs[mcore][icnt].Fields.nllc_miss = cllc; + } + + ixgbe_stats[mcore].itr_joules_last_tsc = now; + } + + //ixgbe_logs[mcore][icnt].Fields.rx_desc = ixgmq_.stat_num_rx_desc; + //ixgbe_logs[mcore][icnt].Fields.rx_bytes = ixgmq_.stat_num_rx_bytes; + //ixgbe_logs[mcore][icnt].Fields.tx_desc = ixgmq_.stat_num_tx_desc; + //ixgbe_logs[mcore][icnt].Fields.tx_bytes = ixgmq_.stat_num_tx_bytes; + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.rx_desc), ixgmq_.stat_num_rx_desc); + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.rx_bytes), ixgmq_.stat_num_rx_bytes); + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_desc), ixgmq_.stat_num_tx_desc); + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_bytes), ixgmq_.stat_num_tx_bytes); + + ixgmq_.stat_num_rx_bytes = 0; + ixgmq_.stat_num_rx_desc = 0; + ixgmq_.stat_num_tx_bytes = 0; + ixgmq_.stat_num_tx_desc = 0; + + ixgbe_stats[mcore].itr_cnt++; + nsleep_states[mcore] = 0; + } + } + + i = 0; + while(i < 64) { + rxflag = 0x0; + rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); + + // Linux's ixgbe: + // This memory barrier is needed to keep us from reading + // any other fields out of the rx_desc until we know the + // descriptor has been written back + // + asm volatile("" ::: "memory"); + + // if no rx packets ready + if (!(rx_desc->dd)) { + return; + } + + // return buffers to hardware + if(ixgmq_.cleaned_count > IXGBE_RX_BUFFER_WRITE) { + while(ixgmq_.cleaned_count) { + // reset buffer + ixgmq_.rx_ring_[ixgmq_.rx_tail_].raw[0] = 0x0; + ixgmq_.rx_ring_[ixgmq_.rx_tail_].raw[1] = 0x0; + + // allocate new rx buffer + ixgmq_.circ_buffer_[ixgmq_.rx_tail_] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[ixgmq_.rx_tail_])->MutData()); + // update descriptor with new buffer adder + ixgmq_.rx_ring_[ixgmq_.rx_tail_].buffer_address = rxphys; + + ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + ixgmq_.cleaned_count --; + } + + ixgmq_.cleaned_count = 0; + + // Force memory writes to complete before letting h/w + //know there are new descriptors to fetch. (Only + // applicable for weak-ordered memory model archs, + // such as IA-64). + // + wmb(); + WriteRdt_1(mcore, ixgmq_.rx_tail_); + } + + // handle a single receive + if(rx_desc->eop) + { + ixgmq_.rx_desc_counts[1] ++; + + plen = rx_desc->pkt_len; + if(!plen) return; // Linux's ixgbe driver checks this case + + // TCP/UDP checksum + if (rx_desc->l4i) { + rxflag |= RXFLAG_L4CS; + if (!(rx_desc->l4e)) { + rxflag |= RXFLAG_L4CS_VALID; + } + } + // Ipv4 checksum + if (rx_desc->ipcs) { + rxflag |= RXFLAG_IPCS; + if (!(rx_desc->ipe)) { + rxflag |= RXFLAG_IPCS_VALID; + } + } + + // setup rx buffers + ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); + auto b = std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_]); + ixgmq_.cleaned_count ++; + ixgmq_.stat_num_rx_desc ++; + i ++; + + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + ixgmq_.total_rx_bytes += b->ComputeChainDataLength(); + ixgmq_.stat_num_rx_bytes += b->ComputeChainDataLength(); + + // TODO add _mm_prefetch from Linux?? + root_.itf_.Receive(std::move(b), rxflag); + } + else + { +#ifndef RSC_EN + ebbrt::kprintf_force("RSC packet in non-RSC mode\n"); + return; +#endif + + uint32_t rsc_count = 0; + + //RSC FIRST PACKET + plen = rx_desc->pkt_len; + if(!plen) { + ebbrt::kabort("**** RSC first packet Abort: core %u pkt_len == 0.\n", mcore); + return; + } + + //ebbrt::kprintf("\nRSC desc=%d next_desc=%d len=%d eop=%d\n", ixgmq_.rx_head_, rx_desc->next_descriptor_ptr, plen, rx_desc->eop); + + // setup rx buffers + ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); + auto b = std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_]); + ixgmq_.cleaned_count ++; + i ++; + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + rsc_count ++; + + while(true) { + rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); + retry: + // Linux's ixgbe: + // This memory barrier is needed to keep us from reading + // any other fields out of the rx_desc until we know the + // descriptor has been written back + asm volatile("" ::: "memory"); + + // if no rx packets ready + if (!(rx_desc->dd)) { + goto retry; + ebbrt::kprintf_force("**** RSC Abort: core %u rx_desc->dd == 0\n", mcore); + } + + plen = rx_desc->pkt_len; + if(!plen) { + ebbrt::kabort("***** RSC middle packetsAbort: core %u pkt_len == 0.\n", mcore); + return; + } + + // setup rx buffers + ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); + b->PrependChain(std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_])); + ixgmq_.cleaned_count ++; + ixgmq_.stat_num_rx_desc ++; + i ++; + + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + rsc_count ++; + if(rx_desc->eop) { + rxflag = 0; + // TCP/UDP checksum + if (rx_desc->l4i) { + rxflag |= RXFLAG_L4CS; + if (!(rx_desc->l4e)) { + rxflag |= RXFLAG_L4CS_VALID; + } + } + // Ipv4 checksum + if (rx_desc->ipcs) { + rxflag |= RXFLAG_IPCS; + if (!(rx_desc->ipe)) { + rxflag |= RXFLAG_IPCS_VALID; + } + } + + ixgmq_.rx_desc_counts[rsc_count] ++; + break; + } + } + ixgmq_.total_rx_bytes += b->ComputeChainDataLength(); + ixgmq_.stat_num_rx_bytes += b->ComputeChainDataLength(); + root_.itf_.Receive(std::move(b), rxflag); + } + + } +} + +ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) + : root_(root), ixgmq_(root.GetMultiQueue(Cpu::GetMine())), + receive_callback_([this]() { ReceivePoll(); }) { + //this->ReceivePoll(); + //ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::fixed_cycles); + //ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::fixed_instructions); + + //ixgmq_.perfRefCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::fixed_reference_cycles); + //ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); + //ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + //ixgmq_.perfRefCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::reference_cycles); + //ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); + + ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StartTimer() { + auto timeout = std::chrono::seconds(1); + timer->Start(*this, timeout, true); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StopTimer() { + timer->Stop(*this); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::Fire() { + /*uint32_t mcore = static_cast(Cpu::GetMine()); + + ixgmq_.perfCycles.Stop(); + ixgmq_.perfInst.Stop(); + ixgmq_.perfLLC_miss.Stop(); + if(mcore == 0 || mcore == 1) { + ixgmq_.powerMeter.Stop(); + } + // accumulate counters + ixgmq_.totalCycles += static_cast(ixgmq_.perfCycles.Read()); + ixgmq_.totalIns += static_cast(ixgmq_.perfInst.Read()); + ixgmq_.totalLLCmisses += static_cast(ixgmq_.perfLLC_miss.Read()); + if(mcore == 0 || mcore == 1) { + ixgmq_.totalNrg += ixgmq_.powerMeter.Read(); + //ebbrt::kprintf_force("Core %u: Fire() cycles=%llu ins=%llu llc=%llu energy=%.2lfJ\n", mcore, ixgmq_.totalCycles, ixgmq_.totalIns, ixgmq_.totalLLCmisses, ixgmq_.totalNrg); + } + + ixgmq_.perfCycles.Clear(); + ixgmq_.perfInst.Clear(); + ixgmq_.perfLLC_miss.Clear(); + + ixgmq_.perfCycles.Start(); + ixgmq_.perfInst.Start(); + ixgmq_.perfLLC_miss.Start(); + if(mcore == 0 || mcore == 1) { + ixgmq_.powerMeter.Start(); + } + ixgmq_.fireCount += 1; */ + //ebbrt::kprintf_force("Core %u: Fire() %llu\n", mcore, ixgmq_.fireCount); +} + +uint32_t ebbrt::IxgbeDriverRep::ReadEicr() { + auto reg = root_.bar0_.Read32(0x00800); + return reg & 0xFFFFFFFF; +} + +uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01010 + 0x40 * n); + return reg & 0xFFFF; +} +uint16_t ebbrt::IxgbeDriverRep::ReadRdt_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01018 + 0x40 * n); + return reg & 0xFFFF; +} + +void ebbrt::IxgbeDriverRep::WriteRdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x01018 + 0x40 * n, m); +} + +void ebbrt::IxgbeDriverRep::Run() { + while (1) { + ReceivePoll(); + } +} +void ebbrt::IxgbeDriverRep::WriteTdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x06018 + 0x40 * n, m); +} + +// 8.2.3.5.9 Extended Interrupt Mask Clear Registers — EIMC[n] +// (0x00AB0 + 4*(n-1), n=1...2; WO) +void ebbrt::IxgbeDriverRep::WriteEimcn(uint32_t n, uint32_t m) { + auto reg = root_.bar0_.Read32(0x00AB0 + 4 * n); + root_.bar0_.Write32(0x00AB0 + 4 * n, reg | m); +} + +// 8.2.3.5.4 Extended Interrupt Mask Clear Register- EIMC (0x00888; WO) +void ebbrt::IxgbeDriverRep::WriteEimc(uint32_t m) { root_.bar0_.Write32(0x00888, m); } + +// 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) +void ebbrt::IxgbeDriverRep::WriteEims(uint32_t m) { root_.bar0_.Write32(0x00880, m); } + +uint32_t ebbrt::IxgbeDriverRep::ReadTdh_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x06010 + 0x40 * n); + return reg & 0xFFFF; +} +uint32_t ebbrt::IxgbeDriverRep::ReadTdt_1(uint32_t n) { + return root_.bar0_.Read32(0x06018 + 0x40 * n) & 0xFFFF; +} + +void dumpPacketContents(uint8_t* p1, uint64_t len) { + uint64_t i, j; + + ebbrt::kprintf_force("dumpPacketContents() len=%u\n", len); + for (i = 0; i < len; i+=8) { + if (i+8 < len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } else { + for(j=i;j(Cpu::GetMine()); + if(s == "rx_usecs") { + ixgmq[i]->itr_val = v; + WriteEitr(i, (ixgmq[i]->itr_val << 3) | IXGBE_EITR_CNT_WDIS); + ebbrt::kprintf_force("%u: rx-usecs = %u\n", i, ixgmq[i]->itr_val*2); + } else if(s == "rapl") { + if(i == 0 || i == 1) { + ixgmq[i]->powerMeter.SetLimit(v); + ebbrt::kprintf_force("%u: rapl = %u\n", i, v); + } + } else if(s == "sleep_state") { + sleep_state[i] = v; + ebbrt::kprintf_force("IxgbeDriver sleep_state[%u] = 0x%x\n", i, v); + } else if(s == "start_stats") { + //ebbrt::kprintf_force("start_stats on core %u\n", v); + ixgmq[v]->collect_stats = true; + //ixgmq[v]->perfCycles.Start(); + //ixgmq[v]->perfRefCycles.Start(); + //ixgmq[v]->perfInst.Start(); + //ixgmq[v]->perfLLC_miss.Start(); + ixgmq[v]->powerMeter.Start(); + } + else if(s == "stop_stats") { + //ebbrt::kprintf_force("stop_stats on core %u\n", v); + ixgmq[v]->collect_stats = false; + //ixgmq[v]->perfCycles.Stop(); + //ixgmq[v]->perfRefCycles.Stop(); + //ixgmq[v]->perfInst.Stop(); + //ixgmq[v]->perfLLC_miss.Stop(); + ixgmq[v]->powerMeter.Stop(); + + } else if(s == "clear_stats") { + //ebbrt::kprintf_force("clear_stats on core %u\n", v); + //ixgmq[v]->perfCycles.Clear(); + //ixgmq[v]->perfRefCycles.Clear(); + //ixgmq[v]->perfInst.Clear(); + //ixgmq[v]->perfLLC_miss.Clear(); + ixgmq[v]->powerMeter.Clear(); + + //memset(ixgbe_logs[v], 0, sizeof(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry))); + for (uint32_t i = 0; i < ixgbe_stats[v].itr_cnt; i++) { + ixgbe_logs[v][i].Fields.rx_desc=0; + ixgbe_logs[v][i].Fields.rx_bytes=0; + ixgbe_logs[v][i].Fields.tx_desc=0; + ixgbe_logs[v][i].Fields.tx_bytes=0; + ixgbe_logs[v][i].Fields.ninstructions=0; + ixgbe_logs[v][i].Fields.ncycles=0; + ixgbe_logs[v][i].Fields.nref_cycles=0; + ixgbe_logs[v][i].Fields.nllc_miss=0; + ixgbe_logs[v][i].Fields.c3=0; + ixgbe_logs[v][i].Fields.c6=0; + ixgbe_logs[v][i].Fields.c7=0; + ixgbe_logs[v][i].Fields.joules=0; + ixgbe_logs[v][i].Fields.tsc=0; + } + ixgbe_stats[v].itr_joules_last_tsc = 0; + ixgbe_stats[v].itr_cnt =0; + //ixgbe_stats[v].itr_cnt2 =0; + ixgbe_stats[v].rdtsc_start = 0; + ixgbe_stats[v].rdtsc_end = 0; + ixgbe_stats[v].repeat =0; + ixgbe_stats[v].dvfs =0; + ixgbe_stats[v].rapl =0; + ixgbe_stats[v].itr =0; + ixgbe_stats[v].iter =0; + + // clear up counters + memset(nsleep_states, 0, sizeof(nsleep_states)); + /*memset(processCnt, 0, sizeof(processCnt)); + memset(swEventCnt, 0, sizeof(swEventCnt)); + memset(idleEventCnt, 0, sizeof(idleEventCnt)); + memset(rxPollCnt, 0, sizeof(rxPollCnt)); + memset(processInterruptCntAll, 0, sizeof(processInterruptCntAll)); + memset(processInterruptCntA, 0, sizeof(processInterruptCntA)); + memset(processInterruptCntB, 0, sizeof(processInterruptCntB)); + memset(processInterruptCntC, 0, sizeof(processInterruptCntC)); + memset(passTokenCnt, 0, sizeof(passTokenCnt)); + memset(receiveTokenCnt, 0, sizeof(receiveTokenCnt)); + memset(genFireCnt, 0, sizeof(genFireCnt)); + memset(fireCntA, 0, sizeof(fireCntA)); + memset(fireCntB, 0, sizeof(fireCntB));*/ + + } else { + ebbrt::kprintf_force("%s Unknown command: %s\n", __FUNCTION__, s.c_str()); + } +} diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h new file mode 100644 index 00000000..6cdbfc5f --- /dev/null +++ b/src/native/IxgbeDriver.h @@ -0,0 +1,602 @@ +// Copyright Boston University SESA Group 2013 - 2017. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ + +#include "../Align.h" +#include "../MulticoreEbb.h" +#include "../SpinLock.h" +#include "../StaticIOBuf.h" +#include "../UniqueIOBuf.h" +#include "Debug.h" +#include "Fls.h" +#include "Ixgbe.h" +#include "Net.h" +#include "PageAllocator.h" +#include "Pci.h" +#include "Pfn.h" +#include "SlabAllocator.h" +#include "Perf.h" +#include "Rapl.h" +#include "Trace.h" + +// Receive Side Scaling (RSC) enabled +//#define RSC_EN +// Direct Cache Access (DCA) enabled +//#define DCA_ENABLE +// Transmit Header Writeback enabled +//#define TX_HEAD_WB +//#define JUMBO_EN + +// Collect Statistics Flag +//#define STATS_EN +//#define MAX_DESC + +union IxgbeLogEntry { + long long data[11]; + struct { + long long tsc; + long long ninstructions; + long long ncycles; + long long nref_cycles; + long long nllc_miss; + long long joules; + long long c3; + long long c6; + long long c7; + + int rx_desc; + int rx_bytes; + int tx_desc; + int tx_bytes; + } __attribute((packed)) Fields; +} __attribute((packed)); + +#define IXGBE_CACHE_LINE_SIZE 64 +#define IXGBE_LOG_SIZE 4000000U +#define TSC_KHZ 2899999 + +struct IxgbeLog { + uint64_t itr_joules_last_tsc; + uint64_t rdtsc_start; + uint64_t rdtsc_end; + uint32_t itr_cnt; + uint32_t itr_cnt2; + uint32_t repeat; + uint32_t dvfs; + uint32_t rapl; + uint32_t itr; + uint32_t iter; +} __attribute__((packed, aligned(IXGBE_CACHE_LINE_SIZE))); + +extern struct IxgbeLog ixgbe_stats[16]; +extern union IxgbeLogEntry *ixgbe_logs[16]; +extern std::unique_ptr bsendbufs[16]; +//extern uint64_t rxPollCnt[16]; + +namespace ebbrt { + +class IxgbeDriverRep; + +class IxgbeDriver : public EthernetDevice { + public: + explicit IxgbeDriver(pci::Device& dev) + : itf_(network_manager->NewInterface(*this)), dev_(dev), + bar0_(dev.GetBar(0)) { + dev_.SetBusMaster(true); + + // set up interrupts, polling won't work after this + auto msix = dev_.MsixEnable(); + kbugon(!msix, "Ixgbe without msix is unsupported\n"); + + // each core gets a queue struct + ixgmq.resize(Cpu::Count()); + } + + static void Create(pci::Device& dev); + static bool Probe(pci::Device& dev) { + if (dev.GetVendorId() == kIxgbeVendorId && + dev.GetDeviceId() == kIxgbeDeviceId && dev.GetFunc() == 0) { + IxgbeDriver::Create(dev); + return true; + } + return false; + } + + //void Run(); + void Send(std::unique_ptr buf, PacketInfo pinfo) override; + //void SendUdp(std::unique_ptr buf, uint64_t len) override; + //void SendTCPUnchained(std::unique_ptr buf, uint64_t len) override; + //void SendTCPUnchained(std::unique_ptr buf, uint64_t len) override; + + void Config(std::string s, uint32_t v) override; + std::string ReadNic() override; + const EthernetAddress& GetMacAddress() override; + + protected: + + static const constexpr uint16_t kIxgbeVendorId = 0x8086; + static const constexpr uint16_t kIxgbeDeviceId = 0x10FB; + + /* FreeBSD: + * RxDescriptors Valid Range: 64-4096 Default Value: 256 This value is the + * number of receive descriptors allocated for each RX queue. Increasing this + * value allows the driver to buffer more incoming packets. Each descriptor + * is 16 bytes. A receive buffer is also allocated for each descriptor. + * + * Note: with 8 rings and a dual port card, it is possible to bump up + * against the system mbuf pool limit, you can tune nmbclusters + * to adjust for this. + */ + +#ifdef MAX_DESC + static const constexpr uint32_t NTXDESCS = 8192; + static const constexpr uint32_t NRXDESCS = 8192; +#else + static const constexpr uint32_t NTXDESCS = 512; + static const constexpr uint32_t NRXDESCS = 512; + //static const constexpr uint32_t NTXDESCS = 4096; + //static const constexpr uint32_t NRXDESCS = 4096; +#endif + + // Linux Defaults + static const constexpr uint32_t RXBUFSZ = 2048; + //static const constexpr uint32_t RXBUFSZ = 8192; + static const constexpr uint32_t BSIZEHEADER = 256; + + //static const constexpr uint32_t RXBUFSZ = 4096; + //static const constexpr uint32_t RXBUFSZ = 8192; + //static const constexpr uint32_t RXBUFSZ = 16384; + + // 8 bits (3 - 11) in (ITR_INTERVAL * 2 us) + //static const constexpr uint8_t ITR_INTERVAL = 32; + static const constexpr uint8_t ITR_INTERVAL = 8; + + // 3 bits only (0 - 7) in (RSC_DELAY + 1) * 4 us + static const constexpr uint8_t RSC_DELAY = 7; + + // DMA Tx TCP Max Allow Size Requests — DTXMXSZRQ + //static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0x10; + static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0xFFF; + + // Class with per core queue data structures + class e10Kq { + public: + e10Kq(uint32_t idx, Nid nid) + : rx_head_(0), rx_tail_(0), rx_size_(NRXDESCS), tx_tail_(0), + tx_last_tail_(0), tx_size_(NTXDESCS), idx_(idx), rxflag_(0), + rsc_used(false), hanc{0} { + + circ_buffer_.reserve(NRXDESCS+1); + for (uint32_t k = 0; k < NRXDESCS+1; k++) { + circ_buffer_.emplace_back(MakeUniqueIOBuf(RXBUFSZ, true)); + } + + // keep a log of number of idle times + idle_times_.reserve(NRXDESCS); + + // keep track of context descriptors + tx_iseop.reserve(NRXDESCS); + for (uint32_t k = 0; k < NRXDESCS; k++) { + tx_iseop[k] = false; + } + + // RX ring buffer allocation + auto sz = align::Up(sizeof(rdesc_legacy_t) * NRXDESCS, 4096); + auto order = Fls(sz - 1) - pmem::kPageShift + 1; + auto page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + auto addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + rx_ring_ = static_cast(addr); + + // TX ring buffer allocation + sz = align::Up(sizeof(tdesc_legacy_t) * NTXDESCS, 4096); + order = Fls(sz - 1) - pmem::kPageShift + 1; + page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + tx_ring_ = static_cast(addr); + +#ifdef TX_HEAD_WB + // TODO: not sure how much exactly to allocate for head wb addr + tx_head_ = (uint32_t*)malloc(4 * sizeof(uint32_t)); + memset(tx_head_, 0, 4 * sizeof(uint32_t)); + txhwbaddr_ = reinterpret_cast(tx_head_); + // txhwbaddr must be byte aligned + ebbrt::kbugon((txhwbaddr_ & 0x3) != 0, "txhwbaddr not byte aligned\n"); + kassert((txhwbaddr_ & 0x3) == 0); +#else + tx_head_ = 0; +#endif + + // get starting address, need to write to device registers + rxaddr_ = reinterpret_cast(rx_ring_); + txaddr_ = reinterpret_cast(tx_ring_); + rx_size_bytes_ = sizeof(rdesc_legacy_t) * NRXDESCS; + tx_size_bytes_ = sizeof(tdesc_legacy_t) * NTXDESCS; + + // must be 128 byte aligned + ebbrt::kbugon((rxaddr_ & 0x7F) != 0, "rx_addr_ not 128 byte aligned\n"); + ebbrt::kbugon((txaddr_ & 0x7F) != 0, "tx_addr_ not 128 byte aligned\n"); + ebbrt::kbugon((rx_size_bytes_ & 0x7F) != 0, + "rx_size_bytes_ not 128 byte aligned\n"); + ebbrt::kbugon((tx_size_bytes_ & 0x7F) != 0, + "tx_size_bytes_ not 128 byte aligned\n"); + + tx_desc_counts.reserve(100); + rx_desc_counts.reserve(100); + for(int i=0;i<100;i++) { + tx_desc_counts.emplace_back(0); + rx_desc_counts.emplace_back(0); + } + } + + uint32_t rx_head_; + uint32_t rx_tail_; + uint32_t rx_size_; + uint32_t tx_tail_; + uint32_t tx_last_tail_; + uint32_t tx_size_; + uint32_t idx_; + uint32_t rx_size_bytes_; + uint32_t tx_size_bytes_; + uint64_t rxaddr_; + uint64_t txaddr_; + uint64_t txhwbaddr_; + uint64_t rxflag_; + uint64_t cleaned_count{0}; + + std::vector> circ_buffer_; + std::unordered_map idle_times_; + std::vector tx_iseop; + std::ostringstream str_stats; + //std::vector send_to_watch; + + rdesc_legacy_t* rx_ring_; + tdesc_legacy_t* tx_ring_; + + //std::vector tx_isctx; + //bool* tx_isctx_; + bool rsc_used; + int hanc; +#ifdef TX_HEAD_WB + uint32_t* tx_head_; +#else + uint32_t tx_head_; +#endif + + // stats + int stat_num_rx_desc{0}; + int stat_num_tx_desc{0}; + int stat_num_rx_bytes{0}; + int stat_num_tx_bytes{0}; + uint64_t total_tx_bytes{0}; + uint64_t total_rx_bytes{0}; + + uint64_t time_us{0}; + uint64_t time_send{0}; + uint64_t time_idle_min{999999}; + uint64_t time_idle_max{0}; + uint64_t total_idle_time{0}; + uint64_t totalInterrupts{0}; + uint64_t totalCycles{0}; + uint64_t totalIns{0}; + uint64_t totalLLCmisses{0}; + uint64_t fireCount{0}; + uint32_t rapl_val{666}; + uint32_t itr_val{8}; + std::chrono::nanoseconds itr_joules_last_ts{0}; + bool collect_stats{false}; + bool start_perf{false}; + + std::vector tx_desc_counts; + std::vector rx_desc_counts; + double totalNrg{0.0}; + double totalTime{0.0}; + double totalPower{0.0}; + + bool stat_start{false}; + bool stat_init{false}; + ebbrt::perf::PerfCounter perfCycles; + ebbrt::perf::PerfCounter perfInst; + ebbrt::perf::PerfCounter perfRefCycles; + ebbrt::perf::PerfCounter perfLLC_ref; + ebbrt::perf::PerfCounter perfLLC_miss; + ebbrt::perf::PerfCounter perfTLB_store_miss; + ebbrt::perf::PerfCounter perfTLB_load_miss; + ebbrt::rapl::RaplCounter powerMeter; + + }; + + private: + EbbRef ebb_; + NetworkManager::Interface& itf_; + EthernetAddress mac_addr_; + + void Init(); + void PhyInit(); + void StopDevice(); + void GlobalReset(); + void SetupMultiQueue(uint32_t i); + void FinishSetup(); + + // device register writing code below + bool SwsmSmbiRead(); + void SwsmSmbiClear(); + + void SwsmSwesmbiSet(); + bool SwsmSwesmbiRead(); + void SwsmSwesmbiClear(); + + uint32_t ReadSwfwSyncSmBits(uint32_t m); + void WriteSwfwSyncSmBits(uint32_t m); + void WriteSwfwSyncSmBits2(uint32_t m); + + bool SwfwLockPhy(); + void SwfwUnlockPhy(); + bool SwfwSemAcquire(); + void SwfwSemRelease(); + + void WriteRxctrl(uint32_t m); + void WriteDmatxctl(uint32_t m); + void WriteDmatxctl_te(uint32_t m); + + void WriteEimc(uint32_t m); + void WriteEitr(uint32_t n, uint32_t m); + + void WriteTxdctl(uint32_t n, uint32_t m); + + void WriteRxdctl_1(uint32_t n, uint32_t m); + void WriteRxdctl_1_enable(uint32_t n, uint32_t m); + + void WriteRxdctl_2(uint32_t n, uint32_t m); + void WriteCtrl(uint32_t m); + void WriteCtrlExt(uint32_t m); + void WriteFcttv(uint32_t n, uint32_t m); + void WriteFcrtl(uint32_t n, uint32_t m); + void WriteFcrth(uint32_t n, uint32_t m); + void WriteFcrtv(uint32_t m); + void WriteFccfg(uint32_t m); + void WriteEerd(uint32_t m); + + void WriteCorectl(uint16_t m); + + void WriteAutoc(uint32_t m); + + void WriteEicr(uint32_t m); + void WriteGpie(uint32_t m); + void WriteEiam(uint32_t n, uint32_t m); + + void WriteEims(uint32_t m); + + void WriteRal(uint32_t n, uint32_t m); + void WriteRah(uint32_t n, uint32_t m); + + void WriteMta(uint32_t n, uint32_t m); + void WriteVfta(uint32_t n, uint32_t m); + void WritePfvlvf(uint32_t n, uint32_t m); + void WritePfvlvfb(uint32_t n, uint32_t m); + void WriteMpsar(uint32_t n, uint32_t m); + void WriteFtqf(uint32_t n, uint32_t m); + void WriteSaqf(uint32_t n, uint32_t m); + void WriteDaqf(uint32_t n, uint32_t m); + void WriteSdpqf(uint32_t n, uint32_t m); + + void WriteFctrl(uint32_t m); + void WriteFhft_1(uint32_t n, uint32_t m); + void WriteFhft_2(uint32_t n, uint32_t m); + + void WritePfuta(uint32_t n, uint32_t m); + void WriteMcstctrl(uint32_t m); + + void WriteRttdqsel(uint32_t m); + void WriteRttbcnrc(uint32_t m); + + void WriteDcaTxctrlTxdescWbro(uint32_t n, uint32_t m); + void WriteDcaTxctrl(uint32_t n, uint32_t m); + void WriteDcaRxctrl(uint32_t n, uint32_t m); + void WriteDcaRxctrlClear(uint32_t n, uint32_t m); + void WriteDcaRxctrl_1(uint32_t n, uint32_t m); + void WriteDcaRxctrl_2(uint32_t n, uint32_t m); + void WriteDcaCtrl(uint32_t m); + void ReadDcaTxctrl(uint32_t n); + void ReadDcaRxctrl(uint32_t n); + + void WriteRdbal_1(uint32_t n, uint32_t m); + void WriteRdbal_2(uint32_t n, uint32_t m); + + void WriteRdbah_1(uint32_t n, uint32_t m); + void WriteRdbah_2(uint32_t n, uint32_t m); + + void WriteRdlen_1(uint32_t n, uint32_t m); + void WriteRdlen_2(uint32_t n, uint32_t m); + + void WriteSrrctl_1(uint32_t n, uint32_t m); + void WriteSrrctlZero(uint32_t n); + void WriteSrrctl_1_desctype(uint32_t n, uint32_t m); + void WriteRscdbu(uint32_t m); + + void WriteRdt_1(uint32_t n, uint32_t m); + void WriteRdh_1(uint32_t n, uint32_t m); + void WriteRdt_2(uint32_t n, uint32_t m); + + void WriteIvarAlloc0(uint32_t n, uint32_t m); + void WriteIvarAllocval0(uint32_t n, uint32_t m); + void WriteIvarAlloc1(uint32_t n, uint32_t m); + void WriteIvarAllocval1(uint32_t n, uint32_t m); + void WriteIvarAlloc2(uint32_t n, uint32_t m); + void WriteIvarAllocval2(uint32_t n, uint32_t m); + void WriteIvarAlloc3(uint32_t n, uint32_t m); + void WriteIvarAllocval3(uint32_t n, uint32_t m); + + void WriteSecrxctrl_Rx_Dis(uint32_t m); + + void WriteTdbal(uint32_t n, uint32_t m); + void WriteTdbah(uint32_t n, uint32_t m); + void WriteTdlen(uint32_t n, uint32_t m); + + void WriteTdh(uint32_t n, uint32_t m); + void WriteTdt(uint32_t n, uint32_t m); + uint32_t ReadTdt(uint32_t n); + + void WriteTdwbal(uint32_t n, uint32_t m); + void WriteTdwbah(uint32_t n, uint32_t m); + + void WriteHlreg0(uint32_t m); + void WriteRdrxctl(uint32_t m); + void WriteRdrxctlRSCFRSTSIZE(uint32_t m); + + void WriteEiac(uint32_t m); + void WriteEimsn(uint32_t n, uint32_t m); + + void WriteRfctl(uint32_t m); + + void WriteRscctl(uint32_t n, uint32_t m); + void WritePsrtype(uint32_t n, uint32_t m); + + void WriteRxcsum(uint32_t m); + void WriteTxpbthresh(uint32_t n, uint32_t m); + void WriteMrqc(uint32_t m); + void WriteDtxmxszrq(uint32_t m); + void WriteMflcn(uint32_t m); + void WriteReta(uint32_t n, uint32_t m); + void WriteRssrk(uint32_t n, uint32_t m) { + kassert(n < 10); + bar0_.Write32(0x0EB80 + 4 * n, m); + } + + void WritePsrtypeZero(uint32_t n); + + void WriteRttdcs(uint32_t m); + void WriteRttdcsArbdisEn(uint32_t m); + void WriteRxpbsize(uint32_t n, uint32_t m); + void WriteTxpbsize(uint32_t n, uint32_t m); + void WriteTxpbThresh(uint32_t n, uint32_t m); + void WriteMtqc(uint32_t m); + void WritePfvtctl(uint32_t m); + void WriteRtrup2tc(uint32_t m); + void WriteRttup2tc(uint32_t m); + void WritePfqde(uint32_t m); + void WriteRttdt1c(uint32_t m); + void WriteRttdt2c(uint32_t n, uint32_t m); + void WriteRttpt2c(uint32_t n, uint32_t m); + void WriteRtrpt4c(uint32_t n, uint32_t m); + void WriteRttpcs(uint32_t m); + void WriteRtrpcs(uint32_t m); + void WritePfvml2flt(uint32_t n, uint32_t m); + + void WriteMngtxmap(uint32_t m); + + void WriteRxfeccerr0(uint32_t m); + void WriteMaxfrs(uint32_t m); + + uint8_t ReadRdrxctlDmaidone(); + + void ReadEicr(); + bool ReadStatusPcieMes(); + uint8_t ReadStatusLanId(); + void ReadCtrl(); + bool ReadEerdDone(); + uint16_t ReadEerdData(); + uint16_t ReadEeprom(uint16_t offset); + uint8_t ReadAnlp1(); + uint8_t ReadAutocRestartAn(); + uint8_t ReadEecAutoRd(); + uint32_t ReadEims(); + + uint32_t ReadRal(uint32_t n); + uint16_t ReadRah(uint32_t n); + uint8_t ReadRahAv(uint32_t n); + + uint8_t ReadRxdctl_1_enable(uint32_t n); + uint8_t ReadSecrxstat_Sr_Rdy(); + + uint8_t ReadTxdctl_enable(uint32_t n); + + uint16_t ReadRdh_1(uint32_t n); + uint16_t ReadTdh(uint32_t n); + uint16_t ReadRdt_1(uint32_t n); + + // some statistics + uint32_t ReadTpr(); + uint32_t ReadGprc(); + bool ReadLinksLinkUp(); + + // Process packet functions + void ProcessPacket(uint32_t n); + uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr); + void SendPacket(uint32_t n); + + // dump per core stats if STATS_EN + void DumpStats(); + + e10Kq& GetMultiQueue(uint32_t index) const { return *ixgmq[index]; } + + pci::Device& dev_; + pci::Bar& bar0_; + + struct IxgbeRegs { + volatile uint32_t kIxgbeCtrl; + volatile uint32_t kIxgbeCtrlBak; + volatile uint32_t kIxgbeStatus; + }; + + uint8_t rcv_vector{0}; + + std::vector> ixgmq; + + friend class IxgbeDriverRep; +}; // class IxgbeDriver + +class IxgbeDriverRep : public MulticoreEbb, Timer::Hook { + public: + explicit IxgbeDriverRep(const IxgbeDriver& root); + void Run(); + void ReceivePoll(); + void ReclaimTx(); + void ReclaimRx(); + void Send(std::unique_ptr buf, PacketInfo pinfo); + void SendUdp(std::unique_ptr buf, uint64_t len, PacketInfo pinfo); + void SendTCPChained(std::unique_ptr buf, uint64_t len, uint64_t num_chains, PacketInfo pinfo); + //void SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo); + void SendTCPUnchained(uint64_t bdata, uint64_t len, PacketInfo pinfo); + + //void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, + // enum l4_type l4type); + //void AddTx(uint64_t pa, uint64_t len, uint64_t totallen, bool first, + // bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum, bool tse, int hdr_len); + void StartTimer(); + void StopTimer(); + + private: + uint16_t ReadRdh_1(uint32_t n); + uint16_t ReadRdt_1(uint32_t n); + void WriteRdt_1(uint32_t n, uint32_t m); + void WriteRdh_1(uint32_t n, uint32_t m); + void WriteTdt_1(uint32_t n, uint32_t m); + void WriteEimcn(uint32_t n, uint32_t m); + void WriteEimc(uint32_t m); + void WriteEims(uint32_t m); + uint32_t ReadEicr(); + uint32_t ReadTdh_1(uint32_t n); + uint32_t ReadTdt_1(uint32_t n); + uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, + bool* process_rsc, uint32_t* rnt, uint32_t* rxhead); + void Fire() override; + + const IxgbeDriver& root_; + IxgbeDriver::e10Kq& ixgmq_; + + EventManager::IdleCallback receive_callback_; + +}; // class IxgbeDriverRep + +} // namespace ebbrt + +#endif // BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ diff --git a/src/native/Main.cc b/src/native/Main.cc index 4e2f0145..dfed9d9a 100644 --- a/src/native/Main.cc +++ b/src/native/Main.cc @@ -47,7 +47,11 @@ #include "Trans.h" #include "VMem.h" #include "VMemAllocator.h" +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ +#include "IxgbeDriver.h" +#else #include "VirtioNet.h" +#endif namespace { bool started_once = false; @@ -66,6 +70,40 @@ extern "C" __attribute__((noreturn)) void ebbrt::Main(multiboot::Information* mbi) { console::Init(); + memset(nsleep_states, 0, sizeof(nsleep_states)); + memset(sleep_state, 0, sizeof(sleep_state)); + sleep_state[0]=48; + sleep_state[1]=48; + sleep_state[2]=48; + sleep_state[3]=48; + sleep_state[4]=48; + sleep_state[5]=48; + sleep_state[6]=48; + sleep_state[7]=48; + sleep_state[8]=48; + sleep_state[9]=48; + sleep_state[10]=48; + sleep_state[11]=48; + sleep_state[12]=48; + sleep_state[13]=48; + sleep_state[14]=48; + sleep_state[15]=48; + + /*memset(processCnt, 0, sizeof(processCnt)); + memset(swEventCnt, 0, sizeof(swEventCnt)); + memset(idleEventCnt, 0, sizeof(idleEventCnt)); + memset(rxPollCnt, 0, sizeof(rxPollCnt)); + memset(processInterruptCntAll, 0, sizeof(processInterruptCntAll)); + memset(processInterruptCntA, 0, sizeof(processInterruptCntA)); + memset(processInterruptCntB, 0, sizeof(processInterruptCntB)); + memset(processInterruptCntC, 0, sizeof(processInterruptCntC)); + memset(passTokenCnt, 0, sizeof(passTokenCnt)); + memset(receiveTokenCnt, 0, sizeof(receiveTokenCnt)); + memset(genFireCnt, 0, sizeof(genFireCnt)); + memset(timerCnt, 0, sizeof(timerCnt)); + memset(fireCntA, 0, sizeof(fireCntA)); + memset(fireCntB, 0, sizeof(fireCntB));*/ + #ifdef __EBBRT_ENABLE_TRACE__ trace::Init(); #endif @@ -146,18 +184,29 @@ ebbrt::Main(multiboot::Information* mbi) { Timer::Init(); smp::Init(); event_manager->ReceiveToken(); + #ifdef __EBBRT_ENABLE_NETWORKING__ NetworkManager::Init(); pci::Init(); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + pci::RegisterProbe(IxgbeDriver::Probe); +#else pci::RegisterProbe(VirtioNetDriver::Probe); +#endif + pci::LoadDrivers(); network_manager->StartDhcp().Then([](Future fut) { fut.Get(); // Dhcp completed #ifdef __EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ +// Currently not supported in BMNIC since we don't pass arguments +// via grub +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ Messenger::Init(); runtime::Init(); #endif +#endif #endif // run global ctors for (unsigned i = 0; i < (end_ctors - start_ctors); ++i) { diff --git a/src/native/Msr.h b/src/native/Msr.h index 4e3b7ba6..a5358f4a 100644 --- a/src/native/Msr.h +++ b/src/native/Msr.h @@ -29,7 +29,18 @@ inline uint64_t Read(uint32_t index) { inline void Write(uint32_t index, uint64_t data) { uint32_t low = data; - uint32_t high = data >> 32; + uint32_t high = (data >> 32) & 0xFFFFFFFF; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // TODO - correct fix is here? + // GP fault happens when writing a 1 to bit #3 for kX2apicDcr, + // which is a reserved bit + // only happens in baremetal, VM prob virtualized this issue + if ((((data >> 2) & 0x1) == 1) && index == kX2apicDcr) { + low = (data & 0x3) | ((data & 0x4) << 1); + high = 0x0; + } +#endif asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); } } // namespace msr diff --git a/src/native/Net.cc b/src/native/Net.cc index 581cec54..d0617c0c 100644 --- a/src/native/Net.cc +++ b/src/native/Net.cc @@ -12,27 +12,37 @@ ebbrt::NetworkManager::NewInterface(EthernetDevice& ether_dev) { return *interface_; } -void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Drop packets that are too small - if (packet_len <= sizeof(EthernetHeader)) + if (packet_len <= sizeof(EthernetHeader)) { + ebbrt::kprintf("packet_len=%d too small (less than EthernetHeader)\n", packet_len); return; + } auto dp = buf->GetMutDataPointer(); auto& eth_header = dp.Get(); - buf->Advance(sizeof(EthernetHeader)); switch (ntohs(eth_header.type)) { case kEthTypeIp: { - ReceiveIp(eth_header, std::move(buf)); + ReceiveIp(eth_header, std::move(buf), rxflag); break; } case kEthTypeArp: { ReceiveArp(eth_header, std::move(buf)); break; } + default: { + //ebbrt::kprintf("NetworkManager::Interface::Receive(): Unknown eth_header.type=0x%X packet_len=%u\n", ntohs(eth_header.type), packet_len); + /*auto p1 = reinterpret_cast(buf->MutData()); + for (int i = 0; i < 256; i+=8) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } + ebbrt::kabort("NetworkManager::Interface::Receive()\n");*/ + } } } @@ -44,3 +54,19 @@ void ebbrt::NetworkManager::Interface::Send(std::unique_ptr b, PacketInfo pinfo) { ether_dev_.Send(std::move(b), std::move(pinfo)); } + +void ebbrt::NetworkManager::Config(std::string s, uint32_t v) { + interface_->Config(std::move(s), v); +} + +std::string ebbrt::NetworkManager::ReadNic() { + return interface_->ReadNic(); +} + +void ebbrt::NetworkManager::Interface::Config(std::string s, uint32_t v) { + ether_dev_.Config(std::move(s), v); +} + +std::string ebbrt::NetworkManager::Interface::ReadNic() { + return ether_dev_.ReadNic(); +} diff --git a/src/native/Net.h b/src/native/Net.h index 389bd1fb..4dff7ae7 100644 --- a/src/native/Net.h +++ b/src/native/Net.h @@ -25,9 +25,16 @@ #include "RcuTable.h" #include "SharedPoolAllocator.h" +// IP and L4 checksum offload bits +#define RXFLAG_IPCS (1 << 0) +#define RXFLAG_IPCS_VALID (1 << 1) +#define RXFLAG_L4CS (1 << 2) +#define RXFLAG_L4CS_VALID (1 << 3) + namespace ebbrt { struct PacketInfo { static const constexpr uint8_t kNeedsCsum = 1; + static const constexpr uint8_t kNeedsIpCsum = 2; static const constexpr uint8_t kGsoNone = 0; static const constexpr uint8_t kGsoTcpv4 = 1; static const constexpr uint8_t kGsoUdp = 3; @@ -39,12 +46,17 @@ struct PacketInfo { uint16_t gso_size{0}; uint16_t csum_start{0}; uint16_t csum_offset{0}; + uint32_t tcp_len{0}; + uint32_t tcp_hdr_len{0}; + bool get_stats{false}; }; class EthernetDevice { public: virtual void Send(std::unique_ptr buf, PacketInfo pinfo = PacketInfo()) = 0; + virtual void Config(std::string s, uint32_t v) = 0; + virtual std::string ReadNic() = 0; virtual const EthernetAddress& GetMacAddress() = 0; virtual ~EthernetDevice() {} }; @@ -230,12 +242,15 @@ class NetworkManager : public StaticSharedEbb { explicit Interface(EthernetDevice& ether_dev) : address_(nullptr), ether_dev_(ether_dev) {} - void Receive(std::unique_ptr buf); + void Receive(std::unique_ptr buf, uint64_t rxflag = 0); void Send(std::unique_ptr buf, PacketInfo pinfo = PacketInfo()); void SendUdp(UdpPcb& pcb, Ipv4Address addr, uint16_t port, std::unique_ptr buf); void SendIp(std::unique_ptr buf, Ipv4Address src, Ipv4Address dst, uint8_t proto, PacketInfo pinfo = PacketInfo()); + void Config(std::string s, uint32_t v); + std::string ReadNic(); + const EthernetAddress& MacAddress(); const ItfAddress* Address() const { return address_.get(); } void SetAddress(std::unique_ptr address) { @@ -246,7 +261,7 @@ class NetworkManager : public StaticSharedEbb { private: struct DhcpPcb : public CacheAligned, public Timer::Hook { void Fire() override; - + UdpPcb udp_pcb; DhcpMessage last_offer; enum State { kInactive, kSelecting, kRequesting, kBound } state; @@ -260,11 +275,14 @@ class NetworkManager : public StaticSharedEbb { }; void ReceiveArp(EthernetHeader& eh, std::unique_ptr buf); - void ReceiveIp(EthernetHeader& eh, std::unique_ptr buf); + void ReceiveIp(EthernetHeader& eh, std::unique_ptr buf, + uint64_t rxflag = 0); void ReceiveIcmp(EthernetHeader& eh, Ipv4Header& ih, std::unique_ptr buf); - void ReceiveUdp(Ipv4Header& ih, std::unique_ptr buf); - void ReceiveTcp(const Ipv4Header& ih, std::unique_ptr buf); + void ReceiveUdp(Ipv4Header& ih, std::unique_ptr buf, + uint64_t rxflag = 0); + void ReceiveTcp(const Ipv4Header& ih, std::unique_ptr buf, + uint64_t rxflag = 0); void ReceiveDhcp(Ipv4Address from_addr, uint16_t from_port, std::unique_ptr buf); void EthArpSend(uint16_t proto, const Ipv4Header& ih, @@ -298,9 +316,11 @@ class NetworkManager : public StaticSharedEbb { Interface& NewInterface(EthernetDevice& ether_dev); Ipv4Address IpAddress(); - + void Config(std::string s, uint32_t v); + std::string ReadNic(); + private: - Future StartDhcp(); + Future StartDhcp(); void SendIp(std::unique_ptr buf, Ipv4Address src, Ipv4Address dst, uint8_t proto, PacketInfo = PacketInfo()); void TcpReset(bool ack, uint32_t seqno, uint32_t ackno, diff --git a/src/native/NetChecksum.cc b/src/native/NetChecksum.cc index f92e5bfc..86cb4747 100644 --- a/src/native/NetChecksum.cc +++ b/src/native/NetChecksum.cc @@ -41,7 +41,7 @@ uint32_t Add32WithCarry(uint32_t a, uint32_t b) { } // Compute checksum over a contiguous region of memory -uint32_t Csum(const uint8_t* buf, size_t len, size_t offset = 0) { +uint64_t Csum(const uint8_t* buf, size_t len, size_t offset = 0) { if (unlikely(len == 0)) return 0; @@ -156,6 +156,11 @@ uint16_t ebbrt::OffloadPseudoCsum(const IOBuf& buf, uint8_t proto, return From32To16(PseudoCsum(buf.ComputeChainDataLength(), proto, src, dst)); } +uint16_t ebbrt::OffloadPseudoCsumTso(uint8_t proto, + Ipv4Address src, Ipv4Address dst) { + return From32To16(PseudoCsum(0, proto, src, dst)); +} + // Calculate the Ipv4 pseudo checksum with the provided header information uint16_t ebbrt::IpPseudoCsum(const IOBuf& buf, uint8_t proto, Ipv4Address src, Ipv4Address dst) { @@ -172,3 +177,7 @@ uint16_t ebbrt::IpCsum(const IOBuf& buf) { return CsumFold(IpCsumNoFold(buf)); } uint16_t ebbrt::IpCsum(const uint8_t* buf, size_t len) { return CsumFold(Csum(buf, len)); } + +uint32_t ebbrt::CsumTest(const IOBuf& buf) { + return IpCsumNoFold(buf); +} diff --git a/src/native/NetChecksum.h b/src/native/NetChecksum.h index 8d875f95..55333e16 100644 --- a/src/native/NetChecksum.h +++ b/src/native/NetChecksum.h @@ -14,10 +14,13 @@ namespace ebbrt { uint16_t OffloadPseudoCsum(const IOBuf& buf, uint8_t proto, Ipv4Address src, Ipv4Address dst); +uint16_t OffloadPseudoCsumTso(uint8_t proto, Ipv4Address src, + Ipv4Address dst); uint16_t IpPseudoCsum(const IOBuf& buf, uint8_t proto, Ipv4Address src, Ipv4Address dst); uint16_t IpCsum(const IOBuf& buf); uint16_t IpCsum(const uint8_t* buf, size_t len); +uint32_t CsumTest(const IOBuf& buf); } // namespace ebbrt #endif // BAREMETAL_SRC_INCLUDE_EBBRT_NETCHECKSUM_H_ diff --git a/src/native/NetDhcp.cc b/src/native/NetDhcp.cc index cc931ede..d5ce53eb 100644 --- a/src/native/NetDhcp.cc +++ b/src/native/NetDhcp.cc @@ -13,6 +13,7 @@ ebbrt::Future ebbrt::NetworkManager::StartDhcp() { kbugon(Cpu::GetMine() != 0, "Dhcp not started on core 0!"); +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ // Before DHCP, check if a static IP has been specified auto cmdline = std::string(ebbrt::multiboot::CmdLine()); auto loc = cmdline.find("nodhcp"); @@ -20,6 +21,7 @@ ebbrt::Future ebbrt::NetworkManager::StartDhcp() { kprintf("Warning: Skipping DHCP, static IP detected\n"); return MakeReadyFuture(); } +#endif if (interface_) return interface_->StartDhcp(); @@ -194,10 +196,12 @@ void ebbrt::NetworkManager::Interface::DhcpHandleAck( kassert(netmask_opt); addr->netmask = *netmask_opt; +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ // assert fails in baremetal auto gw_opt = DhcpGetOptionLong(message, kDhcpOptionRouter); kassert(gw_opt); addr->gateway = *gw_opt; - +#endif + SetAddress(std::unique_ptr(addr)); DhcpSetState(DhcpPcb::State::kBound); diff --git a/src/native/NetIcmp.cc b/src/native/NetIcmp.cc index e5c06153..623e7684 100644 --- a/src/native/NetIcmp.cc +++ b/src/native/NetIcmp.cc @@ -12,6 +12,7 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( EthernetHeader& eth_header, Ipv4Header& ip_header, std::unique_ptr buf) { auto packet_len = buf->ComputeChainDataLength(); + //ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); if (unlikely(packet_len < sizeof(IcmpHeader))) return; @@ -19,9 +20,12 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( auto dp = buf->GetMutDataPointer(); auto& icmp_header = dp.Get(); - // checksum + //ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ + // software checksum if (IpCsum(*buf)) return; +#endif // if echo_request, send reply if (icmp_header.type == kIcmpEchoRequest) { @@ -43,9 +47,20 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( ip_header.ttl = kIpDefaultTtl; ip_header.chksum = 0; + + PacketInfo pinfo; + pinfo.flags = 0; + // hijacking ping to dump ixgbe statistics + pinfo.get_stats = true; +//#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // hardware ip checksum offload +// pinfo.flags |= PacketInfo::kNeedsIpCsum; +//#else ip_header.chksum = ip_header.ComputeChecksum(); +//#endif buf->Retreat(ip_header.HeaderLength()); - EthArpSend(kEthTypeIp, ip_header, std::move(buf)); + + EthArpSend(kEthTypeIp, ip_header, std::move(buf), pinfo); } } diff --git a/src/native/NetIp.cc b/src/native/NetIp.cc index d76ddce7..8409e4ab 100644 --- a/src/native/NetIp.cc +++ b/src/native/NetIp.cc @@ -36,8 +36,9 @@ bool ebbrt::NetworkManager::Interface::ItfAddress::isLocalNetwork( } // Receive an Ipv4 packet -void ebbrt::NetworkManager::Interface::ReceiveIp( - EthernetHeader& eth_header, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveIp(EthernetHeader& eth_header, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); if (unlikely(packet_len < sizeof(Ipv4Header))) @@ -59,8 +60,21 @@ void ebbrt::NetworkManager::Interface::ReceiveIp( buf->TrimEnd(packet_len - tot_len); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // baremetal checksum offload + if (unlikely((rxflag & RXFLAG_IPCS) == 0)) { + ebbrt::kprintf("%s RXFLAG_IPCS failed\n", __FUNCTION__); + return; + } + + if (unlikely((rxflag & RXFLAG_IPCS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_IPCS_VALID failed\n", __FUNCTION__); + return; + } +#else if (unlikely(ip_header.ComputeChecksum() != 0)) return; +#endif auto addr = Address(); // Unless the protocol is UDP or we have an address on this interface and the @@ -87,11 +101,11 @@ void ebbrt::NetworkManager::Interface::ReceiveIp( break; } case kIpProtoUDP: { - ReceiveUdp(ip_header, std::move(buf)); + ReceiveUdp(ip_header, std::move(buf), rxflag); break; } case kIpProtoTCP: { - ReceiveTcp(ip_header, std::move(buf)); + ReceiveTcp(ip_header, std::move(buf), rxflag); break; } } @@ -123,13 +137,18 @@ void ebbrt::NetworkManager::Interface::SendIp(std::unique_ptr buf, ih.chksum = 0; ih.src = src; ih.dst = dst; - ih.chksum = ih.ComputeChecksum(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // baremetal ip checksum offload + pinfo.flags |= PacketInfo::kNeedsIpCsum; +#else + ih.chksum = ih.ComputeChecksum(); kassert(ih.ComputeChecksum() == 0); +#endif pinfo.csum_start += sizeof(Ipv4Header); pinfo.hdr_len += sizeof(Ipv4Header); - + EthArpSend(kEthTypeIp, ih, std::move(buf), pinfo); } diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index e792f34a..358c3bc4 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -194,8 +194,9 @@ ebbrt::Ipv4Address ebbrt::NetworkManager::TcpPcb::GetRemoteAddress() { } // Receive a TCP packet on an interface -void ebbrt::NetworkManager::Interface::ReceiveTcp( - const Ipv4Header& ih, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveTcp(const Ipv4Header& ih, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Ensure we have a header @@ -210,10 +211,21 @@ void ebbrt::NetworkManager::Interface::ReceiveTcp( if (unlikely(addr->isBroadcast(ih.dst) || ih.dst.isMulticast())) return; - // XXX: Check if rxcsum is supported - // if (unlikely(IpPseudoCsum(*buf, ih.proto, ih.src, ih.dst))) - // return; +// XXX: Check if rxcsum is supported +// if (unlikely(IpPseudoCsum(*buf, ih.proto, ih.src, ih.dst))) +// return; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (unlikely((rxflag & RXFLAG_L4CS) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS failed\n"); + return; + } + + if (unlikely((rxflag & RXFLAG_L4CS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS_VALID failed\n"); + return; + } +#endif auto hdr_len = tcp_header.HdrLen(); if (unlikely(hdr_len < sizeof(TcpHeader) || hdr_len > packet_len)) return; @@ -351,6 +363,7 @@ void ebbrt::NetworkManager::TcpEntry::Destroy() { std::lock_guard guard(network_manager->tcp_write_lock_); network_manager->tcp_pcbs_.erase(*this); } + kassert(this); event_manager->DoRcu([this]() { delete this; }); } @@ -700,7 +713,6 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( if (TcpSeqGT(rcv_nxt, info.seqno)) { // Trim the front - kprintf(">> rcv_nxt > info.seqno \n"); buf->Advance(rcv_nxt - info.seqno); info.tcplen -= rcv_nxt - info.seqno; } @@ -708,6 +720,9 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( // Second check the RST bit if (unlikely(flags & kTcpRst)) { state = kClosed; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + bool flag = false; +#endif if (state == kSynReceived) { // RFC 793 Page 70: // "If this connection was initiated with a passive OPEN (i.e., came @@ -718,7 +733,6 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( // In either case, all segments on the retransmission queue should be // removed. And in the active OPEN case, enter the CLOSED state and // delete the TCB, and return." - handler->Abort(); } else if (state >= kEstablished && state <= kCloseWait) { // If the RST bit is set then, any outstanding RECEIVEs and SEND @@ -734,10 +748,17 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( // RFC 793 Page 70: // If the RST bit is set then, enter the CLOSED state, delete the TCB, // and return. +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + flag = true; +#endif } Purge(); DisableTimers(); - Destroy(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (flag == false) Destroy(); +#else + Destroy(); +#endif return false; } else if (unlikely(flags & kTcpSyn)) { // RFC 793 Page 71: @@ -929,7 +950,7 @@ void ebbrt::NetworkManager::TcpEntry::EnqueueSegment( th.SetHdrLenFlags(sizeof(TcpHeader) + optlen, flags); // ackno, wnd, and checksum are set in Output() th.urgp = 0; - + pending_segments.emplace_back(std::move(buf), th, tcp_len); snd_nxt += tcp_len; @@ -996,13 +1017,61 @@ void ebbrt::NetworkManager::TcpEntry::SendEmptyAck() { rcv_last_acked = rcv_nxt; th.ackno = htonl(rcv_nxt); th.wnd = htons(TcpWindow16(rcv_wnd)); - th.checksum = OffloadPseudoCsum(*buf, kIpProtoTCP, address, std::get<0>(key)); - + th.checksum = 0; //OffloadPseudoCsum(*buf, kIpProtoTCP, address, std::get<0>(key)); + + //auto local_ip = htonl(address.toU32()); + //auto remote_ip = htonl((std::get<0>(key)).toU32()); + + //ebbrt::kprintf("SendEmptyAck() src_ip=0x%llX dst_ip=0x%llX kIpProtoTCP=0x%X buf_len=0x%X src_port=0x%X dst_port=0x%X seq_num=0x%X ack_num=0x%X\n", local_ip, remote_ip, kIpProtoTCP, 0, th.src_port, th.dst_port, th.seqno, th.ackno); + //ebbrt::kprintf("\t flags=0x%X windows=0x%X urgent_pointer=0x%X tcp_header.checksum=0x%X \n", th.hdrlen_flags, th.wnd, th.urgp, th.checksum); + + /*auto pl = buf->ComputeChainDataLength(); + //ebbrt::kprintf("\t packet_len=%u\n", pl); + uint8_t* p1 = reinterpret_cast(buf->MutData()); + int i; + uint32_t sum = 0; + uint16_t word16; + for (i = 0; i < (int)pl; i+=2) { + word16 = ((p1[i]<<8)&0xFF00) + (p1[i+1]&0xFF); + sum = sum + (uint32_t)word16; + } + // pseudo header start + //add src addr + word16 = (local_ip & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((local_ip >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add dst addr + word16 = (remote_ip & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((remote_ip >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add protocol number and length of tcpp packet + sum = sum + kIpProtoTCP + pl; + // pseudo header end + + while(sum >> 16) { + sum = (sum & 0xFFFF) + (sum >> 16); + } + sum = (~sum) & 0xFFFF; + th.checksum = htons((uint16_t) sum); + //ebbrt::kprintf("\t new checksum=0x%X\n\n", th.checksum); + */ + PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; pinfo.csum_offset = 16; // checksum is 16 bytes into the TCP header - + + th.checksum = + OffloadPseudoCsum(*buf, kIpProtoTCP, address, std::get<0>(key)); + //OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); + pinfo.tcp_len = 0; + pinfo.tcp_hdr_len = th.HdrLen(); + //ebbrt::kprintf("SendEmptyAck() checksum=0x%X\n\n", th.checksum); + network_manager->SendIp(std::move(buf), address, std::get<0>(key), kIpProtoTCP, pinfo); } @@ -1055,26 +1124,38 @@ void ebbrt::NetworkManager::TcpEntry::SendFin() { // Actually send a segment via IP void ebbrt::NetworkManager::TcpEntry::SendSegment(TcpSegment& segment) { + uint32_t len = segment.buf->ComputeChainDataLength(); + uint32_t totallen = len + sizeof(Ipv4Header) + sizeof(EthernetHeader); rcv_last_acked = rcv_nxt; segment.th.ackno = htonl(rcv_nxt); segment.th.wnd = htons(TcpWindow16(rcv_wnd)); segment.th.checksum = 0; - // XXX: check if checksum offloading is supported - segment.th.checksum = + + // 82599 has a different checksum method when greater than a single MTU, the paylen field is set to 0 + if(totallen > 1490) { + segment.th.checksum = + OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); + } else { + // XXX: check if checksum offloading is supported + segment.th.checksum = OffloadPseudoCsum(*(segment.buf), kIpProtoTCP, address, std::get<0>(key)); + } PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; pinfo.csum_offset = 16; // checksum is 16 bytes into the TCP header - + pinfo.tcp_hdr_len = segment.th.HdrLen(); + pinfo.tcp_len = len - pinfo.tcp_hdr_len; + // XXX: Actually store the MSS instead of making this assumption - size_t mss = 1460; + //size_t mss = 1460; + size_t mss = 2048; if (segment.tcp_len > mss) { pinfo.gso_type = PacketInfo::kGsoTcpv4; pinfo.hdr_len = segment.th.HdrLen(); pinfo.gso_size = mss; } - + network_manager->SendIp(CreateRefChain(*(segment.buf)), address, std::get<0>(key), kIpProtoTCP, std::move(pinfo)); } @@ -1085,6 +1166,8 @@ void ebbrt::NetworkManager::TcpReset(bool ack, uint32_t seqno, uint32_t ackno, const Ipv4Address& remote_ip, uint16_t local_port, uint16_t remote_port) { + +// ebbrt::kabort("ebbrt::NetworkManager::TcpReset() - Aborting haven't added checksum offloading\n"); auto buf = MakeUniqueIOBuf(sizeof(TcpHeader) + sizeof(Ipv4Header) + sizeof(EthernetHeader)); @@ -1101,12 +1184,17 @@ void ebbrt::NetworkManager::TcpReset(bool ack, uint32_t seqno, uint32_t ackno, tcp_header.wnd = htons(TcpWindow16(kTcpWnd)); tcp_header.urgp = 0; tcp_header.checksum = - OffloadPseudoCsum(*buf, kIpProtoTCP, local_ip, remote_ip); + OffloadPseudoCsum(*buf, kIpProtoTCP, local_ip, remote_ip); + //ebbrt::kprintf("TcpReset() src_ip=0x%llX dst_ip=0x%llX kIpProtoTCP=0x%X buf_len=0x%X src_port=0x%X dst_port=0x%X seq_num=0x%X ack_num=0x%X\n", local_ip.toU32(), remote_ip.toU32(), kIpProtoTCP, 0, tcp_header.src_port, tcp_header.dst_port, tcp_header.seqno, tcp_header.ackno); + //ebbrt::kprintf("\t flags=0x%X windows=0x%X urgent_pointer=0x%X tcp_header.checksum=0x%X \n\n", tcp_header.hdrlen_flags, tcp_header.wnd, tcp_header.urgp, tcp_header.checksum); + PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; // 14 byte eth header + 20 byte ip header pinfo.csum_offset = 16; // checksum is 16 bytes into the TCP header - + pinfo.tcp_len = 0; + pinfo.tcp_hdr_len = tcp_header.HdrLen(); + SendIp(std::move(buf), local_ip, remote_ip, kIpProtoTCP, pinfo); } diff --git a/src/native/NetUdp.cc b/src/native/NetUdp.cc index 7da5fdc4..60ba3a92 100644 --- a/src/native/NetUdp.cc +++ b/src/native/NetUdp.cc @@ -57,8 +57,9 @@ void ebbrt::NetworkManager::UdpPcb::Receive( } // Receive UDP packet on an interface -void ebbrt::NetworkManager::Interface::ReceiveUdp( - Ipv4Header& ip_header, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveUdp(Ipv4Header& ip_header, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Ensure we have a header @@ -75,10 +76,20 @@ void ebbrt::NetworkManager::Interface::ReceiveUdp( // trim any excess off the packet buf->TrimEnd(packet_len - ntohs(udp_header.length)); - // XXX: Check if rxcsum supported - // if (udp_header.checksum && - // IpPseudoCsum(*buf, ip_header.proto, ip_header.src, ip_header.dst)) - // return; +// XXX: Check if rxcsum supported +// if (udp_header.checksum && +// IpPseudoCsum(*buf, ip_header.proto, ip_header.src, ip_header.dst)) +// return; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (unlikely((rxflag & RXFLAG_L4CS) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS failed\n"); + return; + } + if (unlikely((rxflag & RXFLAG_L4CS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS_VALID failed\n"); + return; + } +#endif auto entry = network_manager->udp_pcbs_.find(ntohs(udp_header.dst_port)); @@ -121,6 +132,10 @@ void ebbrt::NetworkManager::Interface::SendUdp(UdpPcb& pcb, Ipv4Address addr, src_addr = itf_addr->address; } + if(data_size % 2 > 0) { + throw std::runtime_error("SendUdp: data buffer size must be multiple of 2"); + } + // Get source port auto src_port = pcb.entry_->port; if (!src_port) @@ -137,12 +152,58 @@ void ebbrt::NetworkManager::Interface::SendUdp(UdpPcb& pcb, Ipv4Address addr, udp_header.length = htons(data_size + sizeof(UdpHeader)); udp_header.checksum = 0; + //uint64_t buff_addr = reinterpret_cast(buf->Data()); + // Append data header_buf->AppendChain(std::move(buf)); udp_header.checksum = - OffloadPseudoCsum(*header_buf, kIpProtoUDP, src_addr, addr); + OffloadPseudoCsum(*header_buf, kIpProtoUDP, src_addr, addr); + //OffloadPseudoCsumTso(kIpProtoUDP, src_addr, addr); + // OffloadPseudoCsum(*header_buf, kIpProtoUDP, src_addr, addr); + + //ebbrt::kprintf("udp_header.checksum=0x%X udp_header.length=%d src_port=%d dst_port=%d kIpProtoUDP=0x%X src_addr=0x%llX dst_addr=0x%llX buf_len=%d\n", udp_header.checksum, data_size + sizeof(UdpHeader), src_port, port, kIpProtoUDP, src_addr.toU32(), addr.toU32(), data_size); + /*uint8_t* p1 = reinterpret_cast(buff_addr); + int i; + uint32_t sum = 0; + uint16_t word16; + for (i = 0; i < (int)data_size; i+=2) { + word16 = ((p1[i]<<8)&0xFF00) + (p1[i+1]&0xFF); + sum = sum + (uint32_t)word16; + } + // pseudo header start + //add src addr + word16 = (src_addr.toU32() & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((src_addr.toU32() >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add dst addr + word16 = (addr.toU32() & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((addr.toU32() >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + //sum = sum + (uint32_t)(addr.toU32()); + + //add protocol number and length of udp packet + sum = sum + kIpProtoUDP + (data_size + sizeof(UdpHeader)); + // pseudo header end + + // udp header start + // port + sum = sum + (uint32_t) src_port; + sum = sum + (uint32_t) port; + sum = sum + (uint32_t) (data_size + sizeof(UdpHeader)); + // udp header end + + while(sum >> 16) { + sum = (sum & 0xFFFF) + (sum >> 16); + } + sum = (~sum) & 0xFFFF; + udp_header.checksum = htons((uint16_t) sum); + //ebbrt::kprintf("real checksum? 0x%X\n", udp_header.checksum); + */ // XXX: check if checksum offloading is supported PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; diff --git a/src/native/Newlib.cc b/src/native/Newlib.cc index 7110c825..f800e66f 100644 --- a/src/native/Newlib.cc +++ b/src/native/Newlib.cc @@ -88,7 +88,7 @@ extern "C" int ebbrt_newlib_fstat(int file, struct stat* st) { return 0; } -extern "C" int ebbrt_newlib_stat(const char* file, struct stat* st) { +extern "C" int ebbrt_newlib_stat(const char* file, struct stat* st) { EBBRT_UNIMPLEMENTED(); return 0; } @@ -158,6 +158,181 @@ extern "C" int ebbrt_newlib_gettimeofday(struct timeval* p, void* z) { return 0; } +extern "C" int ebbrt_newlib_fcntl(int s, int cmd) { + EBBRT_UNIMPLEMENTED(); + return 0; +} + +extern "C" char* ebbrt_newlib_getcwd(char *buf, size_t size) { + EBBRT_UNIMPLEMENTED(); + return 0; +} + +extern "C" int ebbrt_newlib_dup(int oldfd) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_clock_gettime () +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_clock_settime() +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_clock_getres() +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_closedir(DIR *d) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_opendir(const char* c) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" void ebbrt_newlib_getppid() { + EBBRT_UNIMPLEMENTED(); +} + +extern "C" struct dirent * ebbrt_newlib_readdir(DIR *d) +{ + EBBRT_UNIMPLEMENTED(); + return NULL; +} + +extern "C" int ebbrt_newlib_pipe (int *fd) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_sched_yield() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" void ebbrt_newlib_umask () { + EBBRT_UNIMPLEMENTED(); +} + +extern "C" int ebbrt_newlib_symlink(const char *path1, const char *path2) +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_rmdir(const char *path) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_mkdir(const char *path) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_chdir(const char *path) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" char* ebbrt_newlib_ttyname(int) { + EBBRT_UNIMPLEMENTED(); + return NULL; +} + +extern "C" int ebbrt_newlib_fdatasync(int) { + EBBRT_UNIMPLEMENTED(); + return 0; +} + +extern "C" int ebbrt_newlib_getuid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getgid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_geteuid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getegid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_fsync(int) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_execv (const char *path, char *const argv[]) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_chmod() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_access (const char *fn, int flags) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_utime (const char *path, char *times) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_lstat () { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" void ebbrt_newlib_getpwnam () { + EBBRT_UNIMPLEMENTED(); +} +extern "C" void ebbrt_newlib_getpwuid () { + EBBRT_UNIMPLEMENTED(); +} + +extern "C" int ebbrt_newlib_select () { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getrusage() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getrlimit() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_setrlimit() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + struct RLock { static const constexpr uint32_t kNoOwner = -1; uint32_t owner; diff --git a/src/native/Newlib.h b/src/native/Newlib.h index 502d4c61..3149d96f 100644 --- a/src/native/Newlib.h +++ b/src/native/Newlib.h @@ -14,7 +14,8 @@ extern "C" { typedef void* _LOCK_T; typedef void* _LOCK_RECURSIVE_T; - +typedef struct __dirstream DIR; + extern void ebbrt_newlib_lock_init(_LOCK_T*); extern void ebbrt_newlib_lock_init_recursive(_LOCK_RECURSIVE_T*); extern void ebbrt_newlib_lock_close(_LOCK_T*); @@ -48,7 +49,42 @@ extern void* ebbrt_newlib_realloc(void*, size_t); extern void* ebbrt_newlib_calloc(size_t, size_t); extern void* ebbrt_newlib_memalign(size_t, size_t); extern int ebbrt_newlib_gettimeofday(struct timeval *, void *); - +extern int ebbrt_newlib_fcntl(int , int); +extern char* ebbrt_newlib_getcwd(char * , size_t); +extern int ebbrt_newlib_dup(int); +extern int ebbrt_newlib_clock_gettime(); +extern int ebbrt_newlib_clock_settime(); +extern int ebbrt_newlib_clock_getres(); +extern int ebbrt_newlib_closedir (DIR *); +extern int ebbrt_newlib_opendir (const char*); +extern void ebbrt_newlib_getppid(); +extern struct dirent * ebbrt_newlib_readdir(DIR *d); +extern int ebbrt_newlib_pipe (int *); +extern int ebbrt_newlib_sched_yield(); +extern void ebbrt_newlib_umask (); +extern int ebbrt_newlib_symlink(const char *path1, const char *path2); +extern int ebbrt_newlib_rmdir(const char *path); +extern int ebbrt_newlib_mkdir(const char *path); +extern int ebbrt_newlib_chdir(const char *path); +extern char* ebbrt_newlib_ttyname(int); +extern int ebbrt_newlib_fdatasync(int); +extern int ebbrt_newlib_getuid(); +extern int ebbrt_newlib_getgid(); +extern int ebbrt_newlib_geteuid(); +extern int ebbrt_newlib_getegid(); +extern int ebbrt_newlib_fsync(int); +extern int ebbrt_newlib_execv (const char *path, char *const argv[]); +extern int ebbrt_newlib_chmod(); +extern int ebbrt_newlib_access (const char *fn, int flags); +extern int ebbrt_newlib_utime (const char *path, char *times); +extern int lstat (); +extern void ebbrt_newlib_getpwnam (); +extern void ebbrt_newlib_getpwuid (); +extern int ebbrt_newlib_select (); +extern int ebbrt_newlib_getrusage(); +extern int ebbrt_newlib_getrlimit(); +extern int ebbrt_newlib_setrlimit(); + #ifdef __cplusplus } #endif diff --git a/src/native/Pci.cc b/src/native/Pci.cc index cdd53dae..83a4674e 100644 --- a/src/native/Pci.cc +++ b/src/native/Pci.cc @@ -9,6 +9,7 @@ #include "../Align.h" #include "../ExplicitlyConstructed.h" #include "Debug.h" +#include "GeneralPurposeAllocator.h" #include "Io.h" #include "VMem.h" #include "VMemAllocator.h" @@ -34,7 +35,11 @@ uint8_t PciRead8(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { } uint16_t PciRead16(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { PciSetAddr(bus, device, func, offset); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + return ebbrt::io::In16(kPciDataPort + (offset & 2)); +#else return ebbrt::io::In16(kPciDataPort); +#endif } uint32_t PciRead32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { @@ -45,7 +50,12 @@ uint32_t PciRead32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { void PciWrite16(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset, uint16_t val) { PciSetAddr(bus, device, func, offset); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + ebbrt::io::Out16(kPciDataPort + (offset & 2), val); +#else ebbrt::io::Out16(kPciDataPort, val); +#endif } void PciWrite32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset, @@ -71,8 +81,12 @@ void EnumerateBus(uint8_t bus) { if (dev) continue; + dev.DumpAddress(); + dev.DumpInfo(); + if (dev.IsBridge()) { - ebbrt::kabort("Secondary bus unsupported!\n"); + // ebbrt::kabort("Secondary bus unsupported!\n"); + continue; } else { devices->emplace_back(bus, device, func); } @@ -101,6 +115,11 @@ void ebbrt::pci::Init() { devices.construct(); driver_probes.construct(); EnumerateAllBuses(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // TODO - Kludge to identify where NIC sits in device tree, should incorporate + // Dan's pull request for enumerating bridges + EnumerateBus(0x4); +#endif } void ebbrt::pci::RegisterProbe(std::function probe) { @@ -149,6 +168,18 @@ uint16_t ebbrt::pci::Function::GetCommand() const { return Read16(kCommandAddr); } +uint8_t ebbrt::pci::Function::GetClassCode() const { + return Read8(kClassCodeAddr); +} + +uint8_t ebbrt::pci::Function::GetFunc() const { return func_; } + +uint8_t ebbrt::pci::Function::GetSubclass() const { + return Read8(kSubclassAddr); +} + +uint8_t ebbrt::pci::Function::GetProgIf() const { return Read8(kProgIfAddr); } + uint8_t ebbrt::pci::Function::GetHeaderType() const { return Read8(kHeaderTypeAddr) & ~kHeaderMultifuncMask; } @@ -187,6 +218,11 @@ void ebbrt::pci::Function::DumpAddress() const { kprintf("%u:%u:%u\n", bus_, device_, func_); } +void ebbrt::pci::Function::DumpInfo() const { + kprintf("Vendor ID: 0x%x ", GetVendorId()); + kprintf("Device ID: 0x%x\n", GetDeviceId()); +} + ebbrt::pci::Bar::Bar(pci::Device& dev, uint32_t bar_val, uint8_t idx) : vaddr_(nullptr), is_64_(false), prefetchable_(false) { mmio_ = !(bar_val & kIoSpaceFlag); @@ -226,6 +262,8 @@ ebbrt::pci::Bar::~Bar() { kbugon(vaddr_ != nullptr, "pci::Bar: Need to free mapped region\n"); } +void* ebbrt::pci::Bar::GetVaddr() { return vaddr_; } + bool ebbrt::pci::Bar::Is64() const { return is_64_; } void ebbrt::pci::Bar::Map() { @@ -233,10 +271,21 @@ void ebbrt::pci::Bar::Map() { return; auto npages = align::Up(size_, pmem::kPageSize) >> pmem::kPageShift; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + auto pf = std::make_unique(); + auto& ref = *pf; + auto page = vmem_allocator->Alloc(npages, std::move(pf)); + vaddr_ = reinterpret_cast(page.ToAddr()); + kbugon(page == Pfn::None(), "Failed to allocate virtual pages for mmio\n"); + vmem::MapMemory(page, Pfn::Down(addr_), size_); + ref.SetMap(page, Pfn::Down(addr_), size_); +#else auto page = vmem_allocator->Alloc(npages); vaddr_ = reinterpret_cast(page.ToAddr()); kbugon(page == Pfn::None(), "Failed to allocate virtual pages for mmio\n"); vmem::MapMemory(page, Pfn::Down(addr_), size_); +#endif } uint8_t ebbrt::pci::Bar::Read8(size_t offset) { @@ -415,7 +464,15 @@ void ebbrt::pci::Device::SetMsixEntry(size_t entry, uint8_t vector, uint8_t dest) { auto& msix_bar = GetBar(msix_bar_idx_); auto offset = msix_table_offset_ + entry * kMsixTableEntrySize; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // more precise + msix_bar.Write32(offset + kMsixTableEntryAddrLow, 0xFEE00000 | dest << 12); + msix_bar.Write32(offset + kMsixTableEntryAddrHigh, 0x0); +#else msix_bar.Write32(offset + kMsixTableEntryAddr, 0xFEE00000 | dest << 12); +#endif + msix_bar.Write32(offset + kMsixTableEntryData, vector); MsixUnmaskEntry(entry); } diff --git a/src/native/Pci.h b/src/native/Pci.h index 15bc1dce..3d40af31 100644 --- a/src/native/Pci.h +++ b/src/native/Pci.h @@ -30,6 +30,7 @@ class Function { uint8_t GetLatencyTimer() const; uint8_t GetHeaderType() const; uint8_t GetBist() const; + uint8_t GetFunc() const; operator bool() const; bool IsMultifunc() const; @@ -40,6 +41,7 @@ class Function { void DisableInt(); void DumpAddress() const; + void DumpInfo() const; protected: static const constexpr uint8_t kVendorIdAddr = 0x00; @@ -87,6 +89,7 @@ class Bar { void Write8(size_t offset, uint8_t val); void Write16(size_t offset, uint16_t val); void Write32(size_t offset, uint32_t val); + void* GetVaddr(); private: static const constexpr uint32_t kIoSpaceFlag = 0x1; @@ -166,6 +169,8 @@ class Device : public Function { static const constexpr size_t kMsixTableEntryAddr = 0; static const constexpr size_t kMsixTableEntryData = 8; static const constexpr size_t kMsixTableEntryControl = 12; + static const constexpr size_t kMsixTableEntryAddrLow = 0; + static const constexpr size_t kMsixTableEntryAddrHigh = 4; static const constexpr uint32_t kMsixTableEntryControlMaskBit = 1; diff --git a/src/native/Perf.cc b/src/native/Perf.cc index c07edd6d..b62ad115 100644 --- a/src/native/Perf.cc +++ b/src/native/Perf.cc @@ -150,14 +150,14 @@ ebbrt::perf::PerfCounter::PerfCounter(ebbrt::perf::PerfEvent evt) : evt_{evt} { if (((pmcs >> i) & 0x1) == 0) { pmc_num_ = i; pmcs |= (0x1u << i); - kprintf("DEBUG#%d %x \n", pmc_num_, pmcs); + //kprintf("DEBUG#%d %x \n", pmc_num_, pmcs); perfevtsel.usermode = 1; perfevtsel.osmode = 1; perfevtsel.en = 1; ebbrt::msr::Write(kIa32PerfEvtSelMsr(pmc_num_), perfevtsel.val); counter_offset_ = ebbrt::msr::Read(kIa32Pmc(pmc_num_)); - kprintf("Perf counter #%d initialized to evt=%u\n", pmc_num_, - static_cast(evt_)); + //kprintf("Perf counter #%d initialized to evt=%u\n", pmc_num_, + // static_cast(evt_)); return; } } diff --git a/src/native/Rapl.cc b/src/native/Rapl.cc new file mode 100644 index 00000000..a1438b9d --- /dev/null +++ b/src/native/Rapl.cc @@ -0,0 +1,11 @@ +#include "Debug.h" +//#include "Msr.h" +#include "Rapl.h" + +ebbrt::rapl::RaplCounter::~RaplCounter() { + return; +} + +double ebbrt::rapl::RaplCounter::Read() { + return counter_offset; +} diff --git a/src/native/Rapl.h b/src/native/Rapl.h new file mode 100644 index 00000000..8d92ea79 --- /dev/null +++ b/src/native/Rapl.h @@ -0,0 +1,177 @@ +// Copyright Boston University SESA Group 2013 - 2016. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_RAPL_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_RAPL_H_ +#include +#include + +#include "Debug.h" +#include "Msr.h" +#include "Clock.h" + +namespace ebbrt { +namespace rapl { + const constexpr uint32_t kMsrIntelRaplPowerUnit = 0x606; + + /* Package RAPL Domain */ + const constexpr uint32_t kMsrPkgRaplPowerLimit = 0x610; + const constexpr uint32_t kMsrIntelPkgEnergyStatus = 0x611; + const constexpr uint32_t kMsrPkgPerfStatus = 0x613; + const constexpr uint32_t kMsrPkgPowerInfo = 0x614; + + /* PP0 RAPL Domain */ + const constexpr uint32_t kMsrPp0PowerLimit = 0x638; + const constexpr uint32_t kMsrIntelPp0EnergyStatus = 0x639; + const constexpr uint32_t kMsrPp0Policy = 0x63A; + const constexpr uint32_t kMsrPp0PerfStatus = 0x63B; + + /* PP1 RAPL Domain, may reflect to uncore devices */ + const constexpr uint32_t kMsrPp1PowerLimit = 0x640; + const constexpr uint32_t kMsrPp1EnergyStatus = 0x641; + const constexpr uint32_t kMsrPp1Polcy = 0x642; + + /* DRAM RAPL Domain */ + const constexpr uint32_t kMsrDramPowerLimit = 0x618; + const constexpr uint32_t kMsrDramEnergyStatus = 0x619; + const constexpr uint32_t kMsrDramPerfStatus = 0x61B; + const constexpr uint32_t kMsrDramPowerInfo = 0x61C; + + /* PSYS RAPL Domain */ + const constexpr uint32_t kMsrPlatformEnergyStatus = 0x64d; + + /* RAPL UNIT BITMASK */ + const constexpr uint32_t POWER_UNIT_OFFSET = 0; + const constexpr uint32_t POWER_UNIT_MASK = 0x0F; + + const constexpr uint32_t ENERGY_UNIT_OFFSET = 0x08; + const constexpr uint32_t ENERGY_UNIT_MASK = 0x1F00; + + const constexpr uint32_t TIME_UNIT_OFFSET = 0x10; + const constexpr uint32_t TIME_UNIT_MASK = 0xF000; + + class RaplCounter { //: public Timer::Hook { + public: + RaplCounter() { + uint64_t res = ebbrt::msr::Read(kMsrIntelRaplPowerUnit); + rapl_power_units = pow(0.5,(double)(res&0xf)); + rapl_cpu_energy_units = pow(0.5,(double)((res>>8)&0x1f)); + rapl_time_units = pow(0.5,(double)((res>>16)&0xf)); + rapl_dram_energy_units = rapl_cpu_energy_units; + }; + // move constructors + RaplCounter(RaplCounter&& other) { + rapl_power_units = other.rapl_power_units; + rapl_cpu_energy_units = other.rapl_cpu_energy_units; + rapl_time_units = other.rapl_time_units; + rapl_dram_energy_units = other.rapl_dram_energy_units; + + other.rapl_power_units = 0.0; + other.rapl_cpu_energy_units = 0.0; + other.rapl_time_units = 0.0; + other.rapl_dram_energy_units = 0.0; + }; + RaplCounter& operator=(RaplCounter&& other) { + rapl_power_units = other.rapl_power_units; + rapl_cpu_energy_units = other.rapl_cpu_energy_units; + rapl_time_units = other.rapl_time_units; + rapl_dram_energy_units = other.rapl_dram_energy_units; + + other.rapl_power_units = 0.0; + other.rapl_cpu_energy_units = 0.0; + other.rapl_time_units = 0.0; + other.rapl_dram_energy_units = 0.0; + + return *this; + }; + + // delete implicitly created copy constructor + RaplCounter(const RaplCounter& other) = delete; + RaplCounter& operator=(const RaplCounter& other) = delete; + + ~RaplCounter(); + + void Clear() { + counter_offset = 0.0; + } + + uint64_t ReadMsr() + { + return ebbrt::msr::Read(kMsrIntelPkgEnergyStatus); + } + + void Start() { + uint64_t res = ebbrt::msr::Read(kMsrIntelPkgEnergyStatus); + counter_offset = (double)res*rapl_cpu_energy_units; + + /*ebbrt::kprintf("\t\tPower units = %.3fW\n",rapl_power_units); + ebbrt::kprintf("\t\tCPU Energy units = %.8fJ\n",rapl_cpu_energy_units); + ebbrt::kprintf("\t\tDRAM Energy units = %.8fJ\n",rapl_dram_energy_units); + ebbrt::kprintf("\t\tTime units = %.8fs\n",rapl_time_units); */ + //ebbrt::kprintf("Package Energy before: %.6fJ\n", counter_offset); + } + + void Stop() { + uint64_t res = ebbrt::msr::Read(kMsrIntelPkgEnergyStatus); + double after = (double)res*rapl_cpu_energy_units; + //ebbrt::kprintf("Package Energy after: %.6fJ\n", after); + counter_offset = after - counter_offset; + ebbrt::kprintf_force("Total Package Energy used: %.6fJ\n", counter_offset); + } + + void SetLimit(uint32_t v) { + uint64_t result = ebbrt::msr::Read(kMsrPkgRaplPowerLimit); + uint64_t m = 0x7FFF; + uint32_t npower = (uint32_t)(v / 0.125); + + // resetting values + result = result & (~m); + result = result & (~(m << 32)); + + // new power + result = result | npower; + result = result | ((uint64_t)npower << 32); + + // set clamp + result |= 1LL << 15; + result |= 1LL << 16; + result |= 1LL << 47; + result |= 1LL << 48; + + uint32_t low = result & 0xFFFFFFFF; + uint32_t high = (result >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(kMsrPkgRaplPowerLimit), "a"(low), "d"(high)); + + /*result=ebbrt::msr::Read(kMsrPkgRaplPowerLimit); + ebbrt::kprintf("%u Package power limits are %s\n", v, (result >> 63) ? "locked" : "unlocked"); + double pkg_power_limit_1 = rapl_power_units*(double)((result>>0)&0x7FFF); + double pkg_time_window_1 = rapl_time_units*(double)((result>>17)&0x007F); + ebbrt::kprintf("Package power limit #1: %.3fW for %.6fs (%s, %s)\n", + pkg_power_limit_1, pkg_time_window_1, + (result & (1LL<<15)) ? "enable power limit" : "disabled", + (result & (1LL<<16)) ? "clamped" : "not_clamped"); + double pkg_power_limit_2 = rapl_power_units*(double)((result>>32)&0x7FFF); + double pkg_time_window_2 = rapl_time_units*(double)((result>>49)&0x007F); + ebbrt::kprintf("Package power limit #2: %.3fW for %.6fs (%s, %s)\n", + pkg_power_limit_2, pkg_time_window_2, + (result & (1LL<<47)) ? "enable power limit" : "disabled", + (result & (1LL<<48)) ? "clamped" : "not_clamped"); + */ + } + + double Read(); + private: + double rapl_power_units{0.0}; + double rapl_cpu_energy_units{0.0}; + double rapl_time_units{0.0}; + double rapl_dram_energy_units{0.0}; + double counter_offset{0.0}; + //void Fire() override; + }; + +} // namespace rapl +} // namespace ebbrt + +#endif diff --git a/src/native/Timer.cc b/src/native/Timer.cc index 6069be7d..a019f49a 100644 --- a/src/native/Timer.cc +++ b/src/native/Timer.cc @@ -10,9 +10,19 @@ const constexpr ebbrt::EbbId ebbrt::Timer::static_id; +//uint32_t timerCnt[16]; +//uint32_t fireCntA[16]; +//uint32_t fireCntB[16]; + ebbrt::Timer::Timer() { + //uint32_t mycpu = static_cast(Cpu::GetMine()); + + //timerCnt[mycpu] += 1; +// auto interrupt = event_manager->AllocateVector([this, mycpu]() { auto interrupt = event_manager->AllocateVector([this]() { auto now = clock::Wall::Now().time_since_epoch(); + //fireCntA[mycpu] ++; + while (!timers_.empty() && timers_.begin()->fire_time_ <= now) { auto& hook = *timers_.begin(); @@ -25,6 +35,7 @@ ebbrt::Timer::Timer() { timers_.insert(hook); } + //fireCntB[mycpu] ++; hook.Fire(); now = clock::Wall::Now().time_since_epoch(); diff --git a/src/native/Uart8250.cc b/src/native/Uart8250.cc index eba7cd25..e510e04f 100644 --- a/src/native/Uart8250.cc +++ b/src/native/Uart8250.cc @@ -11,7 +11,7 @@ #include "Io.h" namespace { -const constexpr uint16_t kPort = 0x3f8; +const constexpr uint16_t kPort = 0x2f8; // when DLAB = 0 const constexpr uint16_t kDataReg = 0; const constexpr uint16_t kIntEnable = 1; diff --git a/src/native/VMemAllocator.cc b/src/native/VMemAllocator.cc index 173fdc23..e61df2a5 100644 --- a/src/native/VMemAllocator.cc +++ b/src/native/VMemAllocator.cc @@ -118,6 +118,22 @@ ebbrt::VMemAllocator::Alloc(size_t npages, size_t pages_align, npages); } +// Recursive page table walker +struct Frame { + struct Frame *next; + uint64_t rip; +}; + +void dumpFrames(int n, struct Frame *s) +{ + struct Frame *f; + int i; + for (i=0,f=s; inext,i++) { + ebbrt::kprintf("FRAME[%d]: %p RIP: 0x%llx next:%p\n", -1*i, f, + f->rip, f->next); + } +} + void ebbrt::VMemAllocator::HandlePageFault(idt::ExceptionFrame* ef) { std::lock_guard lock(lock_); auto fault_addr = ReadCr2(); diff --git a/src/native/VirtioNet.cc b/src/native/VirtioNet.cc index 61064a2f..85a3ec22 100644 --- a/src/native/VirtioNet.cc +++ b/src/native/VirtioNet.cc @@ -164,6 +164,9 @@ ebbrt::VirtioNetRep::VirtioNetRep(const VirtioNetDriver& root) receive_callback_([this]() { ReceivePoll(); }), circ_buffer_head_(0), circ_buffer_tail_(0) {} +void ebbrt::VirtioNetDriver::Config(std::string s, uint32_t v) {} +std::string ebbrt::VirtioNetDriver::ReadNic() { return ""; } + void ebbrt::VirtioNetDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { ebb_->Send(std::move(buf), std::move(pinfo)); diff --git a/src/native/VirtioNet.h b/src/native/VirtioNet.h index 63b616b5..4387d46e 100644 --- a/src/native/VirtioNet.h +++ b/src/native/VirtioNet.h @@ -24,6 +24,8 @@ class VirtioNetDriver : public VirtioDriver, static void Create(pci::Device& dev); static uint32_t GetDriverFeatures(); void Send(std::unique_ptr buf, PacketInfo pinfo) override; + void Config(std::string s, uint32_t v) override; + std::string ReadNic() override; const EthernetAddress& GetMacAddress() override; private: diff --git a/src/native/config.cmake b/src/native/config.cmake index f3831979..7c477114 100644 --- a/src/native/config.cmake +++ b/src/native/config.cmake @@ -1,6 +1,7 @@ # EbbRT native platform-specific configuration -option(__EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ "Enable Distributed Runtime Support" ON) +option(__EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ "Enable Distributed Runtime Support" OFF) option(__EBBRT_ENABLE_NETWORKING__ "Enable Networking" ON) +option(__EBBRT_ENABLE_BAREMETAL_NIC__ "Enable Baremetal NIC" ON) option(__EBBRT_ENABLE_TRACE__ "Enable Tracing Subsystem" OFF) option(LARGE_WINDOW_HACK "Enable Large TCP Window Hack" OFF) option(PAGE_CHECKER "Enable Page Checker" OFF) @@ -12,7 +13,7 @@ configure_file(${PLATFORM_SOURCE_DIR}/config.h.in config.h @ONLY) set(CMAKE_CXX_FLAGS "-Wall -Werror -std=gnu++14 -include ${CMAKE_CURRENT_BINARY_DIR}/config.h") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") -set(CMAKE_CXX_FLAGS_RELEASE "-O4 -flto -DNDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-O4 -flto") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g3") set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) set(CMAKE_ASM_FLAGS "-DASSEMBLY") diff --git a/src/native/config.h.in b/src/native/config.h.in index 5344cd86..f90587bf 100644 --- a/src/native/config.h.in +++ b/src/native/config.h.in @@ -1,6 +1,7 @@ #cmakedefine __EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ #cmakedefine __EBBRT_ENABLE_NETWORKING__ #cmakedefine __EBBRT_ENABLE_TRACE__ +#cmakedefine __EBBRT_ENABLE_BAREMETAL_NIC__ #cmakedefine LARGE_WINDOW_HACK #cmakedefine PAGE_CHECKER #cmakedefine VIRTIO_ZERO_COPY diff --git a/toolchain/patches/newlib-2.0.0.patch b/toolchain/patches/newlib-2.0.0.patch index 51c9e675..71c3194b 100644 --- a/toolchain/patches/newlib-2.0.0.patch +++ b/toolchain/patches/newlib-2.0.0.patch @@ -116,7 +116,7 @@ new file mode 100644 index 0000000..68b7fbe --- /dev/null +++ b/newlib/libc/sys/ebbrt/syscalls.c -@@ -0,0 +1,145 @@ +@@ -0,0 +1,304 @@ +#include +#include +#include @@ -262,3 +262,162 @@ index 0000000..68b7fbe + return ebbrt_newlib_gettimeofday(p, z); +} + ++int fcntl(int s, int cmd, ...) ++{ ++ return ebbrt_newlib_fcntl(s, cmd); ++} ++ ++char* getcwd(char *buf, size_t size) ++{ ++ return ebbrt_newlib_getcwd(buf, size); ++} ++ ++int dup(int oldfd) ++{ ++ return ebbrt_newlib_dup(oldfd); ++} ++ ++int clock_gettime (clockid_t clock_id, struct timespec *tp) ++{ ++ return ebbrt_newlib_clock_gettime(); ++} ++int clock_settime (clockid_t clock_id, const struct timespec *tp) ++{ ++ return ebbrt_newlib_clock_settime(); ++} ++int clock_getres (clockid_t clock_id, struct timespec *res) ++{ ++ return ebbrt_newlib_clock_getres(); ++} ++ ++int closedir (DIR *d) ++{ ++ return ebbrt_newlib_closedir(d); ++} ++ ++DIR *opendir(const char * c) ++{ ++ return ebbrt_newlib_opendir(c); ++} ++ ++pid_t getppid() ++{ ++ ebbrt_newlib_getppid(); ++ return -1; ++} ++ ++struct dirent * readdir(DIR *d) ++{ ++ return ebbrt_newlib_readdir(d); ++} ++ ++int pipe (int *fd) ++{ ++ return ebbrt_newlib_pipe(fd); ++} ++ ++int sched_yield () ++{ ++ return ebbrt_newlib_sched_yield(); ++} ++ ++mode_t umask (mode_t mask) ++{ ++ ebbrt_newlib_umask(); ++ return -1; ++} ++ ++int symlink(const char *path1, const char *path2) ++{ ++ return ebbrt_newlib_symlink(path1, path2); ++} ++ ++int rmdir(const char *path) ++{ ++ return ebbrt_newlib_rmdir(path); ++} ++int mkdir (const char *path, mode_t mode) ++{ ++ return ebbrt_newlib_mkdir(path); ++} ++ ++int chdir (const char *path) ++{ ++ return ebbrt_newlib_chdir(path); ++} ++ ++char* ttyname(int fd) ++{ ++ return ebbrt_newlib_ttyname(fd); ++} ++int fdatasync(int fd) ++{ ++ return ebbrt_newlib_fdatasync(fd); ++} ++uid_t getuid() { ++ return ebbrt_newlib_getuid(); ++} ++ ++uid_t getgid(void) { ++ return ebbrt_newlib_getgid(); ++} ++ ++uid_t geteuid() { ++ return ebbrt_newlib_geteuid(); ++} ++ ++uid_t getegid(void) { ++ return ebbrt_newlib_getegid(); ++} ++ ++int fsync (int fd) ++{ ++ return ebbrt_newlib_fsync(fd); ++} ++int execv (const char *path, char *const argv[]) ++{ ++ return ebbrt_newlib_execv (path, argv); ++} ++int chmod (const char *path, mode_t mode) ++{ ++ return ebbrt_newlib_chmod(); ++} ++int access (const char *fn, int flags) ++{ ++ return ebbrt_newlib_access(fn, flags); ++} ++int utime (const char *path, char *times) ++{ ++ return ebbrt_newlib_utime(path, times); ++} ++int lstat (const char *__restrict pathname, struct stat *__restrict pstat) ++{ ++ return ebbrt_newlib_lstat(); ++} ++struct passwd* getpwnam (const char *name) ++{ ++ ebbrt_newlib_getpwnam(); ++ return NULL; ++} ++struct passwd* getpwuid (uid_t uid) ++{ ++ ebbrt_newlib_getpwuid(); ++ return NULL; ++} ++int select (int n, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) ++{ ++ return ebbrt_newlib_select(); ++} ++int getrusage(int who, struct rusage *rusage) ++{ ++ return ebbrt_newlib_getrusage(); ++} ++ ++int getrlimit(int resource, struct rlimit* rlim) ++{ ++ return ebbrt_newlib_getrlimit(); ++} ++int setrlimit(int resource, const struct rlimit* rlim) ++{ ++ return ebbrt_newlib_setrlimit(); ++}