diff --git a/packages/adsim/buildfiles/fbgemm/cmake.txt b/packages/adsim/buildfiles/fbgemm/cmake.txt index baf263f9..720ddebf 100755 --- a/packages/adsim/buildfiles/fbgemm/cmake.txt +++ b/packages/adsim/buildfiles/fbgemm/cmake.txt @@ -39,6 +39,9 @@ cp libfbgemm.so* "${ADSIM_STAGING_DIR}/lib/" 2>/dev/null || true # Copy headers from source cp -r ../include/fbgemm "${ADSIM_STAGING_DIR}/include/" 2>/dev/null || true +# Copy asmjit headers (new dependency in FBGEMM v1.4.0) +cp -r ../external/asmjit/src/asmjit "${ADSIM_STAGING_DIR}/include/" + echo "[BUILD] FBGEMM files copied to staging directory" # Try make install but don't fail if it errors diff --git a/packages/adsim/install_fbgemm.sh b/packages/adsim/install_fbgemm.sh index 2cae8601..34fdc40c 100755 --- a/packages/adsim/install_fbgemm.sh +++ b/packages/adsim/install_fbgemm.sh @@ -16,10 +16,10 @@ MINICONDA_PREFIX=${FBGEMM_STAGING_DIR}/miniconda # Version of FBGEMM to install -FBGEMM_VERSION=v1.2.0 +FBGEMM_VERSION=v1.4.0 # Version of PyTorch to install -PYTORCH_VERSION=2.7.0 +PYTORCH_VERSION=2.8.0 MINICONDA_VERSION="5.1-0" diff --git a/packages/adsim/patches/treadmill.patch b/packages/adsim/patches/treadmill.patch index b8055c93..aa0c0f12 100644 --- a/packages/adsim/patches/treadmill.patch +++ b/packages/adsim/patches/treadmill.patch @@ -1,7 +1,7 @@ diff -wbBdu -ruN '--exclude=.git' '--exclude=*.rej' '--exclude=*.orig' '--exclude=gen-cpp2' '--exclude=build' '--exclude=third_party' treadmill-src/build.sh treadmill/build.sh --- treadmill-src/build.sh 1969-12-31 16:00:00.000000000 -0800 +++ treadmill/build.sh 2025-08-04 15:44:21.731476550 -0700 -@@ -0,0 +1,42 @@ +@@ -0,0 +1,47 @@ +#!/bin/bash + +# Exit on error @@ -15,7 +15,12 @@ diff -wbBdu -ruN '--exclude=.git' '--exclude=*.rej' '--exclude=*.orig' '--exclud +INSTALL=false + + -+sudo dnf install -y numactl-devel ++if command -v dnf >/dev/null 2>&1; then ++ sudo dnf install -y numactl-devel ++elif command -v apt-get >/dev/null 2>&1; then ++ sudo apt-get update ++ sudo apt-get install -y libnuma-dev numactl ++fi +# Create build directory +mkdir -p build +cd build @@ -6977,7 +6982,7 @@ diff -wbBdu -ruN '--exclude=.git' '--exclude=*.rej' '--exclude=*.orig' '--exclud diff -wbBdu -ruN '--exclude=.git' '--exclude=*.rej' '--exclude=*.orig' '--exclude=gen-cpp2' '--exclude=build' '--exclude=third_party' treadmill-src/src/Scheduler.cpp treadmill/src/Scheduler.cpp --- treadmill-src/src/Scheduler.cpp 1969-12-31 16:00:00.000000000 -0800 +++ treadmill/src/Scheduler.cpp 2025-08-04 15:44:21.785297704 -0700 -@@ -0,0 +1,107 @@ +@@ -0,0 +1,113 @@ +/* + * Copyright (c) 2014, Facebook, Inc. + * All rights reserved. @@ -7039,8 +7044,14 @@ diff -wbBdu -ruN '--exclude=.git' '--exclude=*.rej' '--exclude=*.orig' '--exclud + to avoid memory order violation, which greatly improves its performance. + http://siyobik.info.gf/main/reference/instruction/PAUSE */ + for (auto start = nowNs(); nowNs() - start < ns;) { -+ asm volatile("pause"); -+ } ++#if defined(__x86_64__) || defined(__i386__) ++ asm volatile("pause"); ++#elif defined(__aarch64__) || defined(__arm64__) ++ asm volatile("yield" ::: "memory"); ++#else ++ asm volatile("" ::: "memory"); ++#endif ++} +} + +/** diff --git a/packages/adsim/src/cpp2/server/CMakeLists.txt b/packages/adsim/src/cpp2/server/CMakeLists.txt index 621aecc0..31c5355c 100644 --- a/packages/adsim/src/cpp2/server/CMakeLists.txt +++ b/packages/adsim/src/cpp2/server/CMakeLists.txt @@ -9,8 +9,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") endif() target_compile_definitions(folly_memcpy_obj PRIVATE FOLLY_MEMCPY_IS_MEMCPY) -target_compile_options(folly_memcpy_obj PRIVATE +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + target_compile_options(folly_memcpy_obj PRIVATE -mavx2 -march=haswell) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + target_compile_options(folly_memcpy_obj PRIVATE + -march=armv8.5-a+sve2 + -mcpu=native) +endif() # Data objects library add_library(data_objects DataObjects.cpp) @@ -61,13 +67,18 @@ target_link_libraries(adsim_server Atomic::Atomic ${FMT_LIBRARIES} ${JEMALLOC_LIB} - $ ) +# Link folly_memcpy_obj only on x86 — on ARM the custom memmove causes +# R_AARCH64_CALL26 relocation overflows and is a no-op anyway +# (FOLLY_MEMCPY_IS_MEMCPY is defined). +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + target_link_libraries(adsim_server + PUBLIC $) + target_compile_options(adsim_server PRIVATE + -fno-builtin-mempcpy + -fno-builtin-memmove) +endif() # Linker and compiler options target_link_options(adsim_server PRIVATE "-Wl,--export-dynamic") -target_compile_options(adsim_server PRIVATE - -fno-builtin-mempcpy - -fno-builtin-memmove -) diff --git a/packages/adsim/src/cpp2/server/dwarfs/CMakeLists.txt b/packages/adsim/src/cpp2/server/dwarfs/CMakeLists.txt index 5a7c722f..ec6be90c 100644 --- a/packages/adsim/src/cpp2/server/dwarfs/CMakeLists.txt +++ b/packages/adsim/src/cpp2/server/dwarfs/CMakeLists.txt @@ -20,12 +20,19 @@ add_library(gemm GEMM.cc GEMM.h ${FBGEMM_SRC_DIR}/test/QuantizationHelpers.cc ) -target_compile_options(gemm PRIVATE - ${COROUTINES_FLAG} - -m64 - -mavx2 - -mfma - -masm=intel) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + target_compile_options(gemm PRIVATE + ${COROUTINES_FLAG} + -m64 + -mavx2 + -mfma + -masm=intel) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + target_compile_options(gemm PRIVATE + ${COROUTINES_FLAG} + -march=armv8.5-a+sve2 + -mcpu=native) +endif() target_link_directories(gemm PUBLIC ${ADSIM_STAGING_DIR}/include @@ -40,12 +47,19 @@ add_dependencies(gemm fbgemm) add_library(embedding Embedding.cc Embedding.h) -target_compile_options(embedding PRIVATE - ${COROUTINES_FLAG} - -m64 - -mavx2 - -mfma - -masm=intel) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + target_compile_options(gemm PRIVATE + ${COROUTINES_FLAG} + -m64 + -mavx2 + -mfma + -masm=intel) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + target_compile_options(gemm PRIVATE + ${COROUTINES_FLAG} + -march=armv8.5-a+sve2 + -mcpu=native) +endif() target_link_directories(embedding PUBLIC ${ADSIM_STAGING_DIR}/include