From 96ce069025ba7ce2bbaa30ef73119fbd6af0125f Mon Sep 17 00:00:00 2001 From: wangzhe <734914022@qq.com> Date: Thu, 20 Mar 2025 03:18:51 +0000 Subject: [PATCH 1/2] fix c++ version --- CMakeLists.txt | 77 ++++++++++++++++++++++++++++++++++++++ README.md | 17 +++++++++ dmlc-core | 2 +- example/criteo_sgd.conf | 41 +++++++++++++++----- example/gisette_sgd.conf | 20 ++++++++++ include/difacto/store.h | 1 + src/reader/adfea_parser.h | 8 ++-- src/reader/criteo_parser.h | 2 +- tests/cpp/test.mk | 22 ----------- 9 files changed, 152 insertions(+), 38 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 example/gisette_sgd.conf delete mode 100644 tests/cpp/test.mk diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..3c884b6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,77 @@ +cmake_minimum_required(VERSION 3.15 FATAL_ERROR) + +project(difacto VERSION 0.1 LANGUAGES C CXX) + +# 设置C++标准 +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +# 定义依赖路径 +set(DEPS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/deps) + +# 配置选项 +option(USE_CITY "Build with CITY HASH support" ON) +option(USE_LZ4 "Build with LZ4 support" ON) +option(NO_REVERSE_ID "Disable reverse feature ID" OFF) + +# 添加路径定义 +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR}/src + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/include + ${CMAKE_CURRENT_SOURCE_DIR}/ps-lite/include + ${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/src + ${DEPS_PATH}/include +) + +# 定义全局编译选项 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fopenmp -fPIC -O3 -ggdb -Wall -finline-functions -DDMLC_LOG_FATAL_THROW=0") + +if(NO_REVERSE_ID) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DREVERSE_FEATURE_ID=0") +endif() + +# 添加dmlc-core子项目 +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core) + +# 添加ps-lite子项目 +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ps-lite) + +# 编译difacto库 +file(GLOB_RECURSE DIFACTO_SRCS + src/loss/loss.cc + src/updater.cc + src/sgd/sgd_updater.cc + src/sgd/sgd_learner.cc + src/learner.cc + src/bcd/bcd_learner.cc + src/lbfgs/lbfgs_learner.cc + src/store/store.cc + src/tracker/tracker.cc + src/reporter/reporter.cc + src/data/localizer.cc + src/reader/batch_reader.cc +) + +add_library(difacto STATIC ${DIFACTO_SRCS}) + +# City Hash 依赖处理 +if(USE_CITY) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDIFACTO_USE_CITY=1") + target_link_libraries(difacto ${DEPS_PATH}/lib/libcityhash.a) +endif() + +# LZ4 依赖处理 +if(USE_LZ4) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDIFACTO_USE_LZ4=1") + target_link_libraries(difacto ${DEPS_PATH}/lib/liblz4.a) +endif() + +# 添加可执行文件 +add_executable(difacto_main src/main.cc) +set_target_properties(difacto_main PROPERTIES OUTPUT_NAME "difacto") +target_link_libraries(difacto_main difacto dmlc pslite ${CMAKE_THREAD_LIBS_INIT}) + +# 如果需要单元测试,在这里添加 +# add_subdirectory(tests) \ No newline at end of file diff --git a/README.md b/README.md index e05a907..61170ab 100644 --- a/README.md +++ b/README.md @@ -35,3 +35,20 @@ Origins from Mu Li, Ziqi Liu, Alex Smola, and Yu-Xiang Wang. DiFacto — Distributed Factorization Machines. In WSDM, 2016 + +## Build + +### 使用CMake构建(推荐) + +```bash +# 创建编译目录 +mkdir -p build +cd build + +# 配置和编译项目 +cmake .. +make -j4 + +# 运行示例 +./difacto task=train learner=sgd batch_size=100 data_in=/path/to/data V_dim=10 +``` diff --git a/dmlc-core b/dmlc-core index 56cab31..1334185 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit 56cab3143f73c7dadb77fa0b3cb7079aca618d42 +Subproject commit 13341857549852a9a86b1894b5ba84c6276ab381 diff --git a/example/criteo_sgd.conf b/example/criteo_sgd.conf index 1d45535..d892ea2 100644 --- a/example/criteo_sgd.conf +++ b/example/criteo_sgd.conf @@ -1,19 +1,40 @@ # data -data_in = data/criteo_kaggle/criteo_train.rec -data_val = data/criteo_kaggle/criteo_val.rec -data_format = rec +# data_in = /workspace/public_data/criteo/dac/train.txt +# data_val = /workspace/public_data/criteo/dac/test.txt +# data_in = ./data/criteo.train_sample +# data_val = ./data/criteo.train_sample +data_in = ./data/criteo.train_sample_libsvm +data_val = ./data/criteo.train_sample_libsvm +# data_format = criteo +data_format = libsvm + +# 日志设置 +# log_level = 1 +# report_interval = 60 # 每60秒报告一次进度 # learner task = train learner = sgd -max_num_epochs = 10 -batch_size = 10000 +max_num_epochs = 5 +batch_size = 1000 -# linear term -l1 = 10 -l2 = 10 +# # linear term +# l1 = 10 +# l2 = 10 # embedding term -V_dim = 10 +V_dim = 8 V_threshold = 10 -V_l2 = 10 +# V_l2 = 10 + +# loss +loss = fm + +# learning rate +lr = 0.01 +lr_beta = 1 +V_lr = 0.01 +V_lr_beta = 1 + +# number of jobs per epoch +num_jobs_per_epoch = 10 diff --git a/example/gisette_sgd.conf b/example/gisette_sgd.conf new file mode 100644 index 0000000..8e0052b --- /dev/null +++ b/example/gisette_sgd.conf @@ -0,0 +1,20 @@ +# data +data_in = data/gisette_scale +data_val = data/gisette_scale.t + +# learner +task = train +learner = sgd +max_num_epochs = 2 +batch_size = 512 +lr = 0.02 +V_lr=.001 + +# linear term +l1 = 10 +l2 = 10 + +# embedding term +V_dim = 10 +V_threshold = 10 +V_l2 = 10 diff --git a/include/difacto/store.h b/include/difacto/store.h index 1b2b15e..de16d32 100644 --- a/include/difacto/store.h +++ b/include/difacto/store.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "./base.h" #include "dmlc/io.h" #include "dmlc/parameter.h" diff --git a/src/reader/adfea_parser.h b/src/reader/adfea_parser.h index 05d9d8d..3fd47cd 100644 --- a/src/reader/adfea_parser.h +++ b/src/reader/adfea_parser.h @@ -10,7 +10,7 @@ #include "difacto/base.h" #include "data/row_block.h" #include "data/parser.h" -#include "data/strtonum.h" +#include "dmlc/strtonum.h" namespace difacto { /** @@ -33,9 +33,9 @@ class AdfeaParser : public dmlc::data::ParserImpl { } bool ParseNext( std::vector > *data) override { - using dmlc::data::isspace; - using dmlc::data::isdigit; - using dmlc::data::strtoull; + using dmlc::isspace; + using dmlc::isdigit; + using dmlc::strtoull; dmlc::InputSplit::Blob chunk; diff --git a/src/reader/criteo_parser.h b/src/reader/criteo_parser.h index 87eee9f..8413ddc 100644 --- a/src/reader/criteo_parser.h +++ b/src/reader/criteo_parser.h @@ -13,7 +13,7 @@ #include "difacto/base.h" #include "data/row_block.h" #include "data/parser.h" -#include "data/strtonum.h" +#include "dmlc/strtonum.h" namespace difacto { /** diff --git a/tests/cpp/test.mk b/tests/cpp/test.mk deleted file mode 100644 index 70abe74..0000000 --- a/tests/cpp/test.mk +++ /dev/null @@ -1,22 +0,0 @@ -GTEST_PATH = /usr - -CPPTEST_SRC = $(wildcard tests/cpp/*_test.cc) -CPPTEST_OBJ = $(patsubst tests/cpp/%_test.cc, build/tests/%_test.o, $(CPPTEST_SRC)) - -build/tests/%.o : tests/cpp/%.cc ${DEPS} - @mkdir -p $(@D) - $(CXX) $(INCPATH) -std=c++0x -MM -MT build/tests/$*.o $< >build/tests/$*.d - $(CXX) $(CFLAGS) -c $< -o $@ - -build/difacto_tests: $(CPPTEST_OBJ) build/tests/main.o build/libdifacto.a $(DMLC_DEPS) - $(CXX) $(CFLAGS) -I$(GTEST_PATH)/include -o $@ $^ $(LDFLAGS) -L$(GTEST_PATH)/lib -lgtest - -CPPPERF_SRC = $(wildcard tests/cpp/*_perf.cc) -CPPPERF = $(patsubst tests/cpp/%_perf.cc, build/%_perf, $(CPPTEST_SRC)) - - -build/%_perf : tests/cpp/%_perf.cc build/libdifacto.a $(DMLC_DEPS) ${DEPS} - $(CXX) -std=c++0x $(CFLAGS) -MM -MT $@ $< >$@.d - $(CXX) -std=c++0x $(CFLAGS) -I$(GTEST_PATH)/include -o $@ $(filter %.cc %.a, $^) $(LDFLAGS) - -cpp-perf: $(CPPPERF) From 4acc8b2776c531af3a326955849fc872cffac5ed Mon Sep 17 00:00:00 2001 From: wangzhe <734914022@qq.com> Date: Thu, 20 Mar 2025 10:38:57 +0000 Subject: [PATCH 2/2] fix --- CMakeLists.txt | 24 ++++++++++++ Makefile | 79 -------------------------------------- README.md | 2 +- example/criteo_sgd.conf | 12 +++--- ps-lite | 2 +- src/reader/criteo_parser.h | 14 +++---- 6 files changed, 37 insertions(+), 96 deletions(-) delete mode 100644 Makefile diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c884b6..9de0fe2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,5 +73,29 @@ add_executable(difacto_main src/main.cc) set_target_properties(difacto_main PROPERTIES OUTPUT_NAME "difacto") target_link_libraries(difacto_main difacto dmlc pslite ${CMAKE_THREAD_LIBS_INIT}) +# 添加测试目标 +file(GLOB_RECURSE TEST_SRCS + tests/cpp/main.cc + tests/cpp/spmv_test.cc + tests/cpp/spmm_test.cc + tests/cpp/spmt_test.cc + tests/cpp/compressed_row_block_test.cc + tests/cpp/find_position_test.cc + tests/cpp/localizer_test.cc + tests/cpp/kv_match_test.cc + tests/cpp/kv_union_test.cc + tests/cpp/sgd_learner_test.cc + tests/cpp/bcd_learner_test.cc + tests/cpp/lbfgs_learner_test.cc + tests/cpp/lbfgs_twoloop_test.cc + tests/cpp/batch_reader_test.cc + tests/cpp/fm_loss_test.cc + tests/cpp/logit_loss_delta_test.cc + tests/cpp/data_store_test.cc +) + +add_executable(difacto_tests ${TEST_SRCS}) +target_link_libraries(difacto_tests difacto dmlc pslite gtest gtest_main pthread ${CMAKE_THREAD_LIBS_INIT}) + # 如果需要单元测试,在这里添加 # add_subdirectory(tests) \ No newline at end of file diff --git a/Makefile b/Makefile deleted file mode 100644 index 393ec7f..0000000 --- a/Makefile +++ /dev/null @@ -1,79 +0,0 @@ -# default configures, one can change it by passing new value to make. -# e.g. `make CXX=g++-4.9` -CXX = g++ -DEPS_PATH = $(shell pwd)/deps -USE_CITY=0 -USE_LZ4=1 -NO_REVERSE_ID=0 - -all: build/difacto - -INCPATH = -I./src -I./include -I./dmlc-core/include -I./ps-lite/include -I./dmlc-core/src -I$(DEPS_PATH)/include -PROTOC = ${DEPS_PATH}/bin/protoc -CFLAGS = -std=c++11 -fopenmp -fPIC -O3 -ggdb -Wall -finline-functions $(INCPATH) -DDMLC_LOG_FATAL_THROW=0 $(ADD_CFLAGS) - -ifeq ($(NO_REVERSE_ID), 1) -CFLAGS += -DREVERSE_FEATURE_ID=0 -endif - -include ps-lite/make/deps.mk - -ifeq ($(USE_CITY), 1) -DEPS += ${CITYHASH} -CFLAGS += -DDIFACTO_USE_CITY=1 -LDFLAGS += ${DEPS_PATH}/lib/libcityhash.a -endif - -ifeq ($(USE_LZ4), 1) -DEPS += ${LZ4} -CFLAGS += -DDIFACTO_USE_LZ4=1 -LDFLAGS += ${DEPS_PATH}/lib/liblz4.a -endif - - - -# LDFLAGS += $(addprefix $(DEPS_PATH)/lib/, libprotobuf.a libzmq.a) - -OBJS = $(addprefix build/, loss/loss.o \ -updater.o \ -sgd/sgd_updater.o sgd/sgd_learner.o \ -learner.o \ -bcd/bcd_learner.o \ -lbfgs/lbfgs_learner.o \ -store/store.o \ -tracker/tracker.o \ -reporter/reporter.o \ -data/localizer.o reader/batch_reader.o ) - -DMLC_DEPS = dmlc-core/libdmlc.a - -clean: - rm -rf build/* - make -C dmlc-core clean - make -C ps-lite clean - -lint: - python2 dmlc-core/scripts/lint.py difacto all include src tests/cpp - - -build/%.o: src/%.cc ${DEPS} - @mkdir -p $(@D) - $(CXX) $(INCPATH) -std=c++0x -MM -MT build/$*.o $< >build/$*.d - $(CXX) $(CFLAGS) -c $< -o $@ - -build/libdifacto.a: $(OBJS) - ar crv $@ $(filter %.o, $?) - -build/difacto: build/main.o build/libdifacto.a $(DMLC_DEPS) - $(CXX) $(CFLAGS) -o $@ $^ $(LDFLAGS) - -dmlc-core/libdmlc.a: - $(MAKE) -C dmlc-core libdmlc.a DEPS_PATH=$(DEPS_PATH) CXX=$(CXX) - -include tests/cpp/test.mk - - -test: build/difacto_tests - --include build/*.d --include build/*/*.d diff --git a/README.md b/README.md index 61170ab..afd3ccf 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ and train FM with 2-dimension on it. git clone --recursive https://github.com/dmlc/difacto cd difacto; git submodule update --init; make -j8 ./tools/download.sh gisette -build/difacto data_in=data/gisette_scale val_data=data/gisette_scale.t lr=.02 V_dim=2 V_lr=.001 +build/difacto data_in=data/gisette_scale val_data=data/gisette_scale.t lr=.02 V_dim=2 V_lr=.001 batch_size=256 ``` ### History diff --git a/example/criteo_sgd.conf b/example/criteo_sgd.conf index d892ea2..e4bcce0 100644 --- a/example/criteo_sgd.conf +++ b/example/criteo_sgd.conf @@ -1,12 +1,12 @@ # data # data_in = /workspace/public_data/criteo/dac/train.txt # data_val = /workspace/public_data/criteo/dac/test.txt -# data_in = ./data/criteo.train_sample -# data_val = ./data/criteo.train_sample -data_in = ./data/criteo.train_sample_libsvm -data_val = ./data/criteo.train_sample_libsvm -# data_format = criteo -data_format = libsvm +data_in = ./data/criteo.train_sample +data_val = ./data/criteo.train_sample +# data_in = ./data/criteo.train_sample_libsvm +# data_val = ./data/criteo.train_sample_libsvm +data_format = criteo +# data_format = libsvm # 日志设置 # log_level = 1 diff --git a/ps-lite b/ps-lite index e9de822..97b6e0d 160000 --- a/ps-lite +++ b/ps-lite @@ -1 +1 @@ -Subproject commit e9de8228ccc4756ce9afda8546b31e48a046321a +Subproject commit 97b6e0d5356e098a3e9effb8739eebc342d36f96 diff --git a/src/reader/criteo_parser.h b/src/reader/criteo_parser.h index 8413ddc..3d87c61 100644 --- a/src/reader/criteo_parser.h +++ b/src/reader/criteo_parser.h @@ -6,9 +6,8 @@ #ifndef DIFACTO_READER_CRITEO_PARSER_H_ #define DIFACTO_READER_CRITEO_PARSER_H_ #include -#if DIFACTO_USE_CITY -#include -#endif // DIFACTO_USE_CITY +#include +#include #include #include "difacto/base.h" #include "data/row_block.h" @@ -93,12 +92,9 @@ class CriteoParser : public dmlc::data::ParserImpl { private: inline feaid_t Hash(const char* p, size_t len) { -#if DIFACTO_USE_CITY - return CityHash64(p, len); -#else - LOG(FATAL) << "compile with USE_CITY=1"; - return 0; -#endif // DIFACTO_USE_CITY + std::string str(p, len); + std::hash hasher; + return hasher(str); } // implement strchr