Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 101 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
cmake_minimum_required(VERSION 3.15 FATAL_ERROR)

project(difacto VERSION 0.1 LANGUAGES C CXX)

# 设置C++标准
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

# 定义依赖路径
set(DEPS_PATH ${CMAKE_CURRENT_SOURCE_DIR}/deps)

# 配置选项
option(USE_CITY "Build with CITY HASH support" ON)
option(USE_LZ4 "Build with LZ4 support" ON)
option(NO_REVERSE_ID "Disable reverse feature ID" OFF)

# 添加路径定义
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}/src
${CMAKE_CURRENT_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/include
${CMAKE_CURRENT_SOURCE_DIR}/ps-lite/include
${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/src
${DEPS_PATH}/include
)

# 定义全局编译选项
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fopenmp -fPIC -O3 -ggdb -Wall -finline-functions -DDMLC_LOG_FATAL_THROW=0")

if(NO_REVERSE_ID)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DREVERSE_FEATURE_ID=0")
endif()

# 添加dmlc-core子项目
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core)

# 添加ps-lite子项目
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ps-lite)

# 编译difacto库
file(GLOB_RECURSE DIFACTO_SRCS
src/loss/loss.cc
src/updater.cc
src/sgd/sgd_updater.cc
src/sgd/sgd_learner.cc
src/learner.cc
src/bcd/bcd_learner.cc
src/lbfgs/lbfgs_learner.cc
src/store/store.cc
src/tracker/tracker.cc
src/reporter/reporter.cc
src/data/localizer.cc
src/reader/batch_reader.cc
)

add_library(difacto STATIC ${DIFACTO_SRCS})

# City Hash 依赖处理
if(USE_CITY)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDIFACTO_USE_CITY=1")
target_link_libraries(difacto ${DEPS_PATH}/lib/libcityhash.a)
endif()

# LZ4 依赖处理
if(USE_LZ4)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDIFACTO_USE_LZ4=1")
target_link_libraries(difacto ${DEPS_PATH}/lib/liblz4.a)
endif()

# 添加可执行文件
add_executable(difacto_main src/main.cc)
set_target_properties(difacto_main PROPERTIES OUTPUT_NAME "difacto")
target_link_libraries(difacto_main difacto dmlc pslite ${CMAKE_THREAD_LIBS_INIT})

# 添加测试目标
file(GLOB_RECURSE TEST_SRCS
tests/cpp/main.cc
tests/cpp/spmv_test.cc
tests/cpp/spmm_test.cc
tests/cpp/spmt_test.cc
tests/cpp/compressed_row_block_test.cc
tests/cpp/find_position_test.cc
tests/cpp/localizer_test.cc
tests/cpp/kv_match_test.cc
tests/cpp/kv_union_test.cc
tests/cpp/sgd_learner_test.cc
tests/cpp/bcd_learner_test.cc
tests/cpp/lbfgs_learner_test.cc
tests/cpp/lbfgs_twoloop_test.cc
tests/cpp/batch_reader_test.cc
tests/cpp/fm_loss_test.cc
tests/cpp/logit_loss_delta_test.cc
tests/cpp/data_store_test.cc
)

add_executable(difacto_tests ${TEST_SRCS})
target_link_libraries(difacto_tests difacto dmlc pslite gtest gtest_main pthread ${CMAKE_THREAD_LIBS_INIT})

# 如果需要单元测试,在这里添加
# add_subdirectory(tests)
79 changes: 0 additions & 79 deletions Makefile

This file was deleted.

19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ and train FM with 2-dimension on it.
git clone --recursive https://github.com/dmlc/difacto
cd difacto; git submodule update --init; make -j8
./tools/download.sh gisette
build/difacto data_in=data/gisette_scale val_data=data/gisette_scale.t lr=.02 V_dim=2 V_lr=.001
build/difacto data_in=data/gisette_scale val_data=data/gisette_scale.t lr=.02 V_dim=2 V_lr=.001 batch_size=256
```

### History
Expand All @@ -35,3 +35,20 @@ Origins from

Mu Li, Ziqi Liu, Alex Smola, and Yu-Xiang Wang.
DiFacto — Distributed Factorization Machines. In WSDM, 2016

## Build

### 使用CMake构建(推荐)

```bash
# 创建编译目录
mkdir -p build
cd build

# 配置和编译项目
cmake ..
make -j4

# 运行示例
./difacto task=train learner=sgd batch_size=100 data_in=/path/to/data V_dim=10
```
2 changes: 1 addition & 1 deletion dmlc-core
Submodule dmlc-core updated 165 files
41 changes: 31 additions & 10 deletions example/criteo_sgd.conf
Original file line number Diff line number Diff line change
@@ -1,19 +1,40 @@
# data
data_in = data/criteo_kaggle/criteo_train.rec
data_val = data/criteo_kaggle/criteo_val.rec
data_format = rec
# data_in = /workspace/public_data/criteo/dac/train.txt
# data_val = /workspace/public_data/criteo/dac/test.txt
data_in = ./data/criteo.train_sample
data_val = ./data/criteo.train_sample
# data_in = ./data/criteo.train_sample_libsvm
# data_val = ./data/criteo.train_sample_libsvm
data_format = criteo
# data_format = libsvm

# 日志设置
# log_level = 1
# report_interval = 60 # 每60秒报告一次进度

# learner
task = train
learner = sgd
max_num_epochs = 10
batch_size = 10000
max_num_epochs = 5
batch_size = 1000

# linear term
l1 = 10
l2 = 10
# # linear term
# l1 = 10
# l2 = 10

# embedding term
V_dim = 10
V_dim = 8
V_threshold = 10
V_l2 = 10
# V_l2 = 10

# loss
loss = fm

# learning rate
lr = 0.01
lr_beta = 1
V_lr = 0.01
V_lr_beta = 1

# number of jobs per epoch
num_jobs_per_epoch = 10
20 changes: 20 additions & 0 deletions example/gisette_sgd.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# data
data_in = data/gisette_scale
data_val = data/gisette_scale.t

# learner
task = train
learner = sgd
max_num_epochs = 2
batch_size = 512
lr = 0.02
V_lr=.001

# linear term
l1 = 10
l2 = 10

# embedding term
V_dim = 10
V_threshold = 10
V_l2 = 10
1 change: 1 addition & 0 deletions include/difacto/store.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <memory>
#include <vector>
#include <string>
#include <functional>
#include "./base.h"
#include "dmlc/io.h"
#include "dmlc/parameter.h"
Expand Down
2 changes: 1 addition & 1 deletion ps-lite
Submodule ps-lite updated 59 files
+8 −2 .gitignore
+103 −37 CMakeLists.txt
+15 −4 Makefile
+24 −29 README.md
+173 −0 README_CN.md
+0 −62 cmake/External/zmq.cmake
+110 −0 cmake/Modules/FindProtobuf.cmake
+54 −14 cmake/Modules/FindZMQ.cmake
+0 −89 cmake/ProtoBuf.cmake
+4 −4 docs/Doxyfile
+40 −0 docs/api.md
+315 −0 docs/conf.py
+2 −0 docs/env.md
+1 −0 docs/get_started.md
+25 −0 docs/history.md
+74 −0 docs/how_to.md
+16 −0 docs/index.md
+182 −0 docs/overview.md
+1 −0 docs/requirements.txt
+19 −0 docs/sphinx_util.py
+4 −0 env.sh
+17 −3 include/dmlc/base.h
+72 −3 include/dmlc/logging.h
+0 −5 include/ps/base.h
+20 −8 include/ps/internal/customer.h
+66 −0 include/ps/internal/env.h
+98 −19 include/ps/internal/message.h
+48 −39 include/ps/internal/postoffice.h
+59 −0 include/ps/internal/threadsafe_pqueue.h
+1 −0 include/ps/internal/threadsafe_queue.h
+12 −2 include/ps/internal/utils.h
+115 −63 include/ps/internal/van.h
+139 −26 include/ps/kv_app.h
+22 −22 include/ps/ps.h
+22 −16 include/ps/simple_app.h
+10 −9 make/deps.mk
+2 −2 src/customer.cc
+1,169 −0 src/ibverbs_van.h
+17 −3 src/meta.proto
+291 −0 src/meta_pb2.py
+5 −9 src/network_utils.h
+60 −0 src/p3_van.h
+139 −53 src/postoffice.cc
+141 −0 src/resender.h
+443 −345 src/van.cc
+301 −0 src/zmq_van.h
+24 −0 tests/CMakeLists.txt
+35 −3 tests/README.md
+0 −37 tests/local.sh
+0 −16 tests/repeat.sh
+424 −0 tests/run_tests.sh
+0 −10 tests/test.mk
+2 −2 tests/test_connection.cc
+45 −13 tests/test_kv_app.cc
+85 −0 tests/test_kv_app_benchmark.cc
+80 −0 tests/test_kv_app_multi_workers.cc
+25 −5 tests/test_simple_app.cc
+12 −1 tests/travis/travis_script.sh
+3 −0 tracker/dmlc_mpi.py
8 changes: 4 additions & 4 deletions src/reader/adfea_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "difacto/base.h"
#include "data/row_block.h"
#include "data/parser.h"
#include "data/strtonum.h"
#include "dmlc/strtonum.h"
namespace difacto {

/**
Expand All @@ -33,9 +33,9 @@ class AdfeaParser : public dmlc::data::ParserImpl<feaid_t> {
}
bool ParseNext(
std::vector<dmlc::data::RowBlockContainer<feaid_t> > *data) override {
using dmlc::data::isspace;
using dmlc::data::isdigit;
using dmlc::data::strtoull;
using dmlc::isspace;
using dmlc::isdigit;
using dmlc::strtoull;

dmlc::InputSplit::Blob chunk;

Expand Down
16 changes: 6 additions & 10 deletions src/reader/criteo_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
#ifndef DIFACTO_READER_CRITEO_PARSER_H_
#define DIFACTO_READER_CRITEO_PARSER_H_
#include <limits>
#if DIFACTO_USE_CITY
#include <city.h>
#endif // DIFACTO_USE_CITY
#include <string>
#include <functional>
#include <vector>
#include "difacto/base.h"
#include "data/row_block.h"
#include "data/parser.h"
#include "data/strtonum.h"
#include "dmlc/strtonum.h"
namespace difacto {

/**
Expand Down Expand Up @@ -93,12 +92,9 @@ class CriteoParser : public dmlc::data::ParserImpl<feaid_t> {

private:
inline feaid_t Hash(const char* p, size_t len) {
#if DIFACTO_USE_CITY
return CityHash64(p, len);
#else
LOG(FATAL) << "compile with USE_CITY=1";
return 0;
#endif // DIFACTO_USE_CITY
std::string str(p, len);
std::hash<std::string> hasher;
return hasher(str);
}

// implement strchr
Expand Down
22 changes: 0 additions & 22 deletions tests/cpp/test.mk

This file was deleted.