Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e506f86
Clean unused imports and improve type annotations
ShuhaoZhangTony Sep 30, 2025
36a0db6
code quality fix
ShuhaoZhangTony Sep 30, 2025
c19186a
fix: 添加libstdc++版本检查和自动修复
ShuhaoZhangTony Oct 2, 2025
71b6a91
Update SageFlow import error message to recommend CLI install
ShuhaoZhangTony Oct 5, 2025
6374a63
chore: remove python bindings (moved to main SAGE repo)
ShuhaoZhangTony Oct 5, 2025
fa7c6b0
chore: move examples to main SAGE repo
ShuhaoZhangTony Oct 5, 2025
f154ec1
refactor: rename library from candy to sageflow
ShuhaoZhangTony Oct 7, 2025
ebef3ff
chore: completely remove Python bindings build logic
ShuhaoZhangTony Oct 7, 2025
95424c2
refactor: rename namespace from candy to sageFlow
ShuhaoZhangTony Oct 7, 2025
12a2318
fix: replace candy:: scope resolution with sageFlow::
ShuhaoZhangTony Oct 7, 2025
aa8879b
refactor: 改为动态库以匹配 sageDB 架构
ShuhaoZhangTony Oct 7, 2025
81af7dd
fix: 只在 BUILD_TESTING=ON 时构建 gtest
ShuhaoZhangTony Oct 7, 2025
88bf3a7
将例子推送回c++项目
ShuhaoZhangTony Oct 9, 2025
63f3895
update namespaces and add examples
ShuhaoZhangTony Oct 9, 2025
f1c1ae4
fix examples to link against sageflow
ShuhaoZhangTony Oct 9, 2025
af22e8a
updated: refs/heads/main-dev
ShuhaoZhangTony Oct 9, 2025
8f43967
quick fix
ShuhaoZhangTony Oct 9, 2025
a6839ac
清理.gitignore文件,移除不必要的注释和空行
ShuhaoZhangTony Oct 9, 2025
f0827ee
fix streaming examples build bug (#52)
ZeroJustMe Oct 9, 2025
8ac9c6b
Extract data generation as modular framework with dataset support, da…
ZeroJustMe Oct 13, 2025
c77ab52
Update join experiment tools (#54)
ZeroJustMe Oct 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/main-ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: CMake - Build and Test

on:
pull_request:
branches: [ main, join_pre_experiment ]
branches: [ main, main-dev ]
push:
branches: [ main ]

Expand All @@ -16,7 +16,7 @@ jobs:
runs-on: ${{ matrix.os }}
env:
# 限制 CI 运行时日志级别,避免 DEBUG 级别日志过多
CANDY_LOG_LEVEL: info
SAGEFLOW_LOG_LEVEL: info
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -48,8 +48,8 @@ jobs:
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DBUILD_TESTING=ON \
-DCANDY_ENABLE_METRICS=ON \
-DCANDY_BUILD_PYBIND=OFF \
-DSAGEFLOW_ENABLE_METRICS=ON \
-DSAGEFLOW_BUILD_PYBIND=OFF \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON

- name: Build
Expand Down
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@ cmake-build-*/
dist/
*.egg-info/

# Experiment outputs
*.png
*.tsv
*.otf

# Compiled Object files
*.slo
*.lo
Expand Down Expand Up @@ -91,3 +96,4 @@ docs/_build/
# Uncomment if needed
# data/generated/
# examples/output/
install/*
37 changes: 3 additions & 34 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
cmake_minimum_required(VERSION 3.20)

project(CANDY CXX)
project(sageFlow CXX)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)

# 是否启用运行时指标采集代码(通过编译宏控制)。默认开启以保持现有行为。
option(CANDY_ENABLE_METRICS "Enable metrics instrumentation in join/operator and tests" ON)
message(STATUS "CANDY_ENABLE_METRICS: ${CANDY_ENABLE_METRICS}")
option(SAGEFLOW_ENABLE_METRICS "Enable metrics instrumentation in join/operator and tests" ON)
message(STATUS "SAGEFLOW_ENABLE_METRICS: ${SAGEFLOW_ENABLE_METRICS}")

# 启用测试选项(CLion 识别 gtest 必需)
option(BUILD_TESTING "Build tests" ON)
if(BUILD_TESTING)
enable_testing()
endif()


set(_sage_flow_shared_deps FALSE)
if(DEFINED SAGE_COMMON_DEPS_FILE AND EXISTS "${SAGE_COMMON_DEPS_FILE}")
include("${SAGE_COMMON_DEPS_FILE}")
Expand Down Expand Up @@ -79,33 +78,3 @@ add_subdirectory(src)
add_subdirectory(test)
add_subdirectory(examples)

# Python bindings
if(NOT _sage_flow_shared_deps)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/pybind11_dependency.cmake)
endif()

pybind11_add_module(_sage_flow python/bindings.cpp)
target_link_libraries(_sage_flow PRIVATE
candy
externalRuntimeLibs
)
target_include_directories(_sage_flow PRIVATE include)
if(DEFINED SAGE_COMMON_COMPILE_DEFINITIONS)
target_compile_definitions(_sage_flow PRIVATE ${SAGE_COMMON_COMPILE_DEFINITIONS})
else()
target_compile_definitions(_sage_flow PRIVATE PYBIND11_INTERNALS_ID="sage_pybind11_shared")
endif()

# Reduce exported symbol surface to minimize potential cross-module clashes
if(DEFINED SAGE_COMMON_COMPILE_OPTIONS)
target_compile_options(_sage_flow PRIVATE ${SAGE_COMMON_COMPILE_OPTIONS})
else()
target_compile_options(_sage_flow PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
endif()

if(ENABLE_GPERFTOOLS AND DEFINED SAGE_GPERFTOOLS_LIBS AND SAGE_GPERFTOOLS_LIBS)
target_link_libraries(_sage_flow PRIVATE ${SAGE_GPERFTOOLS_LIBS})
endif()
set_target_properties(_sage_flow PROPERTIES CXX_VISIBILITY_PRESET hidden VISIBILITY_INLINES_HIDDEN YES)


198 changes: 198 additions & 0 deletions batch_translate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
#!/usr/bin/env python3
import os

# Helper function to write translated content
def write_translation(filename, content):
with open(filename, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Translated {filename}")

# Translate IMPLEMENTATION_SUMMARY.md - keeping existing Chinese, just ensure consistency
implementation_summary = """# 数据源框架实现总结

## 问题描述

原始需求:帮我把test文件夹里面数据生成的部分作为模块抽离出来。数据源除了目前的随机生成,新加上从数据集中直接获取数据(数据集目前在data文件夹下)。在生成给算子用的数据中,可以通过不同的类来区分我想要的数据源。

## 实现方案

### 架构设计

创建了一个模块化的数据源框架,包含三个主要组件:

1. **DataSourceBase** - 抽象基类,定义统一接口
2. **RandomDataSource** - 随机数据生成器(从原TestDataGenerator提取)
3. **DatasetDataSource** - 数据集加载器(读取fvecs格式文件)
4. **VectorListSource** - 内存向量包装器(可复用组件)

### 文件结构

```
test/test_utils/data_source/
├── data_source_base.h # 基类接口
├── random_data_source.h/cpp # 随机数据源实现
├── dataset_data_source.h/cpp # 数据集数据源实现
├── vector_list_source.h # 内存向量包装器
└── README.md # 完整文档

test/UnitTest/
├── test_data_source.cpp # 单元测试
├── test_data_persistence.cpp # 持久化测试
└── test_join_data_source.cpp # Join数据源测试

test/examples/
├── test_data_source_example.cpp # 使用示例
└── data_persistence_example.cpp # 持久化示例
```

### 关键特性

1. **模块化设计** - 数据生成逻辑独立,易于扩展
2. **统一接口** - 所有数据源实现相同的接口
3. **向后兼容** - 现有测试代码无需修改即可运行
4. **灵活配置** - 支持多种数据源和配置选项
5. **易于扩展** - 添加新数据源只需继承基类

## 使用方法

### 1. 使用随机数据源

```cpp
// 配置随机数据源
RandomDataSource::Config config;
config.vector_dim = 128;
config.seed = 42;
auto data_source = std::make_shared<RandomDataSource>(config);

// 与TestDataGenerator一起使用
TestDataGenerator::Config gen_config;
gen_config.positive_pairs = 100;
TestDataGenerator generator(gen_config, data_source);
auto [records, matches] = generator.generateData();
```

### 2. 使用数据集数据源

```cpp
// 配置数据集数据源
DatasetDataSource::Config config;
config.file_path = PROJECT_DIR "/data/siftsmall/siftsmall_query.fvecs";
config.expected_dim = 128;
config.loop = true; // 循环使用
auto data_source = std::make_shared<DatasetDataSource>(config);

// 与TestDataGenerator一起使用
TestDataGenerator generator(gen_config, data_source);
auto [records, matches] = generator.generateData();
```

### 3. 向后兼容用法

```cpp
// 原有代码无需修改,仍然正常工作
TestDataGenerator::Config config;
config.vector_dim = 128;
TestDataGenerator generator(config); // 自动使用随机数据源
auto [records, matches] = generator.generateData();
```

## 测试验证

### 单元测试
- `test_data_source.cpp` - 包含5个测试用例
- RandomDataSourceBasic - 测试随机数据源
- DatasetDataSourceBasic - 测试数据集数据源
- TestDataGeneratorWithRandomDataSource - 测试生成器+随机源
- TestDataGeneratorWithDatasetDataSource - 测试生成器+数据集源
- BackwardCompatibility - 测试向后兼容性

- `test_data_persistence.cpp` - 包含5个测试用例
- 测试保存为FVECS格式
- 测试保存为JSON格式
- 测试FVECS往返(保存后加载)
- 测试JSON往返(保存后加载)
- 测试从保存的数据生成

- `test_join_data_source.cpp` - 包含8个测试用例
- 测试Duplicate模式
- 测试Separate模式
- 测试Generator集成
- 测试向后兼容性

### 测试结果
```bash
cd build
ctest -L UNIT
# 18/18 tests passed (100%)
```

所有现有测试仍然通过,证明完全向后兼容。

### 示例程序
```bash
cd build
./bin/test_data_source_example
# 运行4个示例,展示不同使用场景

./bin/data_persistence_example
# 演示数据持久化功能
```

## 文档

- **test/test_utils/data_source/README.md** - 完整的框架文档
- 架构说明
- 使用指南
- 配置选项
- 扩展方法

- **test/test_utils/data_writer/README.md** - 数据写入器文档
- FvecsWriter使用说明
- JsonWriter使用说明
- 配置选项

- **test/test_utils/JOIN_DATA_SOURCE_GUIDE.md** - Join数据源指南
- Join框架说明
- 使用示例
- 配置选项

## 兼容性

✅ **完全向后兼容** - 所有现有测试无需修改
✅ **现有测试通过** - 18个单元测试全部通过
✅ **性能测试正常** - test_join_perf_scaling等构建正常

## 扩展性

添加新数据源非常简单:

```cpp
class MyCustomDataSource : public DataSourceBase {
public:
// 实现接口方法
std::vector<float> getNextVector() override;
int getDimension() const override;
bool hasMore() const override;
void reset() override;
};
```

## 技术细节

1. **内存管理** - 使用智能指针,自动管理生命周期
2. **异常处理** - 数据集加载失败时抛出异常,带详细错误信息
3. **线程安全** - 基础类不保证线程安全,由使用方控制
4. **性能** - 数据集一次性加载到内存,访问快速

## 未来改进

可能的扩展方向:
1. 添加更多数据格式支持(如HDF5)
2. 支持流式加载大数据集
3. 添加数据预处理功能
4. 支持数据增强
"""
write_translation('IMPLEMENTATION_SUMMARY.md', implementation_summary)

print("\\nAll key documentation files translated to Chinese!")
print("Files translated: CODE_REVIEW_IMPROVEMENTS.md, IMPLEMENTATION_SUMMARY.md")
39 changes: 39 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,45 @@ BUILD_TYPE=${BUILD_TYPE:-Debug}

echo "Building sageFlow with CMake (CMAKE_BUILD_TYPE=${BUILD_TYPE})..."

# Function to check and fix libstdc++ version issue in conda environment
check_libstdcxx() {
# Only check if we're in a conda environment
if [[ -z "${CONDA_PREFIX}" ]]; then
return 0
fi

# Check if conda libstdc++ needs update
local conda_libstdcxx="${CONDA_PREFIX}/lib/libstdc++.so.6"
if [[ ! -f "${conda_libstdcxx}" ]]; then
return 0
fi

# Check GCC version requirement
local gcc_version=$(gcc -dumpversion | cut -d. -f1)
if [[ ${gcc_version} -ge 11 ]]; then
# Check if conda libstdc++ has required GLIBCXX version
if ! strings "${conda_libstdcxx}" | grep -q "GLIBCXX_3.4.30"; then
echo "⚠️ 检测到conda环境中的libstdc++版本过低,正在更新..."
echo " 这是C++20/GCC 11+编译所必需的"

# Try to update libstdc++ in conda environment
if command -v conda &> /dev/null; then
conda install -c conda-forge libstdcxx-ng -y || {
echo "⚠️ 无法自动更新libstdc++,将使用系统版本"
# Set LD_LIBRARY_PATH to prefer system libstdc++
if [[ -f "/usr/lib/x86_64-linux-gnu/libstdc++.so.6" ]]; then
export LD_LIBRARY_PATH="/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
echo " 已设置LD_LIBRARY_PATH优先使用系统libstdc++"
fi
}
fi
fi
fi
}

# Check libstdc++ before building
check_libstdcxx

# Create build directory if not exists
mkdir -p build

Expand Down
Loading