diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..5e22fcf Binary files /dev/null and b/.DS_Store differ diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..926aa8c --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,5 @@ +coverage: + status: + patch: + default: + target:80% diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..6d76398 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,36 @@ + +dist: focal + +language: c + +os: linux + +compiler: gcc + +install: + - sudo apt-get install valgrind + - sudo apt-get install clang + - sudo apt-get install cppcheck + - sudo pip install cpplint + +script: + - cd project/ + - mkdir build + - cd build + - cmake .. + - make clean && make + - cppcheck --inconclusive --enable=all --language=c ../include/*.h ../src/*.c + - cpplint ../include/*.h ../src/*.c ../tests/*.cpp + - LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom |fold -w 100 | head -n 100000 > bigfile.txt + - echo "2 ab" | valgrind --leak-check=full --track-origins=yes -s ./main.out bigfile.txt + - valgrind --leak-check=full --track-origins=yes ./linear_tests + - valgrind --leak-check=full --track-origins=yes ./parallel_tests + - LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom |fold -w 100 | head -n 1000000 > bigfile2.txt + - echo "these are clean times, no valgrind, on 100MB" + - echo "2 ab" | ./main.out bigfile2.txt + + + + +after_success: + - bash <(curl -s https://codecov.io/bash) diff --git a/CPPLINT.cfg b/CPPLINT.cfg new file mode 100644 index 0000000..e5ce69c --- /dev/null +++ b/CPPLINT.cfg @@ -0,0 +1,4 @@ +filter=-legal/copyright +filter=-build/include_subdir +filter=-build/include +filter=-readability/casting diff --git a/README.md b/README.md index c246e5e..cf86ff5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,24 @@ # cpp_2021 +[![Build Status](https://travis-ci.com/BorisKoz/cpp_2021.svg?branch=HW-2)](https://travis-ci.com/BorisKoz/cpp_2021) +[![codecov](https://codecov.io/gh/BorisKoz/cpp_2021/branch/HW-2/graph/badge.svg)](https://codecov.io/gh/BorisKoz/cpp_2021) + +On my machine : parallel is faster : 0.152859 vs 1.536328s +Remote : parallel is faster : 0.588880 vs 0.765509 + +# Вариант #4 +Перед вами поставлена задача подсчета количества вхождений заданных символов в загруженный в оперативную память файл размером 100 Мб. Составьте наивный алгоритм подсчета вхождений символов, в затем реализуйте параллельную обработку текста несколькими процессами с учетом оптимизации работы с кэш-памятью. + +# На что необходимо обратить внимание: +- основная информация описана в https://park.mail.ru/blog/topic/view/14270/ +- параллельная реализация не должна быть осуществлена с помощью процессов, когда требуется реализация с помощью потоков (и наоборот); +- компиляция должна происходить с флагами -Wall -Werror -Wpedantic, то есть необработанных ворнингов быть не должно; +- количество потоков/процессов должно быть не захардкожено, а определяться в зависимости от возможностей системы (например, в зависимости от количества ядер процессора); +- при разработке обеих библиотек стоит делать общий интерфейс, не раскрывая особенностей реализации; +- библиотеки должны быть взаимозаменяемыми - конкретная реализация (последовательная/параллельная) - использоваться в зависимости от конфигурации сборки; +- юнит-тесты должны быть реализованы для обеих реализаций (последовательной/параллельной). Покрытие тестами должно быть максимально возможным; +- должны присутствовать стресс-тесты. Они могут быть реализованы внешним образом, запуская две разные программы - одну со статической библиотекой с последовательной реализацией, вторую - с динамической библиотекой с параллельной реализацией, и сравнивая их выводы друг с другом. +- для организации ввода/вывода больших данных полезно работать с файлами, а в программе - предусмотреть работу с универсальными потоками входных/выходных данных (или хотя бы перенаправлять ввод/вывод на уровне их запуска) +- если в задании сказано, что программа должна обрабатывать файлы объёмом 100 Мб – это лишь ориентир, на которых программа точно должна работать, и на котором имеет смысл делать замеры производительности и эффективности алгоритмов. Поэтому тесты на такой объём должны быть. Однако сама программа должна уметь работать с произвольными размерами входных данных +- измерение времени должно осуществляться внешним образом, а не внутри кода библиотек. При этом необходимо делать несколько замеров и усреднять. Стоит помнить о том, что clock() в многопоточных приложениях работает не так, как ожидается. diff --git a/project/.DS_Store b/project/.DS_Store new file mode 100644 index 0000000..a032984 Binary files /dev/null and b/project/.DS_Store differ diff --git a/project/CMakeLists.txt b/project/CMakeLists.txt new file mode 100644 index 0000000..756ba58 --- /dev/null +++ b/project/CMakeLists.txt @@ -0,0 +1,54 @@ +cmake_minimum_required(VERSION 3.15) +project(files) + +configure_file(CMakeLists.txt.in + googletest-download/CMakeLists.txt) +execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) +execute_process(COMMAND ${CMAKE_COMMAND} --build . + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download ) + +add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src + ${CMAKE_BINARY_DIR}/googletest-build) + +set(CMAKE_C_FLAGS "-pedantic -fprofile-arcs -ftest-coverage -Wall -Werror -Wpedantic") +set(CMAKE_CXX_FLAGS "-pedantic -fprofile-arcs -ftest-coverage -Wall -Werror -Wpedantic") + +enable_testing() + + +include_directories("${PROJECT_SOURCE_DIR}/include") + + +set(INCLUDE ${PROJECT_SOURCE_DIR}/include) +set(SOURCE ${PROJECT_SOURCE_DIR}/src) + +add_library(bigfile_linear STATIC + ${INCLUDE}/search.h + ${SOURCE}/linear.c) +# changed to MODULE, otherwise is built different local-remote +add_library(bigfile_parallel MODULE + ${INCLUDE}/search.h + ${SOURCE}/parallel.c) + +file(GLOB prod_sources + "${PROJECT_SOURCE_DIR}/include/*.h" + "${PROJECT_SOURCE_DIR}/src/main.c") + +add_executable(main.out ${PROJECT_SOURCE_DIR}/src/main.c) +target_link_libraries(main.out -ldl bigfile_linear) + + +file(GLOB tests "${PROJECT_SOURCE_DIR}/tests/*.cpp") +list(REMOVE_ITEM tests "${PROJECT_SOURCE_DIR}/tests/main.cpp") + +foreach(file ${tests}) + set(name) + get_filename_component(name ${file} NAME_WE) + add_executable("${name}_tests" + ${PROJECT_SOURCE_DIR}/src/${name}.c + ${file} + "${PROJECT_SOURCE_DIR}/tests/main.cpp") + target_link_libraries("${name}_tests" gtest_main) + add_test(NAME ${name} COMMAND "${name}_tests") +endforeach() \ No newline at end of file diff --git a/project/CMakeLists.txt.in b/project/CMakeLists.txt.in new file mode 100644 index 0000000..e7f1172 --- /dev/null +++ b/project/CMakeLists.txt.in @@ -0,0 +1,15 @@ + + +project(googletest-download NONE) + +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG release-1.8.1 + SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) \ No newline at end of file diff --git a/project/include/search.h b/project/include/search.h new file mode 100644 index 0000000..4beb2f4 --- /dev/null +++ b/project/include/search.h @@ -0,0 +1,16 @@ +// Copyright 2021 bkz +#ifndef PROJECT_INCLUDE_SEARCH_H_ +#define PROJECT_INCLUDE_SEARCH_H_ +#include + +#define NULL_ENTRY (-1) +#define MMAP_ERR 2 +#define PID_ERR 3 +#define WRONG_OPEN 4 +#define MAX_PROCESS 10 +#define READ_SIZE 20 +#define BUFFER_SIZE 30 +#define FORMAT_STRING "%020d" + +int file_search(FILE** , const char* , size_t* , size_t size_to_find); +#endif // PROJECT_INCLUDE_SEARCH_H_ diff --git a/project/src/linear.c b/project/src/linear.c new file mode 100644 index 0000000..542ff53 --- /dev/null +++ b/project/src/linear.c @@ -0,0 +1,43 @@ +// Copyright 2021 bkz + +#include +#include +#include "../include/search.h" + +int compare_to_array(const char* to_compare, char buffer, size_t size_to_find) { + for (int i = 0; i < size_to_find; i++) { + if (to_compare[i] == buffer) { + return i; + } + } + return -1; +} + +int file_search(FILE** fp, const char* to_find, + size_t* found, size_t size_to_find) { + if (fp == NULL || *fp == NULL || to_find == NULL || found == NULL) { + return NULL_ENTRY; + } + struct stat file_stat; + fstat(fileno(*fp), &file_stat); + if ((file_stat.st_mode & S_IRUSR) == 0) { + return WRONG_OPEN; + } + if (file_stat.st_size == 0) { + return NULL_ENTRY; + } + char* file_in_memory = (char *)mmap(NULL, file_stat.st_size, + PROT_READ, MAP_SHARED, fileno(*fp), 0); + if (file_in_memory == NULL) { + return MMAP_ERR; + } + for (size_t i = 0; i < file_stat.st_size; i++) { + int position = compare_to_array(to_find, + file_in_memory[i], size_to_find); + if (position != -1) { + found[position]++; + } + } + munmap(file_in_memory, file_stat.st_size); + return 0; +} diff --git a/project/src/main.c b/project/src/main.c new file mode 100644 index 0000000..a6662b9 --- /dev/null +++ b/project/src/main.c @@ -0,0 +1,102 @@ +// Copyright 2021 bkz + +#include +#include +#include +#include +#include +#include "../include/search.h" +#include +#define TEST_SERIES_SIZE 3 +#define RESULTS_FILE "res.txt" + +int main(int argc, char* argv[]) { + if (argc != 2) { + fprintf(stderr, "No file input"); + return -1; + } + // открытие файла + FILE* p = fopen(argv[1], "r"); + if (!p) { + fprintf(stderr, "No such file"); + return -1; + } + + // считывание символов + int size_to_find = 0; + char to_find[BUFFER_SIZE] = ""; + size_t found[BUFFER_SIZE]; + memset(found, 0, BUFFER_SIZE * sizeof(*found)); + scanf("%d", &size_to_find); + if (size_to_find > 30) { + fprintf(stderr, "Too much symbols"); + fclose(p); + return -1; + } + scanf("%c", &to_find[0]); + for (int i = 0; i < size_to_find; i++) { + scanf("%c", &to_find[i]); + } + + // работа с файлом последовательно + double elapsed[TEST_SERIES_SIZE], average = 0; + memset(found, 0, TEST_SERIES_SIZE * sizeof(*elapsed)); + for (int i = 0; i < TEST_SERIES_SIZE; i++) { + struct timespec start, finish; + + clock_gettime(CLOCK_MONOTONIC, &start); + + file_search(&p, to_find, found, size_to_find); + + clock_gettime(CLOCK_MONOTONIC, &finish); + elapsed[i] = (double)(finish.tv_sec - start.tv_sec); + elapsed[i] += (double)(finish.tv_nsec - start.tv_nsec) / 1000000000.0; + printf("elapsed: %lf\n", elapsed[i]); + average += elapsed[i]; + memset(found, 0, BUFFER_SIZE * sizeof(*found)); + } + average = average / TEST_SERIES_SIZE; + printf("Linear series average: %lf\n", average); + if (p) { + fclose(p); + } + + // parallel run + p = fopen(argv[1], "r"); + void *parallel_lib = dlopen("./libbigfile_parallel.so", RTLD_LAZY); + if (!parallel_lib) { + fprintf(stderr, "LIBRARY NOT FOUND"); + return -1; + } + + int (*pointer)(); + *(void **)(&pointer) = dlsym(parallel_lib, "file_search"); + + double average_parallel = 0; + memset(found, 0, TEST_SERIES_SIZE * sizeof(*elapsed)); + for (int i = 0; i < TEST_SERIES_SIZE; i++) { + struct timespec start, finish; + + clock_gettime(CLOCK_MONOTONIC, &start); + + (*pointer)(&p, to_find, found, size_to_find); + + clock_gettime(CLOCK_MONOTONIC, &finish); + elapsed[i] = (double)(finish.tv_sec - start.tv_sec); + elapsed[i] += (double)(finish.tv_nsec - start.tv_nsec) / 1000000000.0; + printf("elapsed: %lf\n", elapsed[i]); + average_parallel += elapsed[i]; + memset(found, 0, BUFFER_SIZE * sizeof(*found)); + } + average_parallel = average_parallel / TEST_SERIES_SIZE; + printf("Parallel series average: %lf\n", average_parallel); + average > average_parallel ? + printf("parallel is faster : %lf vs %lf\n", average_parallel, average) : + printf("linear is faster : %lf vs %lf\n", average, average_parallel); + dlclose(parallel_lib); + + if (p) { + fclose(p); + } + return 0; +} diff --git a/project/src/parallel.c b/project/src/parallel.c new file mode 100644 index 0000000..368c442 --- /dev/null +++ b/project/src/parallel.c @@ -0,0 +1,154 @@ +// Copyright 2021 bkz +#include +#include +#include +#include "../include/search.h" +#include +#include +#include +#include +#include +#define unlikely(expr) __builtin_expect(!!(expr), 0) + +void free_all_resources(size_t* divisions, int* processes ) { + if (divisions) { + free(divisions); + } + if (processes) { + free(processes); + } +} + +int calculate_process(size_t filesize) { + if (filesize < 1000) { + return 1; + } + if (filesize < 10000) { + return 2; + } + if (filesize < 100000) { + return 3; + } + if (filesize < 1000000) { + return 4; + } + return (MAX_PROCESS > (int)sysconf(_SC_NPROCESSORS_ONLN) ? + (int)sysconf(_SC_NPROCESSORS_ONLN) : MAX_PROCESS); +} + +int create_division( + int count, size_t filesize, size_t *divisions) { + size_t proc_range = filesize/count; + + if (divisions == NULL) { + return NULL_ENTRY; + } + for (int i = 1; i < count; i++) { + divisions[i] = divisions[i-1] + proc_range; + } + divisions[count] = filesize; + return 0; +} + +int fork_calculations(int* processes, int process_count) { + if (processes == NULL) { + return NULL_ENTRY; + } + for (int i=0; i< process_count; i++) { + int current_id = 0; + if ((current_id=fork()) == -1) { + return PID_ERR; + } + if (current_id == 0) { + return i; + } + processes[i] = current_id; + } + return getpid(); +} + +int child_process_run(const char* file_in_memory, const size_t* divisions, + const int current_id, const int size_to_find, + const char* to_find, int pipes[MAX_PROCESS][2]) { + for (int j = 0; j < size_to_find; j++) { + int count = 0; + for (size_t i = divisions[current_id]; + i < divisions[current_id+1]; i++) { + if (unlikely(file_in_memory[i] == to_find[j])) + count++; + } + char str[BUFFER_SIZE] = "0"; + snprintf(str, READ_SIZE + 1, FORMAT_STRING, count); + write(pipes[current_id][1], str, strlen(str)); + } + close(pipes[current_id][0]); + close(pipes[current_id][1]); + return 0; +} + +int file_search(FILE** fp, const char* to_find, + size_t* found, size_t size_to_find) { + // correct checks + if (fp == NULL || *fp == NULL || to_find == NULL || found == NULL) { + return NULL_ENTRY; + } + struct stat file_stat; + fstat(fileno(*fp), &file_stat); + if ((file_stat.st_mode & S_IRUSR) == 0) { + return WRONG_OPEN; + } + if (file_stat.st_size == 0) { + return NULL_ENTRY; + } + char* file_in_memory = (char *)mmap(NULL, file_stat.st_size, + PROT_READ, MAP_SHARED, fileno(*fp), 0); + if (file_in_memory == NULL) { + return MMAP_ERR; + } + // Предполагается работа с большим файлом, поэтому делим сам файл, + // а не массив проверяемых символов. + // calculate processes + int process_count = calculate_process(file_stat.st_size); + + // fragment array for processes rule + size_t* divisions = (size_t *)calloc(process_count + 1, sizeof(size_t)); + create_division(process_count, file_stat.st_size, divisions); + + // setup pipe comms + int* processes = (int *)calloc(process_count, sizeof(pid_t)); + int pipes[MAX_PROCESS][2]; + memset(pipes, 0, MAX_PROCESS*2); + for (int i = 0; i < process_count; i++) { + pipe(pipes[i]); + } + // initialise processes + int current_id = fork_calculations(processes, process_count); + + // this is child process block + if (getpid() != current_id) { + child_process_run(file_in_memory, divisions, current_id, + size_to_find, to_find, pipes); + free_all_resources(divisions, processes); + fclose(*fp); + exit(0); + } + + // this is parent process block + for (int i = 0; i < process_count; i++) { + while (!waitpid(processes[i], NULL, 0)) { + } + for (int j = 0; j < size_to_find; j++) { + char str[BUFFER_SIZE] = "0"; + read(pipes[i][0], str, READ_SIZE); + int count = (int)strtol(str, NULL, 10); + found[j]+= count; + } + close(pipes[i][0]); + close(pipes[i][1]); + } + + // free memory + munmap(file_in_memory, file_stat.st_size); + free_all_resources(divisions, processes); + return 0; +} diff --git a/project/tests/linear.cpp b/project/tests/linear.cpp new file mode 100644 index 0000000..eaee26d --- /dev/null +++ b/project/tests/linear.cpp @@ -0,0 +1,70 @@ +// Copyright 2021 +#include "gtest/gtest.h" + +extern "C" { +#include "../include/search.h" +} + +TEST(file_search, null_params) { + FILE* f1 = fopen("1.txt", "a"); + char* c1 = (char*)calloc(10, sizeof (char)); + size_t* found = (size_t*)calloc(10, sizeof (size_t)); + size_t stf = 0; + ASSERT_EQ(file_search(nullptr, c1, found, stf), NULL_ENTRY); + ASSERT_EQ(file_search(&f1, nullptr, found, stf), NULL_ENTRY); + ASSERT_EQ(file_search(&f1, c1, nullptr, stf), NULL_ENTRY); + free(c1); + free(found); + fclose(f1); +} + + +TEST(file_search, empty_file) { + FILE* f1 = fopen("1.txt", "a"); + char* c1 = (char*)calloc(10, sizeof (char)); + size_t* found = (size_t*)calloc(10, sizeof (size_t)); + size_t stf = 0; + ASSERT_EQ(file_search(&f1, c1, found, stf), NULL_ENTRY); + free(c1); + free(found); + fclose(f1); +} + +TEST(file_search, premade_file) { + FILE* f1 = fopen("1.txt", "a"); + fputs("a a a a\n b\n b\n", f1); + char c1[3] = {'a', 'b', '\n'}; + size_t found[3] = {0, 0, 0}; + size_t stf = 3; + fclose(f1); + f1 = fopen("1.txt", "r"); + file_search(&f1, c1, found, stf); + fclose(f1); + remove("1.txt"); + EXPECT_EQ(found[0], 4); + EXPECT_EQ(found[1], 2); + EXPECT_EQ(found[2], 3); +} + +TEST(file_search, grep_random) { + FILE* f1 = fopen("1.txt", "a"); + char c1[2] = {'a', 'b'}; + size_t found[2] = {0, 0}; + size_t stf = 2; + fclose(f1); + system("LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom | " + "fold -w 100 | head -n 10000 > 1.txt "); + f1 = fopen("1.txt", "r"); + file_search(&f1, c1, found, stf); + fclose(f1); + remove("grep_results.txt"); + system("grep -o 'a' 1.txt | wc -l >> grep_results.txt"); + system("grep -o 'b' 1.txt | wc -l >> grep_results.txt"); + remove("1.txt"); + f1 = fopen("grep_results.txt", "r"); + int i1, i2; + fscanf(f1, "%d %d", &i1, &i2); + fclose(f1); + EXPECT_EQ(found[0], i1); + EXPECT_EQ(found[1], i2); +} diff --git a/project/tests/main.cpp b/project/tests/main.cpp new file mode 100644 index 0000000..7b0b7e1 --- /dev/null +++ b/project/tests/main.cpp @@ -0,0 +1,8 @@ +// Copyright 2021 + +#include "gtest/gtest.h" + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/project/tests/parallel.cpp b/project/tests/parallel.cpp new file mode 100644 index 0000000..45dd84a --- /dev/null +++ b/project/tests/parallel.cpp @@ -0,0 +1,73 @@ +// Copyright 2021 +#include "gtest/gtest.h" + +extern "C" { +#include "../include/search.h" +} + +TEST(file_search_parallel, null_params) { + FILE* f1 = fopen("1.txt", "a"); + char* c1 = (char*)calloc(10, sizeof (char)); + size_t* found = (size_t*)calloc(10, sizeof (size_t)); + size_t stf = 0; + ASSERT_EQ(file_search(nullptr, c1, found, stf), NULL_ENTRY); + ASSERT_EQ(file_search(&f1, nullptr, found, stf), NULL_ENTRY); + ASSERT_EQ(file_search(&f1, c1, nullptr, stf), NULL_ENTRY); + free(c1); + free(found); + fclose(f1); + remove("1.txt"); +} + + +TEST(file_search_parallel, empty_file) { + FILE* f1 = fopen("2.txt", "a"); + char* c1 = (char*)calloc(10, sizeof (char)); + size_t* found = (size_t*)calloc(10, sizeof (size_t)); + size_t stf = 0; + ASSERT_EQ(file_search(&f1, c1, found, stf), NULL_ENTRY); + free(c1); + free(found); + fclose(f1); + remove("2.txt"); +} + +TEST(file_search_parallel, premade_file) { + FILE* f1 = fopen("3.txt", "a"); + fputs("a a a a\n b\n b\n", f1); + char c1[3] = {'a', 'b', '\n'}; + size_t found[3] = {0, 0, 0}; + size_t stf = 3; + fclose(f1); + f1 = fopen("3.txt", "r"); + file_search(&f1, c1, found, stf); + fclose(f1); + EXPECT_EQ(found[0], 4); + EXPECT_EQ(found[1], 2); + EXPECT_EQ(found[2], 3); + remove("3.txt"); +} + +TEST(file_search_parallel, grep_random) { + FILE* f1 = fopen("4.txt", "a"); + char c1[3] = {'a', 'b', '\n'}; + size_t found[3] = {0, 0, 0}; + size_t stf = 3; + fclose(f1); + system("LC_CTYPE=C tr -dc A-Za-z0-9 < /dev/urandom | " + "fold -w 100 | head -n 10000 > 4.txt "); + f1 = fopen("4.txt", "r"); + file_search(&f1, c1, found, stf); + fclose(f1); + remove("grep_results.txt"); + system("grep -o 'a' 4.txt | wc -l >> grep_results.txt"); + system("grep -o 'b' 4.txt | wc -l >> grep_results.txt"); + remove("4.txt"); + f1 = fopen("grep_results.txt", "r"); + int i1, i2; + fscanf(f1, "%d %d", &i1, &i2); + fclose(f1); + EXPECT_EQ(found[0], i1); + EXPECT_EQ(found[1], i2); + remove("grep_results.txt"); +}