From cc1a765115655bc5cadfdcf3536e0eb11bf7a57a Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Wed, 11 May 2022 17:46:36 +0900 Subject: [PATCH 1/8] fix build script --- build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build.sh b/build.sh index 51d75662..3f1579b5 100755 --- a/build.sh +++ b/build.sh @@ -29,15 +29,15 @@ elif [ "$1" = "--nc-monitor" ]; then # Cache hot LB pages with mtr-logging/monitoring enabled BUILD_FLAGS="-DUNIV_NVDIMM_CACHE -DUNIV_LOG_HEADER -DUNIV_FLUSH_MONITOR" else - # Cache hot LB pages (default) - BUILD_FLAGS="-DUNIV_NVDIMM_CACHE -DUNIV_LOG_HEADER" + # Cache NVDIMM pages in TPC-C workloads + BUILD_FLAGS="-DUNIV_NVDIMM_CACHE" fi echo "Start build using $BUILD_FLAGS" -cd $BASE_DIR +cd $BUILD_DIR -cmake -DWITH_DEBUG=0 -DCMAKE_C_FLAGS="$BUILD_FLAGS" -DCMAKE_CXX_FLAGS="$BUILD_FLAGS" \ +cmake .. -DWITH_DEBUG=0 -DCMAKE_C_FLAGS="$BUILD_FLAGS" -DCMAKE_CXX_FLAGS="$BUILD_FLAGS" \ -DDOWNLOAD_BOOST=ON -DWITH_BOOST=$BASE_DIR/boost -DENABLED_LOCAL_INFILE=1 \ -DCMAKE_INSTALL_PREFIX=$BUILD_DIR From a6b2176e75fc8912cd96f40f7c65b2fc3247c7fe Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Wed, 11 May 2022 19:31:03 +0900 Subject: [PATCH 2/8] mop code --- storage/innobase/btr/btr0cur.cc | 23 ++--- storage/innobase/buf/buf0flu.cc | 13 +-- storage/innobase/page/page0cur.cc | 27 +++++- storage/innobase/pmem/pmem0mmap.cc | 138 +---------------------------- storage/innobase/srv/srv0start.cc | 27 ------ 5 files changed, 46 insertions(+), 182 deletions(-) diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index ffffc2f6..f1de8c42 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -3924,23 +3924,19 @@ btr_cur_update_in_place( } #ifdef UNIV_NVDIMM_CACHE + // (jhpark): add REDO log for NC pages + btr_cur_update_in_place_log(flags, rec, index, update, + trx_id, roll_ptr, mtr); + /* nvm_block = btr_cur_get_block(cursor); nvm_bpage = &(nvm_block->page); - if (nvm_bpage->cached_in_nvdimm) { - // skip generating REDO logs for NVM-resident pages - // write NC page on NVDIMM - //pm_mmap_buf_write(nvm_bpage->size.physical(), (void*) ((buf_block_t*) nvm_bpage)->frame); - - // persist records - ulint cur_rec_size = rec_offs_size(offsets); - pm_mmap_mtrlogbuf_commit(nvm_block->frame, UNIV_PAGE_SIZE, nvm_bpage->id.space(), nvm_bpage->id.page_no()); - - //pm_mmap_mtrlogbuf_commit(rec, cur_rec_size, nvm_bpage->id.space(), nvm_bpage->id.page_no()); + } else { btr_cur_update_in_place_log(flags, rec, index, update, trx_id, roll_ptr, mtr); } + */ #else btr_cur_update_in_place_log(flags, rec, index, update, trx_id, roll_ptr, mtr); @@ -4937,6 +4933,7 @@ btr_cur_del_mark_set_clust_rec( row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr); #ifdef UNIV_NVDIMM_CACHE + /* if (is_nvm_page) { // skip generating REDO logs for nvm-page // Instead, write commit log in mtr log @@ -4950,6 +4947,12 @@ btr_cur_del_mark_set_clust_rec( btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, roll_ptr, mtr); } + */ + + // (jhpark): add REDO log for NC pages + btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, + roll_ptr, mtr); + #else btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, roll_ptr, mtr); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 42dc28d9..83afe974 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1116,10 +1116,7 @@ buf_flush_write_block_low( /* Set the oldest LSN of the NVDIMM page to the previous newest LSN. */ buf_flush_note_modification((buf_block_t *)nvdimm_page, bpage->newest_modification, bpage->newest_modification, nvdimm_page->flush_observer); - // TODO: NVDIMM-porting - // 1 flush_cache(((buf_block_t *)nvdimm_page)->frame, UNIV_PAGE_SIZE); - // 2 /* Remove the target page from the original buffer pool. */ buf_page_io_complete(bpage, true); @@ -1141,7 +1138,7 @@ buf_flush_write_block_low( << " with oldest: " << bpage->oldest_modification << " newest: " << bpage->newest_modification << " lsn-gap: " << bpage->newest_modification - bpage->oldest_modification; -*/ + */ if (!srv_use_doublewrite_buf || buf_dblwr == NULL || srv_read_only_mode @@ -1169,10 +1166,7 @@ buf_flush_write_block_low( fil_io(request, sync, bpage->id, bpage->size, 0, bpage->size.physical(), frame, bpage); - - // jhpark: write oldest_modification_lsn of current NVDIMM-caching page - pm_mmap_write_logfile_header_lsn(bpage->oldest_modification); - + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { buf_dblwr_write_single_page(bpage, sync); } else { @@ -1311,7 +1305,8 @@ buf_flush_page( #ifdef UNIV_NVDIMM_CACHE if (bpage->flush_type == BUF_FLUSH_LIST /* Flush list flushing */ - && (bpage->id.space() == 28 || bpage->id.space() == 30 || bpage->id.space() == 32) /* TPC-C tablespaces */ + // (jhpark): modified for 500 wh loading version + && (bpage->id.space() == 27 || bpage->id.space() == 29 || bpage->id.space() == 31) /* TPC-C tablespaces */ && bpage->buf_fix_count == 0 /* Not fixed */ && !bpage->cached_in_nvdimm) { /* Not cached in NVDIMM */ bpage->moved_to_nvdimm = true; diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index 4412ecfb..838d35a0 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1533,6 +1533,7 @@ page_cur_insert_rec_low( /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { #ifdef UNIV_NVDIMM_CACHE + /* ulint page_no = page_get_page_no(page); ulint space_id = page_get_space_id(page); buf_block_t* nvm_block = buf_page_get(page_id_t(space_id, page_no), @@ -1540,7 +1541,6 @@ page_cur_insert_rec_low( assert(nvm_block != NULL); buf_page_t* nvm_bpage = &nvm_block->page; - if (nvm_bpage->cached_in_nvdimm) { // skip generating REDO log for nvm-page pm_mmap_mtrlogbuf_commit(nvm_block->frame, UNIV_PAGE_SIZE, space_id, page_no); @@ -1550,6 +1550,11 @@ page_cur_insert_rec_low( page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, index, mtr); } + */ + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log(insert_rec, rec_size, + current_rec, index, mtr); + #else page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, index, mtr); @@ -1944,6 +1949,7 @@ page_cur_insert_rec_zip( page_zip, page, index, level, NULL, NULL)) { #ifdef UNIV_NVDIMM_CACHE + /* buf_block_t* nvm_block = page_cur_get_block(cursor); assert(nvm_block != NULL); @@ -1959,6 +1965,13 @@ page_cur_insert_rec_zip( insert_rec, rec_size, cursor->rec, index, mtr); } + */ + + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log( + insert_rec, rec_size, + cursor->rec, index, mtr); + #else page_cur_insert_rec_write_log( insert_rec, rec_size, @@ -2235,6 +2248,7 @@ page_cur_insert_rec_zip( /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { #ifdef UNIV_NVDIMM_CACHE + /* buf_block_t* nvm_block = page_cur_get_block(cursor); assert(nvm_block != NULL); @@ -2250,6 +2264,11 @@ page_cur_insert_rec_zip( page_cur_insert_rec_write_log(insert_rec, rec_size, cursor->rec, index, mtr); } + */ + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log(insert_rec, rec_size, + cursor->rec, index, mtr); + #else page_cur_insert_rec_write_log(insert_rec, rec_size, cursor->rec, index, mtr); @@ -2473,6 +2492,7 @@ page_copy_rec_list_end_to_created_page( rec_offs_make_valid(insert_rec, index, offsets); #ifdef UNIV_NVDIMM_CACHE + /* ulint page_no = page_get_page_no(new_page); ulint space_id = page_get_space_id(new_page); buf_block_t* nvm_block = buf_page_get(page_id_t(space_id, page_no), @@ -2489,6 +2509,11 @@ page_copy_rec_list_end_to_created_page( page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); } + */ + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, + index, mtr); + #else page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); diff --git a/storage/innobase/pmem/pmem0mmap.cc b/storage/innobase/pmem/pmem0mmap.cc index 7b0d8638..2d52092e 100644 --- a/storage/innobase/pmem/pmem0mmap.cc +++ b/storage/innobase/pmem/pmem0mmap.cc @@ -59,53 +59,9 @@ unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { if (gb_pm_mmap == MAP_FAILED) { PMEMMMAP_ERROR_PRINT("pm_mmap mmap() faild recovery failed\n"); } - - // get file construct - PMEM_MMAP_MTRLOGFILE_HDR* recv_mmap_mtrlog_fil_hdr = (PMEM_MMAP_MTRLOGFILE_HDR*) - malloc(PMEM_MMAP_LOGFILE_HEADER_SZ); - pm_mmap_read_logfile_header(recv_mmap_mtrlog_fil_hdr); - - // debug - fprintf(stderr, "[check] size: %lu, lsn: %lu, ckpt_lsn: %lu, ckpt_offset: %lu\n", - recv_mmap_mtrlog_fil_hdr->size, recv_mmap_mtrlog_fil_hdr->flushed_lsn, - recv_mmap_mtrlog_fil_hdr->ckpt_lsn, recv_mmap_mtrlog_fil_hdr->ckpt_offset); - - // recvoery check - PMEM_MMAP_MTRLOG_HDR* recv_mmap_mtrlog_hdr = (PMEM_MMAP_MTRLOG_HDR*) malloc(PMEM_MMAP_MTRLOG_HDR_SIZE); - memcpy(recv_mmap_mtrlog_hdr, gb_pm_mmap+recv_mmap_mtrlog_fil_hdr->ckpt_offset, PMEM_MMAP_MTRLOG_HDR_SIZE); - - if (recv_mmap_mtrlog_fil_hdr->size == PMEM_MMAP_MTR_FIL_HDR_SIZE - || recv_mmap_mtrlog_hdr->need_recv == false) { - PMEMMMAP_INFO_PRINT("Normal Shutdown case, don't need to recveory; Recovery process is terminated\n"); - } else { - // TODO(jhpark): real recovery process - is_pmem_recv = true; - pmem_recv_offset = pm_mmap_recv_check(recv_mmap_mtrlog_fil_hdr); - pmem_recv_size = recv_mmap_mtrlog_fil_hdr->size; - - // jhpark: check buffer!!!!! - // pm_mmap_recv_flush_buffer(); - - PMEMMMAP_INFO_PRINT("recovery offset: %lu\n", pmem_recv_offset); - } - - // step1. allocate mtr_recv_sys - // step2. 1) get header infor mation and 2) get info from mtr log region - // step3. reconstruct undo page - - // Get header information from exsiting nvdimm log file - //size_t recv_prev_offset = recv_mmap_mtrlog_hdr->prev; - //memset(recv_mmap_mtrlog_hdr, 0x00, PMEM_MMAP_MTRLOG_HDR_SIZE); - //memcpy(recv_mmap_mtrlog_hdr, gb_pm_mmap+recv_prev_offset, PMEM_MMAP_MTRLOG_HDR_SIZE); - - // debug - //fprintf(stderr, "size: %lu\n", recv_size); - //fprintf(stderr, "len: %lu\n", recv_mmap_mtrlog_hdr->len); - //fprintf(stderr, "lsn: %lu\n", recv_mmap_mtrlog_hdr->lsn); - //fprintf(stderr, "need_recovery: %d\n", recv_mmap_mtrlog_hdr->need_recv); - - free(recv_mmap_mtrlog_fil_hdr); - free(recv_mmap_mtrlog_hdr); + // TODO(jhpark): fix + memcpy(gb_pm_mmap + (6*1024*1024*1024UL), gb_pm_mmap + (1*1024*1024*1024UL), (8UL*147324928)); + is_pmem_recv = true; } // Force to set NVIMMM @@ -396,91 +352,3 @@ ssize_t pm_mmap_mtrlogbuf_write( return ret; } -// commit mtr log -void pm_mmap_mtrlogbuf_commit(unsigned char* rec, unsigned long cur_rec_size ,ulint space, ulint page_no) { - // TODO(jhaprk): Keep page modification finish log for recovery - // For current mtr logging version, we jsut ignore this function - //return; - flush_cache(rec, cur_rec_size); -/* - if (mmap_mtrlogbuf == NULL) return; - - //fprintf(stderr, "[mtr-commit] space: %lu page_no: %lu\n", space, page_no); - // 1. check current cur_offset - size_t cur_offset = mmap_mtrlogbuf->cur_offset; - // 2. check current ckpt_offset - size_t ckpt_offset = mmap_mtrlogbuf->ckpt_offset; - // 3. remove stale log data - memset(gb_pm_mmap + PMEM_MMAP_MTR_FIL_HDR_SIZE, 0x00, cur_offset - PMEM_MMAP_MTR_FIL_HDR_SIZE); - //fprintf(stderr, "cur_offset: %lu ckpt_offset: %lu\n", cur_offset, ckpt_offset); - mmap_mtrlogbuf->cur_offset = PMEM_MMAP_MTR_FIL_HDR_SIZE; - mmap_mtrlogbuf->prev_offset = PMEM_MMAP_MTR_FIL_HDR_SIZE; - // really needed? - pm_mmap_write_logfile_header_size(PMEM_MMAP_MTR_FIL_HDR_SIZE); -*/ - -} - - -// compare mtr log with given space_id, and page_no -// offset is start offset of "log body" of mtr log -bool pm_mmap_mtrlogbuf_identify(size_t offset, size_t n, ulint space, ulint page_no) { - // mtr log structure: [type(1)] [space_id(4)] [page_no(4)] - // mach_write_compressed used when writing space_id and page_no - // + 1 means jump over MTR_LOG_TYPE - ulint cur_space, cur_page; - const byte *ptr = gb_pm_mmap+offset; - const byte *end_ptr = gb_pm_mmap+offset+n; - ptr++; - - cur_space = mach_parse_compressed(&ptr, end_ptr); - if (ptr != NULL) { - cur_page = mach_parse_compressed(&ptr, end_ptr); - } - - //fprintf(stderr, "[mtr identify] space(%lu) : %lu pange_no(%lu) : %lu\n", space, cur_space, page_no, cur_page); - return ((cur_space == space) && (cur_page == page_no)); -} - -void pm_mmap_mtrlogbuf_unset_recv_flag(size_t offset) { - memcpy(gb_pm_mmap + offset, false, sizeof(bool)); - // need flush? No we can recover by using commit log -} - -void pm_mmap_mtrlogbuf_commit_v1(ulint space, ulint page_no) { - // 1. start to inspect mtr log from latest ckpt_offset - // 2. check specific mtr log with spaced_id and page_no - // 2.1 (yes) check need_recv is set goto 3.1 - // 2.2 (no) check need recv is set goto 3.2 - // 3.1. update ckpt_offset to current offset - // 4. move to next mtr log (until cur_offset) - - if (mmap_mtrlogbuf == NULL) return; - - size_t offset = mmap_mtrlogbuf->ckpt_offset; - while (offset != mmap_mtrlogbuf->cur_offset) { - - fprintf(stderr, "offset : %lu cur_offset: %lu\n", offset, mmap_mtrlogbuf->cur_offset); - PMEM_MMAP_MTRLOG_HDR mmap_mtr_hdr; - memcpy(&mmap_mtr_hdr, gb_pm_mmap + offset, (size_t) PMEM_MMAP_MTRLOG_HDR_SIZE); - uint64_t data_len = mmap_mtr_hdr.len; - bool need_recv = mmap_mtr_hdr.need_recv; - - fprintf(stderr, "[mtr info] data_len : %lu lsn: %lu need_recv : %d\n", - data_len, mmap_mtr_hdr.lsn, need_recv); - - // move next - uint64_t org_offset = offset; - offset += PMEM_MMAP_MTRLOG_HDR_SIZE; - - if (pm_mmap_mtrlogbuf_identify(offset, data_len, space, page_no)) { - pm_mmap_mtrlogbuf_unset_recv_flag(org_offset); - } - if (need_recv) { - mmap_mtrlogbuf->ckpt_offset = org_offset; - } - offset += data_len; - } - fprintf(stderr, "break out ! ckpt_offset: %lu\n", mmap_mtrlogbuf->ckpt_offset); -} - diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 383f3fa1..582bab1f 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1510,8 +1510,6 @@ innobase_start_or_create_for_mysql(void) // buffer retion initialization (2GB) pm_mmap_buf_init(1024*1024*1024*2UL); } - - //pm_mmap_buf_init(1024*1024*1024*3UL); #endif /* UNIV_NVDIMM_CACHE */ @@ -2332,32 +2330,14 @@ innobase_start_or_create_for_mysql(void) return(srv_init_abort(DB_ERROR)); } -#ifdef UNIV_NVDIMM_CACHE - fprintf(stderr, "[JONGQ] ---- pass force recovery!\n"); - -// TODO(jhpark): NC recovery check !!!!! - if (is_pmem_recv) { - PMEMMMAP_INFO_PRINT("YES!!!! recovery!!!! start_offset: %lu end_offset: %lu\n" - ,pmem_recv_offset, pmem_recv_size); -// pm_mmap_recv(pmem_recv_offset, pmem_recv_size); -// PMEMMMAP_INFO_PRINT("UNDO page is recoverd !!!!\n"); -// //pm_mmap_recv_flush_buffer(); - } -#endif /* UNIV_NVDIMM_CACHE */ purge_queue = trx_sys_init_at_db_start(); - fprintf(stderr, "[JONGQ] ---- trx_sys_init_at_db_start finished!\n"); - if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { /* Apply the hashed log records to the respective file pages, for the last batch of recv_group_scan_log_recs(). */ -#ifdef UNIV_NVDIMM_CACHE - PMEMMMAP_INFO_PRINT("JONGQ recovery-4-1\n"); -#endif /* UNIV_NVDIMM_CACHE */ - recv_apply_hashed_log_recs(TRUE); DBUG_PRINT("ib_log", ("apply completed")); @@ -2366,10 +2346,6 @@ innobase_start_or_create_for_mysql(void) } } -#ifdef UNIV_NVDIMM_CACHE - PMEMMMAP_INFO_PRINT("JONGQ recovery-5\n"); -#endif /* UNIV_NVDIMM_CACHE */ - if (recv_sys->found_corrupt_log) { ib::warn() << "The log file may have been corrupt and it" @@ -2579,9 +2555,6 @@ innobase_start_or_create_for_mysql(void) variable srv_available_undo_logs. The number of rsegs to use can be set using the dynamic global variable srv_rollback_segments. */ - // debug - fprintf(stderr, "[JONGQ] initialize undo log lists\n"); - srv_available_undo_logs = trx_sys_create_rsegs( srv_undo_tablespaces, srv_rollback_segments, srv_tmp_undo_logs); From e57b7cab732c7502d9225d83a00a91e414ede7c9 Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Sat, 14 May 2022 13:35:24 +0900 Subject: [PATCH 3/8] add recovery --- my.cnf | 22 ++-- storage/innobase/buf/buf0buf.cc | 70 +++++++++- storage/innobase/buf/buf0flu.cc | 60 +++++---- storage/innobase/include/buf0buf.h | 4 + storage/innobase/include/pmem_mmap_obj.h | 9 ++ storage/innobase/log/log0log.cc | 35 ++++- storage/innobase/log/log0recv.cc | 98 +++++++++++++- storage/innobase/mtr/mtr0mtr.cc | 156 +++++++++++++++++++---- storage/innobase/next_checkpoint_lsn) | 7 + storage/innobase/pmem/pmem0mmap.cc | 9 +- storage/innobase/pmem/pmem0recv.cc | 91 +++++++++++++ storage/innobase/srv/srv0start.cc | 18 ++- storage/innobase/trx/trx0rec.cc | 19 +-- storage/innobase/trx/trx0trx.cc | 5 +- 14 files changed, 511 insertions(+), 92 deletions(-) create mode 100644 storage/innobase/next_checkpoint_lsn) diff --git a/my.cnf b/my.cnf index 060d33fb..cbaf48fe 100644 --- a/my.cnf +++ b/my.cnf @@ -14,6 +14,7 @@ prompt=\u:\d>\_ # This was formally known as [safe_mysqld]. Both versions are currently parsed. [mysqld_safe] +user = root socket = /tmp/mysql.sock #nice = 0 @@ -21,19 +22,23 @@ socket = /tmp/mysql.sock # # * Basic Settings # +user = root default-storage-engine = innodb skip-grant-tables -pid-file = /home/mijin/test_data/mysql.pid +pid-file = /home/vldb/test_data/mysql.pid socket = /tmp/mysql.sock port = 3306 -datadir = /home/mijin/test_data/ -log-error = /home/mijin/test_data/mysql_error_nvdimm.log +datadir = /home/vldb/test_data/ +log-error = /home/vldb/test_data/mysql_error_nvdimm.log ################################################## # Need to Modify ################################################## #Log group path (iblog0, iblog1) -innodb_log_group_home_dir=/home/mijin/test_log/ +innodb_log_group_home_dir=/home/vldb/test_log/ +#innodb_log_group_home_dir=/mnt/pmemdir/test_log/ +#innodb_undo_directory=/mnt/pmemdir/ + #innodb page size innodb_page_size=4KB @@ -59,17 +64,17 @@ innodb_use_nvdimm_buffer=true innodb_nvdimm_buffer_pool_size=1G innodb_nvdimm_buffer_pool_instances=1 innodb_nvdimm_pc_threshold_pct=15 -innodb_nvdimm_home_dir=/mnt/ramdisk -#innodb_nvdimm_home_dir=/mnt/pmem +innodb_nvdimm_home_dir=/mnt/pmemdir #transaction log settings -innodb_log_file_size=2G +#innodb_log_file_size=500M innodb_log_files_in_group=3 # 0:every 1 seconds, 1:fsync on commits, 2:writes on commits -innodb_flush_log_at_trx_commit=0 +innodb_flush_log_at_trx_commit=1 innodb_log_buffer_size=32M innodb_flush_neighbors=0 +#innodb_force_recovery = 1 #doublewrite and flush method innodb_doublewrite=ON @@ -95,4 +100,3 @@ open_files_limit = 24000 #performance-schema-instrument='wait/synch/rwlock/innodb/%=ON' #innodb_status_output = ON #innodb_status_output_locks = ON - diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 7c24d104..63f3efda 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -406,6 +406,74 @@ buf_pool_register_chunk( chunk->blocks->frame, chunk)); } +// HOT DEBUG // +lsn_t +nvdimm_buf_pool_get_oldest_modification(void) +/*==================================*/ +{ + lsn_t lsn = 0; + lsn_t oldest_lsn = 0; + lsn_t page_lsn = 0; + + /* When we traverse all the flush lists we don't want another + thread to add a dirty page to any flush list. */ + log_flush_order_mutex_enter(); + + for (ulint i = srv_buf_pool_instances; i < srv_buf_pool_instances+1; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_list_mutex_enter(buf_pool); + + buf_page_t* bpage; + + /* We don't let log-checkpoint halt because pages from system + temporary are not yet flushed to the disk. Anyway, object + residing in system temporary doesn't generate REDO logging. */ + for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + bpage != NULL + && fsp_is_system_temporary(bpage->id.space()); + bpage = UT_LIST_GET_PREV(list, bpage)) { + /* Do nothing. */ + } + + if (bpage != NULL) { + ut_ad(bpage->in_flush_list); + + // HOT DEBUG // + // check page lsn of current NC pages + /* + buf_block_t *block; + block = (buf_block_t*)bpage; + uint64_t cur_lsn_page = mach_read_from_8(block->frame + FIL_PAGE_LSN); + if (cur_lsn_page !=0 + && cur_lsn_page < pmem_lsn) { + lsn = pmem_lsn; + } else { + lsn = cur_lsn_page; + } + */ + lsn = bpage->oldest_modification; + } + + buf_flush_list_mutex_exit(buf_pool); + + if (!oldest_lsn || oldest_lsn > lsn) { + oldest_lsn = lsn; + } + } + + log_flush_order_mutex_exit(); + + /* The returned answer may be out of date: the flush_list can + change after the mutex has been released. */ + + return(oldest_lsn); +} + + + /********************************************************************//** Gets the smallest oldest_modification lsn for any page in the pool. Returns zero if all modified pages have been flushed to disk. @@ -421,7 +489,7 @@ buf_pool_get_oldest_modification(void) thread to add a dirty page to any flush list. */ log_flush_order_mutex_enter(); - for (ulint i = 0; i < srv_buf_pool_instances; i++) { + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 83afe974..7e205a12 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1047,14 +1047,8 @@ buf_flush_write_block_low( /* Force the log to the disk before writing the modified block */ if (!srv_read_only_mode) { -#ifdef UNIV_NVDIMM_CACHE - if (bpage->buf_pool_index < srv_buf_pool_instances) { - log_write_up_to(bpage->newest_modification, true); - } -#else - log_write_up_to(bpage->newest_modification, true); -#endif /* UNIV_NVDIMM_CACHE */ - } + log_write_up_to(bpage->newest_modification, true); + } switch (buf_page_get_state(bpage)) { case BUF_BLOCK_POOL_WATCH: @@ -1078,8 +1072,8 @@ buf_flush_write_block_low( if (!frame) { frame = ((buf_block_t*) bpage)->frame; } - - buf_flush_init_for_writing( + + buf_flush_init_for_writing( reinterpret_cast(bpage), reinterpret_cast(bpage)->frame, bpage->zip.data ? &bpage->zip : NULL, @@ -1107,30 +1101,48 @@ buf_flush_write_block_low( if (nvdimm_page == NULL) goto normal; - /*ib::info() << "page_id = " << bpage->id.space() + /* + ib::info() << "page_id = " << bpage->id.space() << " offset = " << bpage->id.page_no() << " dst = " << &(((buf_block_t *)nvdimm_page)->frame) << " src = " << &(((buf_block_t *)bpage)->frame) - << " flush-type = " << bpage->flush_type;*/ + << " flush-type = " << bpage->flush_type; + */ + memcpy(((buf_block_t *)nvdimm_page)->frame, ((buf_block_t *)bpage)->frame, UNIV_PAGE_SIZE); /* Set the oldest LSN of the NVDIMM page to the previous newest LSN. */ - buf_flush_note_modification((buf_block_t *)nvdimm_page, bpage->newest_modification, bpage->newest_modification, nvdimm_page->flush_observer); + +// nvdimm_page->oldest_modification = bpage->oldest_modification; +// nvdimm_page->newest_modification = bpage->newest_modification; + + buf_flush_note_modification((buf_block_t *)nvdimm_page + , bpage->oldest_modification + , bpage->newest_modification + , nvdimm_page->flush_observer); + +// ib::info() << "oldest_modification: " +// << nvdimm_page->oldest_modification +// << nvdimm_page->id.space() << ":" << nvdimm_page->id.page_no(); flush_cache(((buf_block_t *)nvdimm_page)->frame, UNIV_PAGE_SIZE); /* Remove the target page from the original buffer pool. */ buf_page_io_complete(bpage, true); buf_page_io_complete(nvdimm_page); - - /*buf_pool_t* buf_pool = buf_pool_from_bpage(nvdimm_page); + + /* + buf_pool_t* buf_pool = buf_pool_from_bpage(nvdimm_page); ib::info() << nvdimm_page->id.space() << " " << nvdimm_page->id.page_no() << " is moved to " - << nvdimm_page->buf_pool_index << " from " << bpage->buf_pool_index;*/ + << nvdimm_page->buf_pool_index << " from " << bpage->buf_pool_index; + */ + } else { normal: bpage->moved_to_nvdimm = false; - /*ib::info() << bpage->id.space() << " " << bpage->id.page_no() + /* + ib::info() << bpage->id.space() << " " << bpage->id.page_no() << " is batch written. cached? " << bpage->cached_in_nvdimm << " moved? " << bpage->moved_to_nvdimm << " flush-type: " << flush_type @@ -1139,6 +1151,7 @@ buf_flush_write_block_low( << " newest: " << bpage->newest_modification << " lsn-gap: " << bpage->newest_modification - bpage->oldest_modification; */ + if (!srv_use_doublewrite_buf || buf_dblwr == NULL || srv_read_only_mode @@ -1152,17 +1165,20 @@ buf_flush_write_block_low( IORequest request(type); - /*lsn_t lsn_gap = bpage->newest_modification - bpage->oldest_modification; - - ib::info() << bpage->id.space() << " " << bpage->id.page_no() + lsn_t lsn_gap = bpage->newest_modification - bpage->oldest_modification; + if (bpage->cached_in_nvdimm) { + ib::info() << bpage->id.space() << " " << bpage->id.page_no() << " is batch written. cached? " << bpage->cached_in_nvdimm << " moved? " << bpage->moved_to_nvdimm << " flush-type: " << flush_type << " buf-fix: " << bpage->buf_fix_count << " with oldest: " << bpage->oldest_modification << " newest: " << bpage->newest_modification - << " lsn-gap: " << lsn_gap;*/ - + << " lsn-gap: " << lsn_gap; + + pmem_lsn = bpage->oldest_modification; + } + fil_io(request, sync, bpage->id, bpage->size, 0, bpage->size.physical(), frame, bpage); diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index f2afda23..c72f7f38 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -459,6 +459,10 @@ UNIV_INLINE ulint buf_pool_get_n_pages(void); /*=======================*/ +// HOT DEBUG +lsn_t +nvdimm_buf_pool_get_oldest_modification(void); + /********************************************************************//** Gets the smallest oldest_modification lsn for any page in the pool. Returns zero if all modified pages have been flushed to disk. diff --git a/storage/innobase/include/pmem_mmap_obj.h b/storage/innobase/include/pmem_mmap_obj.h index b0de14e5..a2ed7669 100644 --- a/storage/innobase/include/pmem_mmap_obj.h +++ b/storage/innobase/include/pmem_mmap_obj.h @@ -11,6 +11,8 @@ #include #include +#include +#include //#include "ut0new.h" //#include "log0log.h" @@ -199,10 +201,17 @@ bool pm_mmap_recv(uint64_t start_offset, uint64_t end_offset); uint64_t pm_mmap_recv_check(PMEM_MMAP_MTRLOGFILE_HDR* log_fil_hdr); void pm_mmap_recv_flush_buffer(); +// add +extern std::map , std::vector > pmem_nc_buffer_map; +uint64_t pm_mmap_recv_check_nc_buf(uint64_t space, uint64_t page_no); +void nc_recv_analysis(); + // TODO(jhpark): covert these variables as structure (i.e., recv_sys_t) extern bool is_pmem_recv; extern uint64_t pmem_recv_offset; extern uint64_t pmem_recv_size; +extern uint64_t pmem_lsn; +void nc_save_pmem_lsn(); /** Recovery system data structure */ //struct recv_sys_t{ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 2edb6c9b..69733d18 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -57,6 +57,11 @@ Created 12/9/1995 Heikki Tuuri #include "sync0sync.h" #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_NVDIMM_CACHE +#include "pmem_mmap_obj.h" +extern unsigned char* gb_pm_mmap; +#endif + /* General philosophy of InnoDB redo-logs: @@ -149,7 +154,7 @@ log_buf_pool_get_oldest_modification(void) if (!lsn) { lsn = log_sys->lsn; - } + } return(lsn); } @@ -428,8 +433,12 @@ log_write_low( - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_TRL_SIZE; } - +#ifdef UNIV_NVDIMM_CACHE + ut_memcpy(log->buf + log->buf_free, str, len); + flush_cache(log->buf+log->buf_free, len); +#else ut_memcpy(log->buf + log->buf_free, str, len); +#endif str_len -= len; str = str + len; @@ -807,10 +816,17 @@ log_init(void) log_sys->buf_size = LOG_BUFFER_SIZE; + /* nc-logging */ +#ifdef UNIV_NVDIMM_CACHE + log_sys->buf_ptr = static_cast(gb_pm_mmap); + log_sys->buf = static_cast( + ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); +#else log_sys->buf_ptr = static_cast( ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE)); log_sys->buf = static_cast( ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); +#endif log_sys->first_in_use = true; @@ -1797,7 +1813,20 @@ log_checkpoint( log_mutex_enter(); ut_ad(!recv_no_log_write); - oldest_lsn = log_buf_pool_get_oldest_modification(); + oldest_lsn = log_buf_pool_get_oldest_modification(); + + // HOT DEBUG + /* + lsn_t nvdimm_lsn = nvdimm_buf_pool_get_oldest_modification(); + if (nvdimm_lsn !=0 + && nvdimm_lsn < oldest_lsn) { + ib::info() << "nvdimm_lsn: " + << nvdimm_lsn << " oldest_lsn: " << oldest_lsn + << " the gap: " << oldest_lsn - nvdimm_lsn; + oldest_lsn = nvdimm_lsn; + } + */ + /* Because log also contains headers and dummy log records, log_buf_pool_get_oldest_modification() will return log_sys->lsn diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index f7030cd9..26c76800 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -100,6 +100,10 @@ number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by recv_recovery_from_checkpoint_start(). */ bool recv_lsn_checks_on; +#ifdef UNIV_NVDIMM_CACHE +#include "pmem_mmap_obj.h" +#endif + /** If the following is TRUE, the buffer pool file pages must be invalidated after recovery and no ibuf operations are allowed; this becomes TRUE if the log record hash table becomes too full, and log records must be merged @@ -2243,7 +2247,7 @@ recv_add_to_hash_table( recv_fold(space, page_no), recv_addr); recv_sys->n_addrs++; // debug -#if 1 +#if 0 fprintf(stderr, "Inserting log rec for space %lu, page %lu\n", space, page_no); #endif @@ -2422,6 +2426,54 @@ recv_recover_page_func( recv = UT_LIST_GET_FIRST(recv_addr->rec_list); + /* nc-logging */ +#ifdef UNIV_NVDIMM_CACHE + extern unsigned char* gb_pm_mmap; + bool nc_page_flag = false; + bool nc_corrupt_flag = false; + uint64_t cur_nc_page_lsn = -1; + + uint64_t cur_nc_buf_offset = pm_mmap_recv_check_nc_buf( + block->page.id.space(), block->page.id.page_no()); + + if (cur_nc_buf_offset != -1) { + // (jhpark): now, we know this page reside in the NVDIMM buffer. + nc_page_flag = true; + unsigned char *nc_frame = reinterpret_cast + ((gb_pm_mmap + (1*1024*1024*1024UL) + cur_nc_buf_offset))->frame; + + uint64_t cur_disk_page_lsn = mach_read_from_8(block->frame + FIL_PAGE_LSN); + cur_nc_page_lsn = mach_read_from_8(nc_frame+FIL_PAGE_LSN); + + // check nc buffer page corruption or not + if (buf_page_is_corrupted(true + , nc_frame + , block->page.size + , fsp_is_checksum_disabled(block->page.id.space()))) { + nc_corrupt_flag = true; + } + + fprintf(stderr, "[DEBUG] offset: %lu current page is NC page and LSN : %lu disk lsn: %lu page_lsn: %lu corupted? %d recv_start_lsn :%lu\n" + , cur_nc_buf_offset, cur_nc_page_lsn, cur_disk_page_lsn, page_lsn, nc_corrupt_flag + , recv->start_lsn); + + // recover from NC buffer + if (!nc_corrupt_flag || cur_disk_page_lsn == 0) { + // check + fprintf(stderr, "[DEBUG] we skip this page: %lu:%lu\n", block->page.id.space(), block->page.id.page_no()); + memcpy(block->frame, nc_frame, UNIV_PAGE_SIZE); + page_lsn = cur_nc_page_lsn; + end_lsn = recv->start_lsn + recv->len; + mach_write_to_8(FIL_PAGE_LSN + (block->frame), end_lsn); + mach_write_to_8(UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + + (block->frame), end_lsn); + goto skip_redo; + } + + } // end-of-if +#endif + while (recv) { end_lsn = recv->end_lsn; @@ -2517,7 +2569,11 @@ recv_recover_page_func( } recv = UT_LIST_GET_NEXT(rec_list, recv); - } + } // end-of-while (recv) + +#ifdef UNIV_NVDIMM_CACHE +skip_redo: +#endif #ifdef UNIV_ZIP_DEBUG if (fil_page_index_page_check(page)) { @@ -4075,6 +4131,20 @@ recv_recovery_from_checkpoint_start( checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN); checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO); +#ifdef UNIV_NVDIMM_CACHE + ib::info() << "Reocvery start from this checkpoint_lsn: " << checkpoint_lsn; + // HOT DEBUG + /* + extern unsigned char* gb_pm_mmap; + if (is_pmem_recv) { + uint64_t cur_pmem_lsn = 0; + memcpy(&cur_pmem_lsn, gb_pm_mmap+6*1024*1024*1024UL ,sizeof(uint64_t)); + checkpoint_lsn = cur_pmem_lsn; + ib::info() << "Reocvery start from this checkpoint_lsn (recv): " << checkpoint_lsn; + } + */ +#endif + /* Read the first log file header to print a note if this is a recovery from a restored InnoDB Hot Backup */ @@ -4130,7 +4200,24 @@ recv_recovery_from_checkpoint_start( group = UT_LIST_GET_FIRST(log_sys->log_groups); ut_ad(recv_sys->n_addrs == 0); + // HOT DEBUG +#ifdef UNIV_NVDIMM_CACHE + if (is_pmem_recv) { + /* + extern unsigned char* gb_pm_mmap; + uint64_t cur_pmem_lsn = 0; + memcpy(&cur_pmem_lsn, gb_pm_mmap+6*1024*1024*1024UL ,sizeof(uint64_t)); + contiguous_lsn = cur_pmem_lsn; + ib::info() << "log chopping we use this lsn for congiguous_lns : " << contiguous_lsn; + */ + contiguous_lsn = checkpoint_lsn; + } else { + contiguous_lsn = checkpoint_lsn; + } +#else contiguous_lsn = checkpoint_lsn; +#endif + switch (group->format) { case 0: log_mutex_exit(); @@ -4148,7 +4235,6 @@ recv_recovery_from_checkpoint_start( the hash table. */ rescan = recv_group_scan_log_recs(group, &contiguous_lsn, false); - if ((recv_sys->found_corrupt_log && !srv_force_recovery) || recv_sys->found_corrupt_fs) { log_mutex_exit(); @@ -4170,7 +4256,8 @@ recv_recovery_from_checkpoint_start( group->scanned_lsn = checkpoint_lsn; rescan = false; - } + } + /* NOTE: we always do a 'recovery' at startup, but only if there is something wrong we will print a message to the @@ -4242,6 +4329,7 @@ recv_recovery_from_checkpoint_start( " database is now corrupt!"; } + if (recv_sys->recovered_lsn < checkpoint_lsn) { log_mutex_exit(); @@ -4253,6 +4341,8 @@ recv_recovery_from_checkpoint_start( return(DB_ERROR); } +skip_2: + /* Synchronize the uncorrupted log groups to the most up-to-date log group; we also copy checkpoint info to groups */ diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index fc79bc40..fddcb269 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -617,8 +617,11 @@ mtr_t::commit() ut_ad(!srv_read_only_mode || m_impl.m_log_mode == MTR_LOG_NO_REDO); - +#ifdef UNIV_NVDIMM_CACHE + cmd.execute_nvm(); +#else cmd.execute(); +#endif } else { cmd.release_all(); cmd.release_resources(); @@ -669,6 +672,10 @@ mtr_t::commit_checkpoint( # error SIZE_OF_MLOG_CHECKPOINT != 9 #endif *ptr = MLOG_CHECKPOINT; + + // HOT DEBUG + // (jhpark): we need to leave the current NC pages + ib::info() << "checkpoint_lsn: " << checkpoint_lsn; mach_write_to_8(ptr + 1, checkpoint_lsn); } @@ -1164,34 +1171,131 @@ mtr_t::Command::execute() void mtr_t::Command::execute_nvm() { ut_ad(m_impl->m_log_mode != MTR_LOG_NONE); - if (const ulint len = prepare_write_nvm()) { - finish_write_nvm(len); - } + // (jhpark): pull prepare_write() fucntion here + ulint len, n_recs; + fil_space_t* space; + + switch (m_impl->m_log_mode) { + case MTR_LOG_SHORT_INSERTS: + ut_ad(0); + /* fall through (write no redo log) */ + case MTR_LOG_NO_REDO: + case MTR_LOG_NONE: + ut_ad(m_impl->m_log.size() == 0); + log_mutex_enter(); + m_end_lsn = m_start_lsn = log_sys->lsn; + len = 0; + break; + case MTR_LOG_ALL: + break; + } - m_impl->m_mtr->m_commit_lsn = m_end_lsn; - release_blocks(); - release_latches(); - release_resources(); + if (m_impl->m_log_mode == MTR_LOG_ALL) { + len = m_impl->m_log.size(); + n_recs = m_impl->m_n_log_recs; + ut_ad(len > 0); + ut_ad(n_recs > 0); + + // (jhpark): call log_buffer_extend here!!! + if (len > log_sys->buf_size / 2) { + log_buffer_extend((len + 1) * 2); + } + + ut_ad(m_impl->m_n_log_recs == n_recs); + space = m_impl->m_user_space; + + if (space != NULL && is_system_or_undo_tablespace(space->id)) { + /* Omit MLOG_FILE_NAME for predefined tablespaces. */ + space = NULL; + } + + log_mutex_enter(); + + if (fil_names_write_if_was_clean(space, m_impl->m_mtr)) { + /* This mini-transaction was the first one to modify + this tablespace since the latest checkpoint, so + some MLOG_FILE_NAME records were appended to m_log. */ + ut_ad(m_impl->m_n_log_recs > n_recs); + mlog_catenate_ulint( + &m_impl->m_log, MLOG_MULTI_REC_END, MLOG_1BYTE); + len = m_impl->m_log.size(); + } else { + /* This was not the first time of dirtying a + tablespace since the latest checkpoint. */ + + ut_ad(n_recs == m_impl->m_n_log_recs); + + if (n_recs <= 1) { + ut_ad(n_recs == 1); + + /* Flag the single log record as the + only record in this mini-transaction. */ + *m_impl->m_log.front()->begin() + |= MLOG_SINGLE_REC_FLAG; + } else { + /* Because this mini-transaction comprises + multiple log records, append MLOG_MULTI_REC_END + at the end. */ + + mlog_catenate_ulint( + &m_impl->m_log, MLOG_MULTI_REC_END, + MLOG_1BYTE); + len++; + } + } + + /* check and attempt a checkpoint if exceeding capacity */ + log_margin_checkpoint_age(len); + } + // (jhpark): end-of-prepare_write() + // (jhpark): pull finish_write() + if (len > 0) { + ut_ad(m_impl->m_log_mode == MTR_LOG_ALL); + ut_ad(log_mutex_own()); + ut_ad(m_impl->m_log.size() == len); + ut_ad(len > 0); + + if (m_impl->m_log.is_small()) { + const mtr_buf_t::block_t* front = m_impl->m_log.front(); + ut_ad(len <= front->used()); + + m_end_lsn = log_reserve_and_write_fast( + front->begin(), len, &m_start_lsn); + + if (m_end_lsn > 0) { + goto skip_redo; + } + } + + /* Open the database log for log_write_low */ + m_start_lsn = log_reserve_and_open(len); + mtr_write_log_t write_log; + m_impl->m_log.for_each_block(write_log); + + m_end_lsn = log_close(); + } + // (jhpark): end-of-finish_write() + +skip_redo: + if (m_impl->m_made_dirty) { + log_flush_order_mutex_enter(); + } + + /* It is now safe to release the log mutex because the + flush_order mutex will ensure that we are the first one + to insert into the flush list. */ + log_mutex_exit(); + + m_impl->m_mtr->m_commit_lsn = m_end_lsn; -// TODO(jhpark): add flush_order mutex when nvdimm caching page is flushed. -// if (m_impl->m_made_dirty) { -// log_flush_order_mutex_enter(); -// } -// /* It is now safe to release the log mutex because the -// flush_order mutex will ensure that we are the first one -// to insert into the flush list. */ -// -// fprintf(stderr, "log_mutex_exit() called! m_end_lsn: %lu\n", m_end_lsn); -// log_mutex_exit(); -// fprintf(stderr, "log_mutex_exit() called! -- finished\n"); -// m_impl->m_mtr->m_commit_lsn = m_end_lsn; -// release_blocks(); -// if (m_impl->m_made_dirty) { -// log_flush_order_mutex_exit(); -// } -// release_latches(); -// release_resources(); + release_blocks(); + + if (m_impl->m_made_dirty) { + log_flush_order_mutex_exit(); + } + release_latches(); + release_resources(); } void mtr_t::Command::execute_no_nvm() { diff --git a/storage/innobase/next_checkpoint_lsn) b/storage/innobase/next_checkpoint_lsn) new file mode 100644 index 00000000..7a6d558d --- /dev/null +++ b/storage/innobase/next_checkpoint_lsn) @@ -0,0 +1,7 @@ +log/log0log.cc 1567 log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn; +log/log0log.cc 1619 log_sys->next_checkpoint_lsn, +log/log0log.cc 1626 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn); +log/log0log.cc 1628 lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn, +log/log0log.cc 1899 log_sys->next_checkpoint_lsn = oldest_lsn; +log/log0recv.cc 1271 log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn +log/log0recv.cc 4320 log_sys->next_checkpoint_lsn = checkpoint_lsn; diff --git a/storage/innobase/pmem/pmem0mmap.cc b/storage/innobase/pmem/pmem0mmap.cc index 2d52092e..a77a4c35 100644 --- a/storage/innobase/pmem/pmem0mmap.cc +++ b/storage/innobase/pmem/pmem0mmap.cc @@ -18,12 +18,14 @@ unsigned char* gb_pm_mmap; int gb_pm_mmap_fd; PMEM_MMAP_MTRLOG_BUF* mmap_mtrlogbuf = NULL; +// HOT DEBUG +uint64_t pmem_lsn; // recovery bool is_pmem_recv = false; uint64_t pmem_recv_offset = 0; uint64_t pmem_recv_size = 0; - +std::map ,std::vector > pmem_nc_buffer_map; unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { @@ -47,7 +49,7 @@ unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { } else { // TODO(jhaprk) add the recovery logic - PMEMMMAP_INFO_PRINT("Start mtr recvoery process\n"); + PMEMMMAP_INFO_PRINT("Start NC recvoery process\n"); gb_pm_mmap_fd = open(path, O_RDWR, 0777); if (gb_pm_mmap_fd < 0) { @@ -59,8 +61,9 @@ unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { if (gb_pm_mmap == MAP_FAILED) { PMEMMMAP_ERROR_PRINT("pm_mmap mmap() faild recovery failed\n"); } + // TODO(jhpark): fix - memcpy(gb_pm_mmap + (6*1024*1024*1024UL), gb_pm_mmap + (1*1024*1024*1024UL), (8UL*147324928)); + //memcpy(gb_pm_mmap + (6*1024*1024*1024UL), gb_pm_mmap + (1*1024*1024*1024UL), (8UL*147324928)); is_pmem_recv = true; } diff --git a/storage/innobase/pmem/pmem0recv.cc b/storage/innobase/pmem/pmem0recv.cc index 69744416..f6654a29 100644 --- a/storage/innobase/pmem/pmem0recv.cc +++ b/storage/innobase/pmem/pmem0recv.cc @@ -184,3 +184,94 @@ void pm_mmap_recv_flush_buffer() { // note that changes on these pages are not atomic // they might have partial updates } + +uint64_t pm_mmap_recv_check_nc_buf(uint64_t space, uint64_t page_no) { + std::map, std::vector >::iterator ncbuf_iter; + ncbuf_iter = pmem_nc_buffer_map.find(std::make_pair(space,page_no)); + if (ncbuf_iter != pmem_nc_buffer_map.end()) { + std::vector nc_offset_vec = (*ncbuf_iter).second; + uint64_t nc_offset; + for (uint64_t i=0; i + ((gb_pm_mmap + (1*1024*1024*1024UL) + nc_offset))->frame; + + fprintf(stderr, "[DEBUG] NC BUF (%lu:%lu) offset: %lu page_lsn: %lu i: %lu vec:size: %d\n", + space, page_no, nc_offset + , mach_read_from_8(nc_frame + FIL_PAGE_LSN) + , i, nc_offset_vec.size()); + if (space != mach_read_from_4(nc_frame + FIL_PAGE_SPACE_ID) + || page_no != mach_read_from_4(nc_frame + FIL_PAGE_OFFSET)) { + fprintf(stderr, "[DEBUG] wrong buffer page info! %u:%u\n", space, page_no); + } + } + return nc_offset; + } else { + return -1; + } +} +/* nc logging */ + +void nc_recv_analysis() { + uint64_t space, page_no; + unsigned char *addr = gb_pm_mmap + (1*1024*1024*1024UL); + uint64_t page_num_chunks = static_cast( (8*147324928UL)/4096); + + fprintf(stderr, "[DEBUG] NVDIMM Caching page analysis begin! total pages v2: %lu\n", page_num_chunks); + + for (uint64_t i=0; i < page_num_chunks; ++i) { + //for (uint64_t i=0; i < srv_nvdimm_buf_pool_size; i+= UNIV_PAGE_SIZE) { + + space = reinterpret_cast((addr+ i * sizeof(buf_block_t)))->page.id.space(); + page_no = reinterpret_cast((addr+ i * sizeof(buf_block_t)))->page.id.page_no(); + unsigned char *frame = reinterpret_cast((addr+ i * sizeof(buf_block_t)))->frame; + + // HOT DEBUG // + //space = reinterpret_cast((addr+ i ))->page.id.space(); + //page_no = reinterpret_cast((addr+ i ))->page.id.page_no(); + //unsigned char *frame = (unsigned char*)(addr+ i); + + if (space != 27 && space != 29 && space != 31) { + fprintf(stderr, "[DEBUG] we miss the pages %lu:%lu\n", space, page_no); + if (space == 4294967295 + && page_no == 4294967295) { + continue; + } else { + break; + } + } else { + fprintf(stderr, "[DEBUG] we get this page %lu:%lu\n", space, page_no); + } + + // check + if (space != mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + || page_no != mach_read_from_4(frame + FIL_PAGE_OFFSET)) { + fprintf(stderr, "[DEBUG] wrong frame info!\n (%lu:%lu) (%lu:%lu)", space, page_no + , mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + , mach_read_from_4(frame + FIL_PAGE_OFFSET)); + } + +#ifdef PMEM_RECV_DEBUG + fil_space_t* space_t = fil_space_get(space); + const page_id_t page_id(space,page_no); + const page_size_t page_size(space_t->flags); + if (buf_page_is_corrupted(true, frame, page_size, + fsp_is_checksum_disabled(space))) { + fprintf(stderr, "(%lu:%lu) page is corruptted! lsn: %lu\n", space, page_no, mach_read_from_8(frame + FIL_PAGE_LSN)); + } else { + fprintf(stderr, "(%lu:%lu) page is good! lsn: %lu\n", space, page_no, mach_read_from_8(frame + FIL_PAGE_LSN)); + } +#endif + + // we store relative position of nc page + pmem_nc_buffer_map[std::make_pair(space,page_no)].push_back(i*sizeof(buf_block_t)); + + } +} + +void nc_save_pmem_lsn() { + ib::info() << "xxx pmem_lsn save !!!: " << pmem_lsn; + memcpy((gb_pm_mmap + 6*1024*1024*1024UL), &pmem_lsn, sizeof(uint64_t)); + flush_cache((gb_pm_mmap + 6*1024*1024*1024UL), sizeof(uint64_t)); +} + diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 582bab1f..0d6ff80c 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1493,7 +1493,7 @@ innobase_start_or_create_for_mysql(void) #ifdef UNIV_NVDIMM_CACHE sprintf(PMEM_FILE_PATH, "%s/%s", srv_nvdimm_home_dir, NVDIMM_MMAP_FILE_NAME); - size_t srv_pmem_pool_size = 3 * 1024; + size_t srv_pmem_pool_size = 8 * 1024; uint64_t pool_size = srv_pmem_pool_size * 1024 * 1024UL; gb_pm_mmap = pm_mmap_create(PMEM_FILE_PATH, pool_size); if (!gb_pm_mmap) { @@ -1501,7 +1501,7 @@ innobase_start_or_create_for_mysql(void) assert(gb_pm_mmap); } - if (!is_pmem_recv) { + //if (!is_pmem_recv) { // for debugging : chagne the mtr log region size // original : 1024*1024*1024*8UL (8GB) pm_mmap_mtrlogbuf_init(1024*1024*1024*1UL); // 1GB for test @@ -1509,7 +1509,7 @@ innobase_start_or_create_for_mysql(void) // TODO(jhpark): change buffer pool recovery policy // buffer retion initialization (2GB) pm_mmap_buf_init(1024*1024*1024*2UL); - } + //} #endif /* UNIV_NVDIMM_CACHE */ @@ -2299,6 +2299,18 @@ innobase_start_or_create_for_mysql(void) fprintf(stderr, "[JONGQ] ---- scan_and_parse log file finished\n"); +#ifdef UNIV_NVDIMM_CACHE + if (is_pmem_recv) { + nc_recv_analysis(); + } else { + // HOT DEBUG + pmem_lsn = flushed_lsn; + nc_save_pmem_lsn(); + } +#endif + + + /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index 941bbd0a..d8e9d32f 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -2072,23 +2072,8 @@ trx_undo_report_row_operation( mtr_commit(&mtr); } else { /* Success */ - undo->withdraw_clock = buf_withdraw_clock; - - // FIXME(jhpark): for NVDIMM resident pages, we don't need to flush mtr log to WAL log buffer - // just release the mtr structure. -#ifdef UNIV_NVDIMM_CACHE - if (is_nvm_page) { - //ulint space = index->space; - //ulint page = index->page; - //fprintf(stderr, "[mtr-commit] space : %lu page : %lu\n", space, page); - //mtr_commit_nvm(&mtr, space, page); - mtr_commit_no_nvm(&mtr); - } else { - mtr_commit(&mtr); - } -#else - mtr_commit(&mtr); -#endif /* UNIV_NVDIMM_CACHE */ + undo->withdraw_clock = buf_withdraw_clock; + mtr_commit(&mtr); undo->empty = FALSE; undo->top_page_no = page_no; diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 5c23e749..5eaecc01 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -784,10 +784,7 @@ trx_resurrect_table_locks( trx->mod_tables.insert(table); } lock_table_ix_resurrect(table, trx); - - // debugging - fprintf(stderr, "ib_trx resurrect %d table %s IX lock from %s undo", trx_get_id_for_print(trx), table->name.m_name, undo == undo_ptr->insert_undo ? "insert" : "update"); - + DBUG_PRINT("ib_trx", ("resurrect" TRX_ID_FMT " table '%s' IX lock from %s undo", From 58fd8b02e06e15e783f799f20f7ae5921bd66752 Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Mon, 16 May 2022 02:04:47 +0900 Subject: [PATCH 4/8] recovery + dwb added --- storage/innobase/buf/buf0buf.cc | 2 +- storage/innobase/buf/buf0dblwr.cc | 206 ++++++++++++++++++++++- storage/innobase/buf/buf0dump.cc | 10 ++ storage/innobase/buf/buf0flu.cc | 16 +- storage/innobase/fil/fil0fil.cc | 3 +- storage/innobase/handler/ha_innodb.cc | 5 + storage/innobase/include/pmem_mmap_obj.h | 26 +++ storage/innobase/include/srv0srv.h | 2 + storage/innobase/log/log0log.cc | 11 +- storage/innobase/log/log0recv.cc | 40 +++-- storage/innobase/mtr/mtr0mtr.cc | 2 +- storage/innobase/pmem/pmem0mmap.cc | 17 +- storage/innobase/pmem/pmem0recv.cc | 91 ++++++++-- storage/innobase/srv/srv0srv.cc | 2 + storage/innobase/srv/srv0start.cc | 17 +- 15 files changed, 408 insertions(+), 42 deletions(-) diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 63f3efda..8421d60a 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -6399,7 +6399,7 @@ buf_page_io_complete( Asserts that all file pages in the buffer are in a replaceable state. @return TRUE */ static -ibool + ibool buf_all_freed_instance( /*===================*/ buf_pool_t* buf_pool) /*!< in: buffer pool instancce */ diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 057f8e82..3d2ca692 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -39,6 +39,11 @@ Created 2011/12/19 #ifndef UNIV_HOTBACKUP +#ifdef UNIV_NVDIMM_CACHE +extern unsigned char* gb_pm_mmap; +extern pfs_os_file_t gb_pm_dwb_file; +#endif + /** The doublewrite buffer */ buf_dblwr_t* buf_dblwr = NULL; @@ -123,6 +128,10 @@ buf_dblwr_init( byte* doublewrite) /*!< in: pointer to the doublewrite buf header on trx sys page */ { +#ifdef UNIV_NVDIMM_CACHE + dberr_t err; + byte* buf; +#endif ulint buf_size; buf_dblwr = static_cast( @@ -153,12 +162,66 @@ buf_dblwr_init( buf_dblwr->in_use = static_cast( ut_zalloc_nokey(buf_size * sizeof(bool))); +#ifdef UNIV_NVDIMM_CACHE + // HOT DEBUG + // TODO(jhpark): recovery + if (srv_use_nvdimm_dwb) { + + buf_dblwr->write_buf_unaligned = static_cast( + gb_pm_mmap + 5*1024*1024*1024UL); + buf_dblwr->write_buf = static_cast( + ut_align(buf_dblwr->write_buf_unaligned,UNIV_PAGE_SIZE)); + + ib::info() << "we configured the DWB on NVM!"; + + buf = buf_dblwr->write_buf; + IORequest read_request(IORequest::READ); + read_request.disable_compression(); + + err = os_file_read( + read_request, + gb_pm_dwb_file, buf, buf_dblwr->block1 * UNIV_PAGE_SIZE, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE); + + if (err != DB_SUCCESS) { + + ib::error() + << "Failed to read the first double write buffer " + "extent"; + return; + } + + err = os_file_read( + read_request, + gb_pm_dwb_file, + buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf_dblwr->block2 * UNIV_PAGE_SIZE, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE); + + if (err != DB_SUCCESS) { + + ib::error() + << "Failed to read the second double write buffer " + "extent"; + return; + } + } else { + // original + buf_dblwr->write_buf_unaligned = static_cast( + ut_malloc_nokey((1 + buf_size) * UNIV_PAGE_SIZE)); + + buf_dblwr->write_buf = static_cast( + ut_align(buf_dblwr->write_buf_unaligned, + UNIV_PAGE_SIZE)); + } +#else buf_dblwr->write_buf_unaligned = static_cast( ut_malloc_nokey((1 + buf_size) * UNIV_PAGE_SIZE)); buf_dblwr->write_buf = static_cast( ut_align(buf_dblwr->write_buf_unaligned, UNIV_PAGE_SIZE)); +#endif buf_dblwr->buf_block_arr = static_cast( ut_zalloc_nokey(buf_size * sizeof(void*))); @@ -400,6 +463,12 @@ buf_dblwr_init_or_load_pages( == TRX_SYS_DOUBLEWRITE_MAGIC_N) { /* The doublewrite buffer has been created */ +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + gb_pm_dwb_file = file; + } +#endif + buf_dblwr_init(doublewrite); block1 = buf_dblwr->block1; @@ -426,6 +495,11 @@ buf_dblwr_init_or_load_pages( } /* Read the pages from the doublewrite buffer to memory */ +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + goto skip_load_dwb; + } +#endif err = os_file_read( read_request, file, buf, block1 * UNIV_PAGE_SIZE, @@ -460,6 +534,9 @@ buf_dblwr_init_or_load_pages( return(err); } +#ifdef UNIV_NVDIMM_CACHE +skip_load_dwb: +#endif /* Check if any of these pages is half-written in data files, in the intended position */ @@ -702,7 +779,14 @@ buf_dblwr_free(void) os_event_destroy(buf_dblwr->b_event); os_event_destroy(buf_dblwr->s_event); +#ifdef UNIV_NVDIMM_CACHE + // skip free dwb here + if (!srv_use_nvdimm_dwb) { + ut_free(buf_dblwr->write_buf_unaligned); + } +#else ut_free(buf_dblwr->write_buf_unaligned); +#endif buf_dblwr->write_buf_unaligned = NULL; ut_free(buf_dblwr->buf_block_arr); @@ -747,12 +831,19 @@ buf_dblwr_update( ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); buf_dblwr->b_reserved--; - - if (buf_dblwr->b_reserved == 0) { + + if (buf_dblwr->b_reserved == 0) { mutex_exit(&buf_dblwr->mutex); /* This will finish the batch. Sync data files to the disk. */ +#ifdef UNIV_NVDIMM_CACHE + // we do not need to flush + if (!srv_use_nvdimm_dwb) { + fil_flush_file_spaces(FIL_TYPE_TABLESPACE); + } +#else fil_flush_file_spaces(FIL_TYPE_TABLESPACE); +#endif mutex_enter(&buf_dblwr->mutex); /* We can now reuse the doublewrite memory buffer: */ @@ -773,6 +864,7 @@ buf_dblwr_update( buf_dblwr->s_reserved--; buf_dblwr->buf_block_arr[i] = NULL; buf_dblwr->in_use[i] = false; + // HOT DEBUG (dwb) break; } } @@ -1025,6 +1117,12 @@ buf_dblwr_flush_buffered_writes(void) buf_dblwr_check_page_lsn(write_buf + len2); } +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + goto flush; + } +#endif + /* Write out the first block of the doublewrite buffer */ len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, buf_dblwr->first_free) * UNIV_PAGE_SIZE; @@ -1055,7 +1153,14 @@ buf_dblwr_flush_buffered_writes(void) srv_stats.dblwr_writes.inc(); /* Now flush the doublewrite buffer data to disk */ +#ifdef UNIV_NVDIMM_CACHE + // do not need to flush + if (!srv_use_nvdimm_dwb) { + fil_flush(TRX_SYS_SPACE); + } +#else fil_flush(TRX_SYS_SPACE); +#endif /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer @@ -1126,7 +1231,46 @@ buf_dblwr_add_to_batch( byte* p = buf_dblwr->write_buf + univ_page_size.physical() * buf_dblwr->first_free; +#ifdef UNIV_NVDIMM_CACHE + + if (srv_use_nvdimm_dwb) { + + if (bpage->size.is_compressed()) { + UNIV_MEM_ASSERT_RW(bpage->zip.data, bpage->size.physical()); + /* Copy the compressed page and clear the rest. */ + + memcpy(p, bpage->zip.data, bpage->size.physical()); + memset(p + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + flush_cache(p, univ_page_size.physical() - bpage->size.physical()); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + + UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame, + bpage->size.logical()); + + memcpy(p, ((buf_block_t*) bpage)->frame, bpage->size.logical()); + flush_cache(p, bpage->size.logical()); + } + + } else { + // original + if (bpage->size.is_compressed()) { + UNIV_MEM_ASSERT_RW(bpage->zip.data, bpage->size.physical()); + /* Copy the compressed page and clear the rest. */ + memcpy(p, bpage->zip.data, bpage->size.physical()); + memset(p + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame, + bpage->size.logical()); + + memcpy(p, ((buf_block_t*) bpage)->frame, bpage->size.logical()); + } + } +#else if (bpage->size.is_compressed()) { UNIV_MEM_ASSERT_RW(bpage->zip.data, bpage->size.physical()); /* Copy the compressed page and clear the rest. */ @@ -1143,12 +1287,15 @@ buf_dblwr_add_to_batch( memcpy(p, ((buf_block_t*) bpage)->frame, bpage->size.logical()); } +#endif buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage; buf_dblwr->first_free++; buf_dblwr->b_reserved++; + // HOT DEBUG (dwb) + ut_ad(!buf_dblwr->batch_running); ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved); ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size); @@ -1233,6 +1380,8 @@ buf_dblwr_write_single_page( buf_dblwr->s_reserved++; buf_dblwr->buf_block_arr[i] = bpage; + // HOT DEBUG (dwb) + /* increment the doublewrite flushed pages counter */ srv_stats.dblwr_pages_written.inc(); srv_stats.dblwr_writes.inc(); @@ -1259,6 +1408,58 @@ buf_dblwr_write_single_page( write it. This is so because we want to pad the remaining bytes in the doublewrite page with zeros. */ +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + + if (bpage->size.is_compressed()) { + memcpy(buf_dblwr->write_buf + univ_page_size.physical() * i, + bpage->zip.data, bpage->size.physical()); + + memset(buf_dblwr->write_buf + univ_page_size.physical() * i + + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + + flush_cache(buf_dblwr->write_buf, bpage->size.physical()); + } else { + // (jhpark): we do not need fil_io + memcpy(buf_dblwr->write_buf + univ_page_size.physical() * offset, + (void*) ((buf_block_t*) bpage)->frame, + bpage->size.physical()); + } + } else { + // original + + if (bpage->size.is_compressed()) { + memcpy(buf_dblwr->write_buf + univ_page_size.physical() * i, + bpage->zip.data, bpage->size.physical()); + + memset(buf_dblwr->write_buf + univ_page_size.physical() * i + + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + + fil_io(IORequestWrite, true, + page_id_t(TRX_SYS_SPACE, offset), univ_page_size, 0, + univ_page_size.physical(), + (void*) (buf_dblwr->write_buf + + univ_page_size.physical() * i), + NULL); + } else { + + /* It is a regular page. Write it directly to the + doublewrite buffer */ + + fil_io(IORequestWrite, true, + page_id_t(TRX_SYS_SPACE, offset), univ_page_size, 0, + univ_page_size.physical(), + (void*) ((buf_block_t*) bpage)->frame, + NULL); + } + + /* Now flush the doublewrite buffer data to disk */ + fil_flush(TRX_SYS_SPACE); + } + +#else if (bpage->size.is_compressed()) { memcpy(buf_dblwr->write_buf + univ_page_size.physical() * i, bpage->zip.data, bpage->size.physical()); @@ -1285,6 +1486,7 @@ buf_dblwr_write_single_page( /* Now flush the doublewrite buffer data to disk */ fil_flush(TRX_SYS_SPACE); +#endif /* We know that the write has been flushed to disk now and during recovery we will find it in the doublewrite buffer diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index 03025dc3..6efb6159 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -730,6 +730,16 @@ buf_load() buf_load_status(STATUS_INFO, "Buffer pool(s) load completed at %s", now); + + // RECOVERY + /* +#ifdef UNIV_NVDIMM_CACHE + end_time = getticks(); + recovery_time = (unsigned)((end_time-start_time)/CPU_MHZ); + fprintf(stderr, "[INFO] !!! RECOVERY TIME !!! : %u msec\n", recovery_time); +#endif + */ + /* Make sure that estimated = completed when we end. */ mysql_stage_set_work_completed(pfs_stage_progress, dump_n); /* End the stage progress event. */ diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 7e205a12..714f7221 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1101,12 +1101,12 @@ buf_flush_write_block_low( if (nvdimm_page == NULL) goto normal; - /* + /* ib::info() << "page_id = " << bpage->id.space() << " offset = " << bpage->id.page_no() << " dst = " << &(((buf_block_t *)nvdimm_page)->frame) << " src = " << &(((buf_block_t *)bpage)->frame) << " flush-type = " << bpage->flush_type; - */ + */ memcpy(((buf_block_t *)nvdimm_page)->frame, ((buf_block_t *)bpage)->frame, UNIV_PAGE_SIZE); @@ -1119,6 +1119,8 @@ buf_flush_write_block_low( , bpage->oldest_modification , bpage->newest_modification , nvdimm_page->flush_observer); + + pmem_copy_page(((buf_block_t *)bpage)->frame); // ib::info() << "oldest_modification: " // << nvdimm_page->oldest_modification @@ -1130,11 +1132,13 @@ buf_flush_write_block_low( buf_page_io_complete(bpage, true); buf_page_io_complete(nvdimm_page); - /* + /* buf_pool_t* buf_pool = buf_pool_from_bpage(nvdimm_page); ib::info() << nvdimm_page->id.space() << " " << nvdimm_page->id.page_no() << " is moved to " - << nvdimm_page->buf_pool_index << " from " << bpage->buf_pool_index; + << nvdimm_page->buf_pool_index << " from " << bpage->buf_pool_index + << " oldest_modification: " << nvdimm_page->oldest_modification + << " newest_modification: " << nvdimm_page->newest_modification; */ } else { @@ -1165,6 +1169,7 @@ buf_flush_write_block_low( IORequest request(type); + /* lsn_t lsn_gap = bpage->newest_modification - bpage->oldest_modification; if (bpage->cached_in_nvdimm) { ib::info() << bpage->id.space() << " " << bpage->id.page_no() @@ -1175,9 +1180,8 @@ buf_flush_write_block_low( << " with oldest: " << bpage->oldest_modification << " newest: " << bpage->newest_modification << " lsn-gap: " << lsn_gap; - - pmem_lsn = bpage->oldest_modification; } + */ fil_io(request, sync, bpage->id, bpage->size, 0, bpage->size.physical(), diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index a3cdf4a5..fc8ddff4 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -907,7 +907,8 @@ fil_node_open_file( } /* mijin */ - fprintf(stderr, "%s = %lu\n", node->name, space->id); + // HOT DEBUG 2 + //fprintf(stderr, "%s = %lu\n", node->name, space->id); return(true); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 1a744c4c..a3d10dfb 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -20316,6 +20316,10 @@ static MYSQL_SYSVAR_STR(nvdimm_home_dir, srv_nvdimm_home_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Path to NVDIMM-aware files.", NULL, NULL, NULL); +static MYSQL_SYSVAR_BOOL(use_nvdimm_dwb, srv_use_nvdimm_dwb, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable NVDIMM DWB (disabled by default).", + NULL, NULL, FALSE); #endif /* UNIV_NVDIMM_CACHE */ static struct st_mysql_sys_var* innobase_system_variables[]= { @@ -20496,6 +20500,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(nvdimm_buffer_pool_instances), MYSQL_SYSVAR(nvdimm_pc_threshold_pct), MYSQL_SYSVAR(nvdimm_home_dir), + MYSQL_SYSVAR(use_nvdimm_dwb), #endif /* UNIV_NVDIMM_CACHE */ NULL }; diff --git a/storage/innobase/include/pmem_mmap_obj.h b/storage/innobase/include/pmem_mmap_obj.h index a2ed7669..2a539dc1 100644 --- a/storage/innobase/include/pmem_mmap_obj.h +++ b/storage/innobase/include/pmem_mmap_obj.h @@ -13,6 +13,10 @@ #include #include #include + +#include +#include + //#include "ut0new.h" //#include "log0log.h" @@ -203,6 +207,7 @@ void pm_mmap_recv_flush_buffer(); // add extern std::map , std::vector > pmem_nc_buffer_map; +extern std::map , std::vector > pmem_nc_page_map; uint64_t pm_mmap_recv_check_nc_buf(uint64_t space, uint64_t page_no); void nc_recv_analysis(); @@ -211,7 +216,10 @@ extern bool is_pmem_recv; extern uint64_t pmem_recv_offset; extern uint64_t pmem_recv_size; extern uint64_t pmem_lsn; +extern uint64_t pmem_page_offset; void nc_save_pmem_lsn(); +void pmem_copy_page(unsigned char* frame); +uint64_t pm_mmap_recv_check_nc_page(uint64_t space, uint64_t page_no); /** Recovery system data structure */ //struct recv_sys_t{ @@ -228,4 +236,22 @@ void nc_save_pmem_lsn(); // ulint len; /*!< amount of data in buf */ //}; +// time measurement +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned a, d; + asm("cpuid"); + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + + return (((ticks)a) | (((ticks)d) << 32)); +} + +extern ticks start_time; +extern ticks end_time; +extern unsigned recovery_time; +#define CPU_MHZ 1199703 + + #endif /* __PMEMMAPOBJ_H__ */ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index d72c1593..a7c3f89d 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -320,6 +320,8 @@ extern ulong srv_nvdimm_buf_pool_instances; extern ulong srv_nvdimm_pc_threshold_pct; /** NVDIMM-aware file resident directory */ extern char* srv_nvdimm_home_dir; +/** NVDIMM DWB enable */ +extern my_bool srv_use_nvdimm_dwb; #endif /* UNIV_NVDIMM_CACHE */ /** Requested size in bytes */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 69733d18..a0517c16 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -1815,8 +1815,13 @@ log_checkpoint( ut_ad(!recv_no_log_write); oldest_lsn = log_buf_pool_get_oldest_modification(); - // HOT DEBUG - /* + // HOT DEBUG 2 // +#ifdef UNIV_NVDIMM_CACHE + if (recovery_time == 0) { + end_time = getticks(); + recovery_time = (unsigned)((end_time-start_time)/CPU_MHZ); + fprintf(stderr, "[INFO] !!! RECOVERY TIME !!! : %u msec\n", recovery_time); + } lsn_t nvdimm_lsn = nvdimm_buf_pool_get_oldest_modification(); if (nvdimm_lsn !=0 && nvdimm_lsn < oldest_lsn) { @@ -1825,7 +1830,7 @@ log_checkpoint( << " the gap: " << oldest_lsn - nvdimm_lsn; oldest_lsn = nvdimm_lsn; } - */ +#endif /* Because log also contains headers and dummy log records, diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 26c76800..30495a36 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -2453,14 +2453,17 @@ recv_recover_page_func( nc_corrupt_flag = true; } - fprintf(stderr, "[DEBUG] offset: %lu current page is NC page and LSN : %lu disk lsn: %lu page_lsn: %lu corupted? %d recv_start_lsn :%lu\n" - , cur_nc_buf_offset, cur_nc_page_lsn, cur_disk_page_lsn, page_lsn, nc_corrupt_flag - , recv->start_lsn); +#ifdef UNIV_DEBUG + ib::info << "(recovery) offset: " + << cur_nc_buf_offset + << " current page is NC page and LSN: " + << cur_nc_page_lsn + << " disk lsn : " << cur_disk_page_lsn + << " corrupted? : " << nc_corrupt_flag; +#endif // recover from NC buffer if (!nc_corrupt_flag || cur_disk_page_lsn == 0) { - // check - fprintf(stderr, "[DEBUG] we skip this page: %lu:%lu\n", block->page.id.space(), block->page.id.page_no()); memcpy(block->frame, nc_frame, UNIV_PAGE_SIZE); page_lsn = cur_nc_page_lsn; end_lsn = recv->start_lsn + recv->len; @@ -2469,6 +2472,20 @@ recv_recover_page_func( - FIL_PAGE_END_LSN_OLD_CHKSUM + (block->frame), end_lsn); goto skip_redo; + } else { + uint64_t cur_nc_page_offset = pm_mmap_recv_check_nc_buf( + block->page.id.space(), block->page.id.page_no()); + if (cur_nc_page_offset != -1) { + + memcpy(block->frame, nc_frame, UNIV_PAGE_SIZE); + page_lsn = cur_nc_page_lsn; + end_lsn = recv->start_lsn + recv->len; + mach_write_to_8(FIL_PAGE_LSN + (block->frame), end_lsn); + mach_write_to_8(UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + + (block->frame), end_lsn); + goto skip_redo; + } } } // end-of-if @@ -3780,7 +3797,7 @@ recv_scan_log_recs( = log_block_get_checkpoint_no(log_block); } - if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + if (data_len < OS_FILE_LOG_BLOCK_SIZE){ /* Log data for this group ends here */ finished = true; break; @@ -4134,7 +4151,7 @@ recv_recovery_from_checkpoint_start( #ifdef UNIV_NVDIMM_CACHE ib::info() << "Reocvery start from this checkpoint_lsn: " << checkpoint_lsn; // HOT DEBUG - /* + /* extern unsigned char* gb_pm_mmap; if (is_pmem_recv) { uint64_t cur_pmem_lsn = 0; @@ -4142,7 +4159,7 @@ recv_recovery_from_checkpoint_start( checkpoint_lsn = cur_pmem_lsn; ib::info() << "Reocvery start from this checkpoint_lsn (recv): " << checkpoint_lsn; } - */ + */ #endif /* Read the first log file header to print a note if this is @@ -4203,13 +4220,13 @@ recv_recovery_from_checkpoint_start( // HOT DEBUG #ifdef UNIV_NVDIMM_CACHE if (is_pmem_recv) { - /* +/* extern unsigned char* gb_pm_mmap; uint64_t cur_pmem_lsn = 0; memcpy(&cur_pmem_lsn, gb_pm_mmap+6*1024*1024*1024UL ,sizeof(uint64_t)); contiguous_lsn = cur_pmem_lsn; ib::info() << "log chopping we use this lsn for congiguous_lns : " << contiguous_lsn; - */ +*/ contiguous_lsn = checkpoint_lsn; } else { contiguous_lsn = checkpoint_lsn; @@ -4242,6 +4259,9 @@ recv_recovery_from_checkpoint_start( } if (recv_sys->mlog_checkpoint_lsn == 0) { +#ifdef UNIV_NVDIMM_CACHE + goto skip_2; +#endif if (!srv_read_only_mode && group->scanned_lsn != checkpoint_lsn) { ib::error() << "Ignoring the redo log due to missing" diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index fddcb269..350c67d8 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -675,7 +675,7 @@ mtr_t::commit_checkpoint( // HOT DEBUG // (jhpark): we need to leave the current NC pages - ib::info() << "checkpoint_lsn: " << checkpoint_lsn; + //ib::info() << "checkpoint_lsn: " << checkpoint_lsn; mach_write_to_8(ptr + 1, checkpoint_lsn); } diff --git a/storage/innobase/pmem/pmem0mmap.cc b/storage/innobase/pmem/pmem0mmap.cc index a77a4c35..b69a9ffb 100644 --- a/storage/innobase/pmem/pmem0mmap.cc +++ b/storage/innobase/pmem/pmem0mmap.cc @@ -20,15 +20,21 @@ int gb_pm_mmap_fd; PMEM_MMAP_MTRLOG_BUF* mmap_mtrlogbuf = NULL; // HOT DEBUG uint64_t pmem_lsn; +uint64_t pmem_page_offset; // recovery bool is_pmem_recv = false; uint64_t pmem_recv_offset = 0; uint64_t pmem_recv_size = 0; std::map ,std::vector > pmem_nc_buffer_map; +std::map , std::vector > pmem_nc_page_map; unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { - + + if (srv_use_nvdimm_dwb) { + ib::info() << "INODB DWB ON!"; + } + if (access(path, F_OK) != 0) { gb_pm_mmap_fd = open(path, O_RDWR | O_CREAT, 0777); if (gb_pm_mmap_fd < 0) { @@ -355,3 +361,12 @@ ssize_t pm_mmap_mtrlogbuf_write( return ret; } + +// HOT DEBUG // +void pmem_copy_page(unsigned char* frame) { + // key = page_id + // value = page frame + memcpy(gb_pm_mmap + 10*1024*1024*1024UL + pmem_page_offset, frame, UNIV_PAGE_SIZE); + pmem_page_offset += UNIV_PAGE_SIZE; + flush_cache(gb_pm_mmap + 10*1024*1024*1024UL + pmem_page_offset, UNIV_PAGE_SIZE); +} diff --git a/storage/innobase/pmem/pmem0recv.cc b/storage/innobase/pmem/pmem0recv.cc index f6654a29..1a2301e0 100644 --- a/storage/innobase/pmem/pmem0recv.cc +++ b/storage/innobase/pmem/pmem0recv.cc @@ -196,10 +196,32 @@ uint64_t pm_mmap_recv_check_nc_buf(uint64_t space, uint64_t page_no) { unsigned char *nc_frame = reinterpret_cast ((gb_pm_mmap + (1*1024*1024*1024UL) + nc_offset))->frame; - fprintf(stderr, "[DEBUG] NC BUF (%lu:%lu) offset: %lu page_lsn: %lu i: %lu vec:size: %d\n", - space, page_no, nc_offset - , mach_read_from_8(nc_frame + FIL_PAGE_LSN) - , i, nc_offset_vec.size()); + if (space != mach_read_from_4(nc_frame + FIL_PAGE_SPACE_ID) + || page_no != mach_read_from_4(nc_frame + FIL_PAGE_OFFSET)) { + fprintf(stderr, "[DEBUG] wrong buffer page info! %u:%u\n", space, page_no); + } + } + return nc_offset; + } else { + return -1; + } +} + +uint64_t pm_mmap_recv_check_nc_page(uint64_t space, uint64_t page_no) { + std::map, std::vector >::iterator ncbuf_iter; + ncbuf_iter = pmem_nc_page_map.find(std::make_pair(space,page_no)); + if (ncbuf_iter != pmem_nc_buffer_map.end()) { + std::vector nc_offset_vec = (*ncbuf_iter).second; + uint64_t nc_offset; + for (uint64_t i=0; i( (8*147324928UL)/4096); - fprintf(stderr, "[DEBUG] NVDIMM Caching page analysis begin! total pages v2: %lu\n", page_num_chunks); - for (uint64_t i=0; i < page_num_chunks; ++i) { //for (uint64_t i=0; i < srv_nvdimm_buf_pool_size; i+= UNIV_PAGE_SIZE) { @@ -232,7 +254,6 @@ void nc_recv_analysis() { //unsigned char *frame = (unsigned char*)(addr+ i); if (space != 27 && space != 29 && space != 31) { - fprintf(stderr, "[DEBUG] we miss the pages %lu:%lu\n", space, page_no); if (space == 4294967295 && page_no == 4294967295) { continue; @@ -240,17 +261,21 @@ void nc_recv_analysis() { break; } } else { - fprintf(stderr, "[DEBUG] we get this page %lu:%lu\n", space, page_no); - } - - // check - if (space != mach_read_from_4(frame + FIL_PAGE_SPACE_ID) +#ifdef UNIV_DEBUG + ib::info() << "obtaine NC page: " << space << ":" << page_no; + // check + if (space != mach_read_from_4(frame + FIL_PAGE_SPACE_ID) || page_no != mach_read_from_4(frame + FIL_PAGE_OFFSET)) { - fprintf(stderr, "[DEBUG] wrong frame info!\n (%lu:%lu) (%lu:%lu)", space, page_no - , mach_read_from_4(frame + FIL_PAGE_SPACE_ID) - , mach_read_from_4(frame + FIL_PAGE_OFFSET)); + ib::info() << " wrong NC page frame info expected: " + << space << ":" << page_no + << " current value: " << mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + << ":" << mach_read_from_4(frame + FIL_PAGE_OFFSET); + } +#endif } + + #ifdef PMEM_RECV_DEBUG fil_space_t* space_t = fil_space_get(space); const page_id_t page_id(space,page_no); @@ -265,8 +290,44 @@ void nc_recv_analysis() { // we store relative position of nc page pmem_nc_buffer_map[std::make_pair(space,page_no)].push_back(i*sizeof(buf_block_t)); + } + + // nc_page_map + unsigned char *page_addr = gb_pm_mmap + (10*1024*1024*1024UL); + for (uint64_t i=0; i < page_num_chunks; ++i) { + space = mach_read_from_4( + page_addr + (i*4096UL) + FIL_PAGE_SPACE_ID); + page_no = mach_read_from_4( + page_addr + (i*4096UL) + FIL_PAGE_OFFSET); + + if (space != 27 && space != 29 && space != 31) { + fprintf(stderr, "[DEBUG] we miss the pages %lu:%lu\n", space, page_no); + if (space == 4294967295 + && page_no == 4294967295) { + continue; + } else { + break; + } + } else { + +#ifdef UNIV_DEBUG + ib::info() << "obtaine NC page in buffer: " << space << ":" << page_no; + // check + if (space != mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + || page_no != mach_read_from_4(frame + FIL_PAGE_OFFSET)) { + ib::info() << " wrong NC page frame info expected: " + << space << ":" << page_no + << " current value: " << mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + << ":" << mach_read_from_4(frame + FIL_PAGE_OFFSET); + } +#endif + + } + pmem_nc_page_map[std::make_pair(space,page_no)].push_back(i*4096UL); } + + // } void nc_save_pmem_lsn() { diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 542d7907..20e180c4 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -253,6 +253,8 @@ ulong srv_nvdimm_buf_pool_instances = 1; ulong srv_nvdimm_pc_threshold_pct = 2; /** NVDIMM-aware file resident directory */ char* srv_nvdimm_home_dir = NULL; +/** NVDIMM DWB enable */ +my_bool srv_use_nvdimm_dwb = FALSE; #endif /* UNIV_NVDIMM_CACHE */ /** Requested size in bytes */ diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 0d6ff80c..7a8d8f68 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -109,6 +109,10 @@ Created 2/16/1996 Heikki Tuuri #include "pmem_mmap_obj.h" extern unsigned char* gb_pm_mmap; char PMEM_FILE_PATH [PMEM_MMAP_MAX_FILE_NAME_LENGTH]; +pfs_os_file_t gb_pm_dwb_file; +ticks start_time=0; +ticks end_time=0; +unsigned recovery_time=0; #endif /* UNIV_NVDIMM_CACHE */ #ifdef HAVE_LZO1X @@ -1473,6 +1477,11 @@ innobase_start_or_create_for_mysql(void) size_t dirnamelen; unsigned i = 0; + // HOT_DEBUG RECOVERY +#ifdef UNIV_NVDIMM_CACHE + start_time = getticks(); +#endif + /* Reset the start state. */ srv_start_state = SRV_START_STATE_NONE; @@ -1493,7 +1502,7 @@ innobase_start_or_create_for_mysql(void) #ifdef UNIV_NVDIMM_CACHE sprintf(PMEM_FILE_PATH, "%s/%s", srv_nvdimm_home_dir, NVDIMM_MMAP_FILE_NAME); - size_t srv_pmem_pool_size = 8 * 1024; + size_t srv_pmem_pool_size = 12 * 1024; uint64_t pool_size = srv_pmem_pool_size * 1024 * 1024UL; gb_pm_mmap = pm_mmap_create(PMEM_FILE_PATH, pool_size); if (!gb_pm_mmap) { @@ -2302,11 +2311,15 @@ innobase_start_or_create_for_mysql(void) #ifdef UNIV_NVDIMM_CACHE if (is_pmem_recv) { nc_recv_analysis(); - } else { + } + // (jhpark): ignore now + /* + else { // HOT DEBUG pmem_lsn = flushed_lsn; nc_save_pmem_lsn(); } + */ #endif From 094a92670a0e24734fa99d25f940e60669720fc5 Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Mon, 16 May 2022 02:07:34 +0900 Subject: [PATCH 5/8] recovery time measurement --- my.cnf | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/my.cnf b/my.cnf index cbaf48fe..5f4ce190 100644 --- a/my.cnf +++ b/my.cnf @@ -67,7 +67,7 @@ innodb_nvdimm_pc_threshold_pct=15 innodb_nvdimm_home_dir=/mnt/pmemdir #transaction log settings -#innodb_log_file_size=500M +innodb_log_file_size=500M innodb_log_files_in_group=3 # 0:every 1 seconds, 1:fsync on commits, 2:writes on commits @@ -79,6 +79,9 @@ innodb_flush_neighbors=0 #doublewrite and flush method innodb_doublewrite=ON innodb_flush_method=O_DIRECT +# dwb NVM version +innodb_use_nvdimm_dwb=true + #AIO control innodb_use_native_aio=true From 0c0e7302ad8da00acf10a215166c68516b9f519a Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Mon, 16 May 2022 16:43:32 +0900 Subject: [PATCH 6/8] dwb+recovery vresion (mop code) --- my.cnf | 2 +- storage/innobase/next_checkpoint_lsn) | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 storage/innobase/next_checkpoint_lsn) diff --git a/my.cnf b/my.cnf index 5f4ce190..385c3de9 100644 --- a/my.cnf +++ b/my.cnf @@ -80,7 +80,7 @@ innodb_flush_neighbors=0 innodb_doublewrite=ON innodb_flush_method=O_DIRECT # dwb NVM version -innodb_use_nvdimm_dwb=true +innodb_use_nvdimm_dwb=false #AIO control diff --git a/storage/innobase/next_checkpoint_lsn) b/storage/innobase/next_checkpoint_lsn) deleted file mode 100644 index 7a6d558d..00000000 --- a/storage/innobase/next_checkpoint_lsn) +++ /dev/null @@ -1,7 +0,0 @@ -log/log0log.cc 1567 log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn; -log/log0log.cc 1619 log_sys->next_checkpoint_lsn, -log/log0log.cc 1626 mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn); -log/log0log.cc 1628 lsn_offset = log_group_calc_lsn_offset(log_sys->next_checkpoint_lsn, -log/log0log.cc 1899 log_sys->next_checkpoint_lsn = oldest_lsn; -log/log0recv.cc 1271 log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn -log/log0recv.cc 4320 log_sys->next_checkpoint_lsn = checkpoint_lsn; From be207cea6f4f24fc4e0a6e3eebe1707eb1f3a751 Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Mon, 16 May 2022 16:49:14 +0900 Subject: [PATCH 7/8] modified README file --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index add26841..925b0df8 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ $ ./build.sh --origin | innodb_nvdimm_buffer_pool_instances | The number of regions that the NVDIMM cache is divided into. The default value is 1. | | innodb_nvdimm_pc_threshold_pct | Wakeup the NVDIMM page cleaner when this % of free pages remaining. The default value is 5. | | innodb_nvdimm_home_dir | NVDIMM-aware files resident directory | +| innodb_use_nvdimm_dwb | Specifies whether to use NVDIMM dwb. **true** or **false**. | For example: From 24a9da837530cd93f93542361427bf1def735d94 Mon Sep 17 00:00:00 2001 From: JonghyeokPark Date: Mon, 16 May 2022 20:15:16 +0900 Subject: [PATCH 8/8] shadowing page technique version --- storage/innobase/buf/buf0flu.cc | 6 +++- storage/innobase/include/pmem_mmap_obj.h | 22 +++++--------- storage/innobase/pmem/pmem0mmap.cc | 37 +++++++++++++++++++++--- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 714f7221..247ac2bc 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1120,7 +1120,7 @@ buf_flush_write_block_low( , bpage->newest_modification , nvdimm_page->flush_observer); - pmem_copy_page(((buf_block_t *)bpage)->frame); + pmem_copy_page(((buf_block_t *)bpage)->frame, bpage->id.space(), bpage->id.page_no()); // ib::info() << "oldest_modification: " // << nvdimm_page->oldest_modification @@ -1169,6 +1169,10 @@ buf_flush_write_block_low( IORequest request(type); + if (bpage->cached_in_nvdimm) { + pmem_evict_page(bpage->id.space(), bpage->id.page_no()); + } + /* lsn_t lsn_gap = bpage->newest_modification - bpage->oldest_modification; if (bpage->cached_in_nvdimm) { diff --git a/storage/innobase/include/pmem_mmap_obj.h b/storage/innobase/include/pmem_mmap_obj.h index 2a539dc1..3a86536c 100644 --- a/storage/innobase/include/pmem_mmap_obj.h +++ b/storage/innobase/include/pmem_mmap_obj.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -218,23 +219,14 @@ extern uint64_t pmem_recv_size; extern uint64_t pmem_lsn; extern uint64_t pmem_page_offset; void nc_save_pmem_lsn(); -void pmem_copy_page(unsigned char* frame); +void pmem_copy_page(unsigned char* frame, uint64_t space, uint64_t page_no); +void pmem_evict_page(uint64_t space, uint64_t page_no); uint64_t pm_mmap_recv_check_nc_page(uint64_t space, uint64_t page_no); -/** Recovery system data structure */ -//struct recv_sys_t{ -// ib_mutex_t mutex; - /*!< mutex protecting the fields apply_log_recs, - n_addrs, and the state field in each recv_addr struct */ -// ib_mutex_t writer_mutex; - /*!< mutex coordinating - flushing between recv_writer_thread and the recovery thread. */ -// ibool apply_log_recs; - /*!< this is TRUE when log rec application to pages is allowed; this flag tells the - i/o-handler if it should do log record application */ -// byte* buf; /*!< buffer for parsing log records */ -// ulint len; /*!< amount of data in buf */ -//}; +// page map offset list +// (key : page_id (space, page_no & value : offset in page offset +extern std::map , uint64_t > pmem_nc_page_offset_map; +extern std::queue pmem_nc_page_offset_list; // time measurement typedef unsigned long long ticks; diff --git a/storage/innobase/pmem/pmem0mmap.cc b/storage/innobase/pmem/pmem0mmap.cc index b69a9ffb..49aeec74 100644 --- a/storage/innobase/pmem/pmem0mmap.cc +++ b/storage/innobase/pmem/pmem0mmap.cc @@ -28,12 +28,23 @@ uint64_t pmem_recv_offset = 0; uint64_t pmem_recv_size = 0; std::map ,std::vector > pmem_nc_buffer_map; std::map , std::vector > pmem_nc_page_map; +std::map , uint64_t > pmem_nc_page_offset_map; +std::queue pmem_nc_page_offset_list; unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { +#ifdef UNIV_NVDIMM_CACHE if (srv_use_nvdimm_dwb) { ib::info() << "INODB DWB ON!"; } +#endif + + // (jhpark): initialize the pmem_page_offset_list here + // TODO(xxx): use server variable + for (int i=0; i< (1024*1024*1024/4096); i++) { + pmem_nc_page_offset_list.push(i*4096); + } + // if (access(path, F_OK) != 0) { gb_pm_mmap_fd = open(path, O_RDWR | O_CREAT, 0777); @@ -363,10 +374,28 @@ ssize_t pm_mmap_mtrlogbuf_write( // HOT DEBUG // -void pmem_copy_page(unsigned char* frame) { - // key = page_id - // value = page frame +void pmem_copy_page(unsigned char* frame, uint64_t space, uint64_t page_no) { + pthread_mutex_lock(&mmap_mtrlogbuf->mtrMutex); + uint64_t pmem_page_offset = pmem_nc_page_offset_list.front(); + pmem_nc_page_offset_map[std::make_pair(space, page_no)] = pmem_page_offset; + pmem_nc_page_offset_list.pop(); + pthread_mutex_unlock(&mmap_mtrlogbuf->mtrMutex); + memcpy(gb_pm_mmap + 10*1024*1024*1024UL + pmem_page_offset, frame, UNIV_PAGE_SIZE); - pmem_page_offset += UNIV_PAGE_SIZE; flush_cache(gb_pm_mmap + 10*1024*1024*1024UL + pmem_page_offset, UNIV_PAGE_SIZE); } + +void pmem_evict_page(uint64_t space, uint64_t page_no) { + + uint64_t offset = -1; + std::map,uint64_t>::iterator iter; + pthread_mutex_lock(&mmap_mtrlogbuf->mtrMutex); + iter = pmem_nc_page_offset_map.find(std::make_pair(space, page_no)); + if (iter != pmem_nc_page_offset_map.end()) { + offset = iter->second; + } else { + ib::error() << "error : " << space << ":" << page_no; + } + pmem_nc_page_offset_list.push(offset); + pthread_mutex_unlock(&mmap_mtrlogbuf->mtrMutex); +}