diff --git a/README.md b/README.md index add26841..925b0df8 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ $ ./build.sh --origin | innodb_nvdimm_buffer_pool_instances | The number of regions that the NVDIMM cache is divided into. The default value is 1. | | innodb_nvdimm_pc_threshold_pct | Wakeup the NVDIMM page cleaner when this % of free pages remaining. The default value is 5. | | innodb_nvdimm_home_dir | NVDIMM-aware files resident directory | +| innodb_use_nvdimm_dwb | Specifies whether to use NVDIMM dwb. **true** or **false**. | For example: diff --git a/build.sh b/build.sh index 51d75662..3f1579b5 100755 --- a/build.sh +++ b/build.sh @@ -29,15 +29,15 @@ elif [ "$1" = "--nc-monitor" ]; then # Cache hot LB pages with mtr-logging/monitoring enabled BUILD_FLAGS="-DUNIV_NVDIMM_CACHE -DUNIV_LOG_HEADER -DUNIV_FLUSH_MONITOR" else - # Cache hot LB pages (default) - BUILD_FLAGS="-DUNIV_NVDIMM_CACHE -DUNIV_LOG_HEADER" + # Cache NVDIMM pages in TPC-C workloads + BUILD_FLAGS="-DUNIV_NVDIMM_CACHE" fi echo "Start build using $BUILD_FLAGS" -cd $BASE_DIR +cd $BUILD_DIR -cmake -DWITH_DEBUG=0 -DCMAKE_C_FLAGS="$BUILD_FLAGS" -DCMAKE_CXX_FLAGS="$BUILD_FLAGS" \ +cmake .. -DWITH_DEBUG=0 -DCMAKE_C_FLAGS="$BUILD_FLAGS" -DCMAKE_CXX_FLAGS="$BUILD_FLAGS" \ -DDOWNLOAD_BOOST=ON -DWITH_BOOST=$BASE_DIR/boost -DENABLED_LOCAL_INFILE=1 \ -DCMAKE_INSTALL_PREFIX=$BUILD_DIR diff --git a/my.cnf b/my.cnf index 060d33fb..385c3de9 100644 --- a/my.cnf +++ b/my.cnf @@ -14,6 +14,7 @@ prompt=\u:\d>\_ # This was formally known as [safe_mysqld]. Both versions are currently parsed. [mysqld_safe] +user = root socket = /tmp/mysql.sock #nice = 0 @@ -21,19 +22,23 @@ socket = /tmp/mysql.sock # # * Basic Settings # +user = root default-storage-engine = innodb skip-grant-tables -pid-file = /home/mijin/test_data/mysql.pid +pid-file = /home/vldb/test_data/mysql.pid socket = /tmp/mysql.sock port = 3306 -datadir = /home/mijin/test_data/ -log-error = /home/mijin/test_data/mysql_error_nvdimm.log +datadir = /home/vldb/test_data/ +log-error = /home/vldb/test_data/mysql_error_nvdimm.log ################################################## # Need to Modify ################################################## #Log group path (iblog0, iblog1) -innodb_log_group_home_dir=/home/mijin/test_log/ +innodb_log_group_home_dir=/home/vldb/test_log/ +#innodb_log_group_home_dir=/mnt/pmemdir/test_log/ +#innodb_undo_directory=/mnt/pmemdir/ + #innodb page size innodb_page_size=4KB @@ -59,21 +64,24 @@ innodb_use_nvdimm_buffer=true innodb_nvdimm_buffer_pool_size=1G innodb_nvdimm_buffer_pool_instances=1 innodb_nvdimm_pc_threshold_pct=15 -innodb_nvdimm_home_dir=/mnt/ramdisk -#innodb_nvdimm_home_dir=/mnt/pmem +innodb_nvdimm_home_dir=/mnt/pmemdir #transaction log settings -innodb_log_file_size=2G +innodb_log_file_size=500M innodb_log_files_in_group=3 # 0:every 1 seconds, 1:fsync on commits, 2:writes on commits -innodb_flush_log_at_trx_commit=0 +innodb_flush_log_at_trx_commit=1 innodb_log_buffer_size=32M innodb_flush_neighbors=0 +#innodb_force_recovery = 1 #doublewrite and flush method innodb_doublewrite=ON innodb_flush_method=O_DIRECT +# dwb NVM version +innodb_use_nvdimm_dwb=false + #AIO control innodb_use_native_aio=true @@ -95,4 +103,3 @@ open_files_limit = 24000 #performance-schema-instrument='wait/synch/rwlock/innodb/%=ON' #innodb_status_output = ON #innodb_status_output_locks = ON - diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index ffffc2f6..f1de8c42 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -3924,23 +3924,19 @@ btr_cur_update_in_place( } #ifdef UNIV_NVDIMM_CACHE + // (jhpark): add REDO log for NC pages + btr_cur_update_in_place_log(flags, rec, index, update, + trx_id, roll_ptr, mtr); + /* nvm_block = btr_cur_get_block(cursor); nvm_bpage = &(nvm_block->page); - if (nvm_bpage->cached_in_nvdimm) { - // skip generating REDO logs for NVM-resident pages - // write NC page on NVDIMM - //pm_mmap_buf_write(nvm_bpage->size.physical(), (void*) ((buf_block_t*) nvm_bpage)->frame); - - // persist records - ulint cur_rec_size = rec_offs_size(offsets); - pm_mmap_mtrlogbuf_commit(nvm_block->frame, UNIV_PAGE_SIZE, nvm_bpage->id.space(), nvm_bpage->id.page_no()); - - //pm_mmap_mtrlogbuf_commit(rec, cur_rec_size, nvm_bpage->id.space(), nvm_bpage->id.page_no()); + } else { btr_cur_update_in_place_log(flags, rec, index, update, trx_id, roll_ptr, mtr); } + */ #else btr_cur_update_in_place_log(flags, rec, index, update, trx_id, roll_ptr, mtr); @@ -4937,6 +4933,7 @@ btr_cur_del_mark_set_clust_rec( row_upd_rec_sys_fields(rec, page_zip, index, offsets, trx, roll_ptr); #ifdef UNIV_NVDIMM_CACHE + /* if (is_nvm_page) { // skip generating REDO logs for nvm-page // Instead, write commit log in mtr log @@ -4950,6 +4947,12 @@ btr_cur_del_mark_set_clust_rec( btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, roll_ptr, mtr); } + */ + + // (jhpark): add REDO log for NC pages + btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, + roll_ptr, mtr); + #else btr_cur_del_mark_set_clust_rec_log(rec, index, trx->id, roll_ptr, mtr); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 7c24d104..8421d60a 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -406,6 +406,74 @@ buf_pool_register_chunk( chunk->blocks->frame, chunk)); } +// HOT DEBUG // +lsn_t +nvdimm_buf_pool_get_oldest_modification(void) +/*==================================*/ +{ + lsn_t lsn = 0; + lsn_t oldest_lsn = 0; + lsn_t page_lsn = 0; + + /* When we traverse all the flush lists we don't want another + thread to add a dirty page to any flush list. */ + log_flush_order_mutex_enter(); + + for (ulint i = srv_buf_pool_instances; i < srv_buf_pool_instances+1; i++) { + buf_pool_t* buf_pool; + + buf_pool = buf_pool_from_array(i); + + buf_flush_list_mutex_enter(buf_pool); + + buf_page_t* bpage; + + /* We don't let log-checkpoint halt because pages from system + temporary are not yet flushed to the disk. Anyway, object + residing in system temporary doesn't generate REDO logging. */ + for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list); + bpage != NULL + && fsp_is_system_temporary(bpage->id.space()); + bpage = UT_LIST_GET_PREV(list, bpage)) { + /* Do nothing. */ + } + + if (bpage != NULL) { + ut_ad(bpage->in_flush_list); + + // HOT DEBUG // + // check page lsn of current NC pages + /* + buf_block_t *block; + block = (buf_block_t*)bpage; + uint64_t cur_lsn_page = mach_read_from_8(block->frame + FIL_PAGE_LSN); + if (cur_lsn_page !=0 + && cur_lsn_page < pmem_lsn) { + lsn = pmem_lsn; + } else { + lsn = cur_lsn_page; + } + */ + lsn = bpage->oldest_modification; + } + + buf_flush_list_mutex_exit(buf_pool); + + if (!oldest_lsn || oldest_lsn > lsn) { + oldest_lsn = lsn; + } + } + + log_flush_order_mutex_exit(); + + /* The returned answer may be out of date: the flush_list can + change after the mutex has been released. */ + + return(oldest_lsn); +} + + + /********************************************************************//** Gets the smallest oldest_modification lsn for any page in the pool. Returns zero if all modified pages have been flushed to disk. @@ -421,7 +489,7 @@ buf_pool_get_oldest_modification(void) thread to add a dirty page to any flush list. */ log_flush_order_mutex_enter(); - for (ulint i = 0; i < srv_buf_pool_instances; i++) { + for (ulint i = 0; i < srv_buf_pool_instances; i++) { buf_pool_t* buf_pool; buf_pool = buf_pool_from_array(i); @@ -6331,7 +6399,7 @@ buf_page_io_complete( Asserts that all file pages in the buffer are in a replaceable state. @return TRUE */ static -ibool + ibool buf_all_freed_instance( /*===================*/ buf_pool_t* buf_pool) /*!< in: buffer pool instancce */ diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 057f8e82..3d2ca692 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -39,6 +39,11 @@ Created 2011/12/19 #ifndef UNIV_HOTBACKUP +#ifdef UNIV_NVDIMM_CACHE +extern unsigned char* gb_pm_mmap; +extern pfs_os_file_t gb_pm_dwb_file; +#endif + /** The doublewrite buffer */ buf_dblwr_t* buf_dblwr = NULL; @@ -123,6 +128,10 @@ buf_dblwr_init( byte* doublewrite) /*!< in: pointer to the doublewrite buf header on trx sys page */ { +#ifdef UNIV_NVDIMM_CACHE + dberr_t err; + byte* buf; +#endif ulint buf_size; buf_dblwr = static_cast( @@ -153,12 +162,66 @@ buf_dblwr_init( buf_dblwr->in_use = static_cast( ut_zalloc_nokey(buf_size * sizeof(bool))); +#ifdef UNIV_NVDIMM_CACHE + // HOT DEBUG + // TODO(jhpark): recovery + if (srv_use_nvdimm_dwb) { + + buf_dblwr->write_buf_unaligned = static_cast( + gb_pm_mmap + 5*1024*1024*1024UL); + buf_dblwr->write_buf = static_cast( + ut_align(buf_dblwr->write_buf_unaligned,UNIV_PAGE_SIZE)); + + ib::info() << "we configured the DWB on NVM!"; + + buf = buf_dblwr->write_buf; + IORequest read_request(IORequest::READ); + read_request.disable_compression(); + + err = os_file_read( + read_request, + gb_pm_dwb_file, buf, buf_dblwr->block1 * UNIV_PAGE_SIZE, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE); + + if (err != DB_SUCCESS) { + + ib::error() + << "Failed to read the first double write buffer " + "extent"; + return; + } + + err = os_file_read( + read_request, + gb_pm_dwb_file, + buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE, + buf_dblwr->block2 * UNIV_PAGE_SIZE, + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE); + + if (err != DB_SUCCESS) { + + ib::error() + << "Failed to read the second double write buffer " + "extent"; + return; + } + } else { + // original + buf_dblwr->write_buf_unaligned = static_cast( + ut_malloc_nokey((1 + buf_size) * UNIV_PAGE_SIZE)); + + buf_dblwr->write_buf = static_cast( + ut_align(buf_dblwr->write_buf_unaligned, + UNIV_PAGE_SIZE)); + } +#else buf_dblwr->write_buf_unaligned = static_cast( ut_malloc_nokey((1 + buf_size) * UNIV_PAGE_SIZE)); buf_dblwr->write_buf = static_cast( ut_align(buf_dblwr->write_buf_unaligned, UNIV_PAGE_SIZE)); +#endif buf_dblwr->buf_block_arr = static_cast( ut_zalloc_nokey(buf_size * sizeof(void*))); @@ -400,6 +463,12 @@ buf_dblwr_init_or_load_pages( == TRX_SYS_DOUBLEWRITE_MAGIC_N) { /* The doublewrite buffer has been created */ +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + gb_pm_dwb_file = file; + } +#endif + buf_dblwr_init(doublewrite); block1 = buf_dblwr->block1; @@ -426,6 +495,11 @@ buf_dblwr_init_or_load_pages( } /* Read the pages from the doublewrite buffer to memory */ +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + goto skip_load_dwb; + } +#endif err = os_file_read( read_request, file, buf, block1 * UNIV_PAGE_SIZE, @@ -460,6 +534,9 @@ buf_dblwr_init_or_load_pages( return(err); } +#ifdef UNIV_NVDIMM_CACHE +skip_load_dwb: +#endif /* Check if any of these pages is half-written in data files, in the intended position */ @@ -702,7 +779,14 @@ buf_dblwr_free(void) os_event_destroy(buf_dblwr->b_event); os_event_destroy(buf_dblwr->s_event); +#ifdef UNIV_NVDIMM_CACHE + // skip free dwb here + if (!srv_use_nvdimm_dwb) { + ut_free(buf_dblwr->write_buf_unaligned); + } +#else ut_free(buf_dblwr->write_buf_unaligned); +#endif buf_dblwr->write_buf_unaligned = NULL; ut_free(buf_dblwr->buf_block_arr); @@ -747,12 +831,19 @@ buf_dblwr_update( ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); buf_dblwr->b_reserved--; - - if (buf_dblwr->b_reserved == 0) { + + if (buf_dblwr->b_reserved == 0) { mutex_exit(&buf_dblwr->mutex); /* This will finish the batch. Sync data files to the disk. */ +#ifdef UNIV_NVDIMM_CACHE + // we do not need to flush + if (!srv_use_nvdimm_dwb) { + fil_flush_file_spaces(FIL_TYPE_TABLESPACE); + } +#else fil_flush_file_spaces(FIL_TYPE_TABLESPACE); +#endif mutex_enter(&buf_dblwr->mutex); /* We can now reuse the doublewrite memory buffer: */ @@ -773,6 +864,7 @@ buf_dblwr_update( buf_dblwr->s_reserved--; buf_dblwr->buf_block_arr[i] = NULL; buf_dblwr->in_use[i] = false; + // HOT DEBUG (dwb) break; } } @@ -1025,6 +1117,12 @@ buf_dblwr_flush_buffered_writes(void) buf_dblwr_check_page_lsn(write_buf + len2); } +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + goto flush; + } +#endif + /* Write out the first block of the doublewrite buffer */ len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, buf_dblwr->first_free) * UNIV_PAGE_SIZE; @@ -1055,7 +1153,14 @@ buf_dblwr_flush_buffered_writes(void) srv_stats.dblwr_writes.inc(); /* Now flush the doublewrite buffer data to disk */ +#ifdef UNIV_NVDIMM_CACHE + // do not need to flush + if (!srv_use_nvdimm_dwb) { + fil_flush(TRX_SYS_SPACE); + } +#else fil_flush(TRX_SYS_SPACE); +#endif /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer @@ -1126,7 +1231,46 @@ buf_dblwr_add_to_batch( byte* p = buf_dblwr->write_buf + univ_page_size.physical() * buf_dblwr->first_free; +#ifdef UNIV_NVDIMM_CACHE + + if (srv_use_nvdimm_dwb) { + + if (bpage->size.is_compressed()) { + UNIV_MEM_ASSERT_RW(bpage->zip.data, bpage->size.physical()); + /* Copy the compressed page and clear the rest. */ + + memcpy(p, bpage->zip.data, bpage->size.physical()); + memset(p + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + flush_cache(p, univ_page_size.physical() - bpage->size.physical()); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + + UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame, + bpage->size.logical()); + + memcpy(p, ((buf_block_t*) bpage)->frame, bpage->size.logical()); + flush_cache(p, bpage->size.logical()); + } + + } else { + // original + if (bpage->size.is_compressed()) { + UNIV_MEM_ASSERT_RW(bpage->zip.data, bpage->size.physical()); + /* Copy the compressed page and clear the rest. */ + memcpy(p, bpage->zip.data, bpage->size.physical()); + memset(p + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + } else { + ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame, + bpage->size.logical()); + + memcpy(p, ((buf_block_t*) bpage)->frame, bpage->size.logical()); + } + } +#else if (bpage->size.is_compressed()) { UNIV_MEM_ASSERT_RW(bpage->zip.data, bpage->size.physical()); /* Copy the compressed page and clear the rest. */ @@ -1143,12 +1287,15 @@ buf_dblwr_add_to_batch( memcpy(p, ((buf_block_t*) bpage)->frame, bpage->size.logical()); } +#endif buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage; buf_dblwr->first_free++; buf_dblwr->b_reserved++; + // HOT DEBUG (dwb) + ut_ad(!buf_dblwr->batch_running); ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved); ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size); @@ -1233,6 +1380,8 @@ buf_dblwr_write_single_page( buf_dblwr->s_reserved++; buf_dblwr->buf_block_arr[i] = bpage; + // HOT DEBUG (dwb) + /* increment the doublewrite flushed pages counter */ srv_stats.dblwr_pages_written.inc(); srv_stats.dblwr_writes.inc(); @@ -1259,6 +1408,58 @@ buf_dblwr_write_single_page( write it. This is so because we want to pad the remaining bytes in the doublewrite page with zeros. */ +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + + if (bpage->size.is_compressed()) { + memcpy(buf_dblwr->write_buf + univ_page_size.physical() * i, + bpage->zip.data, bpage->size.physical()); + + memset(buf_dblwr->write_buf + univ_page_size.physical() * i + + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + + flush_cache(buf_dblwr->write_buf, bpage->size.physical()); + } else { + // (jhpark): we do not need fil_io + memcpy(buf_dblwr->write_buf + univ_page_size.physical() * offset, + (void*) ((buf_block_t*) bpage)->frame, + bpage->size.physical()); + } + } else { + // original + + if (bpage->size.is_compressed()) { + memcpy(buf_dblwr->write_buf + univ_page_size.physical() * i, + bpage->zip.data, bpage->size.physical()); + + memset(buf_dblwr->write_buf + univ_page_size.physical() * i + + bpage->size.physical(), 0x0, + univ_page_size.physical() - bpage->size.physical()); + + fil_io(IORequestWrite, true, + page_id_t(TRX_SYS_SPACE, offset), univ_page_size, 0, + univ_page_size.physical(), + (void*) (buf_dblwr->write_buf + + univ_page_size.physical() * i), + NULL); + } else { + + /* It is a regular page. Write it directly to the + doublewrite buffer */ + + fil_io(IORequestWrite, true, + page_id_t(TRX_SYS_SPACE, offset), univ_page_size, 0, + univ_page_size.physical(), + (void*) ((buf_block_t*) bpage)->frame, + NULL); + } + + /* Now flush the doublewrite buffer data to disk */ + fil_flush(TRX_SYS_SPACE); + } + +#else if (bpage->size.is_compressed()) { memcpy(buf_dblwr->write_buf + univ_page_size.physical() * i, bpage->zip.data, bpage->size.physical()); @@ -1285,6 +1486,7 @@ buf_dblwr_write_single_page( /* Now flush the doublewrite buffer data to disk */ fil_flush(TRX_SYS_SPACE); +#endif /* We know that the write has been flushed to disk now and during recovery we will find it in the doublewrite buffer diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index 03025dc3..6efb6159 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -730,6 +730,16 @@ buf_load() buf_load_status(STATUS_INFO, "Buffer pool(s) load completed at %s", now); + + // RECOVERY + /* +#ifdef UNIV_NVDIMM_CACHE + end_time = getticks(); + recovery_time = (unsigned)((end_time-start_time)/CPU_MHZ); + fprintf(stderr, "[INFO] !!! RECOVERY TIME !!! : %u msec\n", recovery_time); +#endif + */ + /* Make sure that estimated = completed when we end. */ mysql_stage_set_work_completed(pfs_stage_progress, dump_n); /* End the stage progress event. */ diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 42dc28d9..247ac2bc 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1047,14 +1047,8 @@ buf_flush_write_block_low( /* Force the log to the disk before writing the modified block */ if (!srv_read_only_mode) { -#ifdef UNIV_NVDIMM_CACHE - if (bpage->buf_pool_index < srv_buf_pool_instances) { - log_write_up_to(bpage->newest_modification, true); - } -#else - log_write_up_to(bpage->newest_modification, true); -#endif /* UNIV_NVDIMM_CACHE */ - } + log_write_up_to(bpage->newest_modification, true); + } switch (buf_page_get_state(bpage)) { case BUF_BLOCK_POOL_WATCH: @@ -1078,8 +1072,8 @@ buf_flush_write_block_low( if (!frame) { frame = ((buf_block_t*) bpage)->frame; } - - buf_flush_init_for_writing( + + buf_flush_init_for_writing( reinterpret_cast(bpage), reinterpret_cast(bpage)->frame, bpage->zip.data ? &bpage->zip : NULL, @@ -1107,33 +1101,52 @@ buf_flush_write_block_low( if (nvdimm_page == NULL) goto normal; - /*ib::info() << "page_id = " << bpage->id.space() + /* + ib::info() << "page_id = " << bpage->id.space() << " offset = " << bpage->id.page_no() << " dst = " << &(((buf_block_t *)nvdimm_page)->frame) << " src = " << &(((buf_block_t *)bpage)->frame) - << " flush-type = " << bpage->flush_type;*/ + << " flush-type = " << bpage->flush_type; + */ + memcpy(((buf_block_t *)nvdimm_page)->frame, ((buf_block_t *)bpage)->frame, UNIV_PAGE_SIZE); /* Set the oldest LSN of the NVDIMM page to the previous newest LSN. */ - buf_flush_note_modification((buf_block_t *)nvdimm_page, bpage->newest_modification, bpage->newest_modification, nvdimm_page->flush_observer); - // TODO: NVDIMM-porting - // 1 +// nvdimm_page->oldest_modification = bpage->oldest_modification; +// nvdimm_page->newest_modification = bpage->newest_modification; + + buf_flush_note_modification((buf_block_t *)nvdimm_page + , bpage->oldest_modification + , bpage->newest_modification + , nvdimm_page->flush_observer); + + pmem_copy_page(((buf_block_t *)bpage)->frame, bpage->id.space(), bpage->id.page_no()); + +// ib::info() << "oldest_modification: " +// << nvdimm_page->oldest_modification +// << nvdimm_page->id.space() << ":" << nvdimm_page->id.page_no(); + flush_cache(((buf_block_t *)nvdimm_page)->frame, UNIV_PAGE_SIZE); - // 2 /* Remove the target page from the original buffer pool. */ buf_page_io_complete(bpage, true); buf_page_io_complete(nvdimm_page); - - /*buf_pool_t* buf_pool = buf_pool_from_bpage(nvdimm_page); + + /* + buf_pool_t* buf_pool = buf_pool_from_bpage(nvdimm_page); ib::info() << nvdimm_page->id.space() << " " << nvdimm_page->id.page_no() << " is moved to " - << nvdimm_page->buf_pool_index << " from " << bpage->buf_pool_index;*/ + << nvdimm_page->buf_pool_index << " from " << bpage->buf_pool_index + << " oldest_modification: " << nvdimm_page->oldest_modification + << " newest_modification: " << nvdimm_page->newest_modification; + */ + } else { normal: bpage->moved_to_nvdimm = false; - /*ib::info() << bpage->id.space() << " " << bpage->id.page_no() + /* + ib::info() << bpage->id.space() << " " << bpage->id.page_no() << " is batch written. cached? " << bpage->cached_in_nvdimm << " moved? " << bpage->moved_to_nvdimm << " flush-type: " << flush_type @@ -1141,7 +1154,8 @@ buf_flush_write_block_low( << " with oldest: " << bpage->oldest_modification << " newest: " << bpage->newest_modification << " lsn-gap: " << bpage->newest_modification - bpage->oldest_modification; -*/ + */ + if (!srv_use_doublewrite_buf || buf_dblwr == NULL || srv_read_only_mode @@ -1155,24 +1169,28 @@ buf_flush_write_block_low( IORequest request(type); - /*lsn_t lsn_gap = bpage->newest_modification - bpage->oldest_modification; + if (bpage->cached_in_nvdimm) { + pmem_evict_page(bpage->id.space(), bpage->id.page_no()); + } - ib::info() << bpage->id.space() << " " << bpage->id.page_no() + /* + lsn_t lsn_gap = bpage->newest_modification - bpage->oldest_modification; + if (bpage->cached_in_nvdimm) { + ib::info() << bpage->id.space() << " " << bpage->id.page_no() << " is batch written. cached? " << bpage->cached_in_nvdimm << " moved? " << bpage->moved_to_nvdimm << " flush-type: " << flush_type << " buf-fix: " << bpage->buf_fix_count << " with oldest: " << bpage->oldest_modification << " newest: " << bpage->newest_modification - << " lsn-gap: " << lsn_gap;*/ - + << " lsn-gap: " << lsn_gap; + } + */ + fil_io(request, sync, bpage->id, bpage->size, 0, bpage->size.physical(), frame, bpage); - - // jhpark: write oldest_modification_lsn of current NVDIMM-caching page - pm_mmap_write_logfile_header_lsn(bpage->oldest_modification); - + } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) { buf_dblwr_write_single_page(bpage, sync); } else { @@ -1311,7 +1329,8 @@ buf_flush_page( #ifdef UNIV_NVDIMM_CACHE if (bpage->flush_type == BUF_FLUSH_LIST /* Flush list flushing */ - && (bpage->id.space() == 28 || bpage->id.space() == 30 || bpage->id.space() == 32) /* TPC-C tablespaces */ + // (jhpark): modified for 500 wh loading version + && (bpage->id.space() == 27 || bpage->id.space() == 29 || bpage->id.space() == 31) /* TPC-C tablespaces */ && bpage->buf_fix_count == 0 /* Not fixed */ && !bpage->cached_in_nvdimm) { /* Not cached in NVDIMM */ bpage->moved_to_nvdimm = true; diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index a3cdf4a5..fc8ddff4 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -907,7 +907,8 @@ fil_node_open_file( } /* mijin */ - fprintf(stderr, "%s = %lu\n", node->name, space->id); + // HOT DEBUG 2 + //fprintf(stderr, "%s = %lu\n", node->name, space->id); return(true); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 1a744c4c..a3d10dfb 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -20316,6 +20316,10 @@ static MYSQL_SYSVAR_STR(nvdimm_home_dir, srv_nvdimm_home_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, "Path to NVDIMM-aware files.", NULL, NULL, NULL); +static MYSQL_SYSVAR_BOOL(use_nvdimm_dwb, srv_use_nvdimm_dwb, + PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY, + "Enable NVDIMM DWB (disabled by default).", + NULL, NULL, FALSE); #endif /* UNIV_NVDIMM_CACHE */ static struct st_mysql_sys_var* innobase_system_variables[]= { @@ -20496,6 +20500,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(nvdimm_buffer_pool_instances), MYSQL_SYSVAR(nvdimm_pc_threshold_pct), MYSQL_SYSVAR(nvdimm_home_dir), + MYSQL_SYSVAR(use_nvdimm_dwb), #endif /* UNIV_NVDIMM_CACHE */ NULL }; diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index f2afda23..c72f7f38 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -459,6 +459,10 @@ UNIV_INLINE ulint buf_pool_get_n_pages(void); /*=======================*/ +// HOT DEBUG +lsn_t +nvdimm_buf_pool_get_oldest_modification(void); + /********************************************************************//** Gets the smallest oldest_modification lsn for any page in the pool. Returns zero if all modified pages have been flushed to disk. diff --git a/storage/innobase/include/pmem_mmap_obj.h b/storage/innobase/include/pmem_mmap_obj.h index b0de14e5..3a86536c 100644 --- a/storage/innobase/include/pmem_mmap_obj.h +++ b/storage/innobase/include/pmem_mmap_obj.h @@ -11,6 +11,13 @@ #include #include +#include +#include +#include + +#include +#include + //#include "ut0new.h" //#include "log0log.h" @@ -199,24 +206,44 @@ bool pm_mmap_recv(uint64_t start_offset, uint64_t end_offset); uint64_t pm_mmap_recv_check(PMEM_MMAP_MTRLOGFILE_HDR* log_fil_hdr); void pm_mmap_recv_flush_buffer(); +// add +extern std::map , std::vector > pmem_nc_buffer_map; +extern std::map , std::vector > pmem_nc_page_map; +uint64_t pm_mmap_recv_check_nc_buf(uint64_t space, uint64_t page_no); +void nc_recv_analysis(); + // TODO(jhpark): covert these variables as structure (i.e., recv_sys_t) extern bool is_pmem_recv; extern uint64_t pmem_recv_offset; extern uint64_t pmem_recv_size; +extern uint64_t pmem_lsn; +extern uint64_t pmem_page_offset; +void nc_save_pmem_lsn(); +void pmem_copy_page(unsigned char* frame, uint64_t space, uint64_t page_no); +void pmem_evict_page(uint64_t space, uint64_t page_no); +uint64_t pm_mmap_recv_check_nc_page(uint64_t space, uint64_t page_no); + +// page map offset list +// (key : page_id (space, page_no & value : offset in page offset +extern std::map , uint64_t > pmem_nc_page_offset_map; +extern std::queue pmem_nc_page_offset_list; + +// time measurement +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned a, d; + asm("cpuid"); + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + + return (((ticks)a) | (((ticks)d) << 32)); +} + +extern ticks start_time; +extern ticks end_time; +extern unsigned recovery_time; +#define CPU_MHZ 1199703 -/** Recovery system data structure */ -//struct recv_sys_t{ -// ib_mutex_t mutex; - /*!< mutex protecting the fields apply_log_recs, - n_addrs, and the state field in each recv_addr struct */ -// ib_mutex_t writer_mutex; - /*!< mutex coordinating - flushing between recv_writer_thread and the recovery thread. */ -// ibool apply_log_recs; - /*!< this is TRUE when log rec application to pages is allowed; this flag tells the - i/o-handler if it should do log record application */ -// byte* buf; /*!< buffer for parsing log records */ -// ulint len; /*!< amount of data in buf */ -//}; #endif /* __PMEMMAPOBJ_H__ */ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index d72c1593..a7c3f89d 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -320,6 +320,8 @@ extern ulong srv_nvdimm_buf_pool_instances; extern ulong srv_nvdimm_pc_threshold_pct; /** NVDIMM-aware file resident directory */ extern char* srv_nvdimm_home_dir; +/** NVDIMM DWB enable */ +extern my_bool srv_use_nvdimm_dwb; #endif /* UNIV_NVDIMM_CACHE */ /** Requested size in bytes */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 2edb6c9b..a0517c16 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -57,6 +57,11 @@ Created 12/9/1995 Heikki Tuuri #include "sync0sync.h" #endif /* !UNIV_HOTBACKUP */ +#ifdef UNIV_NVDIMM_CACHE +#include "pmem_mmap_obj.h" +extern unsigned char* gb_pm_mmap; +#endif + /* General philosophy of InnoDB redo-logs: @@ -149,7 +154,7 @@ log_buf_pool_get_oldest_modification(void) if (!lsn) { lsn = log_sys->lsn; - } + } return(lsn); } @@ -428,8 +433,12 @@ log_write_low( - (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) - LOG_BLOCK_TRL_SIZE; } - +#ifdef UNIV_NVDIMM_CACHE + ut_memcpy(log->buf + log->buf_free, str, len); + flush_cache(log->buf+log->buf_free, len); +#else ut_memcpy(log->buf + log->buf_free, str, len); +#endif str_len -= len; str = str + len; @@ -807,10 +816,17 @@ log_init(void) log_sys->buf_size = LOG_BUFFER_SIZE; + /* nc-logging */ +#ifdef UNIV_NVDIMM_CACHE + log_sys->buf_ptr = static_cast(gb_pm_mmap); + log_sys->buf = static_cast( + ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); +#else log_sys->buf_ptr = static_cast( ut_zalloc_nokey(log_sys->buf_size * 2 + OS_FILE_LOG_BLOCK_SIZE)); log_sys->buf = static_cast( ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE)); +#endif log_sys->first_in_use = true; @@ -1797,7 +1813,25 @@ log_checkpoint( log_mutex_enter(); ut_ad(!recv_no_log_write); - oldest_lsn = log_buf_pool_get_oldest_modification(); + oldest_lsn = log_buf_pool_get_oldest_modification(); + + // HOT DEBUG 2 // +#ifdef UNIV_NVDIMM_CACHE + if (recovery_time == 0) { + end_time = getticks(); + recovery_time = (unsigned)((end_time-start_time)/CPU_MHZ); + fprintf(stderr, "[INFO] !!! RECOVERY TIME !!! : %u msec\n", recovery_time); + } + lsn_t nvdimm_lsn = nvdimm_buf_pool_get_oldest_modification(); + if (nvdimm_lsn !=0 + && nvdimm_lsn < oldest_lsn) { + ib::info() << "nvdimm_lsn: " + << nvdimm_lsn << " oldest_lsn: " << oldest_lsn + << " the gap: " << oldest_lsn - nvdimm_lsn; + oldest_lsn = nvdimm_lsn; + } +#endif + /* Because log also contains headers and dummy log records, log_buf_pool_get_oldest_modification() will return log_sys->lsn diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index f7030cd9..30495a36 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -100,6 +100,10 @@ number (FIL_PAGE_LSN) is in the future. Initially FALSE, and set by recv_recovery_from_checkpoint_start(). */ bool recv_lsn_checks_on; +#ifdef UNIV_NVDIMM_CACHE +#include "pmem_mmap_obj.h" +#endif + /** If the following is TRUE, the buffer pool file pages must be invalidated after recovery and no ibuf operations are allowed; this becomes TRUE if the log record hash table becomes too full, and log records must be merged @@ -2243,7 +2247,7 @@ recv_add_to_hash_table( recv_fold(space, page_no), recv_addr); recv_sys->n_addrs++; // debug -#if 1 +#if 0 fprintf(stderr, "Inserting log rec for space %lu, page %lu\n", space, page_no); #endif @@ -2422,6 +2426,71 @@ recv_recover_page_func( recv = UT_LIST_GET_FIRST(recv_addr->rec_list); + /* nc-logging */ +#ifdef UNIV_NVDIMM_CACHE + extern unsigned char* gb_pm_mmap; + bool nc_page_flag = false; + bool nc_corrupt_flag = false; + uint64_t cur_nc_page_lsn = -1; + + uint64_t cur_nc_buf_offset = pm_mmap_recv_check_nc_buf( + block->page.id.space(), block->page.id.page_no()); + + if (cur_nc_buf_offset != -1) { + // (jhpark): now, we know this page reside in the NVDIMM buffer. + nc_page_flag = true; + unsigned char *nc_frame = reinterpret_cast + ((gb_pm_mmap + (1*1024*1024*1024UL) + cur_nc_buf_offset))->frame; + + uint64_t cur_disk_page_lsn = mach_read_from_8(block->frame + FIL_PAGE_LSN); + cur_nc_page_lsn = mach_read_from_8(nc_frame+FIL_PAGE_LSN); + + // check nc buffer page corruption or not + if (buf_page_is_corrupted(true + , nc_frame + , block->page.size + , fsp_is_checksum_disabled(block->page.id.space()))) { + nc_corrupt_flag = true; + } + +#ifdef UNIV_DEBUG + ib::info << "(recovery) offset: " + << cur_nc_buf_offset + << " current page is NC page and LSN: " + << cur_nc_page_lsn + << " disk lsn : " << cur_disk_page_lsn + << " corrupted? : " << nc_corrupt_flag; +#endif + + // recover from NC buffer + if (!nc_corrupt_flag || cur_disk_page_lsn == 0) { + memcpy(block->frame, nc_frame, UNIV_PAGE_SIZE); + page_lsn = cur_nc_page_lsn; + end_lsn = recv->start_lsn + recv->len; + mach_write_to_8(FIL_PAGE_LSN + (block->frame), end_lsn); + mach_write_to_8(UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + + (block->frame), end_lsn); + goto skip_redo; + } else { + uint64_t cur_nc_page_offset = pm_mmap_recv_check_nc_buf( + block->page.id.space(), block->page.id.page_no()); + if (cur_nc_page_offset != -1) { + + memcpy(block->frame, nc_frame, UNIV_PAGE_SIZE); + page_lsn = cur_nc_page_lsn; + end_lsn = recv->start_lsn + recv->len; + mach_write_to_8(FIL_PAGE_LSN + (block->frame), end_lsn); + mach_write_to_8(UNIV_PAGE_SIZE + - FIL_PAGE_END_LSN_OLD_CHKSUM + + (block->frame), end_lsn); + goto skip_redo; + } + } + + } // end-of-if +#endif + while (recv) { end_lsn = recv->end_lsn; @@ -2517,7 +2586,11 @@ recv_recover_page_func( } recv = UT_LIST_GET_NEXT(rec_list, recv); - } + } // end-of-while (recv) + +#ifdef UNIV_NVDIMM_CACHE +skip_redo: +#endif #ifdef UNIV_ZIP_DEBUG if (fil_page_index_page_check(page)) { @@ -3724,7 +3797,7 @@ recv_scan_log_recs( = log_block_get_checkpoint_no(log_block); } - if (data_len < OS_FILE_LOG_BLOCK_SIZE) { + if (data_len < OS_FILE_LOG_BLOCK_SIZE){ /* Log data for this group ends here */ finished = true; break; @@ -4075,6 +4148,20 @@ recv_recovery_from_checkpoint_start( checkpoint_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_LSN); checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO); +#ifdef UNIV_NVDIMM_CACHE + ib::info() << "Reocvery start from this checkpoint_lsn: " << checkpoint_lsn; + // HOT DEBUG + /* + extern unsigned char* gb_pm_mmap; + if (is_pmem_recv) { + uint64_t cur_pmem_lsn = 0; + memcpy(&cur_pmem_lsn, gb_pm_mmap+6*1024*1024*1024UL ,sizeof(uint64_t)); + checkpoint_lsn = cur_pmem_lsn; + ib::info() << "Reocvery start from this checkpoint_lsn (recv): " << checkpoint_lsn; + } + */ +#endif + /* Read the first log file header to print a note if this is a recovery from a restored InnoDB Hot Backup */ @@ -4130,7 +4217,24 @@ recv_recovery_from_checkpoint_start( group = UT_LIST_GET_FIRST(log_sys->log_groups); ut_ad(recv_sys->n_addrs == 0); + // HOT DEBUG +#ifdef UNIV_NVDIMM_CACHE + if (is_pmem_recv) { +/* + extern unsigned char* gb_pm_mmap; + uint64_t cur_pmem_lsn = 0; + memcpy(&cur_pmem_lsn, gb_pm_mmap+6*1024*1024*1024UL ,sizeof(uint64_t)); + contiguous_lsn = cur_pmem_lsn; + ib::info() << "log chopping we use this lsn for congiguous_lns : " << contiguous_lsn; +*/ + contiguous_lsn = checkpoint_lsn; + } else { + contiguous_lsn = checkpoint_lsn; + } +#else contiguous_lsn = checkpoint_lsn; +#endif + switch (group->format) { case 0: log_mutex_exit(); @@ -4148,7 +4252,6 @@ recv_recovery_from_checkpoint_start( the hash table. */ rescan = recv_group_scan_log_recs(group, &contiguous_lsn, false); - if ((recv_sys->found_corrupt_log && !srv_force_recovery) || recv_sys->found_corrupt_fs) { log_mutex_exit(); @@ -4156,6 +4259,9 @@ recv_recovery_from_checkpoint_start( } if (recv_sys->mlog_checkpoint_lsn == 0) { +#ifdef UNIV_NVDIMM_CACHE + goto skip_2; +#endif if (!srv_read_only_mode && group->scanned_lsn != checkpoint_lsn) { ib::error() << "Ignoring the redo log due to missing" @@ -4170,7 +4276,8 @@ recv_recovery_from_checkpoint_start( group->scanned_lsn = checkpoint_lsn; rescan = false; - } + } + /* NOTE: we always do a 'recovery' at startup, but only if there is something wrong we will print a message to the @@ -4242,6 +4349,7 @@ recv_recovery_from_checkpoint_start( " database is now corrupt!"; } + if (recv_sys->recovered_lsn < checkpoint_lsn) { log_mutex_exit(); @@ -4253,6 +4361,8 @@ recv_recovery_from_checkpoint_start( return(DB_ERROR); } +skip_2: + /* Synchronize the uncorrupted log groups to the most up-to-date log group; we also copy checkpoint info to groups */ diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index fc79bc40..350c67d8 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -617,8 +617,11 @@ mtr_t::commit() ut_ad(!srv_read_only_mode || m_impl.m_log_mode == MTR_LOG_NO_REDO); - +#ifdef UNIV_NVDIMM_CACHE + cmd.execute_nvm(); +#else cmd.execute(); +#endif } else { cmd.release_all(); cmd.release_resources(); @@ -669,6 +672,10 @@ mtr_t::commit_checkpoint( # error SIZE_OF_MLOG_CHECKPOINT != 9 #endif *ptr = MLOG_CHECKPOINT; + + // HOT DEBUG + // (jhpark): we need to leave the current NC pages + //ib::info() << "checkpoint_lsn: " << checkpoint_lsn; mach_write_to_8(ptr + 1, checkpoint_lsn); } @@ -1164,34 +1171,131 @@ mtr_t::Command::execute() void mtr_t::Command::execute_nvm() { ut_ad(m_impl->m_log_mode != MTR_LOG_NONE); - if (const ulint len = prepare_write_nvm()) { - finish_write_nvm(len); - } + // (jhpark): pull prepare_write() fucntion here + ulint len, n_recs; + fil_space_t* space; + + switch (m_impl->m_log_mode) { + case MTR_LOG_SHORT_INSERTS: + ut_ad(0); + /* fall through (write no redo log) */ + case MTR_LOG_NO_REDO: + case MTR_LOG_NONE: + ut_ad(m_impl->m_log.size() == 0); + log_mutex_enter(); + m_end_lsn = m_start_lsn = log_sys->lsn; + len = 0; + break; + case MTR_LOG_ALL: + break; + } - m_impl->m_mtr->m_commit_lsn = m_end_lsn; - release_blocks(); - release_latches(); - release_resources(); + if (m_impl->m_log_mode == MTR_LOG_ALL) { + len = m_impl->m_log.size(); + n_recs = m_impl->m_n_log_recs; + ut_ad(len > 0); + ut_ad(n_recs > 0); + + // (jhpark): call log_buffer_extend here!!! + if (len > log_sys->buf_size / 2) { + log_buffer_extend((len + 1) * 2); + } + + ut_ad(m_impl->m_n_log_recs == n_recs); + space = m_impl->m_user_space; + + if (space != NULL && is_system_or_undo_tablespace(space->id)) { + /* Omit MLOG_FILE_NAME for predefined tablespaces. */ + space = NULL; + } + + log_mutex_enter(); + + if (fil_names_write_if_was_clean(space, m_impl->m_mtr)) { + /* This mini-transaction was the first one to modify + this tablespace since the latest checkpoint, so + some MLOG_FILE_NAME records were appended to m_log. */ + ut_ad(m_impl->m_n_log_recs > n_recs); + mlog_catenate_ulint( + &m_impl->m_log, MLOG_MULTI_REC_END, MLOG_1BYTE); + len = m_impl->m_log.size(); + } else { + /* This was not the first time of dirtying a + tablespace since the latest checkpoint. */ + + ut_ad(n_recs == m_impl->m_n_log_recs); + + if (n_recs <= 1) { + ut_ad(n_recs == 1); + + /* Flag the single log record as the + only record in this mini-transaction. */ + *m_impl->m_log.front()->begin() + |= MLOG_SINGLE_REC_FLAG; + } else { + /* Because this mini-transaction comprises + multiple log records, append MLOG_MULTI_REC_END + at the end. */ + + mlog_catenate_ulint( + &m_impl->m_log, MLOG_MULTI_REC_END, + MLOG_1BYTE); + len++; + } + } + + /* check and attempt a checkpoint if exceeding capacity */ + log_margin_checkpoint_age(len); + } + // (jhpark): end-of-prepare_write() + // (jhpark): pull finish_write() + if (len > 0) { + ut_ad(m_impl->m_log_mode == MTR_LOG_ALL); + ut_ad(log_mutex_own()); + ut_ad(m_impl->m_log.size() == len); + ut_ad(len > 0); + + if (m_impl->m_log.is_small()) { + const mtr_buf_t::block_t* front = m_impl->m_log.front(); + ut_ad(len <= front->used()); + + m_end_lsn = log_reserve_and_write_fast( + front->begin(), len, &m_start_lsn); + + if (m_end_lsn > 0) { + goto skip_redo; + } + } + + /* Open the database log for log_write_low */ + m_start_lsn = log_reserve_and_open(len); + mtr_write_log_t write_log; + m_impl->m_log.for_each_block(write_log); + + m_end_lsn = log_close(); + } + // (jhpark): end-of-finish_write() + +skip_redo: + if (m_impl->m_made_dirty) { + log_flush_order_mutex_enter(); + } + + /* It is now safe to release the log mutex because the + flush_order mutex will ensure that we are the first one + to insert into the flush list. */ + log_mutex_exit(); + + m_impl->m_mtr->m_commit_lsn = m_end_lsn; -// TODO(jhpark): add flush_order mutex when nvdimm caching page is flushed. -// if (m_impl->m_made_dirty) { -// log_flush_order_mutex_enter(); -// } -// /* It is now safe to release the log mutex because the -// flush_order mutex will ensure that we are the first one -// to insert into the flush list. */ -// -// fprintf(stderr, "log_mutex_exit() called! m_end_lsn: %lu\n", m_end_lsn); -// log_mutex_exit(); -// fprintf(stderr, "log_mutex_exit() called! -- finished\n"); -// m_impl->m_mtr->m_commit_lsn = m_end_lsn; -// release_blocks(); -// if (m_impl->m_made_dirty) { -// log_flush_order_mutex_exit(); -// } -// release_latches(); -// release_resources(); + release_blocks(); + + if (m_impl->m_made_dirty) { + log_flush_order_mutex_exit(); + } + release_latches(); + release_resources(); } void mtr_t::Command::execute_no_nvm() { diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index 4412ecfb..838d35a0 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1533,6 +1533,7 @@ page_cur_insert_rec_low( /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { #ifdef UNIV_NVDIMM_CACHE + /* ulint page_no = page_get_page_no(page); ulint space_id = page_get_space_id(page); buf_block_t* nvm_block = buf_page_get(page_id_t(space_id, page_no), @@ -1540,7 +1541,6 @@ page_cur_insert_rec_low( assert(nvm_block != NULL); buf_page_t* nvm_bpage = &nvm_block->page; - if (nvm_bpage->cached_in_nvdimm) { // skip generating REDO log for nvm-page pm_mmap_mtrlogbuf_commit(nvm_block->frame, UNIV_PAGE_SIZE, space_id, page_no); @@ -1550,6 +1550,11 @@ page_cur_insert_rec_low( page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, index, mtr); } + */ + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log(insert_rec, rec_size, + current_rec, index, mtr); + #else page_cur_insert_rec_write_log(insert_rec, rec_size, current_rec, index, mtr); @@ -1944,6 +1949,7 @@ page_cur_insert_rec_zip( page_zip, page, index, level, NULL, NULL)) { #ifdef UNIV_NVDIMM_CACHE + /* buf_block_t* nvm_block = page_cur_get_block(cursor); assert(nvm_block != NULL); @@ -1959,6 +1965,13 @@ page_cur_insert_rec_zip( insert_rec, rec_size, cursor->rec, index, mtr); } + */ + + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log( + insert_rec, rec_size, + cursor->rec, index, mtr); + #else page_cur_insert_rec_write_log( insert_rec, rec_size, @@ -2235,6 +2248,7 @@ page_cur_insert_rec_zip( /* 9. Write log record of the insert */ if (UNIV_LIKELY(mtr != NULL)) { #ifdef UNIV_NVDIMM_CACHE + /* buf_block_t* nvm_block = page_cur_get_block(cursor); assert(nvm_block != NULL); @@ -2250,6 +2264,11 @@ page_cur_insert_rec_zip( page_cur_insert_rec_write_log(insert_rec, rec_size, cursor->rec, index, mtr); } + */ + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log(insert_rec, rec_size, + cursor->rec, index, mtr); + #else page_cur_insert_rec_write_log(insert_rec, rec_size, cursor->rec, index, mtr); @@ -2473,6 +2492,7 @@ page_copy_rec_list_end_to_created_page( rec_offs_make_valid(insert_rec, index, offsets); #ifdef UNIV_NVDIMM_CACHE + /* ulint page_no = page_get_page_no(new_page); ulint space_id = page_get_space_id(new_page); buf_block_t* nvm_block = buf_page_get(page_id_t(space_id, page_no), @@ -2489,6 +2509,11 @@ page_copy_rec_list_end_to_created_page( page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); } + */ + // (jhpark): add REDO log for NC pages + page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, + index, mtr); + #else page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec, index, mtr); diff --git a/storage/innobase/pmem/pmem0mmap.cc b/storage/innobase/pmem/pmem0mmap.cc index 7b0d8638..49aeec74 100644 --- a/storage/innobase/pmem/pmem0mmap.cc +++ b/storage/innobase/pmem/pmem0mmap.cc @@ -18,15 +18,34 @@ unsigned char* gb_pm_mmap; int gb_pm_mmap_fd; PMEM_MMAP_MTRLOG_BUF* mmap_mtrlogbuf = NULL; +// HOT DEBUG +uint64_t pmem_lsn; +uint64_t pmem_page_offset; // recovery bool is_pmem_recv = false; uint64_t pmem_recv_offset = 0; uint64_t pmem_recv_size = 0; - +std::map ,std::vector > pmem_nc_buffer_map; +std::map , std::vector > pmem_nc_page_map; +std::map , uint64_t > pmem_nc_page_offset_map; +std::queue pmem_nc_page_offset_list; unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { - + +#ifdef UNIV_NVDIMM_CACHE + if (srv_use_nvdimm_dwb) { + ib::info() << "INODB DWB ON!"; + } +#endif + + // (jhpark): initialize the pmem_page_offset_list here + // TODO(xxx): use server variable + for (int i=0; i< (1024*1024*1024/4096); i++) { + pmem_nc_page_offset_list.push(i*4096); + } + // + if (access(path, F_OK) != 0) { gb_pm_mmap_fd = open(path, O_RDWR | O_CREAT, 0777); if (gb_pm_mmap_fd < 0) { @@ -47,7 +66,7 @@ unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { } else { // TODO(jhaprk) add the recovery logic - PMEMMMAP_INFO_PRINT("Start mtr recvoery process\n"); + PMEMMMAP_INFO_PRINT("Start NC recvoery process\n"); gb_pm_mmap_fd = open(path, O_RDWR, 0777); if (gb_pm_mmap_fd < 0) { @@ -60,52 +79,9 @@ unsigned char* pm_mmap_create(const char* path, const uint64_t pool_size) { PMEMMMAP_ERROR_PRINT("pm_mmap mmap() faild recovery failed\n"); } - // get file construct - PMEM_MMAP_MTRLOGFILE_HDR* recv_mmap_mtrlog_fil_hdr = (PMEM_MMAP_MTRLOGFILE_HDR*) - malloc(PMEM_MMAP_LOGFILE_HEADER_SZ); - pm_mmap_read_logfile_header(recv_mmap_mtrlog_fil_hdr); - - // debug - fprintf(stderr, "[check] size: %lu, lsn: %lu, ckpt_lsn: %lu, ckpt_offset: %lu\n", - recv_mmap_mtrlog_fil_hdr->size, recv_mmap_mtrlog_fil_hdr->flushed_lsn, - recv_mmap_mtrlog_fil_hdr->ckpt_lsn, recv_mmap_mtrlog_fil_hdr->ckpt_offset); - - // recvoery check - PMEM_MMAP_MTRLOG_HDR* recv_mmap_mtrlog_hdr = (PMEM_MMAP_MTRLOG_HDR*) malloc(PMEM_MMAP_MTRLOG_HDR_SIZE); - memcpy(recv_mmap_mtrlog_hdr, gb_pm_mmap+recv_mmap_mtrlog_fil_hdr->ckpt_offset, PMEM_MMAP_MTRLOG_HDR_SIZE); - - if (recv_mmap_mtrlog_fil_hdr->size == PMEM_MMAP_MTR_FIL_HDR_SIZE - || recv_mmap_mtrlog_hdr->need_recv == false) { - PMEMMMAP_INFO_PRINT("Normal Shutdown case, don't need to recveory; Recovery process is terminated\n"); - } else { - // TODO(jhpark): real recovery process - is_pmem_recv = true; - pmem_recv_offset = pm_mmap_recv_check(recv_mmap_mtrlog_fil_hdr); - pmem_recv_size = recv_mmap_mtrlog_fil_hdr->size; - - // jhpark: check buffer!!!!! - // pm_mmap_recv_flush_buffer(); - - PMEMMMAP_INFO_PRINT("recovery offset: %lu\n", pmem_recv_offset); - } - - // step1. allocate mtr_recv_sys - // step2. 1) get header infor mation and 2) get info from mtr log region - // step3. reconstruct undo page - - // Get header information from exsiting nvdimm log file - //size_t recv_prev_offset = recv_mmap_mtrlog_hdr->prev; - //memset(recv_mmap_mtrlog_hdr, 0x00, PMEM_MMAP_MTRLOG_HDR_SIZE); - //memcpy(recv_mmap_mtrlog_hdr, gb_pm_mmap+recv_prev_offset, PMEM_MMAP_MTRLOG_HDR_SIZE); - - // debug - //fprintf(stderr, "size: %lu\n", recv_size); - //fprintf(stderr, "len: %lu\n", recv_mmap_mtrlog_hdr->len); - //fprintf(stderr, "lsn: %lu\n", recv_mmap_mtrlog_hdr->lsn); - //fprintf(stderr, "need_recovery: %d\n", recv_mmap_mtrlog_hdr->need_recv); - - free(recv_mmap_mtrlog_fil_hdr); - free(recv_mmap_mtrlog_hdr); + // TODO(jhpark): fix + //memcpy(gb_pm_mmap + (6*1024*1024*1024UL), gb_pm_mmap + (1*1024*1024*1024UL), (8UL*147324928)); + is_pmem_recv = true; } // Force to set NVIMMM @@ -396,91 +372,30 @@ ssize_t pm_mmap_mtrlogbuf_write( return ret; } -// commit mtr log -void pm_mmap_mtrlogbuf_commit(unsigned char* rec, unsigned long cur_rec_size ,ulint space, ulint page_no) { - // TODO(jhaprk): Keep page modification finish log for recovery - // For current mtr logging version, we jsut ignore this function - //return; - flush_cache(rec, cur_rec_size); -/* - if (mmap_mtrlogbuf == NULL) return; - - //fprintf(stderr, "[mtr-commit] space: %lu page_no: %lu\n", space, page_no); - // 1. check current cur_offset - size_t cur_offset = mmap_mtrlogbuf->cur_offset; - // 2. check current ckpt_offset - size_t ckpt_offset = mmap_mtrlogbuf->ckpt_offset; - // 3. remove stale log data - memset(gb_pm_mmap + PMEM_MMAP_MTR_FIL_HDR_SIZE, 0x00, cur_offset - PMEM_MMAP_MTR_FIL_HDR_SIZE); - //fprintf(stderr, "cur_offset: %lu ckpt_offset: %lu\n", cur_offset, ckpt_offset); - mmap_mtrlogbuf->cur_offset = PMEM_MMAP_MTR_FIL_HDR_SIZE; - mmap_mtrlogbuf->prev_offset = PMEM_MMAP_MTR_FIL_HDR_SIZE; - // really needed? - pm_mmap_write_logfile_header_size(PMEM_MMAP_MTR_FIL_HDR_SIZE); -*/ - -} +// HOT DEBUG // +void pmem_copy_page(unsigned char* frame, uint64_t space, uint64_t page_no) { + pthread_mutex_lock(&mmap_mtrlogbuf->mtrMutex); + uint64_t pmem_page_offset = pmem_nc_page_offset_list.front(); + pmem_nc_page_offset_map[std::make_pair(space, page_no)] = pmem_page_offset; + pmem_nc_page_offset_list.pop(); + pthread_mutex_unlock(&mmap_mtrlogbuf->mtrMutex); -// compare mtr log with given space_id, and page_no -// offset is start offset of "log body" of mtr log -bool pm_mmap_mtrlogbuf_identify(size_t offset, size_t n, ulint space, ulint page_no) { - // mtr log structure: [type(1)] [space_id(4)] [page_no(4)] - // mach_write_compressed used when writing space_id and page_no - // + 1 means jump over MTR_LOG_TYPE - ulint cur_space, cur_page; - const byte *ptr = gb_pm_mmap+offset; - const byte *end_ptr = gb_pm_mmap+offset+n; - ptr++; - - cur_space = mach_parse_compressed(&ptr, end_ptr); - if (ptr != NULL) { - cur_page = mach_parse_compressed(&ptr, end_ptr); - } - - //fprintf(stderr, "[mtr identify] space(%lu) : %lu pange_no(%lu) : %lu\n", space, cur_space, page_no, cur_page); - return ((cur_space == space) && (cur_page == page_no)); + memcpy(gb_pm_mmap + 10*1024*1024*1024UL + pmem_page_offset, frame, UNIV_PAGE_SIZE); + flush_cache(gb_pm_mmap + 10*1024*1024*1024UL + pmem_page_offset, UNIV_PAGE_SIZE); } -void pm_mmap_mtrlogbuf_unset_recv_flag(size_t offset) { - memcpy(gb_pm_mmap + offset, false, sizeof(bool)); - // need flush? No we can recover by using commit log -} +void pmem_evict_page(uint64_t space, uint64_t page_no) { -void pm_mmap_mtrlogbuf_commit_v1(ulint space, ulint page_no) { - // 1. start to inspect mtr log from latest ckpt_offset - // 2. check specific mtr log with spaced_id and page_no - // 2.1 (yes) check need_recv is set goto 3.1 - // 2.2 (no) check need recv is set goto 3.2 - // 3.1. update ckpt_offset to current offset - // 4. move to next mtr log (until cur_offset) - - if (mmap_mtrlogbuf == NULL) return; - - size_t offset = mmap_mtrlogbuf->ckpt_offset; - while (offset != mmap_mtrlogbuf->cur_offset) { - - fprintf(stderr, "offset : %lu cur_offset: %lu\n", offset, mmap_mtrlogbuf->cur_offset); - PMEM_MMAP_MTRLOG_HDR mmap_mtr_hdr; - memcpy(&mmap_mtr_hdr, gb_pm_mmap + offset, (size_t) PMEM_MMAP_MTRLOG_HDR_SIZE); - uint64_t data_len = mmap_mtr_hdr.len; - bool need_recv = mmap_mtr_hdr.need_recv; - - fprintf(stderr, "[mtr info] data_len : %lu lsn: %lu need_recv : %d\n", - data_len, mmap_mtr_hdr.lsn, need_recv); - - // move next - uint64_t org_offset = offset; - offset += PMEM_MMAP_MTRLOG_HDR_SIZE; - - if (pm_mmap_mtrlogbuf_identify(offset, data_len, space, page_no)) { - pm_mmap_mtrlogbuf_unset_recv_flag(org_offset); - } - if (need_recv) { - mmap_mtrlogbuf->ckpt_offset = org_offset; - } - offset += data_len; - } - fprintf(stderr, "break out ! ckpt_offset: %lu\n", mmap_mtrlogbuf->ckpt_offset); + uint64_t offset = -1; + std::map,uint64_t>::iterator iter; + pthread_mutex_lock(&mmap_mtrlogbuf->mtrMutex); + iter = pmem_nc_page_offset_map.find(std::make_pair(space, page_no)); + if (iter != pmem_nc_page_offset_map.end()) { + offset = iter->second; + } else { + ib::error() << "error : " << space << ":" << page_no; + } + pmem_nc_page_offset_list.push(offset); + pthread_mutex_unlock(&mmap_mtrlogbuf->mtrMutex); } - diff --git a/storage/innobase/pmem/pmem0recv.cc b/storage/innobase/pmem/pmem0recv.cc index 69744416..1a2301e0 100644 --- a/storage/innobase/pmem/pmem0recv.cc +++ b/storage/innobase/pmem/pmem0recv.cc @@ -184,3 +184,155 @@ void pm_mmap_recv_flush_buffer() { // note that changes on these pages are not atomic // they might have partial updates } + +uint64_t pm_mmap_recv_check_nc_buf(uint64_t space, uint64_t page_no) { + std::map, std::vector >::iterator ncbuf_iter; + ncbuf_iter = pmem_nc_buffer_map.find(std::make_pair(space,page_no)); + if (ncbuf_iter != pmem_nc_buffer_map.end()) { + std::vector nc_offset_vec = (*ncbuf_iter).second; + uint64_t nc_offset; + for (uint64_t i=0; i + ((gb_pm_mmap + (1*1024*1024*1024UL) + nc_offset))->frame; + + if (space != mach_read_from_4(nc_frame + FIL_PAGE_SPACE_ID) + || page_no != mach_read_from_4(nc_frame + FIL_PAGE_OFFSET)) { + fprintf(stderr, "[DEBUG] wrong buffer page info! %u:%u\n", space, page_no); + } + } + return nc_offset; + } else { + return -1; + } +} + +uint64_t pm_mmap_recv_check_nc_page(uint64_t space, uint64_t page_no) { + std::map, std::vector >::iterator ncbuf_iter; + ncbuf_iter = pmem_nc_page_map.find(std::make_pair(space,page_no)); + if (ncbuf_iter != pmem_nc_buffer_map.end()) { + std::vector nc_offset_vec = (*ncbuf_iter).second; + uint64_t nc_offset; + for (uint64_t i=0; i( (8*147324928UL)/4096); + + for (uint64_t i=0; i < page_num_chunks; ++i) { + //for (uint64_t i=0; i < srv_nvdimm_buf_pool_size; i+= UNIV_PAGE_SIZE) { + + space = reinterpret_cast((addr+ i * sizeof(buf_block_t)))->page.id.space(); + page_no = reinterpret_cast((addr+ i * sizeof(buf_block_t)))->page.id.page_no(); + unsigned char *frame = reinterpret_cast((addr+ i * sizeof(buf_block_t)))->frame; + + // HOT DEBUG // + //space = reinterpret_cast((addr+ i ))->page.id.space(); + //page_no = reinterpret_cast((addr+ i ))->page.id.page_no(); + //unsigned char *frame = (unsigned char*)(addr+ i); + + if (space != 27 && space != 29 && space != 31) { + if (space == 4294967295 + && page_no == 4294967295) { + continue; + } else { + break; + } + } else { +#ifdef UNIV_DEBUG + ib::info() << "obtaine NC page: " << space << ":" << page_no; + // check + if (space != mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + || page_no != mach_read_from_4(frame + FIL_PAGE_OFFSET)) { + ib::info() << " wrong NC page frame info expected: " + << space << ":" << page_no + << " current value: " << mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + << ":" << mach_read_from_4(frame + FIL_PAGE_OFFSET); + } +#endif + } + + + +#ifdef PMEM_RECV_DEBUG + fil_space_t* space_t = fil_space_get(space); + const page_id_t page_id(space,page_no); + const page_size_t page_size(space_t->flags); + if (buf_page_is_corrupted(true, frame, page_size, + fsp_is_checksum_disabled(space))) { + fprintf(stderr, "(%lu:%lu) page is corruptted! lsn: %lu\n", space, page_no, mach_read_from_8(frame + FIL_PAGE_LSN)); + } else { + fprintf(stderr, "(%lu:%lu) page is good! lsn: %lu\n", space, page_no, mach_read_from_8(frame + FIL_PAGE_LSN)); + } +#endif + + // we store relative position of nc page + pmem_nc_buffer_map[std::make_pair(space,page_no)].push_back(i*sizeof(buf_block_t)); + } + + // nc_page_map + unsigned char *page_addr = gb_pm_mmap + (10*1024*1024*1024UL); + for (uint64_t i=0; i < page_num_chunks; ++i) { + + space = mach_read_from_4( + page_addr + (i*4096UL) + FIL_PAGE_SPACE_ID); + page_no = mach_read_from_4( + page_addr + (i*4096UL) + FIL_PAGE_OFFSET); + + if (space != 27 && space != 29 && space != 31) { + fprintf(stderr, "[DEBUG] we miss the pages %lu:%lu\n", space, page_no); + if (space == 4294967295 + && page_no == 4294967295) { + continue; + } else { + break; + } + } else { + +#ifdef UNIV_DEBUG + ib::info() << "obtaine NC page in buffer: " << space << ":" << page_no; + // check + if (space != mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + || page_no != mach_read_from_4(frame + FIL_PAGE_OFFSET)) { + ib::info() << " wrong NC page frame info expected: " + << space << ":" << page_no + << " current value: " << mach_read_from_4(frame + FIL_PAGE_SPACE_ID) + << ":" << mach_read_from_4(frame + FIL_PAGE_OFFSET); + } +#endif + + } + pmem_nc_page_map[std::make_pair(space,page_no)].push_back(i*4096UL); + } + + // +} + +void nc_save_pmem_lsn() { + ib::info() << "xxx pmem_lsn save !!!: " << pmem_lsn; + memcpy((gb_pm_mmap + 6*1024*1024*1024UL), &pmem_lsn, sizeof(uint64_t)); + flush_cache((gb_pm_mmap + 6*1024*1024*1024UL), sizeof(uint64_t)); +} + diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index 542d7907..20e180c4 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -253,6 +253,8 @@ ulong srv_nvdimm_buf_pool_instances = 1; ulong srv_nvdimm_pc_threshold_pct = 2; /** NVDIMM-aware file resident directory */ char* srv_nvdimm_home_dir = NULL; +/** NVDIMM DWB enable */ +my_bool srv_use_nvdimm_dwb = FALSE; #endif /* UNIV_NVDIMM_CACHE */ /** Requested size in bytes */ diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 383f3fa1..7a8d8f68 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -109,6 +109,10 @@ Created 2/16/1996 Heikki Tuuri #include "pmem_mmap_obj.h" extern unsigned char* gb_pm_mmap; char PMEM_FILE_PATH [PMEM_MMAP_MAX_FILE_NAME_LENGTH]; +pfs_os_file_t gb_pm_dwb_file; +ticks start_time=0; +ticks end_time=0; +unsigned recovery_time=0; #endif /* UNIV_NVDIMM_CACHE */ #ifdef HAVE_LZO1X @@ -1473,6 +1477,11 @@ innobase_start_or_create_for_mysql(void) size_t dirnamelen; unsigned i = 0; + // HOT_DEBUG RECOVERY +#ifdef UNIV_NVDIMM_CACHE + start_time = getticks(); +#endif + /* Reset the start state. */ srv_start_state = SRV_START_STATE_NONE; @@ -1493,7 +1502,7 @@ innobase_start_or_create_for_mysql(void) #ifdef UNIV_NVDIMM_CACHE sprintf(PMEM_FILE_PATH, "%s/%s", srv_nvdimm_home_dir, NVDIMM_MMAP_FILE_NAME); - size_t srv_pmem_pool_size = 3 * 1024; + size_t srv_pmem_pool_size = 12 * 1024; uint64_t pool_size = srv_pmem_pool_size * 1024 * 1024UL; gb_pm_mmap = pm_mmap_create(PMEM_FILE_PATH, pool_size); if (!gb_pm_mmap) { @@ -1501,7 +1510,7 @@ innobase_start_or_create_for_mysql(void) assert(gb_pm_mmap); } - if (!is_pmem_recv) { + //if (!is_pmem_recv) { // for debugging : chagne the mtr log region size // original : 1024*1024*1024*8UL (8GB) pm_mmap_mtrlogbuf_init(1024*1024*1024*1UL); // 1GB for test @@ -1509,9 +1518,7 @@ innobase_start_or_create_for_mysql(void) // TODO(jhpark): change buffer pool recovery policy // buffer retion initialization (2GB) pm_mmap_buf_init(1024*1024*1024*2UL); - } - - //pm_mmap_buf_init(1024*1024*1024*3UL); + //} #endif /* UNIV_NVDIMM_CACHE */ @@ -2301,6 +2308,22 @@ innobase_start_or_create_for_mysql(void) fprintf(stderr, "[JONGQ] ---- scan_and_parse log file finished\n"); +#ifdef UNIV_NVDIMM_CACHE + if (is_pmem_recv) { + nc_recv_analysis(); + } + // (jhpark): ignore now + /* + else { + // HOT DEBUG + pmem_lsn = flushed_lsn; + nc_save_pmem_lsn(); + } + */ +#endif + + + /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ @@ -2332,32 +2355,14 @@ innobase_start_or_create_for_mysql(void) return(srv_init_abort(DB_ERROR)); } -#ifdef UNIV_NVDIMM_CACHE - fprintf(stderr, "[JONGQ] ---- pass force recovery!\n"); - -// TODO(jhpark): NC recovery check !!!!! - if (is_pmem_recv) { - PMEMMMAP_INFO_PRINT("YES!!!! recovery!!!! start_offset: %lu end_offset: %lu\n" - ,pmem_recv_offset, pmem_recv_size); -// pm_mmap_recv(pmem_recv_offset, pmem_recv_size); -// PMEMMMAP_INFO_PRINT("UNDO page is recoverd !!!!\n"); -// //pm_mmap_recv_flush_buffer(); - } -#endif /* UNIV_NVDIMM_CACHE */ purge_queue = trx_sys_init_at_db_start(); - fprintf(stderr, "[JONGQ] ---- trx_sys_init_at_db_start finished!\n"); - if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { /* Apply the hashed log records to the respective file pages, for the last batch of recv_group_scan_log_recs(). */ -#ifdef UNIV_NVDIMM_CACHE - PMEMMMAP_INFO_PRINT("JONGQ recovery-4-1\n"); -#endif /* UNIV_NVDIMM_CACHE */ - recv_apply_hashed_log_recs(TRUE); DBUG_PRINT("ib_log", ("apply completed")); @@ -2366,10 +2371,6 @@ innobase_start_or_create_for_mysql(void) } } -#ifdef UNIV_NVDIMM_CACHE - PMEMMMAP_INFO_PRINT("JONGQ recovery-5\n"); -#endif /* UNIV_NVDIMM_CACHE */ - if (recv_sys->found_corrupt_log) { ib::warn() << "The log file may have been corrupt and it" @@ -2579,9 +2580,6 @@ innobase_start_or_create_for_mysql(void) variable srv_available_undo_logs. The number of rsegs to use can be set using the dynamic global variable srv_rollback_segments. */ - // debug - fprintf(stderr, "[JONGQ] initialize undo log lists\n"); - srv_available_undo_logs = trx_sys_create_rsegs( srv_undo_tablespaces, srv_rollback_segments, srv_tmp_undo_logs); diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index 941bbd0a..d8e9d32f 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -2072,23 +2072,8 @@ trx_undo_report_row_operation( mtr_commit(&mtr); } else { /* Success */ - undo->withdraw_clock = buf_withdraw_clock; - - // FIXME(jhpark): for NVDIMM resident pages, we don't need to flush mtr log to WAL log buffer - // just release the mtr structure. -#ifdef UNIV_NVDIMM_CACHE - if (is_nvm_page) { - //ulint space = index->space; - //ulint page = index->page; - //fprintf(stderr, "[mtr-commit] space : %lu page : %lu\n", space, page); - //mtr_commit_nvm(&mtr, space, page); - mtr_commit_no_nvm(&mtr); - } else { - mtr_commit(&mtr); - } -#else - mtr_commit(&mtr); -#endif /* UNIV_NVDIMM_CACHE */ + undo->withdraw_clock = buf_withdraw_clock; + mtr_commit(&mtr); undo->empty = FALSE; undo->top_page_no = page_no; diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc index 5c23e749..5eaecc01 100644 --- a/storage/innobase/trx/trx0trx.cc +++ b/storage/innobase/trx/trx0trx.cc @@ -784,10 +784,7 @@ trx_resurrect_table_locks( trx->mod_tables.insert(table); } lock_table_ix_resurrect(table, trx); - - // debugging - fprintf(stderr, "ib_trx resurrect %d table %s IX lock from %s undo", trx_get_id_for_print(trx), table->name.m_name, undo == undo_ptr->insert_undo ? "insert" : "update"); - + DBUG_PRINT("ib_trx", ("resurrect" TRX_ID_FMT " table '%s' IX lock from %s undo",