From 1a4d9a5e436effee930262a0146a4ae712fb8926 Mon Sep 17 00:00:00 2001 From: lbesnard Date: Tue, 27 Jan 2026 15:50:51 +1100 Subject: [PATCH 1/8] Fix: NRS Darwin Yongala - fix http to https - working order --- ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py | 414 +++++++---- lib/python/aims_realtime_util.py | 867 +++++++++++++----------- 2 files changed, 752 insertions(+), 529 deletions(-) diff --git a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py index d7cf77f4..2eb53e75 100755 --- a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py +++ b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py @@ -40,80 +40,102 @@ from itertools import groupby from tendo import singleton -from aims_realtime_util import (convert_time_cf_to_imos, - create_list_of_dates_to_download, download_channel, - fix_data_code_from_filename, - fix_provider_code_from_filename, - has_var_only_fill_value, - is_no_data_found, is_time_monotonic, - is_time_var_empty, logging_aims, md5, - modify_aims_netcdf, parse_aims_xml, - remove_dimension_from_netcdf, - remove_end_date_from_filename, save_channel_info, - set_up, rm_tmp_dir, get_main_netcdf_var, - list_recursively_files_abs_path) +from aims_realtime_util import ( + convert_time_cf_to_imos, + create_list_of_dates_to_download, + download_channel, + fix_data_code_from_filename, + fix_provider_code_from_filename, + has_var_only_fill_value, + is_no_data_found, + is_time_monotonic, + is_time_var_empty, + logging_aims, + md5, + modify_aims_netcdf, + parse_aims_xml, + remove_dimension_from_netcdf, + remove_end_date_from_filename, + save_channel_info, + set_up, + rm_tmp_dir, + get_main_netcdf_var, + list_recursively_files_abs_path, +) from dest_path import get_anmn_nrs_site_name from util import pass_netcdf_checker -DATA_WIP_PATH = os.path.join(os.environ.get('WIP_DIR'), 'ANMN', 'NRS_AIMS_Darwin_Yongala_data_rss_download_temporary') -ANMN_NRS_INCOMING_DIR = os.path.join(os.environ.get('INCOMING_DIR'), 'AODN', 'ANMN_NRS_DAR_YON') -ANMN_NRS_ERROR_DIR = os.path.join(os.environ['ERROR_DIR'], 'ANMN_NRS_DAR_YON') +MD5_EXPECTED_VALUE = "a6207e053f1cc0e00d171701f0cdb186" + +DATA_WIP_PATH = os.path.join( + os.environ.get("WIP_DIR"), + "ANMN", + "NRS_AIMS_Darwin_Yongala_data_rss_download_temporary", +) +ANMN_NRS_INCOMING_DIR = os.path.join( + os.environ.get("INCOMING_DIR"), "AODN", "ANMN_NRS_DAR_YON" +) +ANMN_NRS_ERROR_DIR = os.path.join(os.environ["ERROR_DIR"], "ANMN_NRS_DAR_YON") def modify_anmn_nrs_netcdf(netcdf_file_path, channel_id_info): - """ Modify the downloaded netCDF file so it passes both CF and IMOS checker + """Modify the downloaded netCDF file so it passes both CF and IMOS checker input: netcdf_file_path(str) : path of netcdf file to modify channel_id_index(tupple) : information from xml for the channel """ modify_aims_netcdf(netcdf_file_path, channel_id_info) - netcdf_file_obj = Dataset(netcdf_file_path, 'a', format='NETCDF4') - netcdf_file_obj.aims_channel_id = int(channel_id_info['channel_id']) - - if 'Yongala' in channel_id_info['site_name']: - netcdf_file_obj.site_code = 'NRSYON' - netcdf_file_obj.platform_code = 'Yongala NRS Buoy' - elif 'Darwin' in channel_id_info['site_name']: - netcdf_file_obj.site_code = 'NRSDAR' - netcdf_file_obj.platform_code = 'Darwin NRS Buoy' - elif 'Beagle' in channel_id_info['site_name']: - netcdf_file_obj.site_code = 'DARBGF' - netcdf_file_obj.platform_code = 'Beagle Gulf Mooring' + netcdf_file_obj = Dataset(netcdf_file_path, "a", format="NETCDF4") + netcdf_file_obj.aims_channel_id = int(channel_id_info["channel_id"]) + + if "Yongala" in channel_id_info["site_name"]: + netcdf_file_obj.site_code = "NRSYON" + netcdf_file_obj.platform_code = "Yongala NRS Buoy" + elif "Darwin" in channel_id_info["site_name"]: + netcdf_file_obj.site_code = "NRSDAR" + netcdf_file_obj.platform_code = "Darwin NRS Buoy" + elif "Beagle" in channel_id_info["site_name"]: + netcdf_file_obj.site_code = "DARBGF" + netcdf_file_obj.platform_code = "Beagle Gulf Mooring" else: return False - if not (channel_id_info['metadata_uuid'] == 'Not Available'): - netcdf_file_obj.metadata_uuid = channel_id_info['metadata_uuid'] + if not (channel_id_info["metadata_uuid"] == "Not Available"): + netcdf_file_obj.metadata_uuid = channel_id_info["metadata_uuid"] # some weather stations channels don't have a depth variable if sensor above water - if 'depth' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['depth'] - var.long_name = 'nominal depth' - var.positive = 'down' - var.axis = 'Z' - var.reference_datum = 'sea surface' - var.valid_min = -10.0 - var.valid_max = 30.0 - var.units = 'm' # some channels put degrees celcius instead ... - netcdf_file_obj.renameVariable('depth', 'NOMINAL_DEPTH') - - if 'DEPTH' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['DEPTH'] - var.coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" - var.long_name = 'actual depth' - var.reference_datum = 'sea surface' - var.positive = 'down' - var.valid_min = -10.0 - var.valid_max = 30.0 - var.units = 'm' # some channels put degrees celcius instead ... + if "depth" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["depth"] + var.long_name = "nominal depth" + var.positive = "down" + var.axis = "Z" + var.reference_datum = "sea surface" + var.valid_min = -10.0 + var.valid_max = 30.0 + var.units = "m" # some channels put degrees celcius instead ... + netcdf_file_obj.renameVariable("depth", "NOMINAL_DEPTH") + + if "DEPTH" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["DEPTH"] + var.coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" + var.long_name = "actual depth" + var.reference_datum = "sea surface" + var.positive = "down" + var.valid_min = -10.0 + var.valid_max = 30.0 + var.units = "m" # some channels put degrees celcius instead ... netcdf_file_obj.close() - netcdf_file_obj = Dataset(netcdf_file_path, 'a', format='NETCDF4') # need to close to save to file. as we call get_main_var just after - main_var = get_main_netcdf_var(netcdf_file_path) + netcdf_file_obj = Dataset( + netcdf_file_path, "a", format="NETCDF4" + ) # need to close to save to file. as we call get_main_var just after + main_var = get_main_netcdf_var(netcdf_file_path) # DEPTH, LATITUDE and LONGITUDE are not dimensions, so we make them into auxiliary cooordinate variables by adding this attribute - if 'NOMINAL_DEPTH' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables[main_var].coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" + if "NOMINAL_DEPTH" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables[ + main_var + ].coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" else: netcdf_file_obj.variables[main_var].coordinates = "TIME LATITUDE LONGITUDE" @@ -122,20 +144,27 @@ def modify_anmn_nrs_netcdf(netcdf_file_path, channel_id_info): if not convert_time_cf_to_imos(netcdf_file_path): return False - remove_dimension_from_netcdf(netcdf_file_path) # last modification to do in this order! + remove_dimension_from_netcdf( + netcdf_file_path + ) # last modification to do in this order! return True def move_to_tmp_incoming(netcdf_path): # [org_filename withouth creation date].[md5].nc to have unique filename in - new_filename = '%s.%s.nc' % (os.path.splitext(os.path.basename(remove_end_date_from_filename(netcdf_path)))[0], md5(netcdf_path)) + new_filename = "%s.%s.nc" % ( + os.path.splitext(os.path.basename(remove_end_date_from_filename(netcdf_path)))[ + 0 + ], + md5(netcdf_path), + ) os.chmod(netcdf_path, 0o0664) # change to 664 for pipeline v2 shutil.move(netcdf_path, os.path.join(TMP_MANIFEST_DIR, new_filename)) def process_monthly_channel(channel_id, aims_xml_info, level_qc): - """ Downloads all the data available for one channel_id and moves the file to a wip_path dir + """Downloads all the data available for one channel_id and moves the file to a wip_path dir channel_id(str) aims_xml_info(tuple) level_qc(int) @@ -145,88 +174,139 @@ def process_monthly_channel(channel_id, aims_xml_info, level_qc): 300 -> NRS DATA for monthly data download, only 1 and 300 should be use """ - logger.info('QC{level_qc} - Processing channel {channel_id}'.format(channel_id=str(channel_id), - level_qc=str(level_qc))) + logger.info( + "QC{level_qc} - Processing channel {channel_id}".format( + channel_id=str(channel_id), level_qc=str(level_qc) + ) + ) channel_id_info = aims_xml_info[channel_id] - from_date = channel_id_info['from_date'] - thru_date = channel_id_info['thru_date'] - [start_dates, end_dates] = create_list_of_dates_to_download(channel_id, level_qc, from_date, thru_date) + from_date = channel_id_info["from_date"] + thru_date = channel_id_info["thru_date"] + [start_dates, end_dates] = create_list_of_dates_to_download( + channel_id, level_qc, from_date, thru_date + ) if len(start_dates) != 0: # download monthly file for start_date, end_date in zip(start_dates, end_dates): - start_date = start_date.strftime("%Y-%m-%dT%H:%M:%SZ") - end_date = end_date.strftime("%Y-%m-%dT%H:%M:%SZ") - netcdf_tmp_file_path = download_channel(channel_id, start_date, end_date, level_qc) - contact_aims_msg = "Process of channel aborted - CONTACT AIMS" + start_date = start_date.strftime("%Y-%m-%dT%H:%M:%SZ") + end_date = end_date.strftime("%Y-%m-%dT%H:%M:%SZ") + netcdf_tmp_file_path = download_channel( + channel_id, start_date, end_date, level_qc + ) + contact_aims_msg = "Process of channel aborted - CONTACT AIMS" if netcdf_tmp_file_path is None: - logger.error(' Channel %s - not valid zip file - %s' % (str(channel_id), contact_aims_msg)) + logger.error( + " Channel %s - not valid zip file - %s" + % (str(channel_id), contact_aims_msg) + ) break # NO_DATA_FOUND file only means there is no data for the selected time period. Could be some data afterwards if is_no_data_found(netcdf_tmp_file_path): - logger.info('Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]'.format( - channel_id=str(channel_id), - start_date=start_date, - end_date=end_date)) + logger.info( + "Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]".format( + channel_id=str(channel_id), + start_date=start_date, + end_date=end_date, + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) else: if is_time_var_empty(netcdf_tmp_file_path): - logger.error('Channel {channel_id}: No values in TIME variable - {message}'.format( - channel_id=str(channel_id), - message=contact_aims_msg)) + logger.error( + "Channel {channel_id}: No values in TIME variable - {message}".format( + channel_id=str(channel_id), message=contact_aims_msg + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) break if not modify_anmn_nrs_netcdf(netcdf_tmp_file_path, channel_id_info): - logger.error('Channel{channel_id}: Could not modify the NetCDF file - Process of channel aborted'. - format(channel_id=str(channel_id))) + logger.error( + "Channel{channel_id}: Could not modify the NetCDF file - Process of channel aborted".format( + channel_id=str(channel_id) + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) break main_var = get_main_netcdf_var(netcdf_tmp_file_path) if has_var_only_fill_value(netcdf_tmp_file_path, main_var): - logger.error('Channel {channel_id}: _Fillvalues only in main variable - {message}'.format( - channel_id=str(channel_id), - message=contact_aims_msg)) + logger.error( + "Channel {channel_id}: _Fillvalues only in main variable - {message}".format( + channel_id=str(channel_id), message=contact_aims_msg + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) break if get_anmn_nrs_site_name(netcdf_tmp_file_path) == []: - logger.error('Channel {channel_id}: Unknown site_code gatt value - {message}'.format( - channel_id=str(channel_id), - message=contact_aims_msg)) + logger.error( + "Channel {channel_id}: Unknown site_code gatt value - {message}".format( + channel_id=str(channel_id), message=contact_aims_msg + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) break if not is_time_monotonic(netcdf_tmp_file_path): - logger.error('Channel {channel_id}: TIME value is not strictly monotonic \ - - {message}'.format(channel_id=str(channel_id), - message=contact_aims_msg)) + logger.error( + "Channel {channel_id}: TIME value is not strictly monotonic \ + - {message}".format( + channel_id=str(channel_id), message=contact_aims_msg + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) break # check every single file of the list. We don't assume that if one passes, all pass ... past proved this - wip_path = os.environ.get('data_wip_path') - checker_retval = pass_netcdf_checker(netcdf_tmp_file_path, tests=['cf:1.6', 'imos:1.3']) + wip_path = os.environ.get("data_wip_path") + checker_retval = pass_netcdf_checker( + netcdf_tmp_file_path, tests=["cf:1.6", "imos:1.3"] + ) if not checker_retval: - logger.error('Channel {channel_id}: File does not pass CF/IMOS compliance checker - Process of channel aborted' - .format(channel_id=str(channel_id))) - shutil.copy(netcdf_tmp_file_path, os.path.join(wip_path, 'errors')) - - logger.error('File copied to {path} for debugging'.format( - path=os.path.join(wip_path, 'errors', os.path.basename(netcdf_tmp_file_path)))) + logger.error( + "Channel {channel_id}: File does not pass CF/IMOS compliance checker - Process of channel aborted".format( + channel_id=str(channel_id) + ) + ) + shutil.copy(netcdf_tmp_file_path, os.path.join(wip_path, "errors")) + + logger.error( + "File copied to {path} for debugging".format( + path=os.path.join( + wip_path, + "errors", + os.path.basename(netcdf_tmp_file_path), + ) + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) break netcdf_tmp_file_path = fix_data_code_from_filename(netcdf_tmp_file_path) - netcdf_tmp_file_path = fix_provider_code_from_filename(netcdf_tmp_file_path, 'IMOS_ANMN') - - if re.search('IMOS_ANMN_[A-Z]{1}_', netcdf_tmp_file_path) is None: - logger.error(' Channel %s - File name Data code does not pass REGEX - Process of channel aborted' % str(channel_id)) - shutil.copy(netcdf_tmp_file_path, os.path.join(wip_path, 'errors')) - logger.error(' File copied to %s for debugging' % (os.path.join(wip_path, 'errors', os.path.basename(netcdf_tmp_file_path)))) + netcdf_tmp_file_path = fix_provider_code_from_filename( + netcdf_tmp_file_path, "IMOS_ANMN" + ) + + if re.search("IMOS_ANMN_[A-Z]{1}_", netcdf_tmp_file_path) is None: + logger.error( + " Channel %s - File name Data code does not pass REGEX - Process of channel aborted" + % str(channel_id) + ) + shutil.copy(netcdf_tmp_file_path, os.path.join(wip_path, "errors")) + logger.error( + " File copied to %s for debugging" + % ( + os.path.join( + wip_path, + "errors", + os.path.basename(netcdf_tmp_file_path), + ) + ) + ) shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) break @@ -240,70 +320,89 @@ def process_monthly_channel(channel_id, aims_xml_info, level_qc): save_channel_info(channel_id, aims_xml_info, level_qc, end_date) else: - logger.info('QC{level_qc} - Channel {channel_id}: already up to date'.format(channel_id=str(channel_id), - level_qc=str(level_qc))) + logger.info( + "QC{level_qc} - Channel {channel_id}: already up to date".format( + channel_id=str(channel_id), level_qc=str(level_qc) + ) + ) def process_qc_level(level_qc): - """ Downloads all channels for a QC level + """Downloads all channels for a QC level level_qc(int) : 0 or 1 """ - logger.info('Process ANMN NRS download from AIMS web service - QC level {level_qc}'.format(level_qc=level_qc)) - xml_url = 'https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level{level_qc}/300'.format(level_qc=level_qc) + logger.info( + "Process ANMN NRS download from AIMS web service - QC level {level_qc}".format( + level_qc=level_qc + ) + ) + xml_url = "https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level{level_qc}/300".format( + level_qc=level_qc + ) try: aims_xml_info = parse_aims_xml(xml_url) except Exception as err: - logger.critical('RSS feed not available') + logger.critical("RSS feed not available") exit(1) for channel_id in aims_xml_info.keys(): try: process_monthly_channel(channel_id, aims_xml_info, level_qc) except Exception as err: - logger.error('QC{qc_level} - Channel {channel_id}: Failed, unknown reason - manual debug required'.format( - channel_id=str(channel_id), - qc_level=str(level_qc))) + logger.error( + "QC{qc_level} - Channel {channel_id}: Failed, unknown reason - manual debug required".format( + channel_id=str(channel_id), qc_level=str(level_qc) + ) + ) logger.error(traceback.print_exc()) class AimsDataValidationTest(data_validation_test.TestCase): - def setUp(self): - """ Check that a the AIMS system or this script hasn't been modified. + """Check that a the AIMS system or this script hasn't been modified. This function checks that a downloaded file still has the same md5. """ - channel_id = '84329' - from_date = '2016-01-01T00:00:00Z' - thru_date = '2016-01-02T00:00:00Z' - level_qc = 1 - aims_rss_val = 300 - xml_url = 'https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level%s/%s' % (str(level_qc), str(aims_rss_val)) - - logger.info('Data validation unittests...') - aims_xml_info = parse_aims_xml(xml_url) + channel_id = "84329" + from_date = "2016-01-01T00:00:00Z" + thru_date = "2016-01-02T00:00:00Z" + level_qc = 1 + aims_rss_val = 300 + xml_url = ( + "https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level%s/%s" + % (str(level_qc), str(aims_rss_val)) + ) + + logger.info("Data validation unittests...") + aims_xml_info = parse_aims_xml(xml_url) channel_id_info = aims_xml_info[channel_id] - self.netcdf_tmp_file_path = download_channel(channel_id, from_date, thru_date, level_qc) + self.netcdf_tmp_file_path = download_channel( + channel_id, from_date, thru_date, level_qc + ) modify_anmn_nrs_netcdf(self.netcdf_tmp_file_path, channel_id_info) # force values of attributes which change all the time - netcdf_file_obj = Dataset(self.netcdf_tmp_file_path, 'a', format='NETCDF4') + netcdf_file_obj = Dataset(self.netcdf_tmp_file_path, "a", format="NETCDF4") netcdf_file_obj.date_created = "1970-01-01T00:00:00Z" # epoch - netcdf_file_obj.history = 'data validation test only' - netcdf_file_obj.NCO = 'NCO_VERSION' + netcdf_file_obj.history = "data validation test only" + netcdf_file_obj.NCO = "NCO_VERSION" netcdf_file_obj.close() def tearDown(self): - shutil.copy(self.netcdf_tmp_file_path, os.path.join(os.environ['data_wip_path'], 'nc_unittest_%s.nc' % self.md5_netcdf_value)) + shutil.copy( + self.netcdf_tmp_file_path, + os.path.join( + os.environ["data_wip_path"], "nc_unittest_%s.nc" % self.md5_netcdf_value + ), + ) shutil.rmtree(os.path.dirname(self.netcdf_tmp_file_path)) def test_aims_validation(self): if sys.version_info[0] < 3: - self.md5_expected_value = '76c9a595264a8173545b6dc0c518a280' + self.md5_expected_value = "76c9a595264a8173545b6dc0c518a280" else: - self.md5_expected_value = '78c6386529faf9dc2272e9bed5ed7fa2' - + self.md5_expected_value = MD5_EXPECTED_VALUE self.md5_netcdf_value = md5(self.netcdf_tmp_file_path) self.assertEqual(self.md5_netcdf_value, self.md5_expected_value) @@ -315,19 +414,24 @@ def args(): :return: vargs """ parser = argparse.ArgumentParser() - parser.add_argument("-t", "--testing", - action='store_true', - help="testing only - downloads the first month of each channel") + parser.add_argument( + "-t", + "--testing", + action="store_true", + help="testing only - downloads the first month of each channel", + ) return parser.parse_args() -if __name__ == '__main__': +if __name__ == "__main__": vargs = args() me = singleton.SingleInstance() - os.environ['data_wip_path'] = os.path.join(os.environ.get('WIP_DIR'), - 'ANMN', - 'NRS_AIMS_Darwin_Yongala_data_rss_download_temporary') + os.environ["data_wip_path"] = os.path.join( + os.environ.get("WIP_DIR"), + "ANMN", + "NRS_AIMS_Darwin_Yongala_data_rss_download_temporary", + ) global TMP_MANIFEST_DIR global TESTING @@ -340,11 +444,13 @@ def args(): # data validation test runner = data_validation_test.TextTestRunner() - itersuite = data_validation_test.TestLoader().loadTestsFromTestCase(AimsDataValidationTest) + itersuite = data_validation_test.TestLoader().loadTestsFromTestCase( + AimsDataValidationTest + ) res = runner.run(itersuite) if not DATA_WIP_PATH: - logger.critical('environment variable data_wip_path is not defined.') + logger.critical("environment variable data_wip_path is not defined.") exit(1) # script optional argument for testing only. used in process_monthly_channel @@ -353,18 +459,19 @@ def args(): rm_tmp_dir(DATA_WIP_PATH) if len(os.listdir(ANMN_NRS_INCOMING_DIR)) >= 2: - logger.critical('Operation aborted, too many files in INCOMING_DIR') + logger.critical("Operation aborted, too many files in INCOMING_DIR") exit(1) if len(os.listdir(ANMN_NRS_ERROR_DIR)) >= 2: - logger.critical('Operation aborted, too many files in ERROR_DIR') + logger.critical("Operation aborted, too many files in ERROR_DIR") exit(1) if not res.failures: for level in [0, 1]: - date_str_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') - TMP_MANIFEST_DIR = os.path.join(DATA_WIP_PATH, 'manifest_dir_tmp_{date}'.format( - date=date_str_now)) + date_str_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + TMP_MANIFEST_DIR = os.path.join( + DATA_WIP_PATH, "manifest_dir_tmp_{date}".format(date=date_str_now) + ) os.makedirs(TMP_MANIFEST_DIR) process_qc_level(level) @@ -372,17 +479,26 @@ def args(): lines_per_file = 2**12 file_list = list_recursively_files_abs_path(TMP_MANIFEST_DIR) if len(file_list) > 0: - for file_number, lines in groupby(enumerate(file_list), key=lambda x: x[0] // lines_per_file): - incoming_file = os.path.join(DATA_WIP_PATH, 'anmn_nrs_aims_FV0{level}_{date}_{file_number}.manifest'.format( - level=str(level), - date=date_str_now, - file_number=file_number)) - with open(incoming_file, 'w') as outfile: + for file_number, lines in groupby( + enumerate(file_list), key=lambda x: x[0] // lines_per_file + ): + incoming_file = os.path.join( + DATA_WIP_PATH, + "anmn_nrs_aims_FV0{level}_{date}_{file_number}.manifest".format( + level=str(level), date=date_str_now, file_number=file_number + ), + ) + with open(incoming_file, "w") as outfile: for item in lines: outfile.write("%s\n" % item[1]) os.chmod(incoming_file, 0o0664) # change to 664 for pipeline v2 - shutil.move(incoming_file, os.path.join(ANMN_NRS_INCOMING_DIR, os.path.basename(incoming_file))) + shutil.move( + incoming_file, + os.path.join( + ANMN_NRS_INCOMING_DIR, os.path.basename(incoming_file) + ), + ) else: - logger.error('Data validation unittests failed') + logger.error("Data validation unittests failed") diff --git a/lib/python/aims_realtime_util.py b/lib/python/aims_realtime_util.py index 4daad1b7..21446e7c 100755 --- a/lib/python/aims_realtime_util.py +++ b/lib/python/aims_realtime_util.py @@ -1,4 +1,4 @@ -""" set of tools to +"""set of tools to - parse AIMS RSS feed web pages - create a list of monthly timestamps to download - generate URL to download (with regards to what has already been downloaded @@ -10,6 +10,7 @@ author Laurent Besnard, laurent.besnard@utas.edu.au """ + import datetime import glob import json @@ -48,34 +49,36 @@ def logging_aims(): - """ start logging using logging python library + """start logging using logging python library output: logger - similar to a file handler """ - wip_path = os.environ.get('data_wip_path') + wip_path = os.environ.get("data_wip_path") # this is used for unit testing as data_wip_path env would not be set if wip_path is None: wip_path = tempfile.mkdtemp() - logging_format = "%(asctime)s — %(name)s — %(levelname)s — %(funcName)s:%(lineno)d — %(message)s" + logging_format = ( + "%(asctime)s — %(name)s — %(levelname)s — %(funcName)s:%(lineno)d — %(message)s" + ) # set up logging to file - tmp_filename = tempfile.mkstemp('.log', 'aims_data_download_')[1] - log_path = os.path.join(wip_path, 'aims.log') - logging.basicConfig(level=logging.INFO, - format=logging_format, - filename=tmp_filename, - filemode='a+') + tmp_filename = tempfile.mkstemp(".log", "aims_data_download_")[1] + log_path = os.path.join(wip_path, "aims.log") + logging.basicConfig( + level=logging.INFO, format=logging_format, filename=tmp_filename, filemode="a+" + ) # rotate logs every Day, and keep only the last 5 log files - logHandler = TimedRotatingFileHandler(log_path, - when="D", - interval=1, - backupCount=5, # backupCount files will be kept - ) + logHandler = TimedRotatingFileHandler( + log_path, + when="D", + interval=1, + backupCount=5, # backupCount files will be kept + ) logHandler.setFormatter(logging.Formatter(logging_format)) logHandler.setLevel(logging.DEBUG) - logging.getLogger('').addHandler(logHandler) + logging.getLogger("").addHandler(logHandler) # define a Handler which writes DEBUG messages to the sys.stderr logFormatter = logging.Formatter(logging_format) @@ -84,7 +87,7 @@ def logging_aims(): consoleHandler.setFormatter(logFormatter) # add the console handler to the root logger - logging.getLogger('').addHandler(consoleHandler) + logging.getLogger("").addHandler(consoleHandler) #################### @@ -93,33 +96,33 @@ def logging_aims(): def _pickle_filename(level_qc): - """ returns the pickle filepath according to the QC level being processed + """returns the pickle filepath according to the QC level being processed input: level_qc(int) : 0 or 1 output: picleQc_file(str) : pickle file path """ - wip_path = os.environ.get('data_wip_path') + wip_path = os.environ.get("data_wip_path") if wip_path is None: - raise ValueError('data_wip_path enviromnent variable is not set') + raise ValueError("data_wip_path enviromnent variable is not set") if level_qc == 0: - pickle_qc_file = os.path.join(wip_path, 'aims_qc0.pickle') + pickle_qc_file = os.path.join(wip_path, "aims_qc0.pickle") elif level_qc == 1: - pickle_qc_file = os.path.join(wip_path, 'aims_qc1.pickle') + pickle_qc_file = os.path.join(wip_path, "aims_qc1.pickle") return pickle_qc_file def delete_channel_id_from_pickle(level_qc, channel_id): pickle_file = _pickle_filename(level_qc) - with open(pickle_file, 'rb') as p_read: + with open(pickle_file, "rb") as p_read: aims_xml_info = pickle.load(p_read) if channel_id in aims_xml_info.keys(): - del(aims_xml_info[channel_id]) + del aims_xml_info[channel_id] - with open(pickle_file, 'wb') as p_write: + with open(pickle_file, "wb") as p_write: pickle.dump(aims_xml_info, p_write) @@ -139,74 +142,78 @@ def delete_platform_entries_from_pickle(level_qc, platform): In [2]: delete_platform_entries_from_pickle(2, 'Beagle') """ pickle_file = _pickle_filename(level_qc) - with open(pickle_file, 'rb') as p_read: + with open(pickle_file, "rb") as p_read: aims_xml_info = pickle.load(p_read) def delete_over_list_platform(aims_xml_info, platform): for index_platform, value in enumerate(aims_xml_info): if platform in value: for index_field in range(0, len(aims_xml_info)): - del(aims_xml_info[index_field][platform_name]) + del aims_xml_info[index_field][platform_name] aims_xml_info = delete_over_list_platform(aims_xml_info, platform) return aims_xml_info aims_xml_info_clean = delete_over_list_platform(aims_xml_info, platform) - with open(pickle_file, 'wb') as p_write: + with open(pickle_file, "wb") as p_write: pickle.dump(aims_xml_info_clean, p_write) @retry(URLError, tries=10, delay=3, backoff=2) def urlopen_with_retry(url): - """ it will retry a maximum of 10 times, with an exponential backoff delay + """it will retry a maximum of 10 times, with an exponential backoff delay doubling each time, e.g. 3 seconds, 6 seconds, 12 seconds """ return urlopen(url) -def save_channel_info(channel_id, aims_xml_info, level_qc, *last_downloaded_date_channel): +def save_channel_info( + channel_id, aims_xml_info, level_qc, *last_downloaded_date_channel +): """ - if channel_id has been successfuly processed, we write about it in a pickle file - we write the last downloaded data date for each channel - input: - channel_id(str) : channel_id to save information - aims_xml_info(dict) : generated by parser_aims_xml - level_qc(int) : 0 or 1 - last_downloaded_date_channel is a variable argument, not used by soop trv + if channel_id has been successfuly processed, we write about it in a pickle file + we write the last downloaded data date for each channel + input: + channel_id(str) : channel_id to save information + aims_xml_info(dict) : generated by parser_aims_xml + level_qc(int) : 0 or 1 + last_downloaded_date_channel is a variable argument, not used by soop trv """ pickle_file = _pickle_filename(level_qc) last_downloaded_date = dict() # condition in case the pickle file already exists or not. In the first case, # aims_xml_info comes from the pickle, file, otherwise comes from the function arg if os.path.isfile(pickle_file): - with open(pickle_file, 'rb') as p_read: + with open(pickle_file, "rb") as p_read: aims_xml_info_file = pickle.load(p_read) last_downloaded_date = aims_xml_info_file if not last_downloaded_date_channel: # soop trv specific, vararg - last_downloaded_date[channel_id] = aims_xml_info[channel_id]['thru_date'] + last_downloaded_date[channel_id] = aims_xml_info[channel_id]["thru_date"] else: last_downloaded_date[channel_id] = last_downloaded_date_channel[0] else: if not last_downloaded_date_channel: # soop trv specific, vararg - last_downloaded_date[channel_id] = aims_xml_info[channel_id]['thru_date'] + last_downloaded_date[channel_id] = aims_xml_info[channel_id]["thru_date"] else: last_downloaded_date[channel_id] = last_downloaded_date_channel[0] - with open(pickle_file, 'wb') as p_write: + with open(pickle_file, "wb") as p_write: pickle.dump(last_downloaded_date, p_write) def get_last_downloaded_date_channel(channel_id, level_qc, from_date): - """ Retrieve the last date sucessfully downloaded for a channel """ + """Retrieve the last date sucessfully downloaded for a channel""" pickle_file = _pickle_filename(level_qc) # different pickle per QC if os.path.isfile(pickle_file): - with open(pickle_file, 'rb') as p_read: + with open(pickle_file, "rb") as p_read: last_downloaded_date = pickle.load(p_read) - if channel_id in last_downloaded_date.keys(): # check the channel is in the pickle file + if ( + channel_id in last_downloaded_date.keys() + ): # check the channel is in the pickle file if last_downloaded_date[channel_id] is not None: return last_downloaded_date[channel_id] @@ -216,11 +223,15 @@ def get_last_downloaded_date_channel(channel_id, level_qc, from_date): def has_channel_already_been_downloaded(channel_id, level_qc): pickle_file = _pickle_filename(level_qc) # different pickle per QC if os.path.isfile(pickle_file): - with open(pickle_file, 'rb') as p_read: + with open(pickle_file, "rb") as p_read: last_downloaded_date = pickle.load(p_read) - if channel_id in last_downloaded_date.keys(): # check the channel is in the pickle file - if last_downloaded_date[channel_id] is not None: # check the last downloaded_date field + if ( + channel_id in last_downloaded_date.keys() + ): # check the channel is in the pickle file + if ( + last_downloaded_date[channel_id] is not None + ): # check the last downloaded_date field return True else: return False @@ -232,22 +243,28 @@ def has_channel_already_been_downloaded(channel_id, level_qc): def create_list_of_dates_to_download(channel_id, level_qc, from_date, thru_date): - """ generate a list of monthly start dates and end dates to download FAIMMS and NRS data """ + """generate a list of monthly start dates and end dates to download FAIMMS and NRS data""" from dateutil import rrule from datetime import datetime from dateutil.relativedelta import relativedelta - last_downloaded_date = get_last_downloaded_date_channel(channel_id, level_qc, from_date) - start_dates = [] - end_dates = [] + last_downloaded_date = get_last_downloaded_date_channel( + channel_id, level_qc, from_date + ) + start_dates = [] + end_dates = [] - from_date = datetime.strptime(from_date, "%Y-%m-%dT%H:%M:%SZ") - thru_date = datetime.strptime(thru_date, "%Y-%m-%dT%H:%M:%SZ") + from_date = datetime.strptime(from_date, "%Y-%m-%dT%H:%M:%SZ") + thru_date = datetime.strptime(thru_date, "%Y-%m-%dT%H:%M:%SZ") last_downloaded_date = datetime.strptime(last_downloaded_date, "%Y-%m-%dT%H:%M:%SZ") if last_downloaded_date < thru_date: - for dt in rrule.rrule(rrule.MONTHLY, dtstart=datetime(last_downloaded_date.year, last_downloaded_date.month, 1), until=thru_date): + for dt in rrule.rrule( + rrule.MONTHLY, + dtstart=datetime(last_downloaded_date.year, last_downloaded_date.month, 1), + until=thru_date, + ): start_dates.append(dt) end_dates.append(datetime(dt.year, dt.month, 1) + relativedelta(months=1)) @@ -263,14 +280,14 @@ def list_recursively_files_abs_path(path): :return: """ filelist = [] - for filename in glob.glob('{path}/**'.format(path=path), recursive=True): + for filename in glob.glob("{path}/**".format(path=path), recursive=True): if os.path.isfile(filename): filelist.append(os.path.abspath(filename)) return filelist def md5(fname): - """ return a md5 checksum of a file """ + """return a md5 checksum of a file""" import hashlib hash = hashlib.md5() @@ -281,17 +298,17 @@ def md5(fname): def get_main_netcdf_var(netcdf_file_path): - with Dataset(netcdf_file_path, mode='r') as netcdf_file_obj: + with Dataset(netcdf_file_path, mode="r") as netcdf_file_obj: variables = netcdf_file_obj.variables - variables.pop('TIME') - variables.pop('LATITUDE') - variables.pop('LONGITUDE') + variables.pop("TIME") + variables.pop("LATITUDE") + variables.pop("LONGITUDE") - if 'NOMINAL_DEPTH' in variables: - variables.pop('NOMINAL_DEPTH') + if "NOMINAL_DEPTH" in variables: + variables.pop("NOMINAL_DEPTH") - qc_var = [s for s in variables if '_quality_control' in s] + qc_var = [s for s in variables if "_quality_control" in s] if qc_var != []: variables.pop(qc_var[0]) @@ -301,22 +318,29 @@ def get_main_netcdf_var(netcdf_file_path): def is_above_file_limit(json_watchd_name): - """ check if the number of files in INCOMING DIR as set in watch.d/[JSON_WATCHD_NAME.json is above threshold - SOMETHING quite annoying re the pipeline structure : - * the watchd JSON filename maches the ERROR directory - * BUT doesn't match the INCOMING_DIR. the 'path' in the watch.d json file matches the ERROR_DIR""" - - json_fp = os.path.join(os.environ['DATA_SERVICES_DIR'], 'watch.d', '%s.json' % json_watchd_name) + """check if the number of files in INCOMING DIR as set in watch.d/[JSON_WATCHD_NAME.json is above threshold + SOMETHING quite annoying re the pipeline structure : + * the watchd JSON filename maches the ERROR directory + * BUT doesn't match the INCOMING_DIR. the 'path' in the watch.d json file matches the ERROR_DIR""" + + json_fp = os.path.join( + os.environ["DATA_SERVICES_DIR"], "watch.d", "%s.json" % json_watchd_name + ) with open(json_fp) as j_data: parsed_json = json.load(j_data) - if len(os.listdir(os.path.join(os.environ['INCOMING_DIR'], parsed_json['path'][0]))) >= int(parsed_json['files_crit']): + if len( + os.listdir(os.path.join(os.environ["INCOMING_DIR"], parsed_json["path"][0])) + ) >= int(parsed_json["files_crit"]): return True - elif len(os.listdir(os.path.join(os.environ['ERROR_DIR'], json_watchd_name))) >= int(parsed_json['files_crit']): + elif len( + os.listdir(os.path.join(os.environ["ERROR_DIR"], json_watchd_name)) + ) >= int(parsed_json["files_crit"]): return True else: return False + ###################### # XML Info Functions # ###################### @@ -324,65 +348,86 @@ def is_above_file_limit(json_watchd_name): @lru_cache(maxsize=100) def parse_aims_xml(xml_url): - """ Download and parse the AIMS XML rss feed """ + """Download and parse the AIMS XML rss feed""" logger = logging.getLogger(__name__) - logger.info('PARSE AIMS xml RSS feed : %s' % (xml_url)) - response = urlopen(xml_url) - html = response.read() - root = ET.fromstring(html) - - n_item_start = 3 # start number for AIMS xml file - - title = [] - link = [] - metadata_uuid = [] - uom = [] - from_date = [] - thru_date = [] - platform_name = [] - site_name = [] - channel_id = [] - parameter = [] - parameter_type = [] - trip_id = [] # soop trv only + logger.info("PARSE AIMS xml RSS feed : %s" % (xml_url)) + response = urlopen(xml_url) + html = response.read() + root = ET.fromstring(html) + + n_item_start = 3 # start number for AIMS xml file + + title = [] + link = [] + metadata_uuid = [] + uom = [] + from_date = [] + thru_date = [] + platform_name = [] + site_name = [] + channel_id = [] + parameter = [] + parameter_type = [] + trip_id = [] # soop trv only for n_item in range(n_item_start, len(root[0])): - title .append(root[0][n_item][0].text) - link .append(root[0][n_item][1].text) - metadata_uuid .append(root[0][n_item][6].text) - uom .append(root[0][n_item][7].text) - from_date .append(root[0][n_item][8].text) - thru_date .append(root[0][n_item][9].text) - platform_name .append(root[0][n_item][10].text) - site_name .append(root[0][n_item][11].text) - channel_id .append(root[0][n_item][12].text) - parameter .append(root[0][n_item][13].text) + title.append(root[0][n_item][0].text) + link.append(root[0][n_item][1].text) + metadata_uuid.append(root[0][n_item][6].text) + uom.append(root[0][n_item][7].text) + from_date.append(root[0][n_item][8].text) + thru_date.append(root[0][n_item][9].text) + platform_name.append(root[0][n_item][10].text) + site_name.append(root[0][n_item][11].text) + channel_id.append(root[0][n_item][12].text) + parameter.append(root[0][n_item][13].text) parameter_type.append(root[0][n_item][14].text) # in case there is no trip id defined by AIMS, we create a fake one, used by SOOP TRV only try: trip_id.append(root[0][n_item][15].text) except IndexError: - dateObject = time.strptime(root[0][n_item][8].text, "%Y-%m-%dT%H:%M:%SZ") - trip_id_fake = str(dateObject.tm_year) + str(dateObject.tm_mon).zfill(2) + str(dateObject.tm_mday).zfill(2) + dateObject = time.strptime(root[0][n_item][8].text, "%Y-%m-%dT%H:%M:%SZ") + trip_id_fake = ( + str(dateObject.tm_year) + + str(dateObject.tm_mon).zfill(2) + + str(dateObject.tm_mday).zfill(2) + ) trip_id.append(trip_id_fake) response.close() - d = [{c: {'title': ttl, - 'channel_id': c, - 'link': lk, - 'metadata_uuid': muuid, - 'uom': uo, - 'from_date': fro, - 'thru_date': thr, - 'platform_name': pltname, - 'site_name': stname, - 'parameter': para, - 'parameter_type': paratype, - 'trip_id': trid - }} for c, ttl, lk, muuid, uo, fro, thr, pltname, stname, para, paratype, trid in - zip(channel_id, title, link, metadata_uuid, uom, from_date, - thru_date, platform_name, site_name, parameter, parameter_type, trip_id)] + d = [ + { + c: { + "title": ttl, + "channel_id": c, + "link": lk, + "metadata_uuid": muuid, + "uom": uo, + "from_date": fro, + "thru_date": thr, + "platform_name": pltname, + "site_name": stname, + "parameter": para, + "parameter_type": paratype, + "trip_id": trid, + } + } + for c, ttl, lk, muuid, uo, fro, thr, pltname, stname, para, paratype, trid in zip( + channel_id, + title, + link, + metadata_uuid, + uom, + from_date, + thru_date, + platform_name, + site_name, + parameter, + parameter_type, + trip_id, + ) + ] # re-writting the dict to have the channel key as a key value new_dict = {} @@ -392,6 +437,7 @@ def parse_aims_xml(xml_url): return new_dict + ########################################## # Channel Process/Download/Mod Functions # ########################################## @@ -402,9 +448,11 @@ def retry_if_result_none(result): return result is None -@retry(retry_on_result=retry_if_result_none, stop_max_attempt_number=10, wait_fixed=2000) +@retry( + retry_on_result=retry_if_result_none, stop_max_attempt_number=10, wait_fixed=2000 +) def download_channel(channel_id, from_date, thru_date, level_qc): - """ generated the data link to download, and extract the zip file into a temp file + """generated the data link to download, and extract the zip file into a temp file input: channel_id(str) : channel_id to download from_date(str) : str containing the first time to start the download from written in this format 2009-04-21_t10:43:54Z @@ -412,28 +460,38 @@ def download_channel(channel_id, from_date, thru_date, level_qc): level_qc(int) : 0 or 1 """ logger = logging.getLogger(__name__) - tmp_zip_file = tempfile.mkstemp() - netcdf_tmp_path = tempfile.mkdtemp() - url_data_download = 'http://data.aims.gov.au/gbroosdata/services/data/rtds/%s/level%s/raw/raw/%s/%s/netcdf/2' % \ - (channel_id, str(level_qc), from_date, thru_date) + tmp_zip_file = tempfile.mkstemp() + netcdf_tmp_path = tempfile.mkdtemp() + url_data_download = ( + "https://data.aims.gov.au/gbroosdata/services/data/rtds/%s/level%s/raw/raw/%s/%s/netcdf/2" + % (channel_id, str(level_qc), from_date, thru_date) + ) # set the timeout for no data to 120 seconds and enable streaming responses so we don't have to keep the file in memory - headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" + } request = requests.get(url_data_download, timeout=120, stream=True, headers=headers) if request.status_code == 403: - logger.error('Error 403: access to the requested resource is forbidden - {url}'.format(url=url_data_download)) + logger.error( + "Error 403: access to the requested resource is forbidden - {url}".format( + url=url_data_download + ) + ) return - with open(tmp_zip_file[1], 'wb') as fh: + with open(tmp_zip_file[1], "wb") as fh: # Walk through the request response in chunks of 1024 * 1024 bytes, so 1MiB for chunk in request.iter_content(1024 * 1024): # Write the chunk to the file fh.write(chunk) if not zipfile.is_zipfile(tmp_zip_file[1]): - logger.error('%s is not a valid zip file' % url_data_download) + logger.error("%s is not a valid zip file" % url_data_download) os.close(tmp_zip_file[0]) - os.remove(tmp_zip_file[1]) # file object needs to be closed or can end up with too many open files + os.remove( + tmp_zip_file[1] + ) # file object needs to be closed or can end up with too many open files shutil.rmtree(netcdf_tmp_path) return @@ -445,11 +503,14 @@ def download_channel(channel_id, from_date, thru_date, level_qc): zip.close() os.close(tmp_zip_file[0]) - os.remove(tmp_zip_file[1]) # file object needs to be closed or can end up with too many open files + os.remove( + tmp_zip_file[1] + ) # file object needs to be closed or can end up with too many open files - logger.info('%s download: SUCCESS' % url_data_download) + logger.info("%s download: SUCCESS" % url_data_download) return netcdf_file_path + #################################### # Functions to modify NetCDF files # # AIMS NetCDF file specific only # @@ -457,27 +518,27 @@ def download_channel(channel_id, from_date, thru_date, level_qc): def is_no_data_found(netcdf_file_path): - """ Check if the unzipped file is a 'NO_DATA_FOUND' file instead of a netCDF file + """Check if the unzipped file is a 'NO_DATA_FOUND' file instead of a netCDF file this behaviour is correct for FAIMMS and NRS, as it means no data for the selected time period. However it doesn't make sense for SOOP TRV """ - return os.path.basename(netcdf_file_path) == 'NO_DATA_FOUND' + return os.path.basename(netcdf_file_path) == "NO_DATA_FOUND" def rename_netcdf_attribute(object_, old_attribute_name, new_attribute_name): - """ Rename global attribute from netcdf4 dataset object - object = Dataset(netcdf_file, 'a', format='NETCDF4') - old_attribute_name = current gatt name to modify - new_attribute_name = new gatt name + """Rename global attribute from netcdf4 dataset object + object = Dataset(netcdf_file, 'a', format='NETCDF4') + old_attribute_name = current gatt name to modify + new_attribute_name = new gatt name """ setattr(object_, new_attribute_name, getattr(object_, old_attribute_name)) delattr(object_, old_attribute_name) def is_time_var_empty(netcdf_file_path): - """ check if the yet unmodified file (time instead of TIME) has values in its time variable """ - netcdf_file_obj = Dataset(netcdf_file_path, 'r', format='NETCDF4') - var_obj = netcdf_file_obj.variables['time'] + """check if the yet unmodified file (time instead of TIME) has values in its time variable""" + netcdf_file_obj = Dataset(netcdf_file_path, "r", format="NETCDF4") + var_obj = netcdf_file_obj.variables["time"] if var_obj.shape[0] == 0: return True @@ -489,15 +550,19 @@ def is_time_var_empty(netcdf_file_path): def convert_time_cf_to_imos(netcdf_file_path): - """ convert a CF time into an IMOS one forced to be 'days since 1950-01-01 00:00:00' + """convert a CF time into an IMOS one forced to be 'days since 1950-01-01 00:00:00' the variable HAS to be 'TIME' """ try: - netcdf_file_obj = Dataset(netcdf_file_path, 'a', format='NETCDF4') - time = netcdf_file_obj.variables['TIME'] - dtime = num2date(time[:], time.units, time.calendar) # this gives an array of datetime objects - time.units = 'days since 1950-01-01 00:00:00 UTC' - time[:] = date2num(dtime, time.units, time.calendar) # conversion to IMOS recommended time + netcdf_file_obj = Dataset(netcdf_file_path, "a", format="NETCDF4") + time = netcdf_file_obj.variables["TIME"] + dtime = num2date( + time[:], time.units, time.calendar + ) # this gives an array of datetime objects + time.units = "days since 1950-01-01 00:00:00 UTC" + time[:] = date2num( + dtime, time.units, time.calendar + ) # conversion to IMOS recommended time netcdf_file_obj.close() return True except: @@ -507,13 +572,13 @@ def convert_time_cf_to_imos(netcdf_file_path): def strictly_increasing(list): - """ check monotocity of list of values""" + """check monotocity of list of values""" return all(x < y for x, y in zip(list, list[1:])) def is_time_monotonic(netcdf_file_path): - netcdf_file_obj = Dataset(netcdf_file_path, 'r', format='NETCDF4') - time = netcdf_file_obj.variables['TIME'][:] + netcdf_file_obj = Dataset(netcdf_file_path, "r", format="NETCDF4") + time = netcdf_file_obj.variables["TIME"][:] netcdf_file_obj.close() if not strictly_increasing(time): return False @@ -521,77 +586,87 @@ def is_time_monotonic(netcdf_file_path): def modify_aims_netcdf(netcdf_file_path, channel_id_info): - """ Modify the downloaded netCDF file so it passes both CF and IMOS checker + """Modify the downloaded netCDF file so it passes both CF and IMOS checker input: netcdf_file_path(str) : path of netcdf file to modify channel_id_index(dict) : information from xml for the channel """ - imos_env_path = os.path.join(os.environ.get('DATA_SERVICES_DIR'), 'lib', 'netcdf', 'imos_env') + imos_env_path = os.path.join( + os.environ.get("DATA_SERVICES_DIR"), "lib", "netcdf", "imos_env" + ) if not os.path.isfile(imos_env_path): logger = logging.getLogger(__name__) - logger.error('%s is not accessible' % imos_env_path) + logger.error("%s is not accessible" % imos_env_path) sys.exit(1) dotenv.load_dotenv(imos_env_path) - netcdf_file_obj = Dataset(netcdf_file_path, 'a', format='NETCDF4') - netcdf_file_obj.naming_authority = 'IMOS' + netcdf_file_obj = Dataset(netcdf_file_path, "a", format="NETCDF4") + netcdf_file_obj.naming_authority = "IMOS" # add gatts to NetCDF - netcdf_file_obj.aims_channel_id = int(channel_id_info['channel_id']) + netcdf_file_obj.aims_channel_id = int(channel_id_info["channel_id"]) - if not (channel_id_info['metadata_uuid'] == 'Not Available'): - netcdf_file_obj.metadata_uuid = channel_id_info['metadata_uuid'] + if not (channel_id_info["metadata_uuid"] == "Not Available"): + netcdf_file_obj.metadata_uuid = channel_id_info["metadata_uuid"] if not netcdf_file_obj.instrument_serial_number: - del(netcdf_file_obj.instrument_serial_number) + del netcdf_file_obj.instrument_serial_number # add CF gatts, values stored in lib/netcdf/imos_env - netcdf_file_obj.Conventions = os.environ.get('CONVENTIONS') - netcdf_file_obj.data_centre_email = os.environ.get('DATA_CENTRE_EMAIL') - netcdf_file_obj.data_centre = os.environ.get('DATA_CENTRE') - netcdf_file_obj.project = os.environ.get('PROJECT') - netcdf_file_obj.acknowledgement = os.environ.get('ACKNOWLEDGEMENT') - netcdf_file_obj.distribution_statement = os.environ.get('DISTRIBUTION_STATEMENT') - - netcdf_file_obj.date_created = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime()) - netcdf_file_obj.quality_control_set = 1 - imos_qc_convention = 'IMOS standard set using the IODE flags' - netcdf_file_obj.author = 'laurent besnard' - netcdf_file_obj.author_email = 'laurent.besnard@utas.edu.au' - - rename_netcdf_attribute(netcdf_file_obj, 'geospatial_LAT_max', 'geospatial_lat_max') - rename_netcdf_attribute(netcdf_file_obj, 'geospatial_LAT_min', 'geospatial_lat_min') - rename_netcdf_attribute(netcdf_file_obj, 'geospatial_LON_max', 'geospatial_lon_max') - rename_netcdf_attribute(netcdf_file_obj, 'geospatial_LON_min', 'geospatial_lon_min') + netcdf_file_obj.Conventions = os.environ.get("CONVENTIONS") + netcdf_file_obj.data_centre_email = os.environ.get("DATA_CENTRE_EMAIL") + netcdf_file_obj.data_centre = os.environ.get("DATA_CENTRE") + netcdf_file_obj.project = os.environ.get("PROJECT") + netcdf_file_obj.acknowledgement = os.environ.get("ACKNOWLEDGEMENT") + netcdf_file_obj.distribution_statement = os.environ.get("DISTRIBUTION_STATEMENT") + + netcdf_file_obj.date_created = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime()) + netcdf_file_obj.quality_control_set = 1 + imos_qc_convention = "IMOS standard set using the IODE flags" + netcdf_file_obj.author = "laurent besnard" + netcdf_file_obj.author_email = "laurent.besnard@utas.edu.au" + + rename_netcdf_attribute(netcdf_file_obj, "geospatial_LAT_max", "geospatial_lat_max") + rename_netcdf_attribute(netcdf_file_obj, "geospatial_LAT_min", "geospatial_lat_min") + rename_netcdf_attribute(netcdf_file_obj, "geospatial_LON_max", "geospatial_lon_max") + rename_netcdf_attribute(netcdf_file_obj, "geospatial_LON_min", "geospatial_lon_min") # variables modifications - time = netcdf_file_obj.variables['time'] - time.calendar = 'gregorian' - time.axis = 'T' + time = netcdf_file_obj.variables["time"] + time.calendar = "gregorian" + time.axis = "T" time.valid_min = 0.0 time.valid_max = 9999999999.0 - netcdf_file_obj.renameDimension('time', 'TIME') - netcdf_file_obj.renameVariable('time', 'TIME') - - netcdf_file_obj.time_coverage_start = num2date(time[:], time.units, time.calendar).min().strftime('%Y-%m-%dT%H:%M:%SZ') - netcdf_file_obj.time_coverage_end = num2date(time[:], time.units, time.calendar).max().strftime('%Y-%m-%dT%H:%M:%SZ') + netcdf_file_obj.renameDimension("time", "TIME") + netcdf_file_obj.renameVariable("time", "TIME") + + netcdf_file_obj.time_coverage_start = ( + num2date(time[:], time.units, time.calendar) + .min() + .strftime("%Y-%m-%dT%H:%M:%SZ") + ) + netcdf_file_obj.time_coverage_end = ( + num2date(time[:], time.units, time.calendar) + .max() + .strftime("%Y-%m-%dT%H:%M:%SZ") + ) # latitude longitude - latitude = netcdf_file_obj.variables['LATITUDE'] - latitude.axis = 'Y' - latitude.valid_min = -90.0 - latitude.valid_max = 90.0 - latitude.reference_datum = 'geographical coordinates, WGS84 projection' - latitude.standard_name = 'latitude' - latitude.long_name = 'latitude' - - longitude = netcdf_file_obj.variables['LONGITUDE'] - longitude.axis = 'X' - longitude.valid_min = -180.0 - longitude.valid_max = 180.0 - longitude.reference_datum = 'geographical coordinates, WGS84 projection' - longitude.standard_name = 'longitude' - longitude.long_name = 'longitude' + latitude = netcdf_file_obj.variables["LATITUDE"] + latitude.axis = "Y" + latitude.valid_min = -90.0 + latitude.valid_max = 90.0 + latitude.reference_datum = "geographical coordinates, WGS84 projection" + latitude.standard_name = "latitude" + latitude.long_name = "latitude" + + longitude = netcdf_file_obj.variables["LONGITUDE"] + longitude.axis = "X" + longitude.valid_min = -180.0 + longitude.valid_max = 180.0 + longitude.reference_datum = "geographical coordinates, WGS84 projection" + longitude.standard_name = "longitude" + longitude.long_name = "longitude" # handle masked arrays lon_array = longitude[:] @@ -612,233 +687,257 @@ def modify_aims_netcdf(netcdf_file_path, channel_id_info): netcdf_file_obj.geospatial_lat_max = numpy.ma.MaskedArray.max(lat_array) # Change variable name, standard name, longname, untis .... - if 'Seawater_Intake_Temperature' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['Seawater_Intake_Temperature'] - var.units = 'Celsius' - netcdf_file_obj.renameVariable('Seawater_Intake_Temperature', 'TEMP') - netcdf_file_obj.renameVariable('Seawater_Intake_Temperature_quality_control', 'TEMP_quality_control') - var.ancillary_variables = 'TEMP_quality_control' - - if 'PSAL' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables['PSAL'].units = '1e-3' - - if 'TURB' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['TURB'] - var.units = '1' - var.standard_name = 'sea_water_turbidity' - netcdf_file_obj.variables['TURB_quality_control'].standard_name = 'sea_water_turbidity status_flag' - - if 'DOWN_PHOTOSYNTH_FLUX' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['DOWN_PHOTOSYNTH_FLUX'] - var.units = 'W m-2' - - if 'PEAK_WAVE_DIR' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['PEAK_WAVE_DIR'] - var.units = 'degree' - - if 'CDIR' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['CDIR'] - var.units = 'degree' - var.long_name = 'current_direction' - - if 'CSPD' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['CSPD'] - var.long_name = 'current_magnitude' - - if 'ALBD' in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables['ALBD'] - var.units = '1' + if "Seawater_Intake_Temperature" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["Seawater_Intake_Temperature"] + var.units = "Celsius" + netcdf_file_obj.renameVariable("Seawater_Intake_Temperature", "TEMP") + netcdf_file_obj.renameVariable( + "Seawater_Intake_Temperature_quality_control", "TEMP_quality_control" + ) + var.ancillary_variables = "TEMP_quality_control" + + if "PSAL" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables["PSAL"].units = "1e-3" + + if "TURB" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["TURB"] + var.units = "1" + var.standard_name = "sea_water_turbidity" + netcdf_file_obj.variables[ + "TURB_quality_control" + ].standard_name = "sea_water_turbidity status_flag" + + if "DOWN_PHOTOSYNTH_FLUX" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["DOWN_PHOTOSYNTH_FLUX"] + var.units = "W m-2" + + if "PEAK_WAVE_DIR" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["PEAK_WAVE_DIR"] + var.units = "degree" + + if "CDIR" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["CDIR"] + var.units = "degree" + var.long_name = "current_direction" + + if "CSPD" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["CSPD"] + var.long_name = "current_magnitude" + + if "ALBD" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["ALBD"] + var.units = "1" def clean_no_cf_variables(var, netcdf_file_obj): """ remove standard name of main variable and of its ancillary qc var if exists """ if var in netcdf_file_obj.variables.keys(): - if hasattr(netcdf_file_obj.variables[var], 'standard_name'): - del(netcdf_file_obj.variables[var].standard_name) - var_qc = '%s_quality_control' % var + if hasattr(netcdf_file_obj.variables[var], "standard_name"): + del netcdf_file_obj.variables[var].standard_name + var_qc = "%s_quality_control" % var if var_qc in netcdf_file_obj.variables.keys(): - if hasattr(netcdf_file_obj.variables[var_qc], 'standard_name'): - del(netcdf_file_obj.variables[var_qc].standard_name) - if hasattr(netcdf_file_obj.variables[var], 'ancillary_variables'): + if hasattr(netcdf_file_obj.variables[var_qc], "standard_name"): + del netcdf_file_obj.variables[var_qc].standard_name + if hasattr(netcdf_file_obj.variables[var], "ancillary_variables"): netcdf_file_obj.variables[var].ancillary_variables = var_qc - if 'Dissolved_Oxygen_Percent' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('Dissolved_Oxygen_Percent', netcdf_file_obj) - - if 'ErrorVelocity' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('ErrorVelocity', netcdf_file_obj) - netcdf_file_obj.variables['ErrorVelocity'].long_name = 'error_velocity' - - if 'Average_Compass_Heading' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('Average_Compass_Heading', netcdf_file_obj) - var = netcdf_file_obj.variables['Average_Compass_Heading'] - var.units = 'degree' - - if 'Upwelling_longwave_radiation' in netcdf_file_obj.variables.keys(): - var_str = 'Upwelling_longwave_radiation' - var_qc_str = '%s_quality_control' % var_str - var = netcdf_file_obj.variables[var_str] - var_qc = netcdf_file_obj.variables[var_qc_str] - var.units = 'W m-2' - var.standard_name = 'upwelling_longwave_flux_in_air' - var_qc.standard_name = 'upwelling_longwave_flux_in_air status_flag' - - if 'Downwelling_longwave_radiation' in netcdf_file_obj.variables.keys(): - var_str = 'Downwelling_longwave_radiation' - var_qc_str = '%s_quality_control' % var_str - var = netcdf_file_obj.variables[var_str] - var_qc = netcdf_file_obj.variables[var_qc_str] - var.units = 'W m-2' - var.standard_name = 'downwelling_longwave_flux_in_air' - var_qc.standard_name = 'downwelling_longwave_flux_in_air status_flag' - - if 'UP_TOT_RADIATION' in netcdf_file_obj.variables.keys(): - var_str = 'UP_TOT_RADIATION' - var_qc_str = '%s_quality_control' % var_str - var = netcdf_file_obj.variables[var_str] - var_qc = netcdf_file_obj.variables[var_qc_str] - var.units = 'W m-2' - var.standard_name = 'upwelling_longwave_flux_in_air' - var_qc.standard_name = 'upwelling_longwave_flux_in_air status_flag' - - if 'DOWN_TOT_RADIATION' in netcdf_file_obj.variables.keys(): - var_str = 'DOWN_TOT_RADIATION' - var_qc_str = '%s_quality_control' % var_str - var = netcdf_file_obj.variables[var_str] - var_qc = netcdf_file_obj.variables[var_qc_str] - var.units = 'W m-2' - var.standard_name = 'downwelling_longwave_flux_in_air' - var_qc.standard_name = 'downwelling_longwave_flux_in_air status_flag' - - if 'RADIATION_DOWN_NET' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('RADIATION_DOWN_NET', netcdf_file_obj) - - if 'fluorescence' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.renameVariable('fluorescence', 'CPHL') - netcdf_file_obj.variables['CPHL'].long_name = 'mass_concentration_of_inferred_chlorophyll_from_relative_fluorescence_units_in_sea_water_concentration_of_chlorophyll_in_sea_water' - if 'fluorescence_quality_control' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.renameVariable('fluorescence_quality_control', 'CPHL_quality_control') - netcdf_file_obj.variables['CPHL_quality_control'].long_name = 'mass_concentration_of_inferred_chlorophyll_from_relative_fluorescence_units_in_sea_waterconcentration_of_chlorophyll_in_sea_water status_flag' - clean_no_cf_variables('CPHL', netcdf_file_obj) - - if 'WDIR_10min' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables['WDIR_10min'].units = 'degree' - - if 'WDIR_30min' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables['WDIR_30min'].units = 'degree' - - if 'R_sigma_30min' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables['R_sigma_30min'].units = 'degree' - clean_no_cf_variables('R_sigma_30min', netcdf_file_obj) - - if 'WDIR_sigma_10min' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables['WDIR_sigma_10min'].units = 'degree' - clean_no_cf_variables('WDIR_sigma_10min', netcdf_file_obj) - - if 'WDIR_sigma_30min' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables['WDIR_sigma_30min'].units = 'degree' - clean_no_cf_variables('WDIR_sigma_30min', netcdf_file_obj) - - if 'ATMP' in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables['ATMP'].units = 'hPa' - - if 'RAIN_DURATION' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('RAIN_DURATION', netcdf_file_obj) - - if 'HAIL_DURATION' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('HAIL_DURATION', netcdf_file_obj) - - if 'HAIL_HIT' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('HAIL_HIT', netcdf_file_obj) - netcdf_file_obj.variables['HAIL_HIT'].comment = netcdf_file_obj.variables['HAIL_HIT'].units - netcdf_file_obj.variables['HAIL_HIT'].units = '1' - - if 'HAIL_INTENSITY_10min' in netcdf_file_obj.variables.keys(): - clean_no_cf_variables('HAIL_INTENSITY_10min', netcdf_file_obj) - netcdf_file_obj.variables['HAIL_INTENSITY_10min'].comment = netcdf_file_obj.variables['HAIL_INTENSITY_10min'].units - netcdf_file_obj.variables['HAIL_INTENSITY_10min'].units = '1' + if "Dissolved_Oxygen_Percent" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("Dissolved_Oxygen_Percent", netcdf_file_obj) + + if "ErrorVelocity" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("ErrorVelocity", netcdf_file_obj) + netcdf_file_obj.variables["ErrorVelocity"].long_name = "error_velocity" + + if "Average_Compass_Heading" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("Average_Compass_Heading", netcdf_file_obj) + var = netcdf_file_obj.variables["Average_Compass_Heading"] + var.units = "degree" + + if "Upwelling_longwave_radiation" in netcdf_file_obj.variables.keys(): + var_str = "Upwelling_longwave_radiation" + var_qc_str = "%s_quality_control" % var_str + var = netcdf_file_obj.variables[var_str] + var_qc = netcdf_file_obj.variables[var_qc_str] + var.units = "W m-2" + var.standard_name = "upwelling_longwave_flux_in_air" + var_qc.standard_name = "upwelling_longwave_flux_in_air status_flag" + + if "Downwelling_longwave_radiation" in netcdf_file_obj.variables.keys(): + var_str = "Downwelling_longwave_radiation" + var_qc_str = "%s_quality_control" % var_str + var = netcdf_file_obj.variables[var_str] + var_qc = netcdf_file_obj.variables[var_qc_str] + var.units = "W m-2" + var.standard_name = "downwelling_longwave_flux_in_air" + var_qc.standard_name = "downwelling_longwave_flux_in_air status_flag" + + if "UP_TOT_RADIATION" in netcdf_file_obj.variables.keys(): + var_str = "UP_TOT_RADIATION" + var_qc_str = "%s_quality_control" % var_str + var = netcdf_file_obj.variables[var_str] + var_qc = netcdf_file_obj.variables[var_qc_str] + var.units = "W m-2" + var.standard_name = "upwelling_longwave_flux_in_air" + var_qc.standard_name = "upwelling_longwave_flux_in_air status_flag" + + if "DOWN_TOT_RADIATION" in netcdf_file_obj.variables.keys(): + var_str = "DOWN_TOT_RADIATION" + var_qc_str = "%s_quality_control" % var_str + var = netcdf_file_obj.variables[var_str] + var_qc = netcdf_file_obj.variables[var_qc_str] + var.units = "W m-2" + var.standard_name = "downwelling_longwave_flux_in_air" + var_qc.standard_name = "downwelling_longwave_flux_in_air status_flag" + + if "RADIATION_DOWN_NET" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("RADIATION_DOWN_NET", netcdf_file_obj) + + if "fluorescence" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.renameVariable("fluorescence", "CPHL") + netcdf_file_obj.variables[ + "CPHL" + ].long_name = "mass_concentration_of_inferred_chlorophyll_from_relative_fluorescence_units_in_sea_water_concentration_of_chlorophyll_in_sea_water" + if "fluorescence_quality_control" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.renameVariable( + "fluorescence_quality_control", "CPHL_quality_control" + ) + netcdf_file_obj.variables[ + "CPHL_quality_control" + ].long_name = "mass_concentration_of_inferred_chlorophyll_from_relative_fluorescence_units_in_sea_waterconcentration_of_chlorophyll_in_sea_water status_flag" + clean_no_cf_variables("CPHL", netcdf_file_obj) + + if "WDIR_10min" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables["WDIR_10min"].units = "degree" + + if "WDIR_30min" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables["WDIR_30min"].units = "degree" + + if "R_sigma_30min" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables["R_sigma_30min"].units = "degree" + clean_no_cf_variables("R_sigma_30min", netcdf_file_obj) + + if "WDIR_sigma_10min" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables["WDIR_sigma_10min"].units = "degree" + clean_no_cf_variables("WDIR_sigma_10min", netcdf_file_obj) + + if "WDIR_sigma_30min" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables["WDIR_sigma_30min"].units = "degree" + clean_no_cf_variables("WDIR_sigma_30min", netcdf_file_obj) + + if "ATMP" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables["ATMP"].units = "hPa" + + if "RAIN_DURATION" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("RAIN_DURATION", netcdf_file_obj) + + if "HAIL_DURATION" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("HAIL_DURATION", netcdf_file_obj) + + if "HAIL_HIT" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("HAIL_HIT", netcdf_file_obj) + netcdf_file_obj.variables["HAIL_HIT"].comment = netcdf_file_obj.variables[ + "HAIL_HIT" + ].units + netcdf_file_obj.variables["HAIL_HIT"].units = "1" + + if "HAIL_INTENSITY_10min" in netcdf_file_obj.variables.keys(): + clean_no_cf_variables("HAIL_INTENSITY_10min", netcdf_file_obj) + netcdf_file_obj.variables[ + "HAIL_INTENSITY_10min" + ].comment = netcdf_file_obj.variables["HAIL_INTENSITY_10min"].units + netcdf_file_obj.variables["HAIL_INTENSITY_10min"].units = "1" # add qc conventions to qc vars variables = netcdf_file_obj.variables.keys() - qc_vars = [s for s in variables if '_quality_control' in s] + qc_vars = [s for s in variables if "_quality_control" in s] if qc_vars != []: for var in qc_vars: - netcdf_file_obj.variables[var].quality_control_conventions = imos_qc_convention + netcdf_file_obj.variables[ + var + ].quality_control_conventions = imos_qc_convention # clean longnames, force lower case, remove space, remove double underscore for var in variables: - if hasattr(netcdf_file_obj.variables[var], 'long_name'): - netcdf_file_obj.variables[var].long_name = netcdf_file_obj.variables[var].long_name.replace('__', '_') - netcdf_file_obj.variables[var].long_name = netcdf_file_obj.variables[var].long_name.replace(' _', '_') - netcdf_file_obj.variables[var].long_name = netcdf_file_obj.variables[var].long_name.lower() + if hasattr(netcdf_file_obj.variables[var], "long_name"): + netcdf_file_obj.variables[var].long_name = netcdf_file_obj.variables[ + var + ].long_name.replace("__", "_") + netcdf_file_obj.variables[var].long_name = netcdf_file_obj.variables[ + var + ].long_name.replace(" _", "_") + netcdf_file_obj.variables[var].long_name = netcdf_file_obj.variables[ + var + ].long_name.lower() netcdf_file_obj.close() def fix_provider_code_from_filename(netcdf_file_path, imos_facility_code): - new_filename = re.sub('AIMS_', ('%s_' % imos_facility_code), netcdf_file_path) + new_filename = re.sub("AIMS_", ("%s_" % imos_facility_code), netcdf_file_path) shutil.move(netcdf_file_path, new_filename) return new_filename def fix_data_code_from_filename(netcdf_file_path): - """ Some filename are badly written. + """Some filename are badly written. this function has to run after modifying the file to make it CF and IMOS compliant It physically renames the filename if needed """ - netcdf_file_obj = Dataset(netcdf_file_path, 'r', format='NETCDF4') - if 'CDIR' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_CDIR_', '_V_', netcdf_file_path) + netcdf_file_obj = Dataset(netcdf_file_path, "r", format="NETCDF4") + if "CDIR" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_CDIR_", "_V_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'CSPD' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_CSPD_', '_V_', netcdf_file_path) + if "CSPD" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_CSPD_", "_V_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'DOX1' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_Dissolved_O2_\(mole\)_', '_K_', netcdf_file_path) + if "DOX1" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_Dissolved_O2_\(mole\)_", "_K_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'DEPTH' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_DEPTH_', '_Z_', netcdf_file_path) + if "DEPTH" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_DEPTH_", "_Z_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'Dissolved_Oxygen_Percent' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_DO_%_', '_O_', netcdf_file_path) + if "Dissolved_Oxygen_Percent" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_DO_%_", "_O_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'ErrorVelocity' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_ErrorVelocity_', '_V_', netcdf_file_path) + if "ErrorVelocity" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_ErrorVelocity_", "_V_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'Average_Compass_Heading' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_Average_Compass_Heading_', '_E_', netcdf_file_path) + if "Average_Compass_Heading" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_Average_Compass_Heading_", "_E_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'Upwelling_longwave_radiation' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_Upwelling_longwave_radiation_', '_F_', netcdf_file_path) + if "Upwelling_longwave_radiation" in netcdf_file_obj.variables.keys(): + new_filename = re.sub("_Upwelling_longwave_radiation_", "_F_", netcdf_file_path) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename - if 'Downwelling_longwave_radiation' in netcdf_file_obj.variables.keys(): - new_filename = re.sub('_Downwelling_longwave_radiation_', '_F_', netcdf_file_path) + if "Downwelling_longwave_radiation" in netcdf_file_obj.variables.keys(): + new_filename = re.sub( + "_Downwelling_longwave_radiation_", "_F_", netcdf_file_path + ) netcdf_file_obj.close() shutil.move(netcdf_file_path, new_filename) return new_filename @@ -848,49 +947,57 @@ def fix_data_code_from_filename(netcdf_file_path): def has_var_only_fill_value(netcdf_file_path, var): - """ some channels have only _Fillvalues in their main variable. This is not correct and need + """some channels have only _Fillvalues in their main variable. This is not correct and need to be tested var is a string of the variable to test """ - netcdf_file_obj = Dataset(netcdf_file_path, 'r', format='NETCDF4') - var_obj = netcdf_file_obj.variables[var] - var_values = var_obj[:] + netcdf_file_obj = Dataset(netcdf_file_path, "r", format="NETCDF4") + var_obj = netcdf_file_obj.variables[var] + var_values = var_obj[:] netcdf_file_obj.close() # if no fill value in variable, no mask attribute - if hasattr(var_values, 'mask'): + if hasattr(var_values, "mask"): return var_values.mask.all() else: return False def remove_dimension_from_netcdf(netcdf_file_path): - """ DIRTY, calling bash. need to write in Python, or part of the NetCDF4 module + """DIRTY, calling bash. need to write in Python, or part of the NetCDF4 module need to remove the 'single' dimension name from DEPTH or other dim. Unfortunately can't seem to find a way to do it easily with netCDF4 module """ fd, tmp_file = tempfile.mkstemp() os.close(fd) - subprocess.check_call(['ncwa', '-O', '-a', 'single', netcdf_file_path, tmp_file]) - subprocess.check_call(['ncatted', '-O', '-a', 'cell_methods,,d,,', tmp_file, tmp_file]) + subprocess.check_call(["ncwa", "-O", "-a", "single", netcdf_file_path, tmp_file]) + subprocess.check_call( + ["ncatted", "-O", "-a", "cell_methods,,d,,", tmp_file, tmp_file] + ) shutil.move(tmp_file, netcdf_file_path) def remove_end_date_from_filename(netcdf_filename): - """ remove the _END-* part of the file, as we download monthly file. This helps + """remove the _END-* part of the file, as we download monthly file. This helps to overwrite file with new data for the same month """ - return re.sub('_END-.*$', '.nc', netcdf_filename) + return re.sub("_END-.*$", ".nc", netcdf_filename) def rm_tmp_dir(data_wip_path): - """ remove temporary directories older than 15 days from data_wip path""" + """remove temporary directories older than 15 days from data_wip path""" for dir_path in os.listdir(data_wip_path): - if dir_path.startswith('manifest_dir_tmp_'): - file_date = datetime.datetime.strptime(dir_path.split('_')[-1], '%Y%m%d%H%M%S') + if dir_path.startswith("manifest_dir_tmp_"): + file_date = datetime.datetime.strptime( + dir_path.split("_")[-1], "%Y%m%d%H%M%S" + ) if (datetime.datetime.now() - file_date).days > 15: logger = logging.getLogger(__name__) - logger.info('DELETE old temporary folder {path}'.format(path=os.path.join(data_wip_path, dir_path))) + logger.info( + "DELETE old temporary folder {path}".format( + path=os.path.join(data_wip_path, dir_path) + ) + ) shutil.rmtree(os.path.join(data_wip_path, dir_path)) @@ -898,7 +1005,7 @@ def set_up(): """ set up wip facility directories """ - wip_path = os.environ.get('data_wip_path') + wip_path = os.environ.get("data_wip_path") # this is used for unit testing as data_wip_path env would not be set if wip_path is None: @@ -906,11 +1013,11 @@ def set_up(): if not wip_path: logger = logging.getLogger(__name__) - logger.error('env data_wip_path not defined') + logger.error("env data_wip_path not defined") exit(1) if not os.path.exists(wip_path): os.makedirs(wip_path) - if not os.path.exists(os.path.join(wip_path, 'errors')): - os.makedirs(os.path.join(wip_path, 'errors')) + if not os.path.exists(os.path.join(wip_path, "errors")): + os.makedirs(os.path.join(wip_path, "errors")) From d995da6458fcb66f9a08972bdf6c6a424aa6cb7f Mon Sep 17 00:00:00 2001 From: lbesnard Date: Tue, 27 Jan 2026 16:08:13 +1100 Subject: [PATCH 2/8] Fix: delete properly tmp dirs --- ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py | 178 ++++--- ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new | 497 ++++++++++++++++++++ 2 files changed, 600 insertions(+), 75 deletions(-) create mode 100755 ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new diff --git a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py index 2eb53e75..ba90f2a0 100755 --- a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py +++ b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py @@ -30,15 +30,13 @@ import datetime import logging import os -import sys import re import shutil +import sys import traceback import unittest as data_validation_test - -from netCDF4 import Dataset from itertools import groupby -from tendo import singleton +from pathlib import Path from aims_realtime_util import ( convert_time_cf_to_imos, @@ -46,23 +44,25 @@ download_channel, fix_data_code_from_filename, fix_provider_code_from_filename, + get_main_netcdf_var, has_var_only_fill_value, is_no_data_found, is_time_monotonic, is_time_var_empty, + list_recursively_files_abs_path, logging_aims, md5, modify_aims_netcdf, parse_aims_xml, remove_dimension_from_netcdf, remove_end_date_from_filename, + rm_tmp_dir, save_channel_info, set_up, - rm_tmp_dir, - get_main_netcdf_var, - list_recursively_files_abs_path, ) from dest_path import get_anmn_nrs_site_name +from netCDF4 import Dataset +from tendo import singleton from util import pass_netcdf_checker MD5_EXPECTED_VALUE = "a6207e053f1cc0e00d171701f0cdb186" @@ -79,88 +79,116 @@ def modify_anmn_nrs_netcdf(netcdf_file_path, channel_id_info): - """Modify the downloaded netCDF file so it passes both CF and IMOS checker - input: - netcdf_file_path(str) : path of netcdf file to modify - channel_id_index(tupple) : information from xml for the channel """ + Refines ANMN NRS specific metadata and coordinate variables. + """ + # First pass: Generic AIMS modifications modify_aims_netcdf(netcdf_file_path, channel_id_info) - netcdf_file_obj = Dataset(netcdf_file_path, "a", format="NETCDF4") - netcdf_file_obj.aims_channel_id = int(channel_id_info["channel_id"]) - - if "Yongala" in channel_id_info["site_name"]: - netcdf_file_obj.site_code = "NRSYON" - netcdf_file_obj.platform_code = "Yongala NRS Buoy" - elif "Darwin" in channel_id_info["site_name"]: - netcdf_file_obj.site_code = "NRSDAR" - netcdf_file_obj.platform_code = "Darwin NRS Buoy" - elif "Beagle" in channel_id_info["site_name"]: - netcdf_file_obj.site_code = "DARBGF" - netcdf_file_obj.platform_code = "Beagle Gulf Mooring" - else: - return False - - if not (channel_id_info["metadata_uuid"] == "Not Available"): - netcdf_file_obj.metadata_uuid = channel_id_info["metadata_uuid"] - - # some weather stations channels don't have a depth variable if sensor above water - if "depth" in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables["depth"] - var.long_name = "nominal depth" - var.positive = "down" - var.axis = "Z" - var.reference_datum = "sea surface" - var.valid_min = -10.0 - var.valid_max = 30.0 - var.units = "m" # some channels put degrees celcius instead ... - netcdf_file_obj.renameVariable("depth", "NOMINAL_DEPTH") - - if "DEPTH" in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables["DEPTH"] - var.coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" - var.long_name = "actual depth" - var.reference_datum = "sea surface" - var.positive = "down" - var.valid_min = -10.0 - var.valid_max = 30.0 - var.units = "m" # some channels put degrees celcius instead ... - - netcdf_file_obj.close() - netcdf_file_obj = Dataset( - netcdf_file_path, "a", format="NETCDF4" - ) # need to close to save to file. as we call get_main_var just after + # Site and Platform Mapping (The Dictionary approach) + site_map = { + "Yongala": ("NRSYON", "Yongala NRS Buoy"), + "Darwin": ("NRSDAR", "Darwin NRS Buoy"), + "Beagle": ("DARBGF", "Beagle Gulf Mooring"), + } + + site_name = channel_id_info.get("site_name", "") + site_data = next((v for k, v in site_map.items() if k in site_name), None) + + if not site_data: + return False # Site not recognised + + with Dataset(netcdf_file_path, "a") as nc: + nc.site_code, nc.platform_code = site_data + nc.aims_channel_id = int(channel_id_info["channel_id"]) + + if channel_id_info.get("metadata_uuid") != "Not Available": + nc.metadata_uuid = channel_id_info["metadata_uuid"] + + # Depth Variable Attributes (Common configurations) + depth_attrs = { + "positive": "down", + "axis": "Z", + "reference_datum": "sea surface", + "valid_min": -10.0, + "valid_max": 30.0, + "units": "m", + } + + # Handle 'depth' + if "depth" in nc.variables: + var = nc.variables["depth"] + for k, v in depth_attrs.items(): + setattr(var, k, v) + var.long_name = "nominal depth" + nc.renameVariable("depth", "NOMINAL_DEPTH") + + # Handle 'DEPTH' (actual depth) + if "DEPTH" in nc.variables: + var = nc.variables["DEPTH"] + # Standard depth attributes plus coordinates + for k, v in depth_attrs.items(): + setattr(var, k, v) + var.long_name = "actual depth" + var.coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" + + # Coordinate String Assignment + # We close the file above so that the next functions see the changes main_var = get_main_netcdf_var(netcdf_file_path) - # DEPTH, LATITUDE and LONGITUDE are not dimensions, so we make them into auxiliary cooordinate variables by adding this attribute - if "NOMINAL_DEPTH" in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables[ - main_var - ].coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" - else: - netcdf_file_obj.variables[main_var].coordinates = "TIME LATITUDE LONGITUDE" - netcdf_file_obj.close() + with Dataset(netcdf_file_path, "a") as nc: + if main_var in nc.variables: + coords = "TIME LATITUDE LONGITUDE" + if "NOMINAL_DEPTH" in nc.variables: + coords += " NOMINAL_DEPTH" + nc.variables[main_var].coordinates = coords + # Final transformations if not convert_time_cf_to_imos(netcdf_file_path): return False - remove_dimension_from_netcdf( - netcdf_file_path - ) # last modification to do in this order! + # This MUST be last as it reshapes the file + remove_dimension_from_netcdf(netcdf_file_path) + return True def move_to_tmp_incoming(netcdf_path): - # [org_filename withouth creation date].[md5].nc to have unique filename in - new_filename = "%s.%s.nc" % ( - os.path.splitext(os.path.basename(remove_end_date_from_filename(netcdf_path)))[ - 0 - ], - md5(netcdf_path), - ) + """ + Renames the NetCDF to include its MD5 hash, moves it to the manifest directory, + and cleans up the now-empty source directory. + """ + logger = logging.getLogger(__name__) + # Convert to Path object for easier manipulation + source_file = Path(netcdf_path) + source_dir = source_file.parent + + # Construct the new filename: [name_without_date].[md5].nc + # remove_end_date_from_filename returns a string, so we wrap it in Path + name_no_date = Path(remove_end_date_from_filename(str(source_file))).stem + file_hash = md5(str(source_file)) + new_filename = f"{name_no_date}.{file_hash}.nc" - os.chmod(netcdf_path, 0o0664) # change to 664 for pipeline v2 - shutil.move(netcdf_path, os.path.join(TMP_MANIFEST_DIR, new_filename)) + destination = Path(TMP_MANIFEST_DIR) / new_filename + + try: + # Apply permissions (664) + source_file.chmod(0o664) + + # Perform the move + shutil.move(str(source_file), str(destination)) + logger.info(f"Moved {source_file.name} to {destination}") + + # Cleanup: Delete the source directory if it is now empty + try: + source_dir.rmdir() + logger.debug(f"Cleaned up empty directory: {source_dir}") + except OSError: + logger.debug(f"Source directory not empty; skipping cleanup: {source_dir}") + + except Exception as e: + logger.error(f"Failed to move {source_file} to incoming: {e}") + raise def process_monthly_channel(channel_id, aims_xml_info, level_qc): diff --git a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new new file mode 100755 index 00000000..24143f80 --- /dev/null +++ b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new @@ -0,0 +1,497 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Download ANMN NRS data from AIMS Web Service for Darwin, Yongala and Beagle +The script reads an XML file provided by AIMS and looks for channels with +new data to download. It compares this list with a pickle file (pythonic +way to store python variables) containing what has already been downloaded +in the previous run of this script. +Some modifications on the files have to be done so they comply with CF and +IMOS conventions. +The IOOS compliance checker is used to check if the first downloaded file of +a channel complies once modified. If not, the download of the rest of the +channel is aborted until some modification on the source code is done so +the channel can pass the checker. +Files which don't pass the checker will land in os.path.join(wip_path, 'errors') +for investigation. No need to reprocess them as they will be redownloaded on +next run until they end up passing the checker. Files in the 'errors' dir can be +removed at anytime + +IMPORTANT: +is it essential to look at the logging os.path.join(wip_path, 'aims.log') +to know which channels have problems and why as most of the time, AIMS will +have to be contacted to sort out issues. + + +author Laurent Besnard, laurent.besnard@utas.edu.au +""" + +import argparse +import datetime +import logging +import os +import re +import shutil +import sys +import traceback +import unittest as data_validation_test +from itertools import groupby +from pathlib import Path + +from aims_realtime_util import ( + convert_time_cf_to_imos, + create_list_of_dates_to_download, + download_channel, + fix_data_code_from_filename, + fix_provider_code_from_filename, + get_main_netcdf_var, + has_var_only_fill_value, + is_no_data_found, + is_time_monotonic, + is_time_var_empty, + list_recursively_files_abs_path, + logging_aims, + md5, + modify_aims_netcdf, + parse_aims_xml, + remove_dimension_from_netcdf, + remove_end_date_from_filename, + rm_tmp_dir, + save_channel_info, + set_up, +) +from dest_path import get_anmn_nrs_site_name +from netCDF4 import Dataset +from tendo import singleton +from util import pass_netcdf_checker + +DATA_WIP_PATH = os.path.join( + os.environ.get("WIP_DIR"), + "ANMN", + "NRS_AIMS_Darwin_Yongala_data_rss_download_temporary", +) +ANMN_NRS_INCOMING_DIR = os.path.join( + os.environ.get("INCOMING_DIR"), "AODN", "ANMN_NRS_DAR_YON" +) +ANMN_NRS_ERROR_DIR = os.path.join(os.environ["ERROR_DIR"], "ANMN_NRS_DAR_YON") + + +def modify_anmn_nrs_netcdf(netcdf_file_path, channel_id_info): + """Modify the downloaded netCDF file so it passes both CF and IMOS checker + input: + netcdf_file_path(str) : path of netcdf file to modify + channel_id_index(tupple) : information from xml for the channel + """ + modify_aims_netcdf(netcdf_file_path, channel_id_info) + + netcdf_file_obj = Dataset(netcdf_file_path, "a", format="NETCDF4") + netcdf_file_obj.aims_channel_id = int(channel_id_info["channel_id"]) + + if "Yongala" in channel_id_info["site_name"]: + netcdf_file_obj.site_code = "NRSYON" + netcdf_file_obj.platform_code = "Yongala NRS Buoy" + elif "Darwin" in channel_id_info["site_name"]: + netcdf_file_obj.site_code = "NRSDAR" + netcdf_file_obj.platform_code = "Darwin NRS Buoy" + elif "Beagle" in channel_id_info["site_name"]: + netcdf_file_obj.site_code = "DARBGF" + netcdf_file_obj.platform_code = "Beagle Gulf Mooring" + else: + return False + + if not (channel_id_info["metadata_uuid"] == "Not Available"): + netcdf_file_obj.metadata_uuid = channel_id_info["metadata_uuid"] + + # some weather stations channels don't have a depth variable if sensor above water + if "depth" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["depth"] + var.long_name = "nominal depth" + var.positive = "down" + var.axis = "Z" + var.reference_datum = "sea surface" + var.valid_min = -10.0 + var.valid_max = 30.0 + var.units = "m" # some channels put degrees celcius instead ... + netcdf_file_obj.renameVariable("depth", "NOMINAL_DEPTH") + + if "DEPTH" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["DEPTH"] + var.coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" + var.long_name = "actual depth" + var.reference_datum = "sea surface" + var.positive = "down" + var.valid_min = -10.0 + var.valid_max = 30.0 + var.units = "m" # some channels put degrees celcius instead ... + + netcdf_file_obj.close() + netcdf_file_obj = Dataset( + netcdf_file_path, "a", format="NETCDF4" + ) # need to close to save to file. as we call get_main_var just after + main_var = get_main_netcdf_var(netcdf_file_path) + # DEPTH, LATITUDE and LONGITUDE are not dimensions, so we make them into auxiliary cooordinate variables by adding this attribute + if "NOMINAL_DEPTH" in netcdf_file_obj.variables.keys(): + netcdf_file_obj.variables[ + main_var + ].coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" + else: + netcdf_file_obj.variables[main_var].coordinates = "TIME LATITUDE LONGITUDE" + + netcdf_file_obj.close() + + if not convert_time_cf_to_imos(netcdf_file_path): + return False + + remove_dimension_from_netcdf( + netcdf_file_path + ) # last modification to do in this order! + return True + + +def move_to_tmp_incoming(netcdf_path): + """ + Renames the NetCDF to include its MD5 hash, moves it to the manifest directory, + and cleans up the now-empty source directory. + """ + logger = logging.getLogger(__name__) + # Convert to Path object for easier manipulation + source_file = Path(netcdf_path) + source_dir = source_file.parent + + # Construct the new filename: [name_without_date].[md5].nc + # remove_end_date_from_filename returns a string, so we wrap it in Path + name_no_date = Path(remove_end_date_from_filename(str(source_file))).stem + file_hash = md5(str(source_file)) + new_filename = f"{name_no_date}.{file_hash}.nc" + + destination = Path(TMP_MANIFEST_DIR) / new_filename + + try: + # Apply permissions (664) + source_file.chmod(0o664) + + # Perform the move + shutil.move(str(source_file), str(destination)) + logger.info(f"Moved {source_file.name} to {destination}") + + # Cleanup: Delete the source directory if it is now empty + try: + source_dir.rmdir() + logger.debug(f"Cleaned up empty directory: {source_dir}") + except OSError: + logger.debug(f"Source directory not empty; skipping cleanup: {source_dir}") + + except Exception as e: + logger.error(f"Failed to move {source_file} to incoming: {e}") + raise + + +def process_monthly_channel(channel_id, aims_xml_info, level_qc): + """ + Downloads all the data available for one channel_id and moves the file to a wip_path dir + + aims_service : 1 -> FAIMMS data + 100 -> SOOP TRV data + 300 -> NRS DATA + for monthly data download, only 1 and 300 should be use + """ + contact_aims_msg = "Process of channel aborted - CONTACT AIMS" + wip_path = Path(os.environ.get("data_wip_path", "")) + + logger.info(f"QC{level_qc} - Processing channel {channel_id}") + + channel_id_info = aims_xml_info[channel_id] + from_date = channel_id_info["from_date"] + thru_date = channel_id_info["thru_date"] + + # [start_dates, end_dates] generation + start_dates, end_dates = create_list_of_dates_to_download( + channel_id, level_qc, from_date, thru_date + ) + + if not start_dates: + logger.info(f"QC{level_qc} - Channel {channel_id}: already up to date") + return + + # download monthly file + for start_dt, end_dt in zip(start_dates, end_dates): + start_date = start_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + end_date = end_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + netcdf_tmp_file_path = download_channel( + channel_id, start_date, end_date, level_qc + ) + + if netcdf_tmp_file_path is None: + logger.error( + f" Channel {channel_id} - not valid zip file - {contact_aims_msg}" + ) + break + + tmp_dir = Path(netcdf_tmp_file_path).parent + + # NO_DATA_FOUND file only means there is no data for the selected time period. + # Could be some data afterwards + if is_no_data_found(netcdf_tmp_file_path): + logger.info( + f"Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]" + ) + shutil.rmtree(tmp_dir) + continue # Move to next month + + # Start of validation sequence + error_occurred = False + + if is_time_var_empty(netcdf_tmp_file_path): + logger.error( + f"Channel {channel_id}: No values in TIME variable - {contact_aims_msg}" + ) + error_occurred = True + + elif not modify_anmn_nrs_netcdf(netcdf_tmp_file_path, channel_id_info): + logger.error( + f"Channel {channel_id}: Could not modify the NetCDF file - Process of channel aborted" + ) + error_occurred = True + + else: + main_var = get_main_netcdf_var(netcdf_tmp_file_path) + if has_var_only_fill_value(netcdf_tmp_file_path, main_var): + logger.error( + f"Channel {channel_id}: _Fillvalues only in main variable - {contact_aims_msg}" + ) + error_occurred = True + elif not get_anmn_nrs_site_name(netcdf_tmp_file_path): + logger.error( + f"Channel {channel_id}: Unknown site_code gatt value - {contact_aims_msg}" + ) + error_occurred = True + elif not is_time_monotonic(netcdf_tmp_file_path): + logger.error( + f"Channel {channel_id}: TIME value is not strictly monotonic - {contact_aims_msg}" + ) + error_occurred = True + + if error_occurred: + shutil.rmtree(tmp_dir) + break + + # check every single file of the list. We don't assume that if one passes, all pass ... past proved this + if not pass_netcdf_checker(netcdf_tmp_file_path, tests=["cf:1.6", "imos:1.3"]): + logger.error( + f"Channel {channel_id}: File does not pass CF/IMOS compliance checker - Process of channel aborted" + ) + + err_dest = wip_path / "errors" / os.path.basename(netcdf_tmp_file_path) + shutil.copy(netcdf_tmp_file_path, err_dest) + + logger.error(f"File copied to {err_dest} for debugging") + shutil.rmtree(tmp_dir) + break + + netcdf_tmp_file_path = fix_data_code_from_filename(netcdf_tmp_file_path) + netcdf_tmp_file_path = fix_provider_code_from_filename( + netcdf_tmp_file_path, "IMOS_ANMN" + ) + + if not re.search(r"IMOS_ANMN_[A-Z]{1}_", netcdf_tmp_file_path): + logger.error( + f" Channel {channel_id} - File name Data code does not pass REGEX - Process of channel aborted" + ) + + err_dest = wip_path / "errors" / os.path.basename(netcdf_tmp_file_path) + shutil.copy(netcdf_tmp_file_path, err_dest) + + logger.error(f" File copied to {err_dest} for debugging") + shutil.rmtree(tmp_dir) + break + + move_to_tmp_incoming(netcdf_tmp_file_path) + + # Update tracking + save_channel_info(channel_id, aims_xml_info, level_qc, end_date) + + if TESTING: + # The 2 next lines download the first month only for every single channel. + # This is only used for testing + # Note: save_channel_info already called above + break + + +def process_qc_level(level_qc): + """Downloads all channels for a QC level + level_qc(int) : 0 or 1 + """ + + logger.info( + "Process ANMN NRS download from AIMS web service - QC level {level_qc}".format( + level_qc=level_qc + ) + ) + xml_url = "https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level{level_qc}/300".format( + level_qc=level_qc + ) + try: + aims_xml_info = parse_aims_xml(xml_url) + except Exception as err: + logger.critical("RSS feed not available") + exit(1) + + for channel_id in aims_xml_info.keys(): + try: + process_monthly_channel(channel_id, aims_xml_info, level_qc) + except Exception as err: + logger.error( + "QC{qc_level} - Channel {channel_id}: Failed, unknown reason - manual debug required".format( + channel_id=str(channel_id), qc_level=str(level_qc) + ) + ) + logger.error(traceback.print_exc()) + + +class AimsDataValidationTest(data_validation_test.TestCase): + def setUp(self): + """Check that a the AIMS system or this script hasn't been modified. + This function checks that a downloaded file still has the same md5. + """ + channel_id = "84329" + from_date = "2016-01-01T00:00:00Z" + thru_date = "2016-01-02T00:00:00Z" + level_qc = 1 + aims_rss_val = 300 + xml_url = ( + "https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level%s/%s" + % (str(level_qc), str(aims_rss_val)) + ) + + logger.info("Data validation unittests...") + aims_xml_info = parse_aims_xml(xml_url) + channel_id_info = aims_xml_info[channel_id] + self.netcdf_tmp_file_path = download_channel( + channel_id, from_date, thru_date, level_qc + ) + modify_anmn_nrs_netcdf(self.netcdf_tmp_file_path, channel_id_info) + EPOCH_ISO = "1970-01-01T00:00:00Z" + + netcdf_path = Path(self.netcdf_tmp_file_path) + + with Dataset(netcdf_path, mode="a", format="NETCDF4") as nc: + # force values of attributes which change all the time + nc.date_created = EPOCH_ISO + nc.history = "data validation test only" + nc.NCO = "NCO_VERSION" + + def tearDown(self): + shutil.copy( + self.netcdf_tmp_file_path, + os.path.join( + os.environ["data_wip_path"], "nc_unittest_%s.nc" % self.md5_netcdf_value + ), + ) + shutil.rmtree(os.path.dirname(self.netcdf_tmp_file_path)) + + def test_aims_validation(self): + if sys.version_info[0] < 3: + self.md5_expected_value = "76c9a595264a8173545b6dc0c518a280" + else: + self.md5_expected_value = "1bb65266f8e526ed2087904ae024e33d" + + self.md5_netcdf_value = md5(self.netcdf_tmp_file_path) + + self.assertEqual(self.md5_netcdf_value, self.md5_expected_value) + + +def args(): + """ + define the script arguments + :return: vargs + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "-t", + "--testing", + action="store_true", + help="testing only - downloads the first month of each channel", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + vargs = args() + me = singleton.SingleInstance() + os.environ["data_wip_path"] = os.path.join( + os.environ.get("WIP_DIR"), + "ANMN", + "NRS_AIMS_Darwin_Yongala_data_rss_download_temporary", + ) + global TMP_MANIFEST_DIR + global TESTING + + set_up() + + # initialise logging + logging_aims() + global logger + logger = logging.getLogger(__name__) + + # data validation test + runner = data_validation_test.TextTestRunner() + itersuite = data_validation_test.TestLoader().loadTestsFromTestCase( + AimsDataValidationTest + ) + res = runner.run(itersuite) + + if not DATA_WIP_PATH: + logger.critical("environment variable data_wip_path is not defined.") + exit(1) + + # script optional argument for testing only. used in process_monthly_channel + TESTING = vargs.testing + + rm_tmp_dir(DATA_WIP_PATH) + + if len(os.listdir(ANMN_NRS_INCOMING_DIR)) >= 2: + logger.critical("Operation aborted, too many files in INCOMING_DIR") + exit(1) + + if len(os.listdir(ANMN_NRS_ERROR_DIR)) >= 2: + logger.critical("Operation aborted, too many files in ERROR_DIR") + exit(1) + + if not res.failures: + for level in [0, 1]: + date_str_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + TMP_MANIFEST_DIR = os.path.join( + DATA_WIP_PATH, "manifest_dir_tmp_{date}".format(date=date_str_now) + ) + os.makedirs(TMP_MANIFEST_DIR) + + process_qc_level(level) + + lines_per_file = 2**12 + file_list = list_recursively_files_abs_path(TMP_MANIFEST_DIR) + if len(file_list) > 0: + for file_number, lines in groupby( + enumerate(file_list), key=lambda x: x[0] // lines_per_file + ): + incoming_file = os.path.join( + DATA_WIP_PATH, + "anmn_nrs_aims_FV0{level}_{date}_{file_number}.manifest".format( + level=str(level), date=date_str_now, file_number=file_number + ), + ) + with open(incoming_file, "w") as outfile: + for item in lines: + outfile.write("%s\n" % item[1]) + + os.chmod(incoming_file, 0o0664) # change to 664 for pipeline v2 + shutil.move( + incoming_file, + os.path.join( + ANMN_NRS_INCOMING_DIR, os.path.basename(incoming_file) + ), + ) + + else: + logger.error("Data validation unittests failed") From ee57da535c9c7c8c71e8716c72a6bcadebd0e8d6 Mon Sep 17 00:00:00 2001 From: lbesnard Date: Thu, 29 Jan 2026 14:45:55 +1100 Subject: [PATCH 3/8] Fix: AIMS NRS - various improvments --- ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py | 315 +++++++++++------------- 1 file changed, 142 insertions(+), 173 deletions(-) diff --git a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py index ba90f2a0..971a20f0 100755 --- a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py +++ b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py @@ -33,7 +33,6 @@ import re import shutil import sys -import traceback import unittest as data_validation_test from itertools import groupby from pathlib import Path @@ -85,7 +84,6 @@ def modify_anmn_nrs_netcdf(netcdf_file_path, channel_id_info): # First pass: Generic AIMS modifications modify_aims_netcdf(netcdf_file_path, channel_id_info) - # Site and Platform Mapping (The Dictionary approach) site_map = { "Yongala": ("NRSYON", "Yongala NRS Buoy"), "Darwin": ("NRSDAR", "Darwin NRS Buoy"), @@ -159,7 +157,7 @@ def move_to_tmp_incoming(netcdf_path): and cleans up the now-empty source directory. """ logger = logging.getLogger(__name__) - # Convert to Path object for easier manipulation + source_file = Path(netcdf_path) source_dir = source_file.parent @@ -192,198 +190,165 @@ def move_to_tmp_incoming(netcdf_path): def process_monthly_channel(channel_id, aims_xml_info, level_qc): - """Downloads all the data available for one channel_id and moves the file to a wip_path dir - channel_id(str) - aims_xml_info(tuple) - level_qc(int) + """ + Downloads all the data available for one channel_id and moves the file to a wip_path dir aims_service : 1 -> FAIMMS data 100 -> SOOP TRV data 300 -> NRS DATA for monthly data download, only 1 and 300 should be use """ - logger.info( - "QC{level_qc} - Processing channel {channel_id}".format( - channel_id=str(channel_id), level_qc=str(level_qc) - ) - ) + contact_aims_msg = "Process of channel aborted - CONTACT AIMS" + wip_path = Path(os.environ.get("data_wip_path", "")) + + logger.info(f"QC{level_qc} - Processing channel {channel_id}") + channel_id_info = aims_xml_info[channel_id] from_date = channel_id_info["from_date"] thru_date = channel_id_info["thru_date"] - [start_dates, end_dates] = create_list_of_dates_to_download( + + # [start_dates, end_dates] generation + start_dates, end_dates = create_list_of_dates_to_download( channel_id, level_qc, from_date, thru_date ) - if len(start_dates) != 0: - # download monthly file - for start_date, end_date in zip(start_dates, end_dates): - start_date = start_date.strftime("%Y-%m-%dT%H:%M:%SZ") - end_date = end_date.strftime("%Y-%m-%dT%H:%M:%SZ") - netcdf_tmp_file_path = download_channel( - channel_id, start_date, end_date, level_qc + if not start_dates: + logger.info(f"QC{level_qc} - Channel {channel_id}: already up to date") + return + + # download monthly file + for start_dt, end_dt in zip(start_dates, end_dates): + start_date = start_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + end_date = end_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + netcdf_tmp_file_path = download_channel( + channel_id, start_date, end_date, level_qc + ) + + if netcdf_tmp_file_path is None: + logger.error( + f" Channel {channel_id} - not valid zip file - {contact_aims_msg}" + ) + break + + tmp_dir = Path(netcdf_tmp_file_path).parent + + # NO_DATA_FOUND file only means there is no data for the selected time period. + # Could be some data afterwards + if is_no_data_found(netcdf_tmp_file_path): + logger.info( + f"Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]" + ) + shutil.rmtree(tmp_dir) + continue # Move to next month + + # Start of validation sequence + error_occurred = False + + if is_time_var_empty(netcdf_tmp_file_path): + logger.error( + f"Channel {channel_id}: No values in TIME variable - {contact_aims_msg}" + ) + error_occurred = True + + elif not modify_anmn_nrs_netcdf(netcdf_tmp_file_path, channel_id_info): + logger.error( + f"Channel {channel_id}: Could not modify the NetCDF file - Process of channel aborted" ) - contact_aims_msg = "Process of channel aborted - CONTACT AIMS" + error_occurred = True - if netcdf_tmp_file_path is None: + else: + main_var = get_main_netcdf_var(netcdf_tmp_file_path) + if has_var_only_fill_value(netcdf_tmp_file_path, main_var): logger.error( - " Channel %s - not valid zip file - %s" - % (str(channel_id), contact_aims_msg) + f"Channel {channel_id}: _Fillvalues only in main variable - {contact_aims_msg}" ) - break - - # NO_DATA_FOUND file only means there is no data for the selected time period. Could be some data afterwards - if is_no_data_found(netcdf_tmp_file_path): - logger.info( - "Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]".format( - channel_id=str(channel_id), - start_date=start_date, - end_date=end_date, - ) + error_occurred = True + elif not get_anmn_nrs_site_name(netcdf_tmp_file_path): + logger.error( + f"Channel {channel_id}: Unknown site_code gatt value - {contact_aims_msg}" ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - else: - if is_time_var_empty(netcdf_tmp_file_path): - logger.error( - "Channel {channel_id}: No values in TIME variable - {message}".format( - channel_id=str(channel_id), message=contact_aims_msg - ) - ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - break - - if not modify_anmn_nrs_netcdf(netcdf_tmp_file_path, channel_id_info): - logger.error( - "Channel{channel_id}: Could not modify the NetCDF file - Process of channel aborted".format( - channel_id=str(channel_id) - ) - ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - break - - main_var = get_main_netcdf_var(netcdf_tmp_file_path) - if has_var_only_fill_value(netcdf_tmp_file_path, main_var): - logger.error( - "Channel {channel_id}: _Fillvalues only in main variable - {message}".format( - channel_id=str(channel_id), message=contact_aims_msg - ) - ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - break - - if get_anmn_nrs_site_name(netcdf_tmp_file_path) == []: - logger.error( - "Channel {channel_id}: Unknown site_code gatt value - {message}".format( - channel_id=str(channel_id), message=contact_aims_msg - ) - ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - break - - if not is_time_monotonic(netcdf_tmp_file_path): - logger.error( - "Channel {channel_id}: TIME value is not strictly monotonic \ - - {message}".format( - channel_id=str(channel_id), message=contact_aims_msg - ) - ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - break - - # check every single file of the list. We don't assume that if one passes, all pass ... past proved this - wip_path = os.environ.get("data_wip_path") - checker_retval = pass_netcdf_checker( - netcdf_tmp_file_path, tests=["cf:1.6", "imos:1.3"] + error_occurred = True + elif not is_time_monotonic(netcdf_tmp_file_path): + logger.error( + f"Channel {channel_id}: TIME value is not strictly monotonic - {contact_aims_msg}" ) - if not checker_retval: - logger.error( - "Channel {channel_id}: File does not pass CF/IMOS compliance checker - Process of channel aborted".format( - channel_id=str(channel_id) - ) - ) - shutil.copy(netcdf_tmp_file_path, os.path.join(wip_path, "errors")) - - logger.error( - "File copied to {path} for debugging".format( - path=os.path.join( - wip_path, - "errors", - os.path.basename(netcdf_tmp_file_path), - ) - ) - ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - break + error_occurred = True - netcdf_tmp_file_path = fix_data_code_from_filename(netcdf_tmp_file_path) - netcdf_tmp_file_path = fix_provider_code_from_filename( - netcdf_tmp_file_path, "IMOS_ANMN" - ) + if error_occurred: + shutil.rmtree(tmp_dir) + break - if re.search("IMOS_ANMN_[A-Z]{1}_", netcdf_tmp_file_path) is None: - logger.error( - " Channel %s - File name Data code does not pass REGEX - Process of channel aborted" - % str(channel_id) - ) - shutil.copy(netcdf_tmp_file_path, os.path.join(wip_path, "errors")) - logger.error( - " File copied to %s for debugging" - % ( - os.path.join( - wip_path, - "errors", - os.path.basename(netcdf_tmp_file_path), - ) - ) - ) - shutil.rmtree(os.path.dirname(netcdf_tmp_file_path)) - break + # check every single file of the list. We don't assume that if one passes, all pass ... past proved this + if not pass_netcdf_checker(netcdf_tmp_file_path, tests=["cf:1.6", "imos:1.3"]): + logger.error( + f"Channel {channel_id}: File does not pass CF/IMOS compliance checker - Process of channel aborted" + ) - move_to_tmp_incoming(netcdf_tmp_file_path) + err_dest = wip_path / "errors" / os.path.basename(netcdf_tmp_file_path) + shutil.copy(netcdf_tmp_file_path, err_dest) - if TESTING: - # The 2 next lines download the first month only for every single channel. This is only used for testing - save_channel_info(channel_id, aims_xml_info, level_qc, end_date) - break + logger.error(f"File copied to {err_dest} for debugging") + shutil.rmtree(tmp_dir) + break - save_channel_info(channel_id, aims_xml_info, level_qc, end_date) + netcdf_tmp_file_path = fix_data_code_from_filename(netcdf_tmp_file_path) + netcdf_tmp_file_path = fix_provider_code_from_filename( + netcdf_tmp_file_path, "IMOS_ANMN" + ) - else: - logger.info( - "QC{level_qc} - Channel {channel_id}: already up to date".format( - channel_id=str(channel_id), level_qc=str(level_qc) + if not re.search(r"IMOS_ANMN_[A-Z]{1}_", netcdf_tmp_file_path): + logger.error( + f" Channel {channel_id} - File name Data code does not pass REGEX - Process of channel aborted" ) - ) + + err_dest = wip_path / "errors" / os.path.basename(netcdf_tmp_file_path) + shutil.copy(netcdf_tmp_file_path, err_dest) + + logger.error(f" File copied to {err_dest} for debugging") + shutil.rmtree(tmp_dir) + break + + move_to_tmp_incoming(netcdf_tmp_file_path) + + # Update tracking + save_channel_info(channel_id, aims_xml_info, level_qc, end_date) + + if TESTING: + # The 2 next lines download the first month only for every single channel. + # This is only used for testing + # Note: save_channel_info already called above + break def process_qc_level(level_qc): - """Downloads all channels for a QC level - level_qc(int) : 0 or 1 """ - + Downloads all channels for a specific QC level (0 or 1). + """ logger.info( - "Process ANMN NRS download from AIMS web service - QC level {level_qc}".format( - level_qc=level_qc - ) + f"Process ANMN NRS download from AIMS web service - QC level {level_qc}" ) - xml_url = "https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level{level_qc}/300".format( - level_qc=level_qc + + xml_url = ( + f"https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level{level_qc}/300" ) + try: aims_xml_info = parse_aims_xml(xml_url) - except Exception as err: - logger.critical("RSS feed not available") + except Exception: + # Use exc_info=True to automatically attach the stack trace to the log + logger.critical(f"RSS feed not available at {xml_url}", exc_info=True) exit(1) - for channel_id in aims_xml_info.keys(): + # Iterate through channels + for channel_id in aims_xml_info: try: process_monthly_channel(channel_id, aims_xml_info, level_qc) - except Exception as err: - logger.error( - "QC{qc_level} - Channel {channel_id}: Failed, unknown reason - manual debug required".format( - channel_id=str(channel_id), qc_level=str(level_qc) - ) + except Exception: + # logger.exception automatically logs the error AND the traceback + logger.exception( + f"QC{level_qc} - Channel {channel_id}: Failed, unknown reason - manual debug required" ) - logger.error(traceback.print_exc()) class AimsDataValidationTest(data_validation_test.TestCase): @@ -404,34 +369,38 @@ def setUp(self): logger.info("Data validation unittests...") aims_xml_info = parse_aims_xml(xml_url) channel_id_info = aims_xml_info[channel_id] - self.netcdf_tmp_file_path = download_channel( - channel_id, from_date, thru_date, level_qc + self.nc_path = Path( + download_channel(channel_id, from_date, thru_date, level_qc) ) - modify_anmn_nrs_netcdf(self.netcdf_tmp_file_path, channel_id_info) + modify_anmn_nrs_netcdf(str(self.nc_path), channel_id_info) # force values of attributes which change all the time - netcdf_file_obj = Dataset(self.netcdf_tmp_file_path, "a", format="NETCDF4") - netcdf_file_obj.date_created = "1970-01-01T00:00:00Z" # epoch - netcdf_file_obj.history = "data validation test only" - netcdf_file_obj.NCO = "NCO_VERSION" - - netcdf_file_obj.close() + with Dataset(self.nc_path, "a") as nc: + nc.date_created = "1970-01-01T00:00:00Z" + nc.history = "data validation test only" + # Check if NCO attribute exists before forcing it + if hasattr(nc, "NCO"): + nc.NCO = "NCO_VERSION" def tearDown(self): - shutil.copy( - self.netcdf_tmp_file_path, - os.path.join( - os.environ["data_wip_path"], "nc_unittest_%s.nc" % self.md5_netcdf_value - ), - ) - shutil.rmtree(os.path.dirname(self.netcdf_tmp_file_path)) + wip_dir = Path(os.environ.get("data_wip_path", ".")) + + # Preserve the file for debugging before cleanup + # self.md5_netcdf_value needs to be calculated in the test method itself + if hasattr(self, "md5_netcdf_value"): + debug_name = f"nc_unittest_{self.md5_netcdf_value}.nc" + shutil.copy(self.nc_path, wip_dir / debug_name) + + # Cleanup: Remove the parent directory of the temp file + if self.nc_path.parent.exists(): + shutil.rmtree(self.nc_path.parent) def test_aims_validation(self): if sys.version_info[0] < 3: self.md5_expected_value = "76c9a595264a8173545b6dc0c518a280" else: self.md5_expected_value = MD5_EXPECTED_VALUE - self.md5_netcdf_value = md5(self.netcdf_tmp_file_path) + self.md5_netcdf_value = md5(str(self.nc_path)) self.assertEqual(self.md5_netcdf_value, self.md5_expected_value) From b905823cf1e793e229af1f7745a653897cacebe3 Mon Sep 17 00:00:00 2001 From: lbesnard Date: Mon, 2 Feb 2026 10:50:41 +1100 Subject: [PATCH 4/8] Fix: aims - DOXY var had wrong CF units --- lib/python/aims_realtime_util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/python/aims_realtime_util.py b/lib/python/aims_realtime_util.py index 21446e7c..f07bae14 100755 --- a/lib/python/aims_realtime_util.py +++ b/lib/python/aims_realtime_util.py @@ -728,6 +728,10 @@ def modify_aims_netcdf(netcdf_file_path, channel_id_info): var = netcdf_file_obj.variables["ALBD"] var.units = "1" + if "DOXY" in netcdf_file_obj.variables.keys(): + var = netcdf_file_obj.variables["DOXY"] + var.units = "kg m-3" # unit was milliliter/Liter which was not CF but equivalent anyway; Example channel 84900 + def clean_no_cf_variables(var, netcdf_file_obj): """ remove standard name of main variable and of its ancillary qc var if exists From 034dcabceabfaed8854115c0cc12f553f9ac8788 Mon Sep 17 00:00:00 2001 From: lbesnard Date: Mon, 2 Feb 2026 13:49:16 +1100 Subject: [PATCH 5/8] Fix: aims_realtime_utils modernisation (2) --- lib/python/aims_realtime_util.py | 394 +++++++++++++++++-------------- 1 file changed, 221 insertions(+), 173 deletions(-) diff --git a/lib/python/aims_realtime_util.py b/lib/python/aims_realtime_util.py index f07bae14..abed4d12 100755 --- a/lib/python/aims_realtime_util.py +++ b/lib/python/aims_realtime_util.py @@ -11,8 +11,8 @@ author Laurent Besnard, laurent.besnard@utas.edu.au """ -import datetime import glob +import hashlib import json import logging import os @@ -25,11 +25,15 @@ import time import xml.etree.ElementTree as ET import zipfile +from datetime import datetime, timedelta +from pathlib import Path from time import gmtime, strftime import dotenv import numpy import requests +from dateutil import rrule +from dateutil.relativedelta import relativedelta from six.moves.urllib.request import urlopen from six.moves.urllib_error import URLError @@ -37,11 +41,10 @@ from functools import lru_cache except ImportError: from functools32 import lru_cache -from netCDF4 import Dataset, date2num, num2date - -from retrying import retry from logging.handlers import TimedRotatingFileHandler +from netCDF4 import Dataset, date2num, num2date +from retrying import retry ##################### # Logging Functions # @@ -49,45 +52,54 @@ def logging_aims(): - """start logging using logging python library - output: - logger - similar to a file handler """ - wip_path = os.environ.get("data_wip_path") - # this is used for unit testing as data_wip_path env would not be set - if wip_path is None: - wip_path = tempfile.mkdtemp() + Starts logging using the standard library. + Returns a configured logger instance. + """ + # Get wip_path from env; fallback to a temp directory for testing + wip_path_env = os.environ.get("data_wip_path") + wip_path = Path(wip_path_env) if wip_path_env else Path(tempfile.mkdtemp()) + + log_path = wip_path / "aims.log" - logging_format = ( + # Centralized Formatting + log_format = ( "%(asctime)s — %(name)s — %(levelname)s — %(funcName)s:%(lineno)d — %(message)s" ) + formatter = logging.Formatter(log_format) - # set up logging to file - tmp_filename = tempfile.mkstemp(".log", "aims_data_download_")[1] - log_path = os.path.join(wip_path, "aims.log") - logging.basicConfig( - level=logging.INFO, format=logging_format, filename=tmp_filename, filemode="a+" - ) + # Initialize Root Logger + root_logger = logging.getLogger() + root_logger.setLevel(logging.DEBUG) # Capture everything at the root level - # rotate logs every Day, and keep only the last 5 log files - logHandler = TimedRotatingFileHandler( - log_path, - when="D", - interval=1, - backupCount=5, # backupCount files will be kept - ) - logHandler.setFormatter(logging.Formatter(logging_format)) - logHandler.setLevel(logging.DEBUG) - logging.getLogger("").addHandler(logHandler) + # Clear existing handlers to prevent duplicate logs if function is called twice + if root_logger.hasHandlers(): + root_logger.handlers.clear() - # define a Handler which writes DEBUG messages to the sys.stderr - logFormatter = logging.Formatter(logging_format) - consoleHandler = logging.StreamHandler() - consoleHandler.setLevel(logging.INFO) - consoleHandler.setFormatter(logFormatter) + # File Handler (Timed Rotation) + # Logic: Daily rotation, keep 5 backups + file_handler = TimedRotatingFileHandler( + filename=log_path, when="D", interval=1, backupCount=5, encoding="utf-8" + ) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(formatter) + root_logger.addHandler(file_handler) + + # Console Handler + # Logic: High-level INFO messages to stderr + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(formatter) + root_logger.addHandler(console_handler) + + # Debug logs to verify initialization + root_logger.debug("Logging initialized successfully.") + root_logger.debug(f"Log file location: {log_path}") + root_logger.debug( + f"Environment 'data_wip_path' was: {'Set' if wip_path_env else 'Not Set (using temp)'}" + ) - # add the console handler to the root logger - logging.getLogger("").addHandler(consoleHandler) + return root_logger #################### @@ -178,98 +190,145 @@ def save_channel_info( level_qc(int) : 0 or 1 last_downloaded_date_channel is a variable argument, not used by soop trv """ - pickle_file = _pickle_filename(level_qc) - last_downloaded_date = dict() - # condition in case the pickle file already exists or not. In the first case, - # aims_xml_info comes from the pickle, file, otherwise comes from the function arg - if os.path.isfile(pickle_file): - with open(pickle_file, "rb") as p_read: - aims_xml_info_file = pickle.load(p_read) - last_downloaded_date = aims_xml_info_file - - if not last_downloaded_date_channel: - # soop trv specific, vararg - last_downloaded_date[channel_id] = aims_xml_info[channel_id]["thru_date"] - else: - last_downloaded_date[channel_id] = last_downloaded_date_channel[0] + logger = logging.getLogger(__name__) + pickle_file = Path(_pickle_filename(level_qc)) + last_downloaded_data = {} + # Load existing data if file exists + if pickle_file.exists(): + try: + with pickle_file.open("rb") as p_read: + last_downloaded_data = pickle.load(p_read) + logger.debug(f"Loaded existing metadata from {pickle_file}") + except (EOFError, pickle.UnpicklingError): + logger.warning(f"Pickle file {pickle_file} was corrupt. Starting fresh.") + + # Determine the date (DRY - Don't Repeat Yourself) + if last_downloaded_date_channel: + new_date = last_downloaded_date_channel[0] + logger.debug(f"Using provided vararg date for {channel_id}: {new_date}") else: - if not last_downloaded_date_channel: - # soop trv specific, vararg - last_downloaded_date[channel_id] = aims_xml_info[channel_id]["thru_date"] - else: - last_downloaded_date[channel_id] = last_downloaded_date_channel[0] + new_date = aims_xml_info[channel_id]["thru_date"] + logger.debug(f"Extracted date from XML info for {channel_id}: {new_date}") - with open(pickle_file, "wb") as p_write: - pickle.dump(last_downloaded_date, p_write) + # Update and Save + last_downloaded_data[channel_id] = new_date + + with pickle_file.open("wb") as p_write: + pickle.dump(last_downloaded_data, p_write) + + logger.info(f"Successfully saved channel info for {channel_id} to {pickle_file}") def get_last_downloaded_date_channel(channel_id, level_qc, from_date): - """Retrieve the last date sucessfully downloaded for a channel""" - pickle_file = _pickle_filename(level_qc) # different pickle per QC - if os.path.isfile(pickle_file): - with open(pickle_file, "rb") as p_read: - last_downloaded_date = pickle.load(p_read) + """ + Retrieve the last date successfully downloaded for a channel. + Falls back to from_date if no record is found or the file is missing/corrupt. + """ + + logger = logging.getLogger(__name__) + pickle_path = Path(_pickle_filename(level_qc)) - if ( - channel_id in last_downloaded_date.keys() - ): # check the channel is in the pickle file - if last_downloaded_date[channel_id] is not None: - return last_downloaded_date[channel_id] + if not pickle_path.is_file(): + return from_date - return from_date + try: + with pickle_path.open("rb") as p_read: + last_downloaded_map = pickle.load(p_read) + + recorded_date = last_downloaded_map.get(channel_id) + return recorded_date if recorded_date is not None else from_date + + except (EOFError, pickle.UnpicklingError, Exception) as e: + # If the pickle is corrupt, we don't want to kill the pipeline. + # Log it and fall back to the provided from_date. + logger.warning( + f"Failed to read tracking file {pickle_path}: {e}. Falling back to {from_date}" + ) + return from_date def has_channel_already_been_downloaded(channel_id, level_qc): - pickle_file = _pickle_filename(level_qc) # different pickle per QC - if os.path.isfile(pickle_file): - with open(pickle_file, "rb") as p_read: - last_downloaded_date = pickle.load(p_read) + """ + Checks if a channel exists in the tracking pickle and has a valid date. + """ - if ( - channel_id in last_downloaded_date.keys() - ): # check the channel is in the pickle file - if ( - last_downloaded_date[channel_id] is not None - ): # check the last downloaded_date field - return True - else: - return False - else: - return False + logger = logging.getLogger(__name__) + pickle_path = Path(_pickle_filename(level_qc)) + # + # Early exit if file doesn't exist + if not pickle_path.is_file(): + logger.debug(f"No tracking file found at {pickle_path}") + return False - else: + try: + with pickle_path.open("rb") as p_read: + last_downloaded_date = pickle.load(p_read) + except (EOFError, pickle.UnpicklingError): + logger.error(f"Failed to read pickle file: {pickle_path}") return False + # Dictionary .get() returns None if key is missing + download_date = last_downloaded_date.get(channel_id) + exists = download_date is not None -def create_list_of_dates_to_download(channel_id, level_qc, from_date, thru_date): - """generate a list of monthly start dates and end dates to download FAIMMS and NRS data""" + logger.debug( + f"Channel {channel_id} download status: {exists} (Date: {download_date})" + ) - from dateutil import rrule - from datetime import datetime - from dateutil.relativedelta import relativedelta + return exists + + +def create_list_of_dates_to_download( + channel_id, level_qc, from_date_str, thru_date_str +): + """ + Generates lists of monthly start and end dates for data downloads. + Logic: Starts from the 1st of the month of the last download. + """ + + logger = logging.getLogger(__name__) + # date format + iso_format = "%Y-%m-%dT%H:%M:%SZ" + + # Retrieve last download date + last_dl_str = get_last_downloaded_date_channel(channel_id, level_qc, from_date_str) + + # Convert strings to datetime objects + thru_date = datetime.strptime(thru_date_str, iso_format) + last_dl_date = datetime.strptime(last_dl_str, iso_format) - last_downloaded_date = get_last_downloaded_date_channel( - channel_id, level_qc, from_date - ) start_dates = [] end_dates = [] - from_date = datetime.strptime(from_date, "%Y-%m-%dT%H:%M:%SZ") - thru_date = datetime.strptime(thru_date, "%Y-%m-%dT%H:%M:%SZ") - last_downloaded_date = datetime.strptime(last_downloaded_date, "%Y-%m-%dT%H:%M:%SZ") + # Only process if there is new data to get + if last_dl_date >= thru_date: + logger.info( + f"Channel {channel_id}: No new dates to download. " + f"Last download ({last_dl_date}) is >= thru_date ({thru_date})" + ) + return start_dates, end_dates + + # Generate Monthly Ranges + # We start at the beginning (1st) of the month of the last download + month_start = datetime(last_dl_date.year, last_dl_date.month, 1) + + logger.debug( + f"Generating monthly ranges for {channel_id} starting from {month_start}" + ) - if last_downloaded_date < thru_date: - for dt in rrule.rrule( - rrule.MONTHLY, - dtstart=datetime(last_downloaded_date.year, last_downloaded_date.month, 1), - until=thru_date, - ): - start_dates.append(dt) - end_dates.append(datetime(dt.year, dt.month, 1) + relativedelta(months=1)) + for dt in rrule.rrule(rrule.MONTHLY, dtstart=month_start, until=thru_date): + start_dates.append(dt) + # End date is exactly one month after the start of the current iteration + end_dates.append(dt + relativedelta(months=1)) + # Ensure the very last end date doesn't overshoot the requested thru_date + if end_dates: + original_end = end_dates[-1] end_dates[-1] = thru_date + logger.debug(f"Snapped final end date from {original_end} to {thru_date}") + logger.info(f"Generated {len(start_dates)} monthly intervals for {channel_id}") return start_dates, end_dates @@ -287,14 +346,15 @@ def list_recursively_files_abs_path(path): def md5(fname): - """return a md5 checksum of a file""" - import hashlib - - hash = hashlib.md5() + """Return an md5 checksum of a file.""" with open(fname, "rb") as f: + if hasattr(hashlib, "file_digest"): + return hashlib.file_digest(f, "md5").hexdigest() + + hash_obj = hashlib.md5() for chunk in iter(lambda: f.read(4096), b""): - hash.update(chunk) - return hash.hexdigest() + hash_obj.update(chunk) + return hash_obj.hexdigest() def get_main_netcdf_var(netcdf_file_path): @@ -889,64 +949,36 @@ def fix_data_code_from_filename(netcdf_file_path): It physically renames the filename if needed """ - netcdf_file_obj = Dataset(netcdf_file_path, "r", format="NETCDF4") - if "CDIR" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_CDIR_", "_V_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename - - if "CSPD" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_CSPD_", "_V_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename - - if "DOX1" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_Dissolved_O2_\(mole\)_", "_K_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename + logger = logging.getLogger(__name__) - if "DEPTH" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_DEPTH_", "_Z_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename + # Mapping of {Variable_Internal_Name: (Regex_Pattern, Replacement_Code)} + FILENAME_MAPPING = { + "CDIR": ("_CDIR_", "_V_"), + "CSPD": ("_CSPD_", "_V_"), + "DOX1": (r"_Dissolved_O2_\(mole\)_", "_K_"), + "DEPTH": ("_DEPTH_", "_Z_"), + "Dissolved_Oxygen_Percent": ("_DO_%_", "_O_"), + "ErrorVelocity": ("_ErrorVelocity_", "_V_"), + "Average_Compass_Heading": ("_Average_Compass_Heading_", "_E_"), + "Upwelling_longwave_radiation": ("_Upwelling_longwave_radiation_", "_F_"), + "Downwelling_longwave_radiation": ("_Downwelling_longwave_radiation_", "_F_"), + } - if "Dissolved_Oxygen_Percent" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_DO_%_", "_O_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename + with Dataset(netcdf_file_path, "r", format="NETCDF4") as nc: + found_var = next((var for var in FILENAME_MAPPING if var in nc.variables), None) - if "ErrorVelocity" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_ErrorVelocity_", "_V_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename + if found_var: + pattern, replacement = FILENAME_MAPPING[found_var] + new_filename = re.sub(pattern, replacement, str(netcdf_file_path)) - if "Average_Compass_Heading" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_Average_Compass_Heading_", "_E_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename + logger.debug(f"Renaming file based on variable '{found_var}': {new_filename}") - if "Upwelling_longwave_radiation" in netcdf_file_obj.variables.keys(): - new_filename = re.sub("_Upwelling_longwave_radiation_", "_F_", netcdf_file_path) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename + old_path = Path(netcdf_file_path) + new_path = old_path.with_name(Path(new_filename).name) - if "Downwelling_longwave_radiation" in netcdf_file_obj.variables.keys(): - new_filename = re.sub( - "_Downwelling_longwave_radiation_", "_F_", netcdf_file_path - ) - netcdf_file_obj.close() - shutil.move(netcdf_file_path, new_filename) - return new_filename + shutil.move(str(old_path), str(new_path)) + return str(new_path) - netcdf_file_obj.close() return netcdf_file_path @@ -989,20 +1021,36 @@ def remove_end_date_from_filename(netcdf_filename): def rm_tmp_dir(data_wip_path): - """remove temporary directories older than 15 days from data_wip path""" - for dir_path in os.listdir(data_wip_path): - if dir_path.startswith("manifest_dir_tmp_"): - file_date = datetime.datetime.strptime( - dir_path.split("_")[-1], "%Y%m%d%H%M%S" - ) - if (datetime.datetime.now() - file_date).days > 15: - logger = logging.getLogger(__name__) - logger.info( - "DELETE old temporary folder {path}".format( - path=os.path.join(data_wip_path, dir_path) - ) - ) - shutil.rmtree(os.path.join(data_wip_path, dir_path)) + """ + Remove temporary directories older than 15 days from data_wip path. + Expected folder format: manifest_dir_tmp_YYYYMMDDHHMMSS + """ + + logger = logging.getLogger(__name__) + base_path = Path(data_wip_path) + if not base_path.is_dir(): + logger.warning(f"Cleanup skipped: {data_wip_path} is not a valid directory.") + return + + # Set threshold to 15 days ago + expiry_limit = datetime.now() - timedelta(days=15) + + for folder in base_path.glob("manifest_dir_tmp_*"): + try: + # Extract date string from the end of the folder name + date_str = folder.name.split("_")[-1] + folder_date = datetime.strptime(date_str, "%Y%m%d%H%M%S") + + if folder_date < expiry_limit: + logger.info(f"Deleting old temporary folder: {folder}") + shutil.rmtree(folder) + + except ValueError: + # This handles cases where the folder name matches the prefix + # but the suffix isn't a valid date + logger.debug(f"Skipping folder with invalid date format: {folder.name}") + except Exception as e: + logger.error(f"Failed to delete {folder}: {e}") def set_up(): From 1668f164228bde53bc616bec99949a7f2d58352e Mon Sep 17 00:00:00 2001 From: lbesnard Date: Mon, 2 Feb 2026 14:29:22 +1100 Subject: [PATCH 6/8] Fix: aims_realtime_utils modernisation (3) --- lib/python/aims_realtime_util.py | 38 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/lib/python/aims_realtime_util.py b/lib/python/aims_realtime_util.py index abed4d12..51b7ba75 100755 --- a/lib/python/aims_realtime_util.py +++ b/lib/python/aims_realtime_util.py @@ -358,23 +358,27 @@ def md5(fname): def get_main_netcdf_var(netcdf_file_path): - with Dataset(netcdf_file_path, mode="r") as netcdf_file_obj: - variables = netcdf_file_obj.variables - - variables.pop("TIME") - variables.pop("LATITUDE") - variables.pop("LONGITUDE") - - if "NOMINAL_DEPTH" in variables: - variables.pop("NOMINAL_DEPTH") - - qc_var = [s for s in variables if "_quality_control" in s] - if qc_var != []: - variables.pop(qc_var[0]) - - return [item for item in variables.keys()][0] - - return variables[0] + """ + Identifies the primary data variable in a NetCDF file by excluding + known coordinate and QC variables. + """ + with Dataset(netcdf_file_path, mode="r") as nc: + # Define the set of variables to ignore + excluded_vars = {"TIME", "LATITUDE", "LONGITUDE", "NOMINAL_DEPTH"} + + # Get all variable names as a list to avoid modifying the 'variables' object + var_names = list(nc.variables.keys()) + + # 1. Filter out the static coordinate names + # 2. Filter out any variable containing '_quality_control' + remaining_vars = [ + v + for v in var_names + if v not in excluded_vars and "_quality_control" not in v + ] + + # Return the first remaining variable if one exists, else None + return remaining_vars[0] if remaining_vars else None def is_above_file_limit(json_watchd_name): From e1f694efde03606605359eacdc29b83993e8dd22 Mon Sep 17 00:00:00 2001 From: lbesnard Date: Mon, 2 Feb 2026 17:26:18 +1100 Subject: [PATCH 7/8] Fix: aims_realtime_utils modernisation (4) --- ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py | 20 +- lib/python/aims_realtime_util.py | 253 +++++++++++++----------- 2 files changed, 157 insertions(+), 116 deletions(-) diff --git a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py index 971a20f0..ccaa7d98 100755 --- a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py +++ b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py @@ -64,7 +64,8 @@ from tendo import singleton from util import pass_netcdf_checker -MD5_EXPECTED_VALUE = "a6207e053f1cc0e00d171701f0cdb186" +MD5_EXPECTED_VALUE = "ba3bcf5d61134a338ee62c8f98033d00" +# MD5_EXPECTED_VALUE = "a6207e053f1cc0e00d171701f0cdb186" DATA_WIP_PATH = os.path.join( os.environ.get("WIP_DIR"), @@ -201,7 +202,16 @@ def process_monthly_channel(channel_id, aims_xml_info, level_qc): contact_aims_msg = "Process of channel aborted - CONTACT AIMS" wip_path = Path(os.environ.get("data_wip_path", "")) - logger.info(f"QC{level_qc} - Processing channel {channel_id}") + HL = "\x1b[1;35m" # Bold Magenta + RS = "\x1b[0m" + GREEN = "\033[92m" + ORANGE = "\033[38;5;208m" + RESET = "\033[0m" + YELLOW = "\033[33m" + + logger.info( + f"QC{level_qc} - {YELLOW}Processing channel{YELLOW} {HL}{channel_id}{RS}" + ) channel_id_info = aims_xml_info[channel_id] from_date = channel_id_info["from_date"] @@ -213,7 +223,9 @@ def process_monthly_channel(channel_id, aims_xml_info, level_qc): ) if not start_dates: - logger.info(f"QC{level_qc} - Channel {channel_id}: already up to date") + logger.info( + f"{GREEN}QC{level_qc} - Channel {channel_id}: already up to date{RESET}" + ) return # download monthly file @@ -237,7 +249,7 @@ def process_monthly_channel(channel_id, aims_xml_info, level_qc): # Could be some data afterwards if is_no_data_found(netcdf_tmp_file_path): logger.info( - f"Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]" + f"{ORANGE}Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]{RESET}" ) shutil.rmtree(tmp_dir) continue # Move to next month diff --git a/lib/python/aims_realtime_util.py b/lib/python/aims_realtime_util.py index 51b7ba75..e22d358f 100755 --- a/lib/python/aims_realtime_util.py +++ b/lib/python/aims_realtime_util.py @@ -19,13 +19,13 @@ import pickle import re import shutil -import subprocess import sys import tempfile import time import xml.etree.ElementTree as ET import zipfile from datetime import datetime, timedelta +from logging.handlers import TimedRotatingFileHandler from pathlib import Path from time import gmtime, strftime @@ -51,53 +51,71 @@ ##################### +class AimsColorFormatter(logging.Formatter): + """Custom formatter to add colors to console output only.""" + + # ANSI Codes + GREY = "\x1b[38;20m" + CYAN = "\x1b[36;20m" + YELLOW = "\x1b[33;20m" + RED = "\x1b[31;20m" + BOLD_RED = "\x1b[31;1m" + RESET = "\x1b[0m" + + log_format = ( + "%(asctime)s — %(name)s — %(levelname)s — %(funcName)s:%(lineno)d — %(message)s" + ) + + LEVEL_COLORS = { + logging.DEBUG: GREY, + logging.INFO: CYAN, + logging.WARNING: YELLOW, + logging.ERROR: RED, + logging.CRITICAL: BOLD_RED, + } + + def format(self, record): + color = self.LEVEL_COLORS.get(record.levelno, self.RESET) + formatter = logging.Formatter(f"{color}{self.log_format}{self.RESET}") + return formatter.format(record) + + def logging_aims(): - """ - Starts logging using the standard library. - Returns a configured logger instance. - """ - # Get wip_path from env; fallback to a temp directory for testing + """Starts logging with colored console and plain-text file output.""" + wip_path_env = os.environ.get("data_wip_path") wip_path = Path(wip_path_env) if wip_path_env else Path(tempfile.mkdtemp()) - log_path = wip_path / "aims.log" - # Centralized Formatting - log_format = ( + # Standard plain formatter for the file + file_format = ( "%(asctime)s — %(name)s — %(levelname)s — %(funcName)s:%(lineno)d — %(message)s" ) - formatter = logging.Formatter(log_format) + file_formatter = logging.Formatter(file_format) - # Initialize Root Logger root_logger = logging.getLogger() - root_logger.setLevel(logging.DEBUG) # Capture everything at the root level + root_logger.setLevel(logging.DEBUG) - # Clear existing handlers to prevent duplicate logs if function is called twice if root_logger.hasHandlers(): root_logger.handlers.clear() - # File Handler (Timed Rotation) - # Logic: Daily rotation, keep 5 backups + # 1. File Handler (Plain text) file_handler = TimedRotatingFileHandler( filename=log_path, when="D", interval=1, backupCount=5, encoding="utf-8" ) file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(formatter) + file_handler.setFormatter(file_formatter) root_logger.addHandler(file_handler) - # Console Handler - # Logic: High-level INFO messages to stderr + # 2. Console Handler (Colored) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) - console_handler.setFormatter(formatter) + + console_handler.setFormatter(AimsColorFormatter()) root_logger.addHandler(console_handler) - # Debug logs to verify initialization root_logger.debug("Logging initialized successfully.") - root_logger.debug(f"Log file location: {log_path}") - root_logger.debug( - f"Environment 'data_wip_path' was: {'Set' if wip_path_env else 'Not Set (using temp)'}" - ) + root_logger.info(f"Log file location: {log_path}") return root_logger @@ -412,92 +430,47 @@ def is_above_file_limit(json_watchd_name): @lru_cache(maxsize=100) def parse_aims_xml(xml_url): - """Download and parse the AIMS XML rss feed""" + """Download and parse the AIMS XML rss feed using a single-pass loop.""" logger = logging.getLogger(__name__) - logger.info("PARSE AIMS xml RSS feed : %s" % (xml_url)) - response = urlopen(xml_url) - html = response.read() - root = ET.fromstring(html) - - n_item_start = 3 # start number for AIMS xml file - - title = [] - link = [] - metadata_uuid = [] - uom = [] - from_date = [] - thru_date = [] - platform_name = [] - site_name = [] - channel_id = [] - parameter = [] - parameter_type = [] - trip_id = [] # soop trv only - - for n_item in range(n_item_start, len(root[0])): - title.append(root[0][n_item][0].text) - link.append(root[0][n_item][1].text) - metadata_uuid.append(root[0][n_item][6].text) - uom.append(root[0][n_item][7].text) - from_date.append(root[0][n_item][8].text) - thru_date.append(root[0][n_item][9].text) - platform_name.append(root[0][n_item][10].text) - site_name.append(root[0][n_item][11].text) - channel_id.append(root[0][n_item][12].text) - parameter.append(root[0][n_item][13].text) - parameter_type.append(root[0][n_item][14].text) - - # in case there is no trip id defined by AIMS, we create a fake one, used by SOOP TRV only + logger.info(f"PARSE AIMS xml RSS feed : {xml_url}") + + with urlopen(xml_url) as response: + root = ET.fromstring(response.read()) + + new_dict = {} + items = root[0] + n_item_start = 3 + + for i in range(n_item_start, len(items)): + node = items[i] + + # Extract channel_id first as it's our primary key + c_id = node[12].text + + # Handle the trip_id logic for SOOP TRV only try: - trip_id.append(root[0][n_item][15].text) + t_id = node[15].text except IndexError: - dateObject = time.strptime(root[0][n_item][8].text, "%Y-%m-%dT%H:%M:%SZ") - trip_id_fake = ( - str(dateObject.tm_year) - + str(dateObject.tm_mon).zfill(2) - + str(dateObject.tm_mday).zfill(2) - ) - trip_id.append(trip_id_fake) - - response.close() - d = [ - { - c: { - "title": ttl, - "channel_id": c, - "link": lk, - "metadata_uuid": muuid, - "uom": uo, - "from_date": fro, - "thru_date": thr, - "platform_name": pltname, - "site_name": stname, - "parameter": para, - "parameter_type": paratype, - "trip_id": trid, - } + # Create fake trip_id from from_date (node[8]) + date_str = node[8].text + date_obj = time.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ") + t_id = time.strftime("%Y%m%d", date_obj) + + # Build the entry directly into the final dictionary + new_dict[c_id] = { + "title": node[0].text, + "channel_id": c_id, + "link": node[1].text, + "metadata_uuid": node[6].text, + "uom": node[7].text, + "from_date": node[8].text, + "thru_date": node[9].text, + "platform_name": node[10].text, + "site_name": node[11].text, + "parameter": node[13].text, + "parameter_type": node[14].text, + "trip_id": t_id, } - for c, ttl, lk, muuid, uo, fro, thr, pltname, stname, para, paratype, trid in zip( - channel_id, - title, - link, - metadata_uuid, - uom, - from_date, - thru_date, - platform_name, - site_name, - parameter, - parameter_type, - trip_id, - ) - ] - - # re-writting the dict to have the channel key as a key value - new_dict = {} - for item in d: - for name in item.keys(): - new_dict[name] = item[name] return new_dict @@ -1003,17 +976,73 @@ def has_var_only_fill_value(netcdf_file_path, var): return False +# +# def remove_dimension_from_netcdf(netcdf_file_path): +# """DIRTY, calling bash. need to write in Python, or part of the NetCDF4 module +# need to remove the 'single' dimension name from DEPTH or other dim. Unfortunately can't seem to find a way to do it easily with netCDF4 module +# """ +# fd, tmp_file = tempfile.mkstemp() +# os.close(fd) +# import subprocess +# +# subprocess.check_call(["ncwa", "-O", "-a", "single", netcdf_file_path, tmp_file]) +# subprocess.check_call( +# ["ncatted", "-O", "-a", "cell_methods,,d,,", tmp_file, tmp_file] +# ) +# shutil.move(tmp_file, netcdf_file_path) +# +# def remove_dimension_from_netcdf(netcdf_file_path): - """DIRTY, calling bash. need to write in Python, or part of the NetCDF4 module - need to remove the 'single' dimension name from DEPTH or other dim. Unfortunately can't seem to find a way to do it easily with netCDF4 module + """ + Python replacement for NCO ncwa/ncatted. + Fixes the _FillValue AttributeError by passing it during variable creation. """ fd, tmp_file = tempfile.mkstemp() os.close(fd) - subprocess.check_call(["ncwa", "-O", "-a", "single", netcdf_file_path, tmp_file]) - subprocess.check_call( - ["ncatted", "-O", "-a", "cell_methods,,d,,", tmp_file, tmp_file] - ) + with Dataset(netcdf_file_path, "r") as src, Dataset(tmp_file, "w") as dst: + # 1. Copy global attributes + dst.setncatts(src.__dict__) + + hist_msg = "NetCDF file modified by remove_dimension_from_netcdf function" + if hasattr(dst, "history"): + # Append to existing history with a newline for readability + dst.history = f"{hist_msg}\n{dst.history}" + else: + # Create it if it doesn't exist + dst.history = hist_msg + + # 2. Copy dimensions EXCEPT 'single' + for name, dimension in src.dimensions.items(): + if name != "single": + dst.createDimension( + name, (len(dimension) if not dimension.isunlimited() else None) + ) + + # 3. Copy variables + for name, variable in src.variables.items(): + new_dims = tuple(d for d in variable.dimensions if d != "single") + + # --- THE FIX --- + # Check if source has a fill value. + # We use getattr because _FillValue is a reserved attribute name. + fill_val = getattr(variable, "_FillValue", None) + + # Create the variable with the fill_value already set + dst_var = dst.createVariable( + name, variable.datatype, new_dims, fill_value=fill_val + ) + # ---------------- + + # 4. Copy remaining Attributes (Replaces ncatted logic) + # We skip 'cell_methods' AND '_FillValue' (since we just set it) + for attr_name in variable.ncattrs(): + if attr_name not in ["cell_methods", "_FillValue"]: + dst_var.setncattr(attr_name, variable.getncattr(attr_name)) + + # 5. Copy Data + dst_var[:] = variable[:] + shutil.move(tmp_file, netcdf_file_path) From 64991e688b59ba4b34c99c1e26260e1e051f8a11 Mon Sep 17 00:00:00 2001 From: lbesnard Date: Mon, 2 Feb 2026 17:31:01 +1100 Subject: [PATCH 8/8] Fix: removing badly commited file --- ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new | 497 -------------------- 1 file changed, 497 deletions(-) delete mode 100755 ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new diff --git a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new b/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new deleted file mode 100755 index 24143f80..00000000 --- a/ANMN/NRS_AIMS/REALTIME/anmn_nrs_aims.py.new +++ /dev/null @@ -1,497 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Download ANMN NRS data from AIMS Web Service for Darwin, Yongala and Beagle -The script reads an XML file provided by AIMS and looks for channels with -new data to download. It compares this list with a pickle file (pythonic -way to store python variables) containing what has already been downloaded -in the previous run of this script. -Some modifications on the files have to be done so they comply with CF and -IMOS conventions. -The IOOS compliance checker is used to check if the first downloaded file of -a channel complies once modified. If not, the download of the rest of the -channel is aborted until some modification on the source code is done so -the channel can pass the checker. -Files which don't pass the checker will land in os.path.join(wip_path, 'errors') -for investigation. No need to reprocess them as they will be redownloaded on -next run until they end up passing the checker. Files in the 'errors' dir can be -removed at anytime - -IMPORTANT: -is it essential to look at the logging os.path.join(wip_path, 'aims.log') -to know which channels have problems and why as most of the time, AIMS will -have to be contacted to sort out issues. - - -author Laurent Besnard, laurent.besnard@utas.edu.au -""" - -import argparse -import datetime -import logging -import os -import re -import shutil -import sys -import traceback -import unittest as data_validation_test -from itertools import groupby -from pathlib import Path - -from aims_realtime_util import ( - convert_time_cf_to_imos, - create_list_of_dates_to_download, - download_channel, - fix_data_code_from_filename, - fix_provider_code_from_filename, - get_main_netcdf_var, - has_var_only_fill_value, - is_no_data_found, - is_time_monotonic, - is_time_var_empty, - list_recursively_files_abs_path, - logging_aims, - md5, - modify_aims_netcdf, - parse_aims_xml, - remove_dimension_from_netcdf, - remove_end_date_from_filename, - rm_tmp_dir, - save_channel_info, - set_up, -) -from dest_path import get_anmn_nrs_site_name -from netCDF4 import Dataset -from tendo import singleton -from util import pass_netcdf_checker - -DATA_WIP_PATH = os.path.join( - os.environ.get("WIP_DIR"), - "ANMN", - "NRS_AIMS_Darwin_Yongala_data_rss_download_temporary", -) -ANMN_NRS_INCOMING_DIR = os.path.join( - os.environ.get("INCOMING_DIR"), "AODN", "ANMN_NRS_DAR_YON" -) -ANMN_NRS_ERROR_DIR = os.path.join(os.environ["ERROR_DIR"], "ANMN_NRS_DAR_YON") - - -def modify_anmn_nrs_netcdf(netcdf_file_path, channel_id_info): - """Modify the downloaded netCDF file so it passes both CF and IMOS checker - input: - netcdf_file_path(str) : path of netcdf file to modify - channel_id_index(tupple) : information from xml for the channel - """ - modify_aims_netcdf(netcdf_file_path, channel_id_info) - - netcdf_file_obj = Dataset(netcdf_file_path, "a", format="NETCDF4") - netcdf_file_obj.aims_channel_id = int(channel_id_info["channel_id"]) - - if "Yongala" in channel_id_info["site_name"]: - netcdf_file_obj.site_code = "NRSYON" - netcdf_file_obj.platform_code = "Yongala NRS Buoy" - elif "Darwin" in channel_id_info["site_name"]: - netcdf_file_obj.site_code = "NRSDAR" - netcdf_file_obj.platform_code = "Darwin NRS Buoy" - elif "Beagle" in channel_id_info["site_name"]: - netcdf_file_obj.site_code = "DARBGF" - netcdf_file_obj.platform_code = "Beagle Gulf Mooring" - else: - return False - - if not (channel_id_info["metadata_uuid"] == "Not Available"): - netcdf_file_obj.metadata_uuid = channel_id_info["metadata_uuid"] - - # some weather stations channels don't have a depth variable if sensor above water - if "depth" in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables["depth"] - var.long_name = "nominal depth" - var.positive = "down" - var.axis = "Z" - var.reference_datum = "sea surface" - var.valid_min = -10.0 - var.valid_max = 30.0 - var.units = "m" # some channels put degrees celcius instead ... - netcdf_file_obj.renameVariable("depth", "NOMINAL_DEPTH") - - if "DEPTH" in netcdf_file_obj.variables.keys(): - var = netcdf_file_obj.variables["DEPTH"] - var.coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" - var.long_name = "actual depth" - var.reference_datum = "sea surface" - var.positive = "down" - var.valid_min = -10.0 - var.valid_max = 30.0 - var.units = "m" # some channels put degrees celcius instead ... - - netcdf_file_obj.close() - netcdf_file_obj = Dataset( - netcdf_file_path, "a", format="NETCDF4" - ) # need to close to save to file. as we call get_main_var just after - main_var = get_main_netcdf_var(netcdf_file_path) - # DEPTH, LATITUDE and LONGITUDE are not dimensions, so we make them into auxiliary cooordinate variables by adding this attribute - if "NOMINAL_DEPTH" in netcdf_file_obj.variables.keys(): - netcdf_file_obj.variables[ - main_var - ].coordinates = "TIME LATITUDE LONGITUDE NOMINAL_DEPTH" - else: - netcdf_file_obj.variables[main_var].coordinates = "TIME LATITUDE LONGITUDE" - - netcdf_file_obj.close() - - if not convert_time_cf_to_imos(netcdf_file_path): - return False - - remove_dimension_from_netcdf( - netcdf_file_path - ) # last modification to do in this order! - return True - - -def move_to_tmp_incoming(netcdf_path): - """ - Renames the NetCDF to include its MD5 hash, moves it to the manifest directory, - and cleans up the now-empty source directory. - """ - logger = logging.getLogger(__name__) - # Convert to Path object for easier manipulation - source_file = Path(netcdf_path) - source_dir = source_file.parent - - # Construct the new filename: [name_without_date].[md5].nc - # remove_end_date_from_filename returns a string, so we wrap it in Path - name_no_date = Path(remove_end_date_from_filename(str(source_file))).stem - file_hash = md5(str(source_file)) - new_filename = f"{name_no_date}.{file_hash}.nc" - - destination = Path(TMP_MANIFEST_DIR) / new_filename - - try: - # Apply permissions (664) - source_file.chmod(0o664) - - # Perform the move - shutil.move(str(source_file), str(destination)) - logger.info(f"Moved {source_file.name} to {destination}") - - # Cleanup: Delete the source directory if it is now empty - try: - source_dir.rmdir() - logger.debug(f"Cleaned up empty directory: {source_dir}") - except OSError: - logger.debug(f"Source directory not empty; skipping cleanup: {source_dir}") - - except Exception as e: - logger.error(f"Failed to move {source_file} to incoming: {e}") - raise - - -def process_monthly_channel(channel_id, aims_xml_info, level_qc): - """ - Downloads all the data available for one channel_id and moves the file to a wip_path dir - - aims_service : 1 -> FAIMMS data - 100 -> SOOP TRV data - 300 -> NRS DATA - for monthly data download, only 1 and 300 should be use - """ - contact_aims_msg = "Process of channel aborted - CONTACT AIMS" - wip_path = Path(os.environ.get("data_wip_path", "")) - - logger.info(f"QC{level_qc} - Processing channel {channel_id}") - - channel_id_info = aims_xml_info[channel_id] - from_date = channel_id_info["from_date"] - thru_date = channel_id_info["thru_date"] - - # [start_dates, end_dates] generation - start_dates, end_dates = create_list_of_dates_to_download( - channel_id, level_qc, from_date, thru_date - ) - - if not start_dates: - logger.info(f"QC{level_qc} - Channel {channel_id}: already up to date") - return - - # download monthly file - for start_dt, end_dt in zip(start_dates, end_dates): - start_date = start_dt.strftime("%Y-%m-%dT%H:%M:%SZ") - end_date = end_dt.strftime("%Y-%m-%dT%H:%M:%SZ") - - netcdf_tmp_file_path = download_channel( - channel_id, start_date, end_date, level_qc - ) - - if netcdf_tmp_file_path is None: - logger.error( - f" Channel {channel_id} - not valid zip file - {contact_aims_msg}" - ) - break - - tmp_dir = Path(netcdf_tmp_file_path).parent - - # NO_DATA_FOUND file only means there is no data for the selected time period. - # Could be some data afterwards - if is_no_data_found(netcdf_tmp_file_path): - logger.info( - f"Channel {channel_id}: No data for the time period:[{start_date} - {end_date}]" - ) - shutil.rmtree(tmp_dir) - continue # Move to next month - - # Start of validation sequence - error_occurred = False - - if is_time_var_empty(netcdf_tmp_file_path): - logger.error( - f"Channel {channel_id}: No values in TIME variable - {contact_aims_msg}" - ) - error_occurred = True - - elif not modify_anmn_nrs_netcdf(netcdf_tmp_file_path, channel_id_info): - logger.error( - f"Channel {channel_id}: Could not modify the NetCDF file - Process of channel aborted" - ) - error_occurred = True - - else: - main_var = get_main_netcdf_var(netcdf_tmp_file_path) - if has_var_only_fill_value(netcdf_tmp_file_path, main_var): - logger.error( - f"Channel {channel_id}: _Fillvalues only in main variable - {contact_aims_msg}" - ) - error_occurred = True - elif not get_anmn_nrs_site_name(netcdf_tmp_file_path): - logger.error( - f"Channel {channel_id}: Unknown site_code gatt value - {contact_aims_msg}" - ) - error_occurred = True - elif not is_time_monotonic(netcdf_tmp_file_path): - logger.error( - f"Channel {channel_id}: TIME value is not strictly monotonic - {contact_aims_msg}" - ) - error_occurred = True - - if error_occurred: - shutil.rmtree(tmp_dir) - break - - # check every single file of the list. We don't assume that if one passes, all pass ... past proved this - if not pass_netcdf_checker(netcdf_tmp_file_path, tests=["cf:1.6", "imos:1.3"]): - logger.error( - f"Channel {channel_id}: File does not pass CF/IMOS compliance checker - Process of channel aborted" - ) - - err_dest = wip_path / "errors" / os.path.basename(netcdf_tmp_file_path) - shutil.copy(netcdf_tmp_file_path, err_dest) - - logger.error(f"File copied to {err_dest} for debugging") - shutil.rmtree(tmp_dir) - break - - netcdf_tmp_file_path = fix_data_code_from_filename(netcdf_tmp_file_path) - netcdf_tmp_file_path = fix_provider_code_from_filename( - netcdf_tmp_file_path, "IMOS_ANMN" - ) - - if not re.search(r"IMOS_ANMN_[A-Z]{1}_", netcdf_tmp_file_path): - logger.error( - f" Channel {channel_id} - File name Data code does not pass REGEX - Process of channel aborted" - ) - - err_dest = wip_path / "errors" / os.path.basename(netcdf_tmp_file_path) - shutil.copy(netcdf_tmp_file_path, err_dest) - - logger.error(f" File copied to {err_dest} for debugging") - shutil.rmtree(tmp_dir) - break - - move_to_tmp_incoming(netcdf_tmp_file_path) - - # Update tracking - save_channel_info(channel_id, aims_xml_info, level_qc, end_date) - - if TESTING: - # The 2 next lines download the first month only for every single channel. - # This is only used for testing - # Note: save_channel_info already called above - break - - -def process_qc_level(level_qc): - """Downloads all channels for a QC level - level_qc(int) : 0 or 1 - """ - - logger.info( - "Process ANMN NRS download from AIMS web service - QC level {level_qc}".format( - level_qc=level_qc - ) - ) - xml_url = "https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level{level_qc}/300".format( - level_qc=level_qc - ) - try: - aims_xml_info = parse_aims_xml(xml_url) - except Exception as err: - logger.critical("RSS feed not available") - exit(1) - - for channel_id in aims_xml_info.keys(): - try: - process_monthly_channel(channel_id, aims_xml_info, level_qc) - except Exception as err: - logger.error( - "QC{qc_level} - Channel {channel_id}: Failed, unknown reason - manual debug required".format( - channel_id=str(channel_id), qc_level=str(level_qc) - ) - ) - logger.error(traceback.print_exc()) - - -class AimsDataValidationTest(data_validation_test.TestCase): - def setUp(self): - """Check that a the AIMS system or this script hasn't been modified. - This function checks that a downloaded file still has the same md5. - """ - channel_id = "84329" - from_date = "2016-01-01T00:00:00Z" - thru_date = "2016-01-02T00:00:00Z" - level_qc = 1 - aims_rss_val = 300 - xml_url = ( - "https://data.aims.gov.au/gbroosdata/services/rss/netcdf/level%s/%s" - % (str(level_qc), str(aims_rss_val)) - ) - - logger.info("Data validation unittests...") - aims_xml_info = parse_aims_xml(xml_url) - channel_id_info = aims_xml_info[channel_id] - self.netcdf_tmp_file_path = download_channel( - channel_id, from_date, thru_date, level_qc - ) - modify_anmn_nrs_netcdf(self.netcdf_tmp_file_path, channel_id_info) - EPOCH_ISO = "1970-01-01T00:00:00Z" - - netcdf_path = Path(self.netcdf_tmp_file_path) - - with Dataset(netcdf_path, mode="a", format="NETCDF4") as nc: - # force values of attributes which change all the time - nc.date_created = EPOCH_ISO - nc.history = "data validation test only" - nc.NCO = "NCO_VERSION" - - def tearDown(self): - shutil.copy( - self.netcdf_tmp_file_path, - os.path.join( - os.environ["data_wip_path"], "nc_unittest_%s.nc" % self.md5_netcdf_value - ), - ) - shutil.rmtree(os.path.dirname(self.netcdf_tmp_file_path)) - - def test_aims_validation(self): - if sys.version_info[0] < 3: - self.md5_expected_value = "76c9a595264a8173545b6dc0c518a280" - else: - self.md5_expected_value = "1bb65266f8e526ed2087904ae024e33d" - - self.md5_netcdf_value = md5(self.netcdf_tmp_file_path) - - self.assertEqual(self.md5_netcdf_value, self.md5_expected_value) - - -def args(): - """ - define the script arguments - :return: vargs - """ - parser = argparse.ArgumentParser() - parser.add_argument( - "-t", - "--testing", - action="store_true", - help="testing only - downloads the first month of each channel", - ) - - return parser.parse_args() - - -if __name__ == "__main__": - vargs = args() - me = singleton.SingleInstance() - os.environ["data_wip_path"] = os.path.join( - os.environ.get("WIP_DIR"), - "ANMN", - "NRS_AIMS_Darwin_Yongala_data_rss_download_temporary", - ) - global TMP_MANIFEST_DIR - global TESTING - - set_up() - - # initialise logging - logging_aims() - global logger - logger = logging.getLogger(__name__) - - # data validation test - runner = data_validation_test.TextTestRunner() - itersuite = data_validation_test.TestLoader().loadTestsFromTestCase( - AimsDataValidationTest - ) - res = runner.run(itersuite) - - if not DATA_WIP_PATH: - logger.critical("environment variable data_wip_path is not defined.") - exit(1) - - # script optional argument for testing only. used in process_monthly_channel - TESTING = vargs.testing - - rm_tmp_dir(DATA_WIP_PATH) - - if len(os.listdir(ANMN_NRS_INCOMING_DIR)) >= 2: - logger.critical("Operation aborted, too many files in INCOMING_DIR") - exit(1) - - if len(os.listdir(ANMN_NRS_ERROR_DIR)) >= 2: - logger.critical("Operation aborted, too many files in ERROR_DIR") - exit(1) - - if not res.failures: - for level in [0, 1]: - date_str_now = datetime.datetime.now().strftime("%Y%m%d%H%M%S") - TMP_MANIFEST_DIR = os.path.join( - DATA_WIP_PATH, "manifest_dir_tmp_{date}".format(date=date_str_now) - ) - os.makedirs(TMP_MANIFEST_DIR) - - process_qc_level(level) - - lines_per_file = 2**12 - file_list = list_recursively_files_abs_path(TMP_MANIFEST_DIR) - if len(file_list) > 0: - for file_number, lines in groupby( - enumerate(file_list), key=lambda x: x[0] // lines_per_file - ): - incoming_file = os.path.join( - DATA_WIP_PATH, - "anmn_nrs_aims_FV0{level}_{date}_{file_number}.manifest".format( - level=str(level), date=date_str_now, file_number=file_number - ), - ) - with open(incoming_file, "w") as outfile: - for item in lines: - outfile.write("%s\n" % item[1]) - - os.chmod(incoming_file, 0o0664) # change to 664 for pipeline v2 - shutil.move( - incoming_file, - os.path.join( - ANMN_NRS_INCOMING_DIR, os.path.basename(incoming_file) - ), - ) - - else: - logger.error("Data validation unittests failed")