From 9fea38a323cf974b8c6f11669a5ced3516c794d7 Mon Sep 17 00:00:00 2001 From: William Horn Date: Mon, 1 Dec 2025 14:33:06 -0900 Subject: [PATCH] Update chipping to allow for specific dates --- src/satchip/chip_data.py | 56 ++++++++++++++++++++++++-------- src/satchip/chip_hls.py | 46 +++++++++++++++++++++------ src/satchip/chip_operas1rtc.py | 47 +++++++++++++++++++++------ src/satchip/chip_sentinel2.py | 58 +++++++++++++++++++++++----------- src/satchip/utils.py | 12 +++++-- 5 files changed, 168 insertions(+), 51 deletions(-) diff --git a/src/satchip/chip_data.py b/src/satchip/chip_data.py index 847e82d..d6b3041 100644 --- a/src/satchip/chip_data.py +++ b/src/satchip/chip_data.py @@ -1,4 +1,5 @@ import argparse +import glob from collections import Counter from datetime import datetime from pathlib import Path @@ -74,8 +75,7 @@ def chip_data( def create_chips( label_paths: list[Path], platform: str, - date_start: datetime, - date_end: datetime, + dates: utils.DateRange | list[datetime], strategy: str, max_cloud_pct: int, chip_dir: Path, @@ -84,7 +84,7 @@ def create_chips( platform_dir = chip_dir / platform platform_dir.mkdir(parents=True, exist_ok=True) - opts: utils.ChipDataOpts = {'strategy': strategy, 'date_start': date_start, 'date_end': date_end} + opts: utils.ChipDataOpts = {'strategy': strategy, 'dates': dates} if platform in ['S2L2A', 'HLS']: opts['max_cloud_pct'] = max_cloud_pct @@ -94,9 +94,11 @@ def create_chips( duplicates = [name for name, count in Counter(chip_names).items() if count > 1] msg = f'Duplicate sample locations not supported. Duplicate chips: {", ".join(duplicates)}' raise NotImplementedError(msg) + chip_paths = [ platform_dir / (x.with_suffix('').with_suffix('').name + f'_{platform}.zarr.zip') for x in label_paths ] + if platform == 'HYP3S1RTC': rtc_paths_for_chips = get_rtc_paths_for_chips(chips, image_dir, opts) opts['local_hyp3_paths'] = rtc_paths_for_chips @@ -109,11 +111,26 @@ def create_chips( def main() -> None: parser = argparse.ArgumentParser(description='Chip a label image') - parser.add_argument('labelpath', type=Path, help='Path to the label directory') + + parser.add_argument('labels', type=str, help='Path or Glob pattern for label chips') + parser.add_argument( 'platform', choices=['S1RTC', 'S2L2A', 'HLS', 'HYP3S1RTC'], type=str, help='Dataset to create chips for' ) - parser.add_argument('daterange', type=str, help='Inclusive date range to search for data in the format Ymd-Ymd') + + date_group = parser.add_mutually_exclusive_group(required=True) + + date_group.add_argument( + '--daterange', + nargs=2, + type=str, + metavar=('START', 'END'), + help='Inclusive date range in the format YYYY-mm-dd YYYY-mm-dd', + ) + date_group.add_argument( + '--dates', nargs='+', type=str, help='Space-separated list of specific dates in format YYYY-mm-dd' + ) + parser.add_argument('--maxcloudpct', default=100, type=int, help='Maximum percent cloud cover for a data chip') parser.add_argument('--chipdir', default='.', type=Path, help='Output directory for the chips') parser.add_argument( @@ -122,24 +139,37 @@ def main() -> None: parser.add_argument( '--strategy', default='BEST', - choices=['BEST', 'ALL'], + choices=['BEST', 'ALL', 'SPECIFIC'], type=str, help='Strategy to use when multiple scenes are found (default: BEST)', ) args = parser.parse_args() args.platform = args.platform.upper() + assert 0 <= args.maxcloudpct <= 100, 'maxcloudpct must be between 0 and 100' - date_start, date_end = [datetime.strptime(d, '%Y%m%d') for d in args.daterange.split('-')] - assert date_start < date_end, 'start date must be before end date' - label_paths = list(args.labelpath.glob('*.zarr.zip')) - assert len(label_paths) > 0, f'No label files found in {args.labelpath}' + + dates: utils.DateRange | list[datetime] + if args.daterange: + start_date = datetime.strptime(args.daterange[0], '%Y-%m-%d') + end_date = datetime.strptime(args.daterange[1], '%Y-%m-%d') + assert start_date < end_date, 'start date must be before end date' + + dates = utils.DateRange(start_date, end_date) + else: + dates = [datetime.strptime(d, '%Y-%m-%d') for d in args.dates] + args.strategy = 'SPECIFIC' + + if '*' in args.labels or '?' in args.labels or '[' in args.labels: + label_paths = [Path(p) for p in glob.glob(args.labels)] + else: + label_paths = list(Path(args.labels).glob('*.zarr.zip')) + + assert len(label_paths) > 0, f'No label files found in {args.labels}' if args.imagedir is None: args.imagedir = args.chipdir / 'IMAGES' - create_chips( - label_paths, args.platform, date_start, date_end, args.strategy, args.maxcloudpct, args.chipdir, args.imagedir - ) + create_chips(label_paths, args.platform, dates, args.strategy, args.maxcloudpct, args.chipdir, args.imagedir) if __name__ == '__main__': diff --git a/src/satchip/chip_hls.py b/src/satchip/chip_hls.py index 4452356..76a833f 100644 --- a/src/satchip/chip_hls.py +++ b/src/satchip/chip_hls.py @@ -15,6 +15,8 @@ from satchip.terra_mind_grid import TerraMindChip +earthaccess.login() + HLS_L_BANDS = OrderedDict( { 'B01': 'COASTAL', @@ -78,19 +80,29 @@ def get_scenes( Returns: The best HLS items. """ - assert strategy in ['BEST', 'ALL'], 'Strategy must be either BEST or ALL' + assert strategy in ['BEST', 'ALL', 'SPECIFIC'], 'Strategy must be either BEST or ALL' overlapping_items = [x for x in items if get_pct_intersect(x['umm'], roi) > 95] best_first = sorted(overlapping_items, key=lambda x: (-get_pct_intersect(x['umm'], roi), get_date(x['umm']))) + valid_scenes = [] + for item in best_first: product_id = get_product_id(item['umm']) n_products = len(list(image_dir.glob(f'{product_id}*'))) + if n_products < 15: + print(f'Downloading {product_id}...') + breakpoint() earthaccess.download([item], image_dir, pqdm_kwargs={'disable': True}) + print('Done Downloading...') + fmask_path = image_dir / f'{product_id}.v2.0.Fmask.tif' + assert fmask_path.exists(), f'File not found: {fmask_path}' + qual_da = rioxarray.open_rasterio(fmask_path).rio.clip_box(*roi.bounds, crs='EPSG:4326') # type: ignore bit_masks = np.unpackbits(qual_da.data[0][..., np.newaxis], axis=-1) + # Looks for a 1 in the 4th, 6th and 7th bit of the Fmask (reverse order). See table 9 and appendix A of: # https://lpdaac.usgs.gov/documents/1698/HLS_User_Guide_V2.pdf bad_pixels = (bit_masks[..., 4] == 1) | (bit_masks[..., 6] == 1) | (bit_masks[..., 7] == 1) @@ -104,22 +116,38 @@ def get_scenes( return valid_scenes +def search_for_data(dates: utils.DateRange | list[datetime], bounds: utils.Bounds) -> list: + results = [] + if isinstance(dates, utils.DateRange): + results = earthaccess.search_data( + short_name=['HLSL30', 'HLSS30'], bounding_box=bounds, temporal=(dates.start, dates.end + timedelta(days=1)) + ) + else: + for date in dates: + day_results = earthaccess.search_data( + short_name=['HLSL30', 'HLSS30'], bounding_box=bounds, temporal=(date, date + timedelta(days=1)) + ) + results.extend(day_results) + + return results + + def get_hls_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpts) -> xr.Dataset: """Returns XArray DataArray of a Harmonized Landsat Sentinel-2 image for the given bounds and closest collection after date. """ - date_start = opts['date_start'] - date_end = opts['date_end'] + timedelta(days=1) # inclusive end - earthaccess.login() - results = earthaccess.search_data( - short_name=['HLSL30', 'HLSS30'], bounding_box=chip.bounds, temporal=(date_start, date_end) - ) - assert len(results) > 0, f'No HLS scenes found for chip {chip.name} between {date_start} and {date_end}.' + dates = opts['dates'] + + results = search_for_data(dates, utils.Bounds(*chip.bounds)) + assert len(results) > 0, f'No HLS scenes found for chip {chip.name} {utils.dates_error_msg(dates)}.' + roi = shapely.box(*chip.bounds) roi_buffered = roi.buffer(0.01) max_cloud_pct = opts.get('max_cloud_pct', 100) strategy = opts.get('strategy', 'BEST').upper() - timesteps = get_scenes(results, roi, max_cloud_pct, strategy, image_dir) + + timesteps = get_scenes(results, roi_buffered, max_cloud_pct, strategy, image_dir) + template = create_template_da(chip) timestep_arrays = [] for scene in timesteps: diff --git a/src/satchip/chip_operas1rtc.py b/src/satchip/chip_operas1rtc.py index 5811998..aa5522a 100644 --- a/src/satchip/chip_operas1rtc.py +++ b/src/satchip/chip_operas1rtc.py @@ -87,25 +87,54 @@ def get_scenes(groups: list[RTCGroup], roi: shapely.geometry.Polygon, strategy: return intersecting[:1] elif strategy == 'ALL': return intersecting + elif strategy == 'SPECIFIC': + return intersecting else: raise ValueError(f'Strategy must be either BEST or ALL. Got {strategy}') +def search_for_data(dates: utils.DateRange | list[datetime], bounds: utils.Bounds) -> list: + earthaccess.login() + results = [] + + if isinstance(dates, utils.DateRange): + results = earthaccess.search_data( + short_name=['OPERA_L2_RTC-S1_V1'], + bounding_box=bounds, + temporal=(dates.start, dates.end + timedelta(days=1)), + ) + else: + for date in dates: + day_results = earthaccess.search_data( + short_name=['OPERA_L2_RTC-S1_V1'], bounding_box=bounds, temporal=(date, date + timedelta(days=1)) + ) + results.extend(day_results) + + return results + + def get_operartc_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpts) -> xr.Dataset: """Returns XArray DataArray of a OPERA S1-RTC for the given chip and selection startegy.""" - date_start = opts['date_start'] - date_end = opts['date_end'] + timedelta(days=1) # inclusive end - earthaccess.login() - results = earthaccess.search_data( - short_name=['OPERA_L2_RTC-S1_V1'], bounding_box=chip.bounds, temporal=(date_start, date_end) - ) - results = filter_to_dualpol(results) - rtc_groups = group_rtcs(results) + + dates = opts['dates'] + roi = shapely.box(*chip.bounds) roi_buffered = roi.buffer(0.01) + + results = search_for_data(dates, utils.Bounds(*roi_buffered.bounds)) + dualpol = filter_to_dualpol(results) + + rtc_groups = group_rtcs(dualpol) strategy = opts.get('strategy', 'BEST').upper() timesteps = get_scenes(rtc_groups, roi_buffered, strategy) - assert len(timesteps) > 0, f'No OPERA RTC scenes found for chip {chip.name} between {date_start} and {date_end}.' + + if len(timesteps) <= 0: + breakpoint() + f'No OPERA RTC scenes found for chip {chip.name} {utils.dates_error_msg(dates)}' + + if isinstance(dates, list): + assert len(timesteps) == len(dates) + vrts = [timestep.download(image_dir) for timestep in timesteps] template = create_template_da(chip) timestep_arrays = [] diff --git a/src/satchip/chip_sentinel2.py b/src/satchip/chip_sentinel2.py index f4725b4..e2484fa 100644 --- a/src/satchip/chip_sentinel2.py +++ b/src/satchip/chip_sentinel2.py @@ -96,6 +96,7 @@ def get_scenes( items = [item for item in items if get_pct_intersect(item.geometry, roi) > 0.95] best_first = sorted(items, key=lambda x: (-get_pct_intersect(x.geometry, roi), x.datetime)) valid_scenes = [] + for item in best_first: scl_href = item.assets['scl'].href local_path = fetch_s3_file(scl_href, image_dir) @@ -131,6 +132,36 @@ def get_s2_version(item: Item) -> int: return latest_items +def search_for_data(dates: utils.DateRange | list[datetime], roi: shapely.Polygon) -> list: + results = [] + client = Client.open('https://earth-search.aws.element84.com/v1') + + if isinstance(dates, utils.DateRange): + date_end = dates.end + timedelta(days=1) + date_range = f'{datetime.strftime(dates.start, "%Y-%m-%d")}/{datetime.strftime(date_end, "%Y-%m-%d")}' + search = client.search( + collections=['sentinel-2-l2a'], + intersects=roi, + datetime=date_range, + max_items=1000, + ) + + results = list(search.item_collection()) + else: + for date in dates: + search = client.search( + collections=['sentinel-2-l2a'], + intersects=roi, + datetime=datetime.strftime(date, '%Y-%m-%d'), + max_items=1000, + ) + results.extend(search.item_collection()) + + results = get_latest_image_versions(results) + + return results + + def get_s2l2a_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpts) -> xr.Dataset: """Get XArray DataArray of Sentinel-2 L2A image for the given bounds and best collection parameters. @@ -146,28 +177,19 @@ def get_s2l2a_data(chip: TerraMindChip, image_dir: Path, opts: utils.ChipDataOpt Returns: XArray Dataset containing the Sentinel-2 L2A image data. """ - date_start = opts['date_start'] - date_end = opts['date_end'] + timedelta(days=1) # inclusive end - date_range = f'{datetime.strftime(date_start, "%Y-%m-%d")}/{datetime.strftime(date_end, "%Y-%m-%d")}' + dates = opts['dates'] + roi = shapely.box(*chip.bounds) roi_buffered = roi.buffer(0.01) - client = Client.open('https://earth-search.aws.element84.com/v1') - search = client.search( - collections=['sentinel-2-l2a'], - intersects=roi, - datetime=date_range, - max_items=1000, - ) - assert len(search.item_collection()) > 0, ( - f'No Sentinel-2 L2A scenes found for chip {chip.name} between {date_start} and {date_end}.' - ) - assert len(search.item_collection()) < 1000, ( - 'Too many Sentinel-2 L2A scenes found for chip. Please narrow the date range.' - ) - items = list(search.item_collection()) - items = get_latest_image_versions(items) + + items = search_for_data(dates, roi) max_cloud_pct = opts.get('max_cloud_pct', 100) strategy = opts.get('strategy', 'BEST') + + assert len(items) > 0, ( + f'No Sentinel-2 L2A scenes found for chip {chip.name} between {utils.dates_error_msg(dates)}.' + ) + assert len(items) < 1000, 'Too many Sentinel-2 L2A scenes found for chip. Please narrow the date range.' timesteps = get_scenes(items, roi, strategy, max_cloud_pct, image_dir) urls = [item.assets[band.lower()].href for item in timesteps for band in S2_BANDS.values()] diff --git a/src/satchip/utils.py b/src/satchip/utils.py index a4a54ca..98700c2 100644 --- a/src/satchip/utils.py +++ b/src/satchip/utils.py @@ -20,10 +20,14 @@ class Bounds(NamedTuple): maxy: float +class DateRange(NamedTuple): + start: datetime.datetime + end: datetime.datetime + + class ChipDataRequiredOpts(TypedDict): strategy: str - date_start: datetime.datetime - date_end: datetime.datetime + dates: DateRange | list[datetime.datetime] class ChipDataOpts(ChipDataRequiredOpts, total=False): @@ -31,6 +35,10 @@ class ChipDataOpts(ChipDataRequiredOpts, total=False): local_hyp3_paths: dict[str, list[RtcImageSet]] +def dates_error_msg(dates: DateRange | list[datetime.datetime]) -> str: + return f'between {dates.start} and {dates.end}' if isinstance(dates, DateRange) else f'for dates {dates}' + + def get_overall_bounds(bounds: list) -> Bounds: minx = min([b[0] for b in bounds]) miny = min([b[1] for b in bounds])