From ee53f5dd47aa3be571108f2ec21529a6edd20d44 Mon Sep 17 00:00:00 2001 From: Kendall Smith Date: Thu, 12 May 2022 14:48:40 -0700 Subject: [PATCH 1/2] fixed fork divergence --- ...adiant-mlhub-on-demand-training-data.ipynb | 3532 +++++++++++++++++ 1 file changed, 3532 insertions(+) create mode 100644 tutorials/radiant-mlhub-on-demand-training-data.ipynb diff --git a/tutorials/radiant-mlhub-on-demand-training-data.ipynb b/tutorials/radiant-mlhub-on-demand-training-data.ipynb new file mode 100644 index 00000000..6a826c36 --- /dev/null +++ b/tutorials/radiant-mlhub-on-demand-training-data.ipynb @@ -0,0 +1,3532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a7cf916c-0991-4af6-889f-a1ce86696d46", + "metadata": {}, + "source": [ + "## On Demand Training Data from Radiant MLHub and Planetary Computer\n", + "\n", + "Radiant MLHub Logo" + ] + }, + { + "cell_type": "markdown", + "id": "010b5b89-32c4-4f6b-81fb-f41d782d251f", + "metadata": {}, + "source": [ + "In this tutorial, we will walk through the process of requesting on-demand traning data from the [Planetary Computer Data Catalog](https://planetarycomputer.microsoft.com/catalog) to pair with the [BigEarthNet](https://mlhub.earth/data/bigearthnet_v1) dataset downloaded from Radiant MLHub. This is an important workflow for someone in the geospatial community who wants to train an ML model on a datasource outside of a prepackaged dataset, such as those found on MLHub. They can start with any dataset containing source image and label collections in STAC, obtain a random sample to work with, fetch source images from a different collection or satellite product, and then reproject and crop those images to match the spatial and temporal extent of the original dataset.\n", + "\n", + "**NOTE:** because the workflow documented below uses libraries like `pystac_client` and `stackstac`, the datasets queried need to be organized into STAC Collections." + ] + }, + { + "cell_type": "markdown", + "id": "f130b365-6fff-4d2a-86a9-39085ab13886", + "metadata": {}, + "source": [ + "Let's start by importing the Python libraries we'll use in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fe7b447-cf8a-4cc7-aaed-4e8407d0f270", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade wget # not installed on PC by default" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "7e144460-5549-4ab4-ba98-10a1a7ebd236", + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import tempfile\n", + "from pathlib import Path\n", + "import os\n", + "import json\n", + "from glob import glob\n", + "import requests\n", + "from typing import List, Tuple\n", + "from datetime import datetime as dt\n", + "from datetime import timedelta as td\n", + "\n", + "from radiant_mlhub import Collection\n", + "import planetary_computer\n", + "import pystac_client\n", + "from pystac import ItemCollection, Item, Asset\n", + "import dask\n", + "\n", + "import numpy as np\n", + "from stackstac import stack\n", + "from geopandas import GeoDataFrame\n", + "import rasterio as rio\n", + "import rioxarray\n", + "from xarray import DataArray\n", + "from shapely.geometry import shape\n", + "from shapely.geometry import Polygon\n", + "from pyproj import CRS" + ] + }, + { + "cell_type": "markdown", + "id": "3a5bb87b-9d9c-4140-bac8-3e95f146c029", + "metadata": {}, + "source": [ + "### Define global variables" + ] + }, + { + "cell_type": "markdown", + "id": "2de2f657-3229-4037-bf9c-541c503cc269", + "metadata": {}, + "source": [ + "In addition to the API key, we will also need to define some other initial global variables to get our workflow started. e.g. a temporary working directory to download and write data to, the STAC API endpoints, names of Collections, and other variables like the RGB bands for those collections. These are pretty flexible depending on your individual needs." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f3bf07a1-3a4a-4207-a449-7be766fa7e36", + "metadata": {}, + "outputs": [], + "source": [ + "# Temporary working directory on local machine or PC instance\n", + "TMP_DIR = tempfile.gettempdir()\n", + "\n", + "# API endpoints for MLHub and Planetary Computer catalogs\n", + "MLHUB_API_URL = \"https://api.radiant.earth/mlhub/v1\"\n", + "MSPC_API_URL = \"https://planetarycomputer.microsoft.com/api/stac/v1\"\n", + "\n", + "# Names of Collections that will be queried against using pystac_client\n", + "BIGEARTHNET_SOURCE_COLLECTION = \"bigearthnet_v1_source\" # sentinel-2 source imagery\n", + "BIGEARTHNET_LABEL_COLLECTION = \"bigearthnet_v1_labels\" # geojson classification labels\n", + "PLANETARY_COMPUTER_LANDSAT_8 = \"landsat-8-c2-l2\" # landsat 8 source imagery on PC\n", + "OUTPUT_DIR = \"landsat_8_source\"\n", + "\n", + "# Default variables that will be used in the API queries\n", + "BIGEARTHNET_TIME_RANGE = \"2017-06-01/2018-05-31\" # full date range for BigEarthNet\n", + "LABEL_CRS = CRS(\"EPSG:4326\")\n", + "DATE_BUFFER = 60\n", + "LANDSAT_8_RGB_BANDS = [\"SR_B4\", \"SR_B3\", \"SR_B2\"] # names of RGB bands from BigEarthNet\n", + "BIGEARTHNET_RGB_BANDS = [\"B04\", \"B03\", \"B02\"] # names of RGB bands from PC Landsat 8\n", + "\n", + "# Bounding box for demonstration fetching Items over Luxembourg\n", + "LUXEMBOURG_AOI = [6.06, 49.58, 6.21, 49.66] # aoi around Luxembourg" + ] + }, + { + "cell_type": "markdown", + "id": "5d5e31b3-5cde-4a7b-af43-dc194b06d0a0", + "metadata": {}, + "source": [ + "### Authentication with Radiant MLHub" + ] + }, + { + "cell_type": "markdown", + "id": "5f11e821-b98b-4df1-a26c-826c9bdbec50", + "metadata": {}, + "source": [ + "Programmatic access to the Radiant MLHub API using the `pystac_client` library requires both the API end-point and an API key. You can obtain an API key for free by registering an account on [mlhub.earth](https://mlhub.earth/). This can be found under `Settings & API Key` from the drop-down once logged in." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f4c9dd60-3abc-464d-af25-4b23c0d2783b", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "MLHub API Key: ································································\n" + ] + } + ], + "source": [ + "MLHUB_API_KEY = getpass.getpass(prompt=\"MLHub API Key: \")" + ] + }, + { + "cell_type": "markdown", + "id": "ccad2439-109f-4eef-ac3b-dac65f54e3aa", + "metadata": {}, + "source": [ + "Once you have your API key, you need to update the default profile file in your home directory. You can use the `mlhub configure` command line tool to do this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfd2681d-56ed-440c-a352-b4ecfd125c03", + "metadata": {}, + "outputs": [], + "source": [ + "!mlhub configure --api-key={MLHUB_API_KEY}" + ] + }, + { + "cell_type": "markdown", + "id": "5c77d029-191e-42a5-8250-bc451b80f247", + "metadata": {}, + "source": [ + "### Configure API connection to Radiant MLHub" + ] + }, + { + "cell_type": "markdown", + "id": "b0480635-eef5-4e3f-847a-5060c409ae4f", + "metadata": {}, + "source": [ + "This makes a connection to the Radiant MLHub Data Catalog using the API endpoint URL, and the API key from your account." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "bf79c301-76df-4158-bf97-3da53552e143", + "metadata": {}, + "outputs": [], + "source": [ + "mlhub_catalog = pystac_client.Client.open(\n", + " url=MLHUB_API_URL, parameters={\"key\": MLHUB_API_KEY}, ignore_conformance=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "637734bc-af77-4c81-ab26-98e808de6415", + "metadata": {}, + "source": [ + "### Fetch label items from BigEarthNet over Luxembourg" + ] + }, + { + "cell_type": "markdown", + "id": "e9f19ce4-57fd-43d0-9263-7a5edd106ee8", + "metadata": {}, + "source": [ + "We will now use the `search` function from the API client to get label Items over Luxembourg as a sample use-case." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fc61a3d4-cfc0-4da5-9717-bf3c8e427100", + "metadata": {}, + "outputs": [], + "source": [ + "origin_label_items = mlhub_catalog.search(\n", + " collections=BIGEARTHNET_LABEL_COLLECTION,\n", + " bbox=LUXEMBOURG_AOI,\n", + " datetime=BIGEARTHNET_TIME_RANGE,\n", + ").get_all_items()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "917c392c-9ae5-43e2-b7e0-71dd9f749cc2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "178" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(origin_label_items)" + ] + }, + { + "cell_type": "markdown", + "id": "e9d121a9-e54b-4039-9cef-04277962c2ca", + "metadata": {}, + "source": [ + "This is another helper function that simply displays the geometry for labels from an ItemCollection overlayed on a map of the region." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2f203e31-5b09-4f2e-bf27-9a3ef8e3fc4d", + "metadata": {}, + "outputs": [], + "source": [ + "def explore_search_extent(items: ItemCollection) -> None:\n", + " \"\"\"Extracts geometry from ItemCollection to display polygons on a map.\n", + "\n", + " Args:\n", + " items: ItemCollection of Items retrieved from pystac_client search\n", + "\n", + " Returns:\n", + " GeoDataFrame object with the .explore() method called\n", + " \"\"\"\n", + " item_feature_collection = items.to_dict()\n", + " geom_df = GeoDataFrame.from_features(item_feature_collection).set_crs(4326)\n", + " print(geom_df.bounds)\n", + " return geom_df[[\"geometry\", \"datetime\"]].explore(\n", + " column=\"datetime\", style_kwds={\"fillOpacity\": 0.2}, cmap=\"viridis\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "ab923855-3c20-4f10-af00-d4175a54fdd4", + "metadata": {}, + "source": [ + "Here are the BigEarthNet chips with their bounding boxes that matched the spatial parameters for the city of Luxembourg and surrounding areas." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b86bf5d8-8dc8-491d-b3cd-1ea1263774ca", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " minx miny maxx maxy\n", + "0 6.197958 49.579464 6.215240 49.590700\n", + "1 6.198663 49.590240 6.215949 49.601477\n", + "2 6.199368 49.601017 6.216659 49.612254\n", + "3 6.180682 49.569146 6.197958 49.580381\n", + "4 6.181383 49.579923 6.198663 49.591158\n", + ".. ... ... ... ...\n", + "173 6.151709 49.634721 6.169003 49.645951\n", + "174 6.152406 49.645498 6.169703 49.656729\n", + "175 6.153102 49.656275 6.170404 49.667506\n", + "176 6.135808 49.645951 6.153102 49.657180\n", + "177 6.136501 49.656729 6.153800 49.667957\n", + "\n", + "[178 rows x 4 columns]\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explore_search_extent(origin_label_items)" + ] + }, + { + "cell_type": "markdown", + "id": "13c6e607-c79b-4678-a483-9bacb0b3b1df", + "metadata": {}, + "source": [ + "### Download the entire label collection for BigEarthNet from Radiant MLHub" + ] + }, + { + "cell_type": "markdown", + "id": "2e131160-50bb-487f-8fcb-96d37ce80167", + "metadata": {}, + "source": [ + "We could certainly use the method above to query label Items directly from our connection to the Radiant MLHub API endpoint. However, on very large collections, such as in the case with BigEarthNet, pagination becomes a bottleneck issue in obtaining and resolving STAC items, as it only returns 100 items at a time. Querying the entire Collection of nearly ~600,000 Items could take hours.\n", + "\n", + "Therefore, downloading the label Collection (which is only 160 MB) directly is preferrable to paginating over the entire Collection using the API." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "93f67824-bf8f-4cce-862e-856d9cfed26d", + "metadata": {}, + "outputs": [], + "source": [ + "label_collection_path = os.path.join(\n", + " TMP_DIR, BIGEARTHNET_LABEL_COLLECTION, \"collection.json\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "21bd7e52-6bc7-4a8d-bf56-060e80f4019b", + "metadata": {}, + "source": [ + "Check if collection folder already exists before downloading 173 mb dataset. Otherwise download and uncompress the `.tar.gz` file to extract the label collection files." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "81f5a8c4-c604-4f8a-8fb9-1d90607206ee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive file already downloaded from Radiant MLHub, skipping...\n" + ] + } + ], + "source": [ + "if not os.path.exists(label_collection_path):\n", + " collection = Collection.fetch(BIGEARTHNET_LABEL_COLLECTION)\n", + " archive_path = collection.download(TMP_DIR)\n", + " !tar -xf {archive_path.as_posix()} -C {TMP_DIR}\n", + "else:\n", + " print(\"Archive file already downloaded from Radiant MLHub, skipping...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "21e2683b-edde-43e0-8bf2-b791a8e04dc8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['bigearthnet_v1_labels_S2B_MSIL2A_20170914T93030_63_71',\n", + " 'bigearthnet_v1_labels_S2B_MSIL2A_20180506T105029_52_1',\n", + " 'bigearthnet_v1_labels_S2B_MSIL2A_20180509T092029_4_62',\n", + " 'bigearthnet_v1_labels_S2B_MSIL2A_20180525T94030_57_6',\n", + " 'bigearthnet_v1_labels_S2A_MSIL2A_20170717T113321_65_3']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bigearthnet_dir = os.listdir(os.path.join(TMP_DIR, BIGEARTHNET_LABEL_COLLECTION))\n", + "bigearthnet_dir[0:5]" + ] + }, + { + "cell_type": "markdown", + "id": "e1c28041-79f1-4ebd-a09a-14d440ac2757", + "metadata": {}, + "source": [ + "This is the total count of label Item (chip) directories, plus one for the STAC Collection itself." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1f270583-b7a5-4e37-8a50-2f9f4e49fdaf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "590327" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(bigearthnet_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "954d358d-54ff-4300-89e5-43f16e452574", + "metadata": {}, + "source": [ + "### Obtain a random sample of label Items from BigEarthNet" + ] + }, + { + "cell_type": "markdown", + "id": "670ad5d5-458f-4f56-88d9-574e5c37ba26", + "metadata": {}, + "source": [ + "We don't want to work with the entire dataset of nearly 600,000 labels. This would take too long to download, and we likely won't have enough disk space or space in memory, so let's work with a random sample of the dataset that is 10% of the original size." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ff4ad2d8-7cbd-4bc4-a1d2-f6bcd0c9fcf8", + "metadata": {}, + "outputs": [], + "source": [ + "assert os.path.exists(label_collection_path)\n", + "with open(label_collection_path, \"r\") as in_file:\n", + " collection_data = json.load(in_file)" + ] + }, + { + "cell_type": "markdown", + "id": "8c8b0c28-9dee-4f83-b3fe-a49079a530dd", + "metadata": {}, + "source": [ + "This confirms we have all of the label Items STAC objects and image data from the collection" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9ffac3d1-4cf1-4bf8-b066-6de4fa1f4da1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "590326" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "label_item_links = [\n", + " link[\"href\"] for link in collection_data[\"links\"] if link[\"rel\"] == \"item\"\n", + "]\n", + "len(label_item_links)" + ] + }, + { + "cell_type": "markdown", + "id": "b9836f40-4228-4d72-8b84-d94cae1030c6", + "metadata": {}, + "source": [ + "Now we take a random sample that is 1/100th the original dataset size" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "97b8f303-08c8-4353-847e-fc4d712edac6", + "metadata": {}, + "outputs": [], + "source": [ + "label_item_sample = np.random.choice(\n", + " a=label_item_links, size=int(len(label_item_links) / 100), replace=False\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a05b5914-ddf8-4332-87f9-4e631e71110a", + "metadata": {}, + "outputs": [], + "source": [ + "first_label_item = Item.from_file(\n", + " os.path.join(\n", + " TMP_DIR,\n", + " BIGEARTHNET_LABEL_COLLECTION,\n", + " label_item_sample[np.random.randint(len(label_item_sample))],\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "65959f74-4d1d-4ce4-8583-bc51d6047ce1", + "metadata": {}, + "source": [ + "Chip ID for the sample label Item pulled:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4420d40b-2cd3-4f74-b271-a137ca71e360", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'bigearthnet_v1_labels_S2A_MSIL2A_20180413T95032_86_10'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first_label_item.id" + ] + }, + { + "cell_type": "markdown", + "id": "c0b0734b-ce42-48de-8bea-f4c4f5e9b37f", + "metadata": {}, + "source": [ + "Links for the sample label Item, take special note of the `rel=source` Link listed:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "42d94501-6305-4329-aa4f-ef69802951b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "first_label_item.links" + ] + }, + { + "cell_type": "markdown", + "id": "5d23fb83-400e-487b-940f-fc158b5ae41b", + "metadata": { + "tags": [] + }, + "source": [ + "### Fetch source items for random sample from BigEarthNet" + ] + }, + { + "cell_type": "markdown", + "id": "e419d96f-f8b9-4911-b5d1-4a4077767062", + "metadata": {}, + "source": [ + "If we had the source collection archive downloaded and uncompressed in the same parent directory as the labels collection, we could reference the source Items and images directly. However the BigEarthNet source collection is over 60GB when compressed. Therefore to work around the disk size limitations of a Planetary Computer instance, we can query the same source items from the MLHub API endpoint, the same way we got the labels, but filter to the exact source item using IDs." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "fa334c57-9dbe-495e-b040-980adf1192c4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_source_item_ids(label_item: Item) -> List[str]:\n", + " return [\n", + " link.href.split(\"/\")[-2] for link in label_item.links if link.rel == \"source\"\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f4fa6f4b-8463-40a4-8aec-527d868be5e2", + "metadata": {}, + "outputs": [], + "source": [ + "origin_source_items = mlhub_catalog.search(\n", + " collections=[BIGEARTHNET_SOURCE_COLLECTION],\n", + " ids=get_source_item_ids(first_label_item),\n", + ").get_all_items()" + ] + }, + { + "cell_type": "markdown", + "id": "03e775fa-dd60-4623-8d44-3d83beafcb2c", + "metadata": {}, + "source": [ + "This is the number of source items that match the query parameters we sent to the MLHub API using the first label's bounding box and datetime properties." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "2c98a5d6-1a55-466a-b768-4433cea148ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(origin_source_items)" + ] + }, + { + "cell_type": "markdown", + "id": "c0ba4eaf-03d8-4e56-92c8-4e98b26ba983", + "metadata": {}, + "source": [ + "Taking a look at some of the properties of the first source Item found:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "854abe53-e1b7-452a-8aa2-c139aa220348", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bigearthnet_v1_source_S2A_MSIL2A_20180413T95032_86_10\n", + "2018-04-13 09:50:32+00:00\n", + "[25.247600449332744, 60.30639355471977, 25.269882490837322, 60.317447602103464]\n", + "{'gsd': 30, 'datetime': '2018-04-13T09:50:32Z', 'eo:bands': [{'name': 'B01', 'common_name': 'Coastal Aerosol', 'description': 'Coastal Aerosol'}, {'name': 'B02', 'common_name': 'Blue', 'description': 'Blue'}, {'name': 'B03', 'common_name': 'Green', 'description': 'Green'}, {'name': 'B04', 'common_name': 'Red', 'description': 'Red'}, {'name': 'B05', 'common_name': 'Vegetation Red Edge', 'description': 'Vegetation Red Edge (704.1nm)'}, {'name': 'B06', 'common_name': 'Vegetation Red Edge', 'description': 'Vegetation Red Edge (740.1nm)'}, {'name': 'B07', 'common_name': 'Vegetation Red Edge', 'description': 'Vegetation Red Edge (782.8nm)'}, {'name': 'B08', 'common_name': 'NIR', 'description': 'NIR'}, {'name': 'B8A', 'common_name': 'Narrow NIR', 'description': 'Narrow NIR'}, {'name': 'B09', 'common_name': 'Water Vapour', 'description': 'Water Vapour'}, {'name': 'B11', 'common_name': 'SWIR', 'description': 'SWIR (1613.7nm)'}, {'name': 'B12', 'common_name': 'SWIR', 'description': 'SWIR (2202.4nm)'}], 'platform': 'Sentinel-2', 'instruments': ['MSI'], 'constellation': 'Sentinel-2'}\n" + ] + } + ], + "source": [ + "for source_item in origin_source_items:\n", + " print(source_item.id)\n", + " print(source_item.datetime)\n", + " print(source_item.bbox)\n", + " print(source_item.properties)\n", + " break" + ] + }, + { + "cell_type": "markdown", + "id": "5303475f-d29b-4e84-82ef-d355ef0519de", + "metadata": {}, + "source": [ + "With the properties from this sample source Item, we can observe where the chip is located, the relevant Sentinel-2 bands (assets) and datetime the image was captured." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "01d88ffe-f530-46d6-87d9-4337c1ec0202", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " minx miny maxx maxy\n", + "0 25.2476 60.306394 25.269882 60.317448\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explore_search_extent(origin_source_items)" + ] + }, + { + "cell_type": "markdown", + "id": "77109ba6-f0a1-426e-8884-39a5fac2795f", + "metadata": {}, + "source": [ + "This is the location of the source items fetched from the label Items sample." + ] + }, + { + "cell_type": "markdown", + "id": "a5e0cbba-32b8-47ad-9913-e7ba2a939922", + "metadata": { + "tags": [] + }, + "source": [ + "### Fetch Landsat 8 scenes based on source Item bbox and datetime" + ] + }, + { + "cell_type": "markdown", + "id": "4392b23e-eebb-4a53-88d8-f49fbfacfaf1", + "metadata": {}, + "source": [ + "Configure API connection for the microsoft planetary computer stac endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "1eddda32-d50d-413d-add8-dedaa2f9a067", + "metadata": {}, + "outputs": [], + "source": [ + "def temporal_buffer(item_datetime: str, date_delta: int) -> str:\n", + " \"\"\"Takes a datetime string and returns a buffer around that date\n", + "\n", + " Args:\n", + " item_datetime: string of the datetime property from an Item\n", + " date_delta: integer for days to add before and after a date\n", + "\n", + " Returns:\n", + " a string range representing the full date buffer\n", + " \"\"\"\n", + " delta = td(days=date_delta)\n", + " item_dt = dt.strptime(item_datetime, \"%Y-%m-%dT%H:%M:%SZ\")\n", + "\n", + " dt_start = item_dt - delta\n", + " dt_start_str = dt_start.strftime(\"%Y-%m-%d\")\n", + "\n", + " dt_end = item_dt + delta\n", + " dt_end_str = dt_end.strftime(\"%Y-%m-%d\")\n", + "\n", + " return f\"{dt_start_str}/{dt_end_str}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f77e4ec0-a1b8-490c-b732-4c10d89b06ea", + "metadata": {}, + "outputs": [], + "source": [ + "def min_cloud_cover_scene(label_geom: Polygon, search_items: ItemCollection) -> Item:\n", + " \"\"\"Finds the Item with minimal cloud cover from an ItemCollection\n", + "\n", + " Args:\n", + " label_geom: Polygon geometry to ensure label completely within scene\n", + " search_items: ItemCollection of the Items found from pystac_client search\n", + "\n", + " Returns:\n", + " Item where label completely contained within, and minimal cloud cover\n", + " \"\"\"\n", + " min_cc = np.inf\n", + " min_cc_item = None\n", + " for item in search_items:\n", + " item_geom = shape(item.geometry)\n", + " item_cc = item.properties[\"eo:cloud_cover\"]\n", + " if item_cc < min_cc and label_geom.within(item_geom):\n", + " min_cc = item_cc\n", + " min_cc_item = item\n", + " return min_cc_item" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "c3b23fa5-d588-42d4-b913-fab7819e7ea3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_landsat_8_match(label_item: Item) -> Tuple[Item, Item]:\n", + " \"\"\"Finds the best Landsat 8 match using source Item datetime and bounding box.\n", + "\n", + " Args:\n", + " label_item: the STAC label Item object\n", + "\n", + " Returns:\n", + " Tuple of the BigEarthNet source Item and the Landsat 8 match Item\n", + " \"\"\"\n", + " # get the matching source Item properties\n", + " source_items = mlhub_catalog.search(\n", + " collections=[BIGEARTHNET_SOURCE_COLLECTION],\n", + " ids=get_source_item_ids(label_item),\n", + " ).get_all_items()\n", + "\n", + " if source_items:\n", + " source_item = source_items[0]\n", + " source_bbox = source_item.bbox\n", + " source_datetime = source_item.properties[\"datetime\"]\n", + "\n", + " # search PC Catalog for L8 Items\n", + " l8_items = mspc_catalog.search(\n", + " collections=PLANETARY_COMPUTER_LANDSAT_8,\n", + " bbox=source_bbox,\n", + " datetime=temporal_buffer(source_datetime, DATE_BUFFER),\n", + " ).get_all_items()\n", + "\n", + " # filter to best L8 Item match\n", + " signed_l8_items = planetary_computer.sign(l8_items)\n", + " best_l8_match = min_cloud_cover_scene(\n", + " shape(source_item.geometry), signed_l8_items\n", + " )\n", + "\n", + " if not best_l8_match:\n", + " print(\n", + " \"No Landsat 8 Item was found on the Planetary \"\n", + " \"Computer matching the query parameters:\"\n", + " )\n", + " print(\n", + " f\"Source Item ID: {source_item.id} \"\n", + " f\"Bbox: {source_bbox}, \"\n", + " f\"Datetime: {source_datetime}\"\n", + " )\n", + " best_l8_match = None\n", + " else:\n", + " print(\n", + " \"No Sentinel-2 source Item was found in the \"\n", + " \"BigEarthNet dataset matching that label item!\"\n", + " )\n", + " source_item = None\n", + " return source_item, best_l8_match" + ] + }, + { + "cell_type": "markdown", + "id": "69ad7ef1-143e-47f0-b6b9-26c73e5cc65d", + "metadata": {}, + "source": [ + "Since it is known that the BigEarthNet dataset from MLHub has a 1-to-1 pairing of source and labels, we can safely assume the first source item is the appropriate match for our label." + ] + }, + { + "cell_type": "markdown", + "id": "ab77ef46-b54a-42e7-a780-bc8e8094ec6f", + "metadata": {}, + "source": [ + "This makes a connection to the Planetary Computer Data Catalog using the API endpoint URL." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "38280716-e130-4de4-9699-2c0bcbfc056d", + "metadata": {}, + "outputs": [], + "source": [ + "mspc_catalog = pystac_client.Client.open(MSPC_API_URL)" + ] + }, + { + "cell_type": "markdown", + "id": "39f35d02-d2de-4be5-9317-c80214502c88", + "metadata": {}, + "source": [ + "We will now use the API client with the helper function above to fetch the best Landsat 8 match for the sampled label Item. This will find only the scenes where the label is completely within the scene, and there is minimal cloud cover." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "53d3ab14-86c7-42a8-be96-8156f5e74d64", + "metadata": {}, + "outputs": [], + "source": [ + "source_item, best_l8_match = get_landsat_8_match(first_label_item)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "53556cf0-375b-4044-9262-d16de707a13d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LC08_L2SP_187018_20180510_02_T1\n", + "[24.323999281168234, 58.95366546927769, 28.882648861314085, 61.187374530722316]\n", + "{'datetime': '2018-05-10T09:22:33.464049Z', 'platform': 'landsat-8', 'proj:bbox': [356085.0, 6537585.0, 601215.0, 6785115.0], 'proj:epsg': 32635, 'description': 'Landsat Collection 2 Level-2 Surface Reflectance Product', 'instruments': ['oli', 'tirs'], 'eo:cloud_cover': 0.01, 'view:off_nadir': 0, 'landsat:wrs_row': '018', 'landsat:scene_id': 'LC81870182018130LGN00', 'landsat:wrs_path': '187', 'landsat:wrs_type': '2', 'view:sun_azimuth': 163.43293558, 'view:sun_elevation': 46.70358845, 'landsat:cloud_cover_land': 0.01, 'landsat:processing_level': 'L2SP', 'landsat:collection_number': '02', 'landsat:collection_category': 'T1'}\n" + ] + } + ], + "source": [ + "if best_l8_match:\n", + " print(best_l8_match.id)\n", + " print(best_l8_match.bbox)\n", + " print(best_l8_match.properties)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "3f3b5d88-7f3c-4e74-8e68-2408ba762b0f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " minx miny maxx maxy\n", + "0 24.32561 58.956597 28.877931 61.185413\n" + ] + }, + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "explore_search_extent(ItemCollection([best_l8_match]))" + ] + }, + { + "cell_type": "markdown", + "id": "66c13807-50a3-4a8c-8a89-33002eafabe6", + "metadata": {}, + "source": [ + "If everything worked correctly, the geographic scope of the Landsat 8 scene should encompass a much larger surface area than the Sentinel-2 source and label chips. From here we need to crop the image down and make sure the chips from both products match." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "258a1d9e-55aa-4781-a0f5-e77ace273240", + "metadata": {}, + "outputs": [], + "source": [ + "def get_redirect_url(asset: Asset) -> str:\n", + " \"\"\"Returns the direct URL to an asset.\n", + "\n", + " Args:\n", + " asset: Asset object from an Item\n", + "\n", + " Returns:\n", + " string response URL direct to Asset\n", + " \"\"\"\n", + " response = requests.get(asset.href, allow_redirects=True)\n", + " if response.status_code == 200:\n", + " return response.url\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "1e4e977d-eb14-4aa9-85df-adbe84b1732d", + "metadata": {}, + "outputs": [], + "source": [ + "s2_stack = stack(\n", + " items=ItemCollection([source_item]),\n", + " assets=BIGEARTHNET_RGB_BANDS,\n", + " epsg=rio.open(get_redirect_url(source_item.assets[\"B02\"])).crs.to_epsg(),\n", + " resolution=10,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "287f9b13-82a3-4a5d-82f2-9ab5066c6be1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'stackstac-fdb94397c7d0c489a40e540385210bc8' (time: 1, band: 3, y: 128, x: 128)>\n",
+       "dask.array<fetch_raster_window, shape=(1, 3, 128, 128), dtype=float64, chunksize=(1, 1, 128, 128), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "  * time                 (time) datetime64[ns] 2018-04-13T09:50:32\n",
+       "    id                   (time) <U53 'bigearthnet_v1_source_S2A_MSIL2A_201804...\n",
+       "  * band                 (band) <U3 'B04' 'B03' 'B02'\n",
+       "  * x                    (x) float64 4.032e+05 4.032e+05 ... 4.044e+05 4.044e+05\n",
+       "  * y                    (y) float64 6.688e+06 6.688e+06 ... 6.687e+06 6.687e+06\n",
+       "    platform             <U10 'Sentinel-2'\n",
+       "    instruments          <U3 'MSI'\n",
+       "    constellation        <U10 'Sentinel-2'\n",
+       "    gsd                  int64 30\n",
+       "    title                (band) <U35 'S2A_MSIL2A_20180413T95032_86_10_B04' .....\n",
+       "    common_name          (band) <U5 'Red' 'Green' 'Blue'\n",
+       "    center_wavelength    object None\n",
+       "    full_width_half_max  object None\n",
+       "    epsg                 int64 32635\n",
+       "Attributes:\n",
+       "    spec:        RasterSpec(epsg=32635, bounds=(403160, 6686780, 404440, 6688...\n",
+       "    crs:         epsg:32635\n",
+       "    transform:   | 10.00, 0.00, 403160.00|\\n| 0.00,-10.00, 6688060.00|\\n| 0.0...\n",
+       "    resolution:  10
" + ], + "text/plain": [ + "\n", + "dask.array\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2018-04-13T09:50:32\n", + " id (time) " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "s2_stack[0].plot(col=\"band\")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "cc2326b7-d1cc-4474-82b6-95f7da05f897", + "metadata": {}, + "outputs": [], + "source": [ + "l8_original = stack(\n", + " items=ItemCollection([best_l8_match]), assets=LANDSAT_8_RGB_BANDS, resolution=10\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "aa276eef-73ea-4834-a6a1-7cbd205f3d46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'stackstac-12352f6ccd95fc24747d98826b4a4e63' (time: 1, band: 3, y: 24754, x: 24514)>\n",
+       "dask.array<fetch_raster_window, shape=(1, 3, 24754, 24514), dtype=float64, chunksize=(1, 1, 1024, 1024), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "  * time                         (time) datetime64[ns] 2018-05-10T09:22:33.46...\n",
+       "    id                           (time) <U31 'LC08_L2SP_187018_20180510_02_T1'\n",
+       "  * band                         (band) <U5 'SR_B4' 'SR_B3' 'SR_B2'\n",
+       "  * x                            (x) float64 3.561e+05 3.561e+05 ... 6.012e+05\n",
+       "  * y                            (y) float64 6.785e+06 6.785e+06 ... 6.538e+06\n",
+       "    view:sun_elevation           float64 46.7\n",
+       "    landsat:processing_level     <U4 'L2SP'\n",
+       "    instruments                  object {'oli', 'tirs'}\n",
+       "    landsat:cloud_cover_land     float64 0.01\n",
+       "    landsat:wrs_path             <U3 '187'\n",
+       "    landsat:collection_number    <U2 '02'\n",
+       "    eo:cloud_cover               float64 0.01\n",
+       "    description                  (band) <U56 'Collection 2 Level-2 Red Band (...\n",
+       "    proj:epsg                    int64 32635\n",
+       "    landsat:wrs_row              <U3 '018'\n",
+       "    view:off_nadir               int64 0\n",
+       "    proj:bbox                    object {6537585.0, 6785115.0, 356085.0, 6012...\n",
+       "    view:sun_azimuth             float64 163.4\n",
+       "    platform                     <U9 'landsat-8'\n",
+       "    landsat:scene_id             <U21 'LC81870182018130LGN00'\n",
+       "    landsat:collection_category  <U2 'T1'\n",
+       "    landsat:wrs_type             <U1 '2'\n",
+       "    gsd                          float64 30.0\n",
+       "    proj:transform               object {0.0, -30.0, 356085.0, 6785115.0, 30.0}\n",
+       "    proj:shape                   object {8251, 8171}\n",
+       "    title                        (band) <U15 'Red Band (B4)' ... 'Blue Band (...\n",
+       "    common_name                  (band) <U5 'red' 'green' 'blue'\n",
+       "    center_wavelength            (band) float64 0.65 0.56 0.48\n",
+       "    full_width_half_max          (band) float64 0.04 0.06 0.06\n",
+       "    epsg                         int64 32635\n",
+       "Attributes:\n",
+       "    spec:        RasterSpec(epsg=32635, bounds=(356080, 6537580, 601220, 6785...\n",
+       "    crs:         epsg:32635\n",
+       "    transform:   | 10.00, 0.00, 356080.00|\\n| 0.00,-10.00, 6785120.00|\\n| 0.0...\n",
+       "    resolution:  10
" + ], + "text/plain": [ + "\n", + "dask.array\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2018-05-10T09:22:33.46...\n", + " id (time) \n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'stackstac-85199aa0104380729f5195e42dc990d3' (time: 1, band: 3, y: 128, x: 128)>\n",
+       "dask.array<fetch_raster_window, shape=(1, 3, 128, 128), dtype=float64, chunksize=(1, 1, 128, 128), chunktype=numpy.ndarray>\n",
+       "Coordinates:\n",
+       "  * time                         (time) datetime64[ns] 2018-05-10T09:22:33.46...\n",
+       "    id                           (time) <U31 'LC08_L2SP_187018_20180510_02_T1'\n",
+       "  * band                         (band) <U5 'SR_B4' 'SR_B3' 'SR_B2'\n",
+       "  * x                            (x) float64 4.032e+05 4.032e+05 ... 4.044e+05\n",
+       "  * y                            (y) float64 6.688e+06 6.688e+06 ... 6.687e+06\n",
+       "    view:sun_elevation           float64 46.7\n",
+       "    landsat:processing_level     <U4 'L2SP'\n",
+       "    instruments                  object {'oli', 'tirs'}\n",
+       "    landsat:cloud_cover_land     float64 0.01\n",
+       "    landsat:wrs_path             <U3 '187'\n",
+       "    landsat:collection_number    <U2 '02'\n",
+       "    eo:cloud_cover               float64 0.01\n",
+       "    description                  (band) <U56 'Collection 2 Level-2 Red Band (...\n",
+       "    proj:epsg                    int64 32635\n",
+       "    landsat:wrs_row              <U3 '018'\n",
+       "    view:off_nadir               int64 0\n",
+       "    proj:bbox                    object {6537585.0, 6785115.0, 356085.0, 6012...\n",
+       "    view:sun_azimuth             float64 163.4\n",
+       "    platform                     <U9 'landsat-8'\n",
+       "    landsat:scene_id             <U21 'LC81870182018130LGN00'\n",
+       "    landsat:collection_category  <U2 'T1'\n",
+       "    landsat:wrs_type             <U1 '2'\n",
+       "    gsd                          float64 30.0\n",
+       "    proj:transform               object {0.0, -30.0, 356085.0, 6785115.0, 30.0}\n",
+       "    proj:shape                   object {8251, 8171}\n",
+       "    title                        (band) <U15 'Red Band (B4)' ... 'Blue Band (...\n",
+       "    common_name                  (band) <U5 'red' 'green' 'blue'\n",
+       "    center_wavelength            (band) float64 0.65 0.56 0.48\n",
+       "    full_width_half_max          (band) float64 0.04 0.06 0.06\n",
+       "    epsg                         int64 32635\n",
+       "Attributes:\n",
+       "    spec:        RasterSpec(epsg=32635, bounds=(403160, 6686780, 404440, 6688...\n",
+       "    crs:         epsg:32635\n",
+       "    transform:   | 10.00, 0.00, 403160.00|\\n| 0.00,-10.00, 6688060.00|\\n| 0.0...\n",
+       "    resolution:  10
" + ], + "text/plain": [ + "\n", + "dask.array\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2018-05-10T09:22:33.46...\n", + " id (time) " + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "l8_cropped[0].plot(col=\"band\")" + ] + }, + { + "cell_type": "markdown", + "id": "bc46c85b-daa9-44df-9be7-62dfa1234b25", + "metadata": {}, + "source": [ + "Now we have a cropped Landsat 8 chip that spatially and temporally matches our Sentinel-2 source imagery and label sample from the BigEarthNet dataset." + ] + }, + { + "cell_type": "markdown", + "id": "3a70b063-d273-4aee-864b-07e318388890", + "metadata": {}, + "source": [ + "### Launch a Dask gateway cluster for parallel processing" + ] + }, + { + "cell_type": "markdown", + "id": "cb650d80-bf1a-4aad-8f8b-08a612e28aae", + "metadata": {}, + "source": [ + "We will use Dask to optimize our data processing of hundreds of Landsat-8 scenes by parallelizing the workflow with a delayed computation graph. The Dask Client schedules, runs the delayed computations, and gathers the results, while the Dask Gateway provides a secure and centralized way of managing the multiple client clusters. This is especially useful for running Dask on Planetary Computer." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "29531759-6d19-4010-8401-eb947a32c515", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + "
\n", + "

Client

\n", + "

Client-da516f30-d223-11ec-8ad1-52879e68a5a2

\n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "
Connection method: Cluster objectCluster type: distributed.LocalCluster
\n", + " Dashboard: http://127.0.0.1:8787/status\n", + "
\n", + "\n", + " \n", + "
\n", + "

Cluster Info

\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

LocalCluster

\n", + "

ac658c6d

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + " \n", + "
\n", + " Dashboard: http://127.0.0.1:8787/status\n", + " \n", + " Workers: 4\n", + "
\n", + " Total threads: 8\n", + " \n", + " Total memory: 16.00 GiB\n", + "
Status: runningUsing processes: True
\n", + "\n", + "
\n", + " \n", + "

Scheduler Info

\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + "

Scheduler

\n", + "

Scheduler-9dd4c443-c650-46ca-a024-f39fdc6ca132

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " Comm: tcp://127.0.0.1:50711\n", + " \n", + " Workers: 4\n", + "
\n", + " Dashboard: http://127.0.0.1:8787/status\n", + " \n", + " Total threads: 8\n", + "
\n", + " Started: Just now\n", + " \n", + " Total memory: 16.00 GiB\n", + "
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "

Workers

\n", + "
\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 0

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:50731\n", + " \n", + " Total threads: 2\n", + "
\n", + " Dashboard: http://127.0.0.1:50732/status\n", + " \n", + " Memory: 4.00 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:50717\n", + "
\n", + " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-xtk5cjq6\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 1

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:50723\n", + " \n", + " Total threads: 2\n", + "
\n", + " Dashboard: http://127.0.0.1:50725/status\n", + " \n", + " Memory: 4.00 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:50715\n", + "
\n", + " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-n187cgkt\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 2

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:50722\n", + " \n", + " Total threads: 2\n", + "
\n", + " Dashboard: http://127.0.0.1:50724/status\n", + " \n", + " Memory: 4.00 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:50714\n", + "
\n", + " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-_01c5u6w\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "

Worker: 3

\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + "\n", + "
\n", + " Comm: tcp://127.0.0.1:50728\n", + " \n", + " Total threads: 2\n", + "
\n", + " Dashboard: http://127.0.0.1:50729/status\n", + " \n", + " Memory: 4.00 GiB\n", + "
\n", + " Nanny: tcp://127.0.0.1:50716\n", + "
\n", + " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-387_p8o1\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "\n", + "
\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "
\n", + " \n", + "\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client = dask.distributed.Client() # you can configure Dask client parameters here\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "55d9dc11-d3b8-4edc-a5fd-acb2faba1c18", + "metadata": {}, + "outputs": [], + "source": [ + "# client.close()" + ] + }, + { + "cell_type": "markdown", + "id": "f3b09697-b6ab-4026-bfcb-2f5214b03f5c", + "metadata": {}, + "source": [ + "### Scale the workflow using Dask Delayed" + ] + }, + { + "cell_type": "markdown", + "id": "5368c39f-94a1-41ba-acc0-fd18c8dc1c18", + "metadata": {}, + "source": [ + "These are two helper functions that we will use to encapsulate the process of creating the cropped Landsat 8 chips and write them to disk in parallel using the Dask Delayed decorator." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "c924acb1-092e-4f86-b73f-5b56ccdebe27", + "metadata": {}, + "outputs": [], + "source": [ + "def create_landsat_8_dataarray(item_path: str) -> DataArray:\n", + " \"\"\"Creates a Landsat 8 chip from BigEarthNet label chip.\n", + "\n", + " Args:\n", + " item_path: string path to the label item on disk\n", + "\n", + " Returns:\n", + " Landsat 8 DataArray that has been cropped to label bbox\n", + " \"\"\"\n", + " # read label Item object\n", + " label_item = Item.from_file(\n", + " os.path.join(TMP_DIR, BIGEARTHNET_LABEL_COLLECTION, item_path)\n", + " )\n", + "\n", + " # fetch the Landsat 8 scene that best matches the label\n", + " s2_source, l8_match = get_landsat_8_match(label_item)\n", + "\n", + " if l8_match:\n", + " # crop L8 match to S2 dims and read image data\n", + " l8_stack = stack(\n", + " items=ItemCollection([l8_match]),\n", + " assets=LANDSAT_8_RGB_BANDS,\n", + " bounds_latlon=s2_source.bbox,\n", + " resolution=10,\n", + " )\n", + "\n", + " return l8_stack\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "02b974fe-0c6f-40a3-ad63-2f4753e0236b", + "metadata": {}, + "outputs": [], + "source": [ + "def write_tifs_bands(l8_array: DataArray, l8_item_id: str) -> None:\n", + " \"\"\"Writes to a GeoTiff for each band in Landsat 8 DataArray\n", + "\n", + " Args:\n", + " l8_array: the DataArray object created from the BigEarthNet label item\n", + " \"\"\"\n", + " # write cropped L8 DataArray to a tiff file for each band\n", + " for _band in LANDSAT_8_RGB_BANDS:\n", + " l8_band_img = l8_array.sel(band=_band)\n", + " l8_band_filename = os.path.join(\n", + " TMP_DIR, OUTPUT_DIR, l8_item_id, f\"{l8_item_id}_{_band}.tiff\"\n", + " )\n", + " Path(os.path.split(l8_band_filename)[0]).mkdir(parents=True, exist_ok=True)\n", + " l8_band_img[0].rio.to_raster(l8_band_filename)" + ] + }, + { + "cell_type": "markdown", + "id": "d5a97950-5b08-4e79-ab15-3736697d0584", + "metadata": {}, + "source": [ + "This sets the stage for the Dask Task Scheduler by mapping all label Items to the `create_landsat_8_dataarray` function. Nothing in the task graph will actually be executed until the `.compute()` command is ran." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "6bba4435-3029-4e0f-b1c8-6e37f225184f", + "metadata": {}, + "outputs": [], + "source": [ + "item_bag = dask.bag.from_sequence(label_item_sample).map(create_landsat_8_dataarray)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "4420a2eb-8e3f-479b-a055-b5394b25f837", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dask.bag" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_bag" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "604aeb5d-f0d0-4045-963b-0c603e336c24", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%time\n", + "computed_result = item_bag.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "6f95d76c-dc3e-4591-a596-a95e27a3dbde", + "metadata": {}, + "source": [ + "Lastly, we want to write a GeoTIFF to disk for each band of each Landsat 8 DataArray we created." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1436d4d-10ac-4b39-88fb-5150fe9df12b", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "for l8_array in computed_result:\n", + " if isinstance(l8_array, DataArray):\n", + " write_tifs_bands(l8_array, l8_array.id.values[0])" + ] + }, + { + "cell_type": "markdown", + "id": "2bde6ce7-9d7a-4114-88ba-56e4a4bea247", + "metadata": {}, + "source": [ + "This confirms that folders with images were written to disk. If there is a discrepancy between the sample size and the output, it's likely that there wasn't always a matching Landsat 8 scene given the geometry and datetime parameters for a particular Sentinel-2 source Item." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec64a009-9a87-4fb2-bf5b-fbc41a3f8021", + "metadata": {}, + "outputs": [], + "source": [ + "landsat_chip_dir = os.path.join(TMP_DIR, OUTPUT_DIR)\n", + "len(os.listdir(landsat_chip_dir))" + ] + }, + { + "cell_type": "markdown", + "id": "e0884796-bfab-4905-aa75-2f8dd43a5a13", + "metadata": {}, + "source": [ + "Open one of the new Landsat 8 chips to inspect what it looks like." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "beb3d5a8-2677-43dc-867a-153eb1e087f9", + "metadata": {}, + "outputs": [], + "source": [ + "landsat_images = glob(f\"{landsat_chip_dir}/**/*.tiff\", recursive=True)\n", + "first_l8_img = rioxarray.open_rasterio(landsat_images[0])\n", + "first_l8_img.plot()" + ] + }, + { + "cell_type": "markdown", + "id": "37581811-0f21-4838-9541-db84688032f6", + "metadata": {}, + "source": [ + "Shutdown the Dask client to cleanup cluster resources." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d643476-917b-484b-a041-8f3c94d12c06", + "metadata": {}, + "outputs": [], + "source": [ + "client.shutdown()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From fad5a9e7e7fcf8adf3acfc7832d84a0531915907 Mon Sep 17 00:00:00 2001 From: Kendall Smith Date: Mon, 16 May 2022 15:23:00 -0700 Subject: [PATCH 2/2] optimized dask operations, tested on PC --- ...adiant-mlhub-on-demand-training-data.ipynb | 3039 ++--------------- 1 file changed, 227 insertions(+), 2812 deletions(-) diff --git a/tutorials/radiant-mlhub-on-demand-training-data.ipynb b/tutorials/radiant-mlhub-on-demand-training-data.ipynb index 6a826c36..13ab8a02 100644 --- a/tutorials/radiant-mlhub-on-demand-training-data.ipynb +++ b/tutorials/radiant-mlhub-on-demand-training-data.ipynb @@ -31,16 +31,6 @@ { "cell_type": "code", "execution_count": null, - "id": "9fe7b447-cf8a-4cc7-aaed-4e8407d0f270", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --upgrade wget # not installed on PC by default" - ] - }, - { - "cell_type": "code", - "execution_count": 53, "id": "7e144460-5549-4ab4-ba98-10a1a7ebd236", "metadata": {}, "outputs": [], @@ -52,20 +42,22 @@ "import json\n", "from glob import glob\n", "import requests\n", - "from typing import List, Tuple\n", + "from typing import List, Tuple, Dict, Any\n", "from datetime import datetime as dt\n", "from datetime import timedelta as td\n", "\n", - "from radiant_mlhub import Collection\n", "import planetary_computer\n", "import pystac_client\n", "from pystac import ItemCollection, Item, Asset\n", "import dask\n", "\n", "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", "from stackstac import stack\n", "from geopandas import GeoDataFrame\n", "import rasterio as rio\n", + "from rasterio.plot import show\n", "import rioxarray\n", "from xarray import DataArray\n", "from shapely.geometry import shape\n", @@ -86,12 +78,12 @@ "id": "2de2f657-3229-4037-bf9c-541c503cc269", "metadata": {}, "source": [ - "In addition to the API key, we will also need to define some other initial global variables to get our workflow started. e.g. a temporary working directory to download and write data to, the STAC API endpoints, names of Collections, and other variables like the RGB bands for those collections. These are pretty flexible depending on your individual needs." + "We will also need to define other initial global variables to get our workflow started, e.g. a temporary working directory to download and write data to, the STAC API endpoints, names of Collections, and other variables like the RGB bands for those collections. These are pretty flexible depending on your individual needs." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "f3bf07a1-3a4a-4207-a449-7be766fa7e36", "metadata": {}, "outputs": [], @@ -117,7 +109,8 @@ "BIGEARTHNET_RGB_BANDS = [\"B04\", \"B03\", \"B02\"] # names of RGB bands from PC Landsat 8\n", "\n", "# Bounding box for demonstration fetching Items over Luxembourg\n", - "LUXEMBOURG_AOI = [6.06, 49.58, 6.21, 49.66] # aoi around Luxembourg" + "LUXEMBOURG_AOI = [6.06, 49.58, 6.21, 49.66] # aoi around Luxembourg\n", + "SPAIN_AOI = [-9.73, 35.84, 3.43, 43.87]" ] }, { @@ -136,40 +129,14 @@ "Programmatic access to the Radiant MLHub API using the `pystac_client` library requires both the API end-point and an API key. You can obtain an API key for free by registering an account on [mlhub.earth](https://mlhub.earth/). This can be found under `Settings & API Key` from the drop-down once logged in." ] }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f4c9dd60-3abc-464d-af25-4b23c0d2783b", - "metadata": {}, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "MLHub API Key: ································································\n" - ] - } - ], - "source": [ - "MLHUB_API_KEY = getpass.getpass(prompt=\"MLHub API Key: \")" - ] - }, - { - "cell_type": "markdown", - "id": "ccad2439-109f-4eef-ac3b-dac65f54e3aa", - "metadata": {}, - "source": [ - "Once you have your API key, you need to update the default profile file in your home directory. You can use the `mlhub configure` command line tool to do this:" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "cfd2681d-56ed-440c-a352-b4ecfd125c03", + "id": "f4c9dd60-3abc-464d-af25-4b23c0d2783b", "metadata": {}, "outputs": [], "source": [ - "!mlhub configure --api-key={MLHUB_API_KEY}" + "MLHUB_API_KEY = getpass.getpass(prompt=\"MLHub API Key: \")" ] }, { @@ -190,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "bf79c301-76df-4158-bf97-3da53552e143", "metadata": {}, "outputs": [], @@ -213,13 +180,13 @@ "id": "e9f19ce4-57fd-43d0-9263-7a5edd106ee8", "metadata": {}, "source": [ - "We will now use the `search` function from the API client to get label Items over Luxembourg as a sample use-case." + "We will now use the `search` function from the API client to get label Items over Luxembourg as a simple use-case." ] }, { "cell_type": "code", - "execution_count": 7, - "id": "fc61a3d4-cfc0-4da5-9717-bf3c8e427100", + "execution_count": null, + "id": "cc6e40fb-9e96-4041-bb21-119539875caa", "metadata": {}, "outputs": [], "source": [ @@ -227,41 +194,21 @@ " collections=BIGEARTHNET_LABEL_COLLECTION,\n", " bbox=LUXEMBOURG_AOI,\n", " datetime=BIGEARTHNET_TIME_RANGE,\n", + " max_items=100\n", ").get_all_items()" ] }, - { - "cell_type": "code", - "execution_count": 8, - "id": "917c392c-9ae5-43e2-b7e0-71dd9f749cc2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "178" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(origin_label_items)" - ] - }, { "cell_type": "markdown", "id": "e9d121a9-e54b-4039-9cef-04277962c2ca", "metadata": {}, "source": [ - "This is another helper function that simply displays the geometry for labels from an ItemCollection overlayed on a map of the region." + "This is a helper function that simply displays the geometry for labels from an ItemCollection overlayed on a map of the region." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "2f203e31-5b09-4f2e-bf27-9a3ef8e3fc4d", "metadata": {}, "outputs": [], @@ -293,46 +240,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "b86bf5d8-8dc8-491d-b3cd-1ea1263774ca", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " minx miny maxx maxy\n", - "0 6.197958 49.579464 6.215240 49.590700\n", - "1 6.198663 49.590240 6.215949 49.601477\n", - "2 6.199368 49.601017 6.216659 49.612254\n", - "3 6.180682 49.569146 6.197958 49.580381\n", - "4 6.181383 49.579923 6.198663 49.591158\n", - ".. ... ... ... ...\n", - "173 6.151709 49.634721 6.169003 49.645951\n", - "174 6.152406 49.645498 6.169703 49.656729\n", - "175 6.153102 49.656275 6.170404 49.667506\n", - "176 6.135808 49.645951 6.153102 49.657180\n", - "177 6.136501 49.656729 6.153800 49.667957\n", - "\n", - "[178 rows x 4 columns]\n" - ] - }, - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "explore_search_extent(origin_label_items)" ] @@ -342,7 +255,7 @@ "id": "13c6e607-c79b-4678-a483-9bacb0b3b1df", "metadata": {}, "source": [ - "### Download the entire label collection for BigEarthNet from Radiant MLHub" + "### Download BigEarthNet Source Items from Radiant MLHub" ] }, { @@ -350,446 +263,124 @@ "id": "2e131160-50bb-487f-8fcb-96d37ce80167", "metadata": {}, "source": [ - "We could certainly use the method above to query label Items directly from our connection to the Radiant MLHub API endpoint. However, on very large collections, such as in the case with BigEarthNet, pagination becomes a bottleneck issue in obtaining and resolving STAC items, as it only returns 100 items at a time. Querying the entire Collection of nearly ~600,000 Items could take hours.\n", + "We could certainly use the method above to query all label and source Items directly from our connection to the Radiant MLHub API endpoint. However, on very large collections, such as in the case with BigEarthNet, pagination becomes a bottleneck issue in obtaining and resolving STAC item. \n", + "\n", + "Querying the entire Collection of nearly ~600,000 Items from a single collection alone would take almost an hour depending on your connection speed. This means it could possibly take a few hours to download all Items in the Catalog. \n", "\n", - "Therefore, downloading the label Collection (which is only 160 MB) directly is preferrable to paginating over the entire Collection using the API." + "One alternative is to download the `.tar.gz` of the collections directly from the Radiant MLHub dataset detail page. The filesize for the labels archive is not large, only 165 MB. However because there are over half a million objects, it takes a long time to decompress the entire download.\n", + "\n", + "Therefore, we can showcase this workflow by paginating over the source Item Collection to fetch the first 5,000 Items available (which only represents 1% of the entire collection)." ] }, { "cell_type": "code", - "execution_count": 11, - "id": "93f67824-bf8f-4cce-862e-856d9cfed26d", + "execution_count": null, + "id": "b257cc4f-d77c-422b-b5ca-e81f768e32d5", "metadata": {}, "outputs": [], "source": [ - "label_collection_path = os.path.join(\n", - " TMP_DIR, BIGEARTHNET_LABEL_COLLECTION, \"collection.json\"\n", + "bigearthnet_source_search = mlhub_catalog.search(\n", + " collections=BIGEARTHNET_SOURCE_COLLECTION,\n", + " bbox=SPAIN_AOI,\n", + " # limit=100, # limit of items per page\n", + " max_items=5000 # total Item recall\n", ")" ] }, { "cell_type": "markdown", - "id": "21bd7e52-6bc7-4a8d-bf56-060e80f4019b", - "metadata": {}, - "source": [ - "Check if collection folder already exists before downloading 173 mb dataset. Otherwise download and uncompress the `.tar.gz` file to extract the label collection files." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "81f5a8c4-c604-4f8a-8fb9-1d90607206ee", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Archive file already downloaded from Radiant MLHub, skipping...\n" - ] - } - ], - "source": [ - "if not os.path.exists(label_collection_path):\n", - " collection = Collection.fetch(BIGEARTHNET_LABEL_COLLECTION)\n", - " archive_path = collection.download(TMP_DIR)\n", - " !tar -xf {archive_path.as_posix()} -C {TMP_DIR}\n", - "else:\n", - " print(\"Archive file already downloaded from Radiant MLHub, skipping...\")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "21e2683b-edde-43e0-8bf2-b791a8e04dc8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['bigearthnet_v1_labels_S2B_MSIL2A_20170914T93030_63_71',\n", - " 'bigearthnet_v1_labels_S2B_MSIL2A_20180506T105029_52_1',\n", - " 'bigearthnet_v1_labels_S2B_MSIL2A_20180509T092029_4_62',\n", - " 'bigearthnet_v1_labels_S2B_MSIL2A_20180525T94030_57_6',\n", - " 'bigearthnet_v1_labels_S2A_MSIL2A_20170717T113321_65_3']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bigearthnet_dir = os.listdir(os.path.join(TMP_DIR, BIGEARTHNET_LABEL_COLLECTION))\n", - "bigearthnet_dir[0:5]" - ] - }, - { - "cell_type": "markdown", - "id": "e1c28041-79f1-4ebd-a09a-14d440ac2757", - "metadata": {}, - "source": [ - "This is the total count of label Item (chip) directories, plus one for the STAC Collection itself." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "1f270583-b7a5-4e37-8a50-2f9f4e49fdaf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "590327" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(bigearthnet_dir)" - ] - }, - { - "cell_type": "markdown", - "id": "954d358d-54ff-4300-89e5-43f16e452574", - "metadata": {}, - "source": [ - "### Obtain a random sample of label Items from BigEarthNet" - ] - }, - { - "cell_type": "markdown", - "id": "670ad5d5-458f-4f56-88d9-574e5c37ba26", - "metadata": {}, - "source": [ - "We don't want to work with the entire dataset of nearly 600,000 labels. This would take too long to download, and we likely won't have enough disk space or space in memory, so let's work with a random sample of the dataset that is 10% of the original size." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "ff4ad2d8-7cbd-4bc4-a1d2-f6bcd0c9fcf8", - "metadata": {}, - "outputs": [], - "source": [ - "assert os.path.exists(label_collection_path)\n", - "with open(label_collection_path, \"r\") as in_file:\n", - " collection_data = json.load(in_file)" - ] - }, - { - "cell_type": "markdown", - "id": "8c8b0c28-9dee-4f83-b3fe-a49079a530dd", + "id": "f729e7ea-689f-40ee-9048-4c37f3f2e859", "metadata": {}, "source": [ - "This confirms we have all of the label Items STAC objects and image data from the collection" + "It should take less than a minute to fetch all the STAC Items for the 5000 sample we've queried." ] }, { "cell_type": "code", - "execution_count": 16, - "id": "9ffac3d1-4cf1-4bf8-b066-6de4fa1f4da1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "590326" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "label_item_links = [\n", - " link[\"href\"] for link in collection_data[\"links\"] if link[\"rel\"] == \"item\"\n", - "]\n", - "len(label_item_links)" - ] - }, - { - "cell_type": "markdown", - "id": "b9836f40-4228-4d72-8b84-d94cae1030c6", - "metadata": {}, - "source": [ - "Now we take a random sample that is 1/100th the original dataset size" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "97b8f303-08c8-4353-847e-fc4d712edac6", - "metadata": {}, - "outputs": [], - "source": [ - "label_item_sample = np.random.choice(\n", - " a=label_item_links, size=int(len(label_item_links) / 100), replace=False\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "a05b5914-ddf8-4332-87f9-4e631e71110a", + "execution_count": null, + "id": "6d026603-2fc8-434b-9701-72aab86a6cd6", "metadata": {}, "outputs": [], "source": [ - "first_label_item = Item.from_file(\n", - " os.path.join(\n", - " TMP_DIR,\n", - " BIGEARTHNET_LABEL_COLLECTION,\n", - " label_item_sample[np.random.randint(len(label_item_sample))],\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "65959f74-4d1d-4ce4-8583-bc51d6047ce1", - "metadata": {}, - "source": [ - "Chip ID for the sample label Item pulled:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "4420d40b-2cd3-4f74-b271-a137ca71e360", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'bigearthnet_v1_labels_S2A_MSIL2A_20180413T95032_86_10'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "first_label_item.id" - ] - }, - { - "cell_type": "markdown", - "id": "c0b0734b-ce42-48de-8bea-f4c4f5e9b37f", - "metadata": {}, - "source": [ - "Links for the sample label Item, take special note of the `rel=source` Link listed:" + "%%time\n", + "bigearthnet_source_items = bigearthnet_source_search.get_all_items()" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "42d94501-6305-4329-aa4f-ef69802951b8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "first_label_item.links" - ] - }, - { - "cell_type": "markdown", - "id": "5d23fb83-400e-487b-940f-fc158b5ae41b", + "execution_count": null, + "id": "01d88ffe-f530-46d6-87d9-4337c1ec0202", "metadata": { "tags": [] }, + "outputs": [], "source": [ - "### Fetch source items for random sample from BigEarthNet" + "explore_search_extent(bigearthnet_source_items)" ] }, { "cell_type": "markdown", - "id": "e419d96f-f8b9-4911-b5d1-4a4077767062", - "metadata": {}, - "source": [ - "If we had the source collection archive downloaded and uncompressed in the same parent directory as the labels collection, we could reference the source Items and images directly. However the BigEarthNet source collection is over 60GB when compressed. Therefore to work around the disk size limitations of a Planetary Computer instance, we can query the same source items from the MLHub API endpoint, the same way we got the labels, but filter to the exact source item using IDs." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "fa334c57-9dbe-495e-b040-980adf1192c4", + "id": "77109ba6-f0a1-426e-8884-39a5fac2795f", "metadata": {}, - "outputs": [], "source": [ - "def get_source_item_ids(label_item: Item) -> List[str]:\n", - " return [\n", - " link.href.split(\"/\")[-2] for link in label_item.links if link.rel == \"source\"\n", - " ]" + "We can see from this map that the location of the source items fetched are concentrated in Portugal. This is merely a consequence of the fact we fetched the first 5,000 source Items out of the Catalog API with a bounding box search criteria over Spain. Had we downloaded the entire Catalog locally, or ran an unfiltered search, we could fetch a random sample that is more representative of the entire dataset." ] }, { - "cell_type": "code", - "execution_count": 22, - "id": "f4fa6f4b-8463-40a4-8aec-527d868be5e2", + "cell_type": "markdown", + "id": "3f09ed9c-66c5-4f4e-8230-1c62e71e2db3", "metadata": {}, - "outputs": [], "source": [ - "origin_source_items = mlhub_catalog.search(\n", - " collections=[BIGEARTHNET_SOURCE_COLLECTION],\n", - " ids=get_source_item_ids(first_label_item),\n", - ").get_all_items()" + "### Configure API connection to Planetary Computer" ] }, { "cell_type": "markdown", - "id": "03e775fa-dd60-4623-8d44-3d83beafcb2c", + "id": "9a320f3a-f6dd-487d-8cef-34b47f327f1d", "metadata": {}, "source": [ - "This is the number of source items that match the query parameters we sent to the MLHub API using the first label's bounding box and datetime properties." + "This makes a connection to the Planetary Computer Data Catalog using the API endpoint URL." ] }, { "cell_type": "code", - "execution_count": 23, - "id": "2c98a5d6-1a55-466a-b768-4433cea148ca", + "execution_count": null, + "id": "38280716-e130-4de4-9699-2c0bcbfc056d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "len(origin_source_items)" + "mspc_catalog = pystac_client.Client.open(MSPC_API_URL)" ] }, { "cell_type": "markdown", - "id": "c0ba4eaf-03d8-4e56-92c8-4e98b26ba983", - "metadata": {}, - "source": [ - "Taking a look at some of the properties of the first source Item found:" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "854abe53-e1b7-452a-8aa2-c139aa220348", + "id": "a5e0cbba-32b8-47ad-9913-e7ba2a939922", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "bigearthnet_v1_source_S2A_MSIL2A_20180413T95032_86_10\n", - "2018-04-13 09:50:32+00:00\n", - "[25.247600449332744, 60.30639355471977, 25.269882490837322, 60.317447602103464]\n", - "{'gsd': 30, 'datetime': '2018-04-13T09:50:32Z', 'eo:bands': [{'name': 'B01', 'common_name': 'Coastal Aerosol', 'description': 'Coastal Aerosol'}, {'name': 'B02', 'common_name': 'Blue', 'description': 'Blue'}, {'name': 'B03', 'common_name': 'Green', 'description': 'Green'}, {'name': 'B04', 'common_name': 'Red', 'description': 'Red'}, {'name': 'B05', 'common_name': 'Vegetation Red Edge', 'description': 'Vegetation Red Edge (704.1nm)'}, {'name': 'B06', 'common_name': 'Vegetation Red Edge', 'description': 'Vegetation Red Edge (740.1nm)'}, {'name': 'B07', 'common_name': 'Vegetation Red Edge', 'description': 'Vegetation Red Edge (782.8nm)'}, {'name': 'B08', 'common_name': 'NIR', 'description': 'NIR'}, {'name': 'B8A', 'common_name': 'Narrow NIR', 'description': 'Narrow NIR'}, {'name': 'B09', 'common_name': 'Water Vapour', 'description': 'Water Vapour'}, {'name': 'B11', 'common_name': 'SWIR', 'description': 'SWIR (1613.7nm)'}, {'name': 'B12', 'common_name': 'SWIR', 'description': 'SWIR (2202.4nm)'}], 'platform': 'Sentinel-2', 'instruments': ['MSI'], 'constellation': 'Sentinel-2'}\n" - ] - } - ], - "source": [ - "for source_item in origin_source_items:\n", - " print(source_item.id)\n", - " print(source_item.datetime)\n", - " print(source_item.bbox)\n", - " print(source_item.properties)\n", - " break" - ] - }, - { - "cell_type": "markdown", - "id": "5303475f-d29b-4e84-82ef-d355ef0519de", - "metadata": {}, "source": [ - "With the properties from this sample source Item, we can observe where the chip is located, the relevant Sentinel-2 bands (assets) and datetime the image was captured." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "01d88ffe-f530-46d6-87d9-4337c1ec0202", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " minx miny maxx maxy\n", - "0 25.2476 60.306394 25.269882 60.317448\n" - ] - }, - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "explore_search_extent(origin_source_items)" + "### Fetch Landsat 8 scenes based on source Item bbox and datetime" ] }, { "cell_type": "markdown", - "id": "77109ba6-f0a1-426e-8884-39a5fac2795f", + "id": "69ad7ef1-143e-47f0-b6b9-26c73e5cc65d", "metadata": {}, "source": [ - "This is the location of the source items fetched from the label Items sample." - ] - }, - { - "cell_type": "markdown", - "id": "a5e0cbba-32b8-47ad-9913-e7ba2a939922", - "metadata": { - "tags": [] - }, - "source": [ - "### Fetch Landsat 8 scenes based on source Item bbox and datetime" + "Since it is known that the BigEarthNet dataset from MLHub has a 1-to-1 pairing of source and labels, we can safely assume the first source item is the appropriate match for our label." ] }, { "cell_type": "markdown", - "id": "4392b23e-eebb-4a53-88d8-f49fbfacfaf1", + "id": "39f35d02-d2de-4be5-9317-c80214502c88", "metadata": {}, "source": [ - "Configure API connection for the microsoft planetary computer stac endpoint" + "We will now use the API client with the helper function above to fetch the best Landsat 8 match for the sampled label Item. This will find only the scenes where the label is completely within the scene, and there is minimal cloud cover." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "1eddda32-d50d-413d-add8-dedaa2f9a067", "metadata": {}, "outputs": [], @@ -818,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "f77e4ec0-a1b8-490c-b732-4c10d89b06ea", "metadata": {}, "outputs": [], @@ -846,163 +437,87 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "c3b23fa5-d588-42d4-b913-fab7819e7ea3", "metadata": {}, "outputs": [], "source": [ - "def get_landsat_8_match(label_item: Item) -> Tuple[Item, Item]:\n", + "def get_landsat_8_match(bbox: List[float], geometry: Dict[str, Any], datetime: str) -> Item:\n", " \"\"\"Finds the best Landsat 8 match using source Item datetime and bounding box.\n", "\n", " Args:\n", - " label_item: the STAC label Item object\n", + " bbox: bounding box of the STAC source Item\n", + " datetime: datetime of the STAC source Item\n", "\n", " Returns:\n", - " Tuple of the BigEarthNet source Item and the Landsat 8 match Item\n", + " best_l8_match: matching Landsat 8 source Item\n", " \"\"\"\n", - " # get the matching source Item properties\n", - " source_items = mlhub_catalog.search(\n", - " collections=[BIGEARTHNET_SOURCE_COLLECTION],\n", - " ids=get_source_item_ids(label_item),\n", - " ).get_all_items()\n", - "\n", - " if source_items:\n", - " source_item = source_items[0]\n", - " source_bbox = source_item.bbox\n", - " source_datetime = source_item.properties[\"datetime\"]\n", "\n", - " # search PC Catalog for L8 Items\n", - " l8_items = mspc_catalog.search(\n", - " collections=PLANETARY_COMPUTER_LANDSAT_8,\n", - " bbox=source_bbox,\n", - " datetime=temporal_buffer(source_datetime, DATE_BUFFER),\n", - " ).get_all_items()\n", + " # search PC Catalog for L8 Items\n", + " l8_items = mspc_catalog.search(\n", + " collections=PLANETARY_COMPUTER_LANDSAT_8,\n", + " bbox=bbox,\n", + " datetime=temporal_buffer(datetime, DATE_BUFFER),\n", + " ).get_all_items()\n", "\n", - " # filter to best L8 Item match\n", - " signed_l8_items = planetary_computer.sign(l8_items)\n", - " best_l8_match = min_cloud_cover_scene(\n", - " shape(source_item.geometry), signed_l8_items\n", - " )\n", + " # filter to best L8 Item match\n", + " signed_l8_items = planetary_computer.sign(l8_items)\n", + " best_l8_match = min_cloud_cover_scene(\n", + " shape(geometry), \n", + " signed_l8_items\n", + " )\n", "\n", - " if not best_l8_match:\n", - " print(\n", - " \"No Landsat 8 Item was found on the Planetary \"\n", - " \"Computer matching the query parameters:\"\n", - " )\n", - " print(\n", - " f\"Source Item ID: {source_item.id} \"\n", - " f\"Bbox: {source_bbox}, \"\n", - " f\"Datetime: {source_datetime}\"\n", - " )\n", - " best_l8_match = None\n", - " else:\n", - " print(\n", - " \"No Sentinel-2 source Item was found in the \"\n", - " \"BigEarthNet dataset matching that label item!\"\n", - " )\n", - " source_item = None\n", - " return source_item, best_l8_match" - ] - }, - { - "cell_type": "markdown", - "id": "69ad7ef1-143e-47f0-b6b9-26c73e5cc65d", - "metadata": {}, - "source": [ - "Since it is known that the BigEarthNet dataset from MLHub has a 1-to-1 pairing of source and labels, we can safely assume the first source item is the appropriate match for our label." - ] - }, - { - "cell_type": "markdown", - "id": "ab77ef46-b54a-42e7-a780-bc8e8094ec6f", - "metadata": {}, - "source": [ - "This makes a connection to the Planetary Computer Data Catalog using the API endpoint URL." + " return best_l8_match" ] }, { "cell_type": "code", - "execution_count": 29, - "id": "38280716-e130-4de4-9699-2c0bcbfc056d", + "execution_count": null, + "id": "784d731a-4e04-41ba-99e1-476d828aa65e", "metadata": {}, "outputs": [], "source": [ - "mspc_catalog = pystac_client.Client.open(MSPC_API_URL)" - ] - }, - { - "cell_type": "markdown", - "id": "39f35d02-d2de-4be5-9317-c80214502c88", - "metadata": {}, - "source": [ - "We will now use the API client with the helper function above to fetch the best Landsat 8 match for the sampled label Item. This will find only the scenes where the label is completely within the scene, and there is minimal cloud cover." + "sample_source_item = bigearthnet_source_items[np.random.randint(0, len(bigearthnet_source_items))]" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "id": "53d3ab14-86c7-42a8-be96-8156f5e74d64", "metadata": {}, "outputs": [], "source": [ - "source_item, best_l8_match = get_landsat_8_match(first_label_item)" + "best_l8_match = get_landsat_8_match(\n", + " sample_source_item.bbox,\n", + " sample_source_item.geometry,\n", + " sample_source_item.properties['datetime']\n", + ")" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "id": "53556cf0-375b-4044-9262-d16de707a13d", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LC08_L2SP_187018_20180510_02_T1\n", - "[24.323999281168234, 58.95366546927769, 28.882648861314085, 61.187374530722316]\n", - "{'datetime': '2018-05-10T09:22:33.464049Z', 'platform': 'landsat-8', 'proj:bbox': [356085.0, 6537585.0, 601215.0, 6785115.0], 'proj:epsg': 32635, 'description': 'Landsat Collection 2 Level-2 Surface Reflectance Product', 'instruments': ['oli', 'tirs'], 'eo:cloud_cover': 0.01, 'view:off_nadir': 0, 'landsat:wrs_row': '018', 'landsat:scene_id': 'LC81870182018130LGN00', 'landsat:wrs_path': '187', 'landsat:wrs_type': '2', 'view:sun_azimuth': 163.43293558, 'view:sun_elevation': 46.70358845, 'landsat:cloud_cover_land': 0.01, 'landsat:processing_level': 'L2SP', 'landsat:collection_number': '02', 'landsat:collection_category': 'T1'}\n" - ] - } - ], + "outputs": [], "source": [ "if best_l8_match:\n", " print(best_l8_match.id)\n", " print(best_l8_match.bbox)\n", + " print(best_l8_match.geometry)\n", " print(best_l8_match.properties)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "3f3b5d88-7f3c-4e74-8e68-2408ba762b0f", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " minx miny maxx maxy\n", - "0 24.32561 58.956597 28.877931 61.185413\n" - ] - }, - { - "data": { - "text/html": [ - "
Make this Notebook Trusted to load map: File -> Trust Notebook
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "explore_search_extent(ItemCollection([best_l8_match]))" ] @@ -1017,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "id": "258a1d9e-55aa-4781-a0f5-e77ace273240", "metadata": {}, "outputs": [], @@ -1039,610 +554,89 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, + "id": "40a5cf10-0c5b-4e42-856a-51c0987f5523", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_rgb_chip(rgb_stack: DataArray, norm: int) -> None:\n", + " img_arr = rgb_stack[0].to_numpy().squeeze()\n", + " fig, ax = plt.subplots(figsize=(7,7))\n", + " show(img_arr/norm, ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "1e4e977d-eb14-4aa9-85df-adbe84b1732d", "metadata": {}, "outputs": [], "source": [ "s2_stack = stack(\n", - " items=ItemCollection([source_item]),\n", + " items=ItemCollection([sample_source_item]),\n", " assets=BIGEARTHNET_RGB_BANDS,\n", - " epsg=rio.open(get_redirect_url(source_item.assets[\"B02\"])).crs.to_epsg(),\n", + " epsg=rio.open(get_redirect_url(sample_source_item.assets[\"B02\"])).crs.to_epsg(),\n", " resolution=10,\n", ")" ] }, + { + "cell_type": "markdown", + "id": "fc36fee6-59ea-4554-82e0-20291d6d3004", + "metadata": {}, + "source": [ + "The `stackstac.stack` method returns a DataArray object with width and height for longitude and latitude, and a third dimension for the RGB bands." + ] + }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "id": "287f9b13-82a3-4a5d-82f2-9ab5066c6be1", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.DataArray 'stackstac-fdb94397c7d0c489a40e540385210bc8' (time: 1, band: 3, y: 128, x: 128)>\n",
-       "dask.array<fetch_raster_window, shape=(1, 3, 128, 128), dtype=float64, chunksize=(1, 1, 128, 128), chunktype=numpy.ndarray>\n",
-       "Coordinates:\n",
-       "  * time                 (time) datetime64[ns] 2018-04-13T09:50:32\n",
-       "    id                   (time) <U53 'bigearthnet_v1_source_S2A_MSIL2A_201804...\n",
-       "  * band                 (band) <U3 'B04' 'B03' 'B02'\n",
-       "  * x                    (x) float64 4.032e+05 4.032e+05 ... 4.044e+05 4.044e+05\n",
-       "  * y                    (y) float64 6.688e+06 6.688e+06 ... 6.687e+06 6.687e+06\n",
-       "    platform             <U10 'Sentinel-2'\n",
-       "    instruments          <U3 'MSI'\n",
-       "    constellation        <U10 'Sentinel-2'\n",
-       "    gsd                  int64 30\n",
-       "    title                (band) <U35 'S2A_MSIL2A_20180413T95032_86_10_B04' .....\n",
-       "    common_name          (band) <U5 'Red' 'Green' 'Blue'\n",
-       "    center_wavelength    object None\n",
-       "    full_width_half_max  object None\n",
-       "    epsg                 int64 32635\n",
-       "Attributes:\n",
-       "    spec:        RasterSpec(epsg=32635, bounds=(403160, 6686780, 404440, 6688...\n",
-       "    crs:         epsg:32635\n",
-       "    transform:   | 10.00, 0.00, 403160.00|\\n| 0.00,-10.00, 6688060.00|\\n| 0.0...\n",
-       "    resolution:  10
" - ], - "text/plain": [ - "\n", - "dask.array\n", - "Coordinates:\n", - " * time (time) datetime64[ns] 2018-04-13T09:50:32\n", - " id (time) " - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "s2_stack[0].plot(col=\"band\")" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, "id": "cc2326b7-d1cc-4474-82b6-95f7da05f897", "metadata": {}, "outputs": [], @@ -1654,648 +648,10 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "id": "aa276eef-73ea-4834-a6a1-7cbd205f3d46", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.DataArray 'stackstac-12352f6ccd95fc24747d98826b4a4e63' (time: 1, band: 3, y: 24754, x: 24514)>\n",
-       "dask.array<fetch_raster_window, shape=(1, 3, 24754, 24514), dtype=float64, chunksize=(1, 1, 1024, 1024), chunktype=numpy.ndarray>\n",
-       "Coordinates:\n",
-       "  * time                         (time) datetime64[ns] 2018-05-10T09:22:33.46...\n",
-       "    id                           (time) <U31 'LC08_L2SP_187018_20180510_02_T1'\n",
-       "  * band                         (band) <U5 'SR_B4' 'SR_B3' 'SR_B2'\n",
-       "  * x                            (x) float64 3.561e+05 3.561e+05 ... 6.012e+05\n",
-       "  * y                            (y) float64 6.785e+06 6.785e+06 ... 6.538e+06\n",
-       "    view:sun_elevation           float64 46.7\n",
-       "    landsat:processing_level     <U4 'L2SP'\n",
-       "    instruments                  object {'oli', 'tirs'}\n",
-       "    landsat:cloud_cover_land     float64 0.01\n",
-       "    landsat:wrs_path             <U3 '187'\n",
-       "    landsat:collection_number    <U2 '02'\n",
-       "    eo:cloud_cover               float64 0.01\n",
-       "    description                  (band) <U56 'Collection 2 Level-2 Red Band (...\n",
-       "    proj:epsg                    int64 32635\n",
-       "    landsat:wrs_row              <U3 '018'\n",
-       "    view:off_nadir               int64 0\n",
-       "    proj:bbox                    object {6537585.0, 6785115.0, 356085.0, 6012...\n",
-       "    view:sun_azimuth             float64 163.4\n",
-       "    platform                     <U9 'landsat-8'\n",
-       "    landsat:scene_id             <U21 'LC81870182018130LGN00'\n",
-       "    landsat:collection_category  <U2 'T1'\n",
-       "    landsat:wrs_type             <U1 '2'\n",
-       "    gsd                          float64 30.0\n",
-       "    proj:transform               object {0.0, -30.0, 356085.0, 6785115.0, 30.0}\n",
-       "    proj:shape                   object {8251, 8171}\n",
-       "    title                        (band) <U15 'Red Band (B4)' ... 'Blue Band (...\n",
-       "    common_name                  (band) <U5 'red' 'green' 'blue'\n",
-       "    center_wavelength            (band) float64 0.65 0.56 0.48\n",
-       "    full_width_half_max          (band) float64 0.04 0.06 0.06\n",
-       "    epsg                         int64 32635\n",
-       "Attributes:\n",
-       "    spec:        RasterSpec(epsg=32635, bounds=(356080, 6537580, 601220, 6785...\n",
-       "    crs:         epsg:32635\n",
-       "    transform:   | 10.00, 0.00, 356080.00|\\n| 0.00,-10.00, 6785120.00|\\n| 0.0...\n",
-       "    resolution:  10
" - ], - "text/plain": [ - "\n", - "dask.array\n", - "Coordinates:\n", - " * time (time) datetime64[ns] 2018-05-10T09:22:33.46...\n", - " id (time) \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.DataArray 'stackstac-85199aa0104380729f5195e42dc990d3' (time: 1, band: 3, y: 128, x: 128)>\n",
-       "dask.array<fetch_raster_window, shape=(1, 3, 128, 128), dtype=float64, chunksize=(1, 1, 128, 128), chunktype=numpy.ndarray>\n",
-       "Coordinates:\n",
-       "  * time                         (time) datetime64[ns] 2018-05-10T09:22:33.46...\n",
-       "    id                           (time) <U31 'LC08_L2SP_187018_20180510_02_T1'\n",
-       "  * band                         (band) <U5 'SR_B4' 'SR_B3' 'SR_B2'\n",
-       "  * x                            (x) float64 4.032e+05 4.032e+05 ... 4.044e+05\n",
-       "  * y                            (y) float64 6.688e+06 6.688e+06 ... 6.687e+06\n",
-       "    view:sun_elevation           float64 46.7\n",
-       "    landsat:processing_level     <U4 'L2SP'\n",
-       "    instruments                  object {'oli', 'tirs'}\n",
-       "    landsat:cloud_cover_land     float64 0.01\n",
-       "    landsat:wrs_path             <U3 '187'\n",
-       "    landsat:collection_number    <U2 '02'\n",
-       "    eo:cloud_cover               float64 0.01\n",
-       "    description                  (band) <U56 'Collection 2 Level-2 Red Band (...\n",
-       "    proj:epsg                    int64 32635\n",
-       "    landsat:wrs_row              <U3 '018'\n",
-       "    view:off_nadir               int64 0\n",
-       "    proj:bbox                    object {6537585.0, 6785115.0, 356085.0, 6012...\n",
-       "    view:sun_azimuth             float64 163.4\n",
-       "    platform                     <U9 'landsat-8'\n",
-       "    landsat:scene_id             <U21 'LC81870182018130LGN00'\n",
-       "    landsat:collection_category  <U2 'T1'\n",
-       "    landsat:wrs_type             <U1 '2'\n",
-       "    gsd                          float64 30.0\n",
-       "    proj:transform               object {0.0, -30.0, 356085.0, 6785115.0, 30.0}\n",
-       "    proj:shape                   object {8251, 8171}\n",
-       "    title                        (band) <U15 'Red Band (B4)' ... 'Blue Band (...\n",
-       "    common_name                  (band) <U5 'red' 'green' 'blue'\n",
-       "    center_wavelength            (band) float64 0.65 0.56 0.48\n",
-       "    full_width_half_max          (band) float64 0.04 0.06 0.06\n",
-       "    epsg                         int64 32635\n",
-       "Attributes:\n",
-       "    spec:        RasterSpec(epsg=32635, bounds=(403160, 6686780, 404440, 6688...\n",
-       "    crs:         epsg:32635\n",
-       "    transform:   | 10.00, 0.00, 403160.00|\\n| 0.00,-10.00, 6688060.00|\\n| 0.0...\n",
-       "    resolution:  10
" - ], - "text/plain": [ - "\n", - "dask.array\n", - "Coordinates:\n", - " * time (time) datetime64[ns] 2018-05-10T09:22:33.46...\n", - " id (time) " - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "l8_cropped[0].plot(col=\"band\")" - ] - }, - { - "cell_type": "markdown", - "id": "bc46c85b-daa9-44df-9be7-62dfa1234b25", + "execution_count": null, + "id": "355ee719-0365-40e6-969f-4193ca2a59ec", "metadata": {}, + "outputs": [], "source": [ - "Now we have a cropped Landsat 8 chip that spatially and temporally matches our Sentinel-2 source imagery and label sample from the BigEarthNet dataset." + "l8_cropped[0].data.compute()" ] }, { - "cell_type": "markdown", - "id": "3a70b063-d273-4aee-864b-07e318388890", + "cell_type": "code", + "execution_count": null, + "id": "95507bc8-521e-43d8-93d6-06ab62b78fa8", "metadata": {}, + "outputs": [], "source": [ - "### Launch a Dask gateway cluster for parallel processing" + "plot_rgb_chip(l8_cropped, 23000)" ] }, { "cell_type": "markdown", - "id": "cb650d80-bf1a-4aad-8f8b-08a612e28aae", - "metadata": {}, - "source": [ - "We will use Dask to optimize our data processing of hundreds of Landsat-8 scenes by parallelizing the workflow with a delayed computation graph. The Dask Client schedules, runs the delayed computations, and gathers the results, while the Dask Gateway provides a secure and centralized way of managing the multiple client clusters. This is especially useful for running Dask on Planetary Computer." - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "29531759-6d19-4010-8401-eb947a32c515", + "id": "bc46c85b-daa9-44df-9be7-62dfa1234b25", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "
\n", - "
\n", - "

Client

\n", - "

Client-da516f30-d223-11ec-8ad1-52879e68a5a2

\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "
Connection method: Cluster objectCluster type: distributed.LocalCluster
\n", - " Dashboard: http://127.0.0.1:8787/status\n", - "
\n", - "\n", - " \n", - "
\n", - "

Cluster Info

\n", - "
\n", - "
\n", - "
\n", - "
\n", - "

LocalCluster

\n", - "

ac658c6d

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - " \n", - "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", - " \n", - " Workers: 4\n", - "
\n", - " Total threads: 8\n", - " \n", - " Total memory: 16.00 GiB\n", - "
Status: runningUsing processes: True
\n", - "\n", - "
\n", - " \n", - "

Scheduler Info

\n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "
\n", - "

Scheduler

\n", - "

Scheduler-9dd4c443-c650-46ca-a024-f39fdc6ca132

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " Comm: tcp://127.0.0.1:50711\n", - " \n", - " Workers: 4\n", - "
\n", - " Dashboard: http://127.0.0.1:8787/status\n", - " \n", - " Total threads: 8\n", - "
\n", - " Started: Just now\n", - " \n", - " Total memory: 16.00 GiB\n", - "
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "

Workers

\n", - "
\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: 0

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tcp://127.0.0.1:50731\n", - " \n", - " Total threads: 2\n", - "
\n", - " Dashboard: http://127.0.0.1:50732/status\n", - " \n", - " Memory: 4.00 GiB\n", - "
\n", - " Nanny: tcp://127.0.0.1:50717\n", - "
\n", - " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-xtk5cjq6\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: 1

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tcp://127.0.0.1:50723\n", - " \n", - " Total threads: 2\n", - "
\n", - " Dashboard: http://127.0.0.1:50725/status\n", - " \n", - " Memory: 4.00 GiB\n", - "
\n", - " Nanny: tcp://127.0.0.1:50715\n", - "
\n", - " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-n187cgkt\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: 2

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tcp://127.0.0.1:50722\n", - " \n", - " Total threads: 2\n", - "
\n", - " Dashboard: http://127.0.0.1:50724/status\n", - " \n", - " Memory: 4.00 GiB\n", - "
\n", - " Nanny: tcp://127.0.0.1:50714\n", - "
\n", - " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-_01c5u6w\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "

Worker: 3

\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - "\n", - " \n", - "\n", - "
\n", - " Comm: tcp://127.0.0.1:50728\n", - " \n", - " Total threads: 2\n", - "
\n", - " Dashboard: http://127.0.0.1:50729/status\n", - " \n", - " Memory: 4.00 GiB\n", - "
\n", - " Nanny: tcp://127.0.0.1:50716\n", - "
\n", - " Local directory: /Users/kendallsmith/radiant/repos/PlanetaryComputerExamples/tutorials/dask-worker-space/worker-387_p8o1\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "\n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "client = dask.distributed.Client() # you can configure Dask client parameters here\n", - "client" + "Now we have a cropped Landsat 8 chip that spatially and temporally matches our Sentinel-2 source imagery and label sample from the BigEarthNet dataset. The first observation is that the Landsat 8 image appears blurry compared to Sentinel-2. This is because Sentinel-2 RGB bands have a 10m resolution, while the same bands for Landsat 8 have a 30m resolution." ] }, { - "cell_type": "code", - "execution_count": 2, - "id": "55d9dc11-d3b8-4edc-a5fd-acb2faba1c18", + "cell_type": "markdown", + "id": "f3b09697-b6ab-4026-bfcb-2f5214b03f5c", "metadata": {}, - "outputs": [], "source": [ - "# client.close()" + "### Scale the workflow using Dask Delayed" ] }, { "cell_type": "markdown", - "id": "f3b09697-b6ab-4026-bfcb-2f5214b03f5c", + "id": "cb650d80-bf1a-4aad-8f8b-08a612e28aae", "metadata": {}, "source": [ - "### Scale the workflow using Dask Delayed" + "We will now use Dask to optimize processing the Landsat-8 scenes by parallelizing the workflow with a delayed computation graph. The Dask Client schedules, runs the delayed computations, and gathers the results. With parallel processing, we can speed up the runtime of our image processing workflow by 10-20x." ] }, { @@ -3314,54 +740,52 @@ "id": "5368c39f-94a1-41ba-acc0-fd18c8dc1c18", "metadata": {}, "source": [ - "These are two helper functions that we will use to encapsulate the process of creating the cropped Landsat 8 chips and write them to disk in parallel using the Dask Delayed decorator." + "These are some helper functions that we will use to encapsulate the process of creating the cropped Landsat 8 chips and write them to disk in parallel using the Dask Delayed decorator." ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "id": "c924acb1-092e-4f86-b73f-5b56ccdebe27", "metadata": {}, "outputs": [], "source": [ - "def create_landsat_8_dataarray(item_path: str) -> DataArray:\n", + "def create_landsat_8_chip(source_item: Dict[str, any]) -> DataArray:\n", " \"\"\"Creates a Landsat 8 chip from BigEarthNet label chip.\n", "\n", " Args:\n", - " item_path: string path to the label item on disk\n", + " source_item: JSON/dictionary representation of source Item\n", "\n", " Returns:\n", - " Landsat 8 DataArray that has been cropped to label bbox\n", + " Landsat 8 DataArray that has been cropped to sentinel-2 bbox\n", " \"\"\"\n", - " # read label Item object\n", - " label_item = Item.from_file(\n", - " os.path.join(TMP_DIR, BIGEARTHNET_LABEL_COLLECTION, item_path)\n", - " )\n", "\n", " # fetch the Landsat 8 scene that best matches the label\n", - " s2_source, l8_match = get_landsat_8_match(label_item)\n", + " l8_match = get_landsat_8_match(\n", + " source_item['bbox'],\n", + " source_item['geometry'],\n", + " source_item['properties']['datetime']\n", + " )\n", "\n", - " if l8_match:\n", - " # crop L8 match to S2 dims and read image data\n", - " l8_stack = stack(\n", - " items=ItemCollection([l8_match]),\n", - " assets=LANDSAT_8_RGB_BANDS,\n", - " bounds_latlon=s2_source.bbox,\n", - " resolution=10,\n", - " )\n", + " # crop L8 match to S2 dims and read image data\n", + " l8_stack = stack(\n", + " items=ItemCollection([l8_match]),\n", + " assets=LANDSAT_8_RGB_BANDS,\n", + " bounds_latlon=source_item['bbox'],\n", + " resolution=10,\n", + " )\n", "\n", - " return l8_stack\n", - " return None" + " return l8_stack" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "id": "02b974fe-0c6f-40a3-ad63-2f4753e0236b", "metadata": {}, "outputs": [], "source": [ - "def write_tifs_bands(l8_array: DataArray, l8_item_id: str) -> None:\n", + "def write_tif_bands(l8_array: DataArray, l8_item_id: str) -> None:\n", " \"\"\"Writes to a GeoTiff for each band in Landsat 8 DataArray\n", "\n", " Args:\n", @@ -3387,67 +811,56 @@ }, { "cell_type": "code", - "execution_count": 58, - "id": "6bba4435-3029-4e0f-b1c8-6e37f225184f", + "execution_count": null, + "id": "29531759-6d19-4010-8401-eb947a32c515", "metadata": {}, "outputs": [], "source": [ - "item_bag = dask.bag.from_sequence(label_item_sample).map(create_landsat_8_dataarray)" + "client = dask.distributed.Client() # you can configure Dask client parameters here\n", + "client" ] }, { - "cell_type": "code", - "execution_count": 59, - "id": "4420a2eb-8e3f-479b-a055-b5394b25f837", + "cell_type": "markdown", + "id": "6a30a2f9-a176-436b-a132-01903ab72fd3", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dask.bag" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "item_bag" + "One quirky nature of combining DataArray objects returned from `stackstac.stack()` (leveraging the `rioxarray` library under the hood) is that the kernel will throw an error that the DataArrays don't have the method `rio.to_raster()`. Normally we could solve this problem by explicitly importing the `rioxarray` library, but we also need to import the module onto each worker in the client cluster. " ] }, { "cell_type": "code", - "execution_count": 60, - "id": "604aeb5d-f0d0-4045-963b-0c603e336c24", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%time\n", - "computed_result = item_bag.compute()" - ] - }, - { - "cell_type": "markdown", - "id": "6f95d76c-dc3e-4591-a596-a95e27a3dbde", + "execution_count": null, + "id": "16e0b91b-0d73-4335-8ba9-00f75850d409", "metadata": {}, + "outputs": [], "source": [ - "Lastly, we want to write a GeoTIFF to disk for each band of each Landsat 8 DataArray we created." + "import importlib\n", + "client.run(lambda: importlib.import_module(\"rioxarray\"))" ] }, { "cell_type": "code", "execution_count": null, - "id": "e1436d4d-10ac-4b39-88fb-5150fe9df12b", - "metadata": {}, + "id": "f3c67a19-442d-484c-8643-93c4d00c60a0", + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "%%time\n", - "for l8_array in computed_result:\n", - " if isinstance(l8_array, DataArray):\n", - " write_tifs_bands(l8_array, l8_array.id.values[0])" + "chunk_size = 125\n", + "\n", + "for i in range(0, len(bigearthnet_source_items[0:500]), chunk_size):\n", + " future_pool = []\n", + " item_chunk=bigearthnet_source_items[i:i+chunk_size]\n", + " for source_item in item_chunk:\n", + " item_dict = dask.delayed(Item.to_dict)(source_item)\n", + " l8_xarray = dask.delayed(create_landsat_8_chip)(item_dict)\n", + " image_writer = dask.delayed(write_tif_bands)(l8_xarray, item_dict['id'])\n", + " future_pool.append(image_writer)\n", + " future_pool = dask.persist(*future_pool)\n", + " dask.compute(*future_pool)" ] }, { @@ -3455,7 +868,7 @@ "id": "2bde6ce7-9d7a-4114-88ba-56e4a4bea247", "metadata": {}, "source": [ - "This confirms that folders with images were written to disk. If there is a discrepancy between the sample size and the output, it's likely that there wasn't always a matching Landsat 8 scene given the geometry and datetime parameters for a particular Sentinel-2 source Item." + "Now that our parallelized workflow has completed, let's confirm that folders with images were written to disk." ] }, { @@ -3474,7 +887,7 @@ "id": "e0884796-bfab-4905-aa75-2f8dd43a5a13", "metadata": {}, "source": [ - "Open one of the new Landsat 8 chips to inspect what it looks like." + "We can also open one of the new Landsat 8 chips to inspect what it looks like." ] }, { @@ -3494,14 +907,16 @@ "id": "37581811-0f21-4838-9541-db84688032f6", "metadata": {}, "source": [ - "Shutdown the Dask client to cleanup cluster resources." + "Lastly, we will shutdown the Dask client to cleanup cluster resources." ] }, { "cell_type": "code", "execution_count": null, "id": "1d643476-917b-484b-a041-8f3c94d12c06", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "client.shutdown()"