diff --git a/examples/research/virtual_philadelphia/rasterization_report.ipynb b/examples/research/virtual_philadelphia/rasterization_report.ipynb index fcd166e..d1cd5a1 100644 --- a/examples/research/virtual_philadelphia/rasterization_report.ipynb +++ b/examples/research/virtual_philadelphia/rasterization_report.ipynb @@ -50,12 +50,10 @@ "import geopandas as gpd\n", "import pandas as pd\n", "import numpy as np\n", + "from shapely.geometry import box\n", "\n", "from nomad.city_gen import RasterCity\n", - "\n", - "SANDBOX_PATH = Path(\"sandbox/sandbox_data.gpkg\")\n", - "USE_SUBSET = False # Set to True for faster testing\n", - "SUBSET_SIZE = 5000" + "import nomad.map_utils as nm" ] }, { @@ -63,7 +61,7 @@ "id": "1df10645", "metadata": {}, "source": [ - "## Load Data" + "## Configuration" ] }, { @@ -73,15 +71,30 @@ "metadata": {}, "outputs": [], "source": [ - "buildings = gpd.read_file(SANDBOX_PATH, layer=\"buildings\")\n", - "streets = gpd.read_file(SANDBOX_PATH, layer=\"streets\")\n", - "boundary = gpd.read_file(SANDBOX_PATH, layer=\"boundary\")\n", + "SMALL_BOX = box(-75.1545, 39.946, -75.1425, 39.9535)\n", + "MEDIUM_BOX = box(-75.1665, 39.9385, -75.1425, 39.9535)\n", + "LARGE_BOX = box(-75.1905, 39.9235, -75.1425, 39.9535)\n", + "\n", + "# PHILLY_BOX = SMALL_BOX\n", + "# PHILLY_BOX = MEDIUM_BOX\n", + "PHILLY_BOX = LARGE_BOX\n", + "\n", + "BLOCK_SIDE_LENGTH = 10.0 # meters\n", "\n", - "if USE_SUBSET:\n", - " buildings = buildings.head(SUBSET_SIZE)\n", - " print(f\"Using subset: {len(buildings):,} buildings\")\n", + "OUTPUT_DIR = Path(\"sandbox\")\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Determine which box we're using and set appropriate filename\n", + "if PHILLY_BOX == SMALL_BOX:\n", + " BOX_NAME = \"small\"\n", + "elif PHILLY_BOX == MEDIUM_BOX:\n", + " BOX_NAME = \"medium\"\n", "else:\n", - " print(f\"Full dataset: {len(buildings):,} buildings, {len(streets):,} streets\")" + " BOX_NAME = \"large\"\n", + "\n", + "SANDBOX_GPKG = OUTPUT_DIR / f\"sandbox_data_{BOX_NAME}.gpkg\"\n", + "\n", + "REGENERATE_DATA = False # Set to True to re-download OSM data" ] }, { @@ -89,9 +102,7 @@ "id": "dafce2a2", "metadata": {}, "source": [ - "## Generate City\n", - "\n", - "Converts vector geometries into discrete grid (block_size = 15 meters)" + "## Benchmark: Data Generation (OSM Download + Rotation)" ] }, { @@ -101,14 +112,71 @@ "metadata": {}, "outputs": [], "source": [ - "t0 = time.time()\n", - "city = RasterCity(boundary.geometry.iloc[0], streets, buildings, block_side_length=15.0)\n", - "gen_time = time.time() - t0\n", + "if REGENERATE_DATA or not SANDBOX_GPKG.exists():\n", + " print(\"=\"*50)\n", + " print(\"DATA GENERATION\")\n", + " print(\"=\"*50)\n", + " \n", + " t0 = time.time()\n", + " buildings = nm.download_osm_buildings(\n", + " PHILLY_BOX,\n", + " crs=\"EPSG:3857\",\n", + " schema=\"garden_city\",\n", + " clip=True,\n", + " infer_building_types=True,\n", + " explode=True,\n", + " )\n", + " download_buildings_time = time.time() - t0\n", + " print(f\"Buildings download: {download_buildings_time:>6.2f}s ({len(buildings):,} buildings)\")\n", + " \n", + " boundary_polygon = gpd.GeoDataFrame(geometry=[PHILLY_BOX], crs=\"EPSG:4326\").to_crs(\"EPSG:3857\").geometry.iloc[0]\n", + " outside_mask = ~buildings.geometry.within(boundary_polygon)\n", + " if outside_mask.any():\n", + " buildings = gpd.clip(buildings, gpd.GeoDataFrame(geometry=[boundary_polygon], crs=\"EPSG:3857\"))\n", + " buildings = nm.remove_overlaps(buildings).reset_index(drop=True)\n", + " \n", + " t1 = time.time()\n", + " streets = nm.download_osm_streets(\n", + " PHILLY_BOX,\n", + " crs=\"EPSG:3857\",\n", + " clip=True,\n", + " explode=True,\n", + " graphml_path=OUTPUT_DIR / \"streets_consolidated.graphml\",\n", + " )\n", + " download_streets_time = time.time() - t1\n", + " print(f\"Streets download: {download_streets_time:>6.2f}s ({len(streets):,} streets)\")\n", + " \n", + " streets = streets.reset_index(drop=True)\n", + " \n", + " t2 = time.time()\n", + " rotated_streets, rotation_deg = nm.rotate_streets_to_align(streets, k=200)\n", + " rotation_time = time.time() - t2\n", + " print(f\"Grid rotation: {rotation_time:>6.2f}s ({rotation_deg:.2f}°)\")\n", + " \n", + " rotated_buildings = nm.rotate(buildings, rotation_deg=rotation_deg)\n", + " rotated_boundary = nm.rotate(\n", + " gpd.GeoDataFrame(geometry=[boundary_polygon], crs=\"EPSG:3857\"),\n", + " rotation_deg=rotation_deg\n", + " )\n", + " \n", + " if SANDBOX_GPKG.exists():\n", + " SANDBOX_GPKG.unlink()\n", + " \n", + " rotated_buildings.to_file(SANDBOX_GPKG, layer=\"buildings\", driver=\"GPKG\")\n", + " rotated_streets.to_file(SANDBOX_GPKG, layer=\"streets\", driver=\"GPKG\", mode=\"a\")\n", + " rotated_boundary.to_file(SANDBOX_GPKG, layer=\"boundary\", driver=\"GPKG\", mode=\"a\")\n", + " \n", + " data_gen_time = download_buildings_time + download_streets_time + rotation_time\n", + " print(\"-\"*50)\n", + " print(f\"Data generation: {data_gen_time:>6.2f}s\")\n", + " print(\"=\"*50 + \"\\n\")\n", + "else:\n", + " print(f\"Loading existing data from {SANDBOX_GPKG}\")\n", + " data_gen_time = 0.0\n", "\n", - "print(f\"\\nCity generation: {gen_time:.2f}s\")\n", - "print(f\" Blocks: {len(city.blocks_gdf):,}\")\n", - "print(f\" Streets: {len(city.streets_gdf):,}\")\n", - "print(f\" Buildings: {len(city.buildings_gdf):,}\")" + "buildings = gpd.read_file(SANDBOX_GPKG, layer=\"buildings\")\n", + "streets = gpd.read_file(SANDBOX_GPKG, layer=\"streets\")\n", + "boundary = gpd.read_file(SANDBOX_GPKG, layer=\"boundary\")" ] }, { @@ -116,133 +184,106 @@ "id": "addde79f", "metadata": {}, "source": [ - "## Street Graph\n", - "\n", - "NetworkX graph for pathfinding" + "## Benchmark: Rasterization Pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "9938250f", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 1 + }, "outputs": [], "source": [ - "t0 = time.time()\n", - "G = city.get_street_graph()\n", - "graph_time = time.time() - t0\n", - "\n", - "print(f\"\\nStreet graph: {graph_time:.2f}s\")\n", - "print(f\" Nodes: {len(G.nodes):,}\")\n", - "print(f\" Edges: {len(G.edges):,}\")" - ] - }, - { - "cell_type": "markdown", - "id": "ae69f7de", - "metadata": {}, - "source": [ - "## Hub Network\n", + "print(\"=\"*50)\n", + "print(\"RASTERIZATION PIPELINE\")\n", + "print(\"=\"*50)\n", "\n", - "Sparse hub-to-hub distance matrix for efficient routing shortcuts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3371ff37-6804-4acb-aad8-86b898467b6c", - "metadata": {}, - "outputs": [], - "source": [ "hub_size = 100\n", - "t0 = time.time()\n", - "city._build_hub_network(hub_size=hub_size)\n", - "hub_time = time.time() - t0\n", - "\n", - "print(f\"\\nHub network ({hub_size} hubs): {hub_time:.2f}s\")\n", - "print(f\" Matrix shape: {city.hub_df.shape}\")\n", - "print(f\"\\nSample (first 5x5):\")\n", - "print(city.hub_df.iloc[:5, :5])" - ] - }, - { - "cell_type": "markdown", - "id": "811bd2d7", - "metadata": {}, - "source": [ - "## Gravity Matrix\n", + "resolve_overlaps = True\n", "\n", - "Building-to-building gravity using vectorized Manhattan distances + hub shortcuts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f984958-4cae-4bd0-9ac4-789455c7ed4f", - "metadata": {}, - "outputs": [], - "source": [ "t0 = time.time()\n", - "city.compute_gravity(exponent=2.0)\n", - "grav_time = time.time() - t0\n", + "city = RasterCity(boundary.geometry.iloc[0], streets, buildings, block_side_length=BLOCK_SIDE_LENGTH, resolve_overlaps=resolve_overlaps)\n", + "gen_time = time.time() - t0\n", + "print(f\"City generation: {gen_time:>6.2f}s\")\n", "\n", - "print(f\"\\nGravity matrix: {grav_time:.2f}s\")\n", - "print(f\" Shape: {city.grav.shape}\")\n", + "t1 = time.time()\n", + "G = city.get_street_graph()\n", + "graph_time = time.time() - t1\n", + "print(f\"Street graph: {graph_time:>6.2f}s\")\n", "\n", - "# Detailed diagnostics\n", - "diag = np.diag(city.grav.values)\n", - "diag_zeros = (diag == 0).all()\n", - "print(f\" Diagonal all zeros: {diag_zeros}\")\n", - "if not diag_zeros:\n", - " nonzero_diag = np.where(diag != 0)[0]\n", - " print(f\" Non-zero diagonal entries: {len(nonzero_diag)}\")\n", - " print(f\" Example diagonal values: {diag[nonzero_diag[:5]]}\")\n", - " print(f\" Corresponding building IDs: {[city.grav.index[i] for i in nonzero_diag[:5]]}\")\n", + "t2 = time.time()\n", + "city._build_hub_network(hub_size=hub_size)\n", + "hub_time = time.time() - t2\n", + "print(f\"Hub network: {hub_time:>6.2f}s\")\n", + "\n", + "t3 = time.time()\n", + "city.compute_gravity(exponent=2.0, callable_only=True)\n", + "grav_time = time.time() - t3\n", + "print(f\"Gravity computation: {grav_time:>6.2f}s\")\n", "\n", - "mask = ~np.eye(len(city.grav), dtype=bool)\n", - "offdiag = city.grav.values[mask]\n", - "offdiag_positive = (offdiag > 0).all()\n", - "print(f\" Off-diagonal all positive: {offdiag_positive}\")\n", - "if not offdiag_positive:\n", - " nonpositive = np.where(offdiag <= 0)[0]\n", - " print(f\" Non-positive off-diagonal entries: {len(nonpositive)} / {len(offdiag)}\")\n", - " print(f\" Min off-diagonal value: {offdiag.min()}\")\n", - " print(f\" Max off-diagonal value: {offdiag.max()}\")\n", - " # Find a specific example\n", - " rows, cols = np.where((city.grav.values <= 0) & ~np.eye(len(city.grav), dtype=bool))\n", - " if len(rows) > 0:\n", - " print(f\" Example: grav[{city.grav.index[rows[0]]}][{city.grav.columns[cols[0]]}] = {city.grav.iloc[rows[0], cols[0]]}\")\n", + "raster_time = gen_time + graph_time + hub_time + grav_time\n", + "print(\"-\"*50)\n", + "print(f\"Rasterization: {raster_time:>6.2f}s\")\n", + "print(\"=\"*50)\n", "\n", - "print(f\"\\nSample (first building to 5 others):\")\n", - "print(city.grav.iloc[0, :5])" + "if data_gen_time > 0:\n", + " total_time = data_gen_time + raster_time\n", + " print(f\"\\nTotal (with data): {total_time:>6.2f}s\")" ] }, { "cell_type": "markdown", - "id": "81e349ec", + "id": "ae69f7de", "metadata": {}, "source": [ - "## Summary" + "## Summary: City Structure" ] }, { "cell_type": "code", "execution_count": null, - "id": "c299b01f-2d47-46bd-bbc4-f48e63f9a83f", + "id": "3371ff37-6804-4acb-aad8-86b898467b6c", "metadata": {}, "outputs": [], "source": [ - "total_time = gen_time + graph_time + hub_time + grav_time\n", - "print(\"\\n\" + \"=\"*50)\n", - "print(\"TIMING SUMMARY\")\n", - "print(\"=\"*50)\n", - "print(f\"City generation: {gen_time:>6.2f}s\")\n", - "print(f\"Street graph: {graph_time:>6.2f}s\")\n", - "print(f\"Hub network: {hub_time:>6.2f}s\")\n", - "print(f\"Gravity matrix: {grav_time:>6.2f}s\")\n", - "print(\"-\"*50)\n", - "print(f\"Total: {total_time:>6.2f}s\")\n", - "print(\"=\"*50)" + "def get_size_mb(obj):\n", + " \"\"\"Estimate memory size in MB for common objects.\"\"\"\n", + " if isinstance(obj, (pd.DataFrame, gpd.GeoDataFrame)):\n", + " return obj.memory_usage(deep=True).sum() / 1024**2\n", + " elif hasattr(obj, 'nodes') and hasattr(obj, 'edges'): # NetworkX graph\n", + " # Approximate: 64 bytes per node + 96 bytes per edge\n", + " return (len(obj.nodes) * 64 + len(obj.edges) * 96) / 1024**2\n", + " else:\n", + " return 0.0\n", + "\n", + "summary_df = pd.DataFrame({\n", + " 'Component': ['Blocks', 'Streets', 'Buildings', 'Graph Nodes', 'Graph Edges', 'Hub Network', 'Hub Info', 'Nearby Doors', 'Gravity (callable)'],\n", + " 'Count/Shape': [\n", + " f\"{len(city.blocks_gdf):,}\",\n", + " f\"{len(city.streets_gdf):,}\",\n", + " f\"{len(city.buildings_gdf):,}\",\n", + " f\"{len(G.nodes):,}\",\n", + " f\"{len(G.edges):,}\",\n", + " f\"{city.hub_df.shape[0]}×{city.hub_df.shape[1]}\",\n", + " f\"{city.grav_hub_info.shape[0]}×{city.grav_hub_info.shape[1]}\",\n", + " f\"{len(city.mh_dist_nearby_doors):,} pairs\",\n", + " \"function\"\n", + " ],\n", + " 'Memory (MB)': [\n", + " f\"{get_size_mb(city.blocks_gdf):.1f}\",\n", + " f\"{get_size_mb(city.streets_gdf):.1f}\",\n", + " f\"{get_size_mb(city.buildings_gdf):.1f}\",\n", + " f\"{get_size_mb(G):.1f}\",\n", + " \"-\",\n", + " f\"{get_size_mb(city.hub_df):.1f}\",\n", + " f\"{get_size_mb(city.grav_hub_info):.1f}\",\n", + " f\"{get_size_mb(city.mh_dist_nearby_doors):.1f}\",\n", + " \"<0.1\"\n", + " ]\n", + "})\n", + "print(\"\\n\" + summary_df.to_string(index=False))" ] } ], @@ -251,9 +292,9 @@ "formats": "ipynb,py:percent" }, "kernelspec": { - "display_name": "Python (nomad repo venv)", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "nomad-repo-venv" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -265,7 +306,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.10.0" } }, "nbformat": 4, diff --git a/examples/research/virtual_philadelphia/rasterization_report.py b/examples/research/virtual_philadelphia/rasterization_report.py index 410e4a0..ced7ca0 100644 --- a/examples/research/virtual_philadelphia/rasterization_report.py +++ b/examples/research/virtual_philadelphia/rasterization_report.py @@ -6,11 +6,11 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.17.3 +# jupytext_version: 1.17.1 # kernelspec: -# display_name: Python (nomad repo venv) +# display_name: Python 3 (ipykernel) # language: python -# name: nomad-repo-venv +# name: python3 # --- # %% [markdown] diff --git a/examples/research/virtual_philadelphia/sample_step_benchmark.ipynb b/examples/research/virtual_philadelphia/sample_step_benchmark.ipynb new file mode 100644 index 0000000..36781d9 --- /dev/null +++ b/examples/research/virtual_philadelphia/sample_step_benchmark.ipynb @@ -0,0 +1,326 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bb9c09b0", + "metadata": {}, + "source": [ + "# Benchmark: `_sample_step` Performance Analysis\n", + "\n", + "## Goals\n", + "1. Benchmark repeated `get_shortest_path()` calls (callable mode)\n", + "2. Profile trajectory generation to identify bottlenecks\n", + "3. Test optimization strategies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abe477f0", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import numpy as np\n", + "import pandas as pd\n", + "import geopandas as gpd\n", + "from pathlib import Path\n", + "\n", + "from nomad.city_gen import RasterCity\n", + "from nomad.traj_gen import Population" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40f4fcb9", + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "BOX_SIZE = 'small' # 'small', 'medium', or 'large'\n", + "BLOCK_SIDE_LENGTH = 10.0\n", + "HUB_SIZE = 100\n", + "MAX_MANHATTAN_DIST = 30\n", + "NUM_PATH_QUERIES = 100\n", + "SIMULATION_HOURS = 24\n", + "DT = 0.5 # minutes\n", + "EPR_TIME_RES = 15\n", + "RHO = 0.4\n", + "GAMMA = 0.3\n", + "SEED = 42" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4c9cd03", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"=\"*60)\n", + "print(f\"SETUP: {BOX_SIZE.upper()} BOX\")\n", + "print(\"=\"*60)\n", + "\n", + "# Load OSM data and rasterize\n", + "data_dir = Path(\"sandbox\")\n", + "osm_path = data_dir / f\"sandbox_data_{BOX_SIZE}.gpkg\"\n", + "\n", + "if not osm_path.exists():\n", + " raise FileNotFoundError(f\"OSM data not found at {osm_path}. Run rasterization_report.py first.\")\n", + "\n", + "print(f\"Loading OSM data...\")\n", + "buildings = gpd.read_file(osm_path, layer=\"buildings\")\n", + "streets = gpd.read_file(osm_path, layer=\"streets\")\n", + "boundary = gpd.read_file(osm_path, layer=\"boundary\")\n", + "print(f\" Buildings: {len(buildings):,}\")\n", + "print(f\" Streets: {len(streets):,}\")\n", + "\n", + "print(f\"\\nRasterizing city...\")\n", + "t0 = time.time()\n", + "city = RasterCity(\n", + " boundary.geometry.iloc[0],\n", + " streets,\n", + " buildings,\n", + " block_side_length=BLOCK_SIDE_LENGTH,\n", + " resolve_overlaps=True,\n", + " verbose=False\n", + ")\n", + "print(f\" Rasterization: {time.time()-t0:.2f}s\")\n", + "print(f\" Buildings added: {len(city.buildings_gdf):,}\")\n", + "print(f\" Street blocks: {len(city.streets_gdf):,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23d7576b", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\nBuilding street graph...\")\n", + "t0 = time.time()\n", + "G = city.get_street_graph()\n", + "print(f\" Street graph: {time.time()-t0:.2f}s\")\n", + "print(f\" Nodes: {G.number_of_nodes():,}\")\n", + "print(f\" Edges: {G.number_of_edges():,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87083b3e", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\nBuilding hub network...\")\n", + "t0 = time.time()\n", + "city._build_hub_network(hub_size=HUB_SIZE)\n", + "print(f\" Hub network: {time.time()-t0:.2f}s\")\n", + "print(f\" Hubs: {len(city.hubs):,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44059656", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\nComputing gravity (callable mode)...\")\n", + "t0 = time.time()\n", + "city.compute_gravity(exponent=2.0, callable_only=True)\n", + "print(f\" Gravity computation: {time.time()-t0:.2f}s\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0936f95", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\nComputing shortest paths (callable mode)...\")\n", + "t0 = time.time()\n", + "city.compute_shortest_paths(callable_only=True)\n", + "print(f\" Shortest paths: {time.time()-t0:.2f}s\")" + ] + }, + { + "cell_type": "markdown", + "id": "3fa29dff", + "metadata": {}, + "source": [ + "## Benchmark 1: Repeated `get_shortest_path()` Calls" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53adef37", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\"*60)\n", + "print(\"BENCHMARK 1: get_shortest_path() Performance\")\n", + "print(\"=\"*60)\n", + "\n", + "# Find valid street block pairs with manhattan distance < MAX_MANHATTAN_DIST\n", + "streets_list = list(city.streets_gdf.index)\n", + "valid_pairs = []\n", + "\n", + "print(f\"Finding {NUM_PATH_QUERIES} pairs with Manhattan distance < {MAX_MANHATTAN_DIST}...\")\n", + "rng = np.random.default_rng(42)\n", + "\n", + "while len(valid_pairs) < NUM_PATH_QUERIES:\n", + " i = rng.integers(0, len(streets_list))\n", + " j = rng.integers(0, len(streets_list))\n", + " if i == j:\n", + " continue\n", + " \n", + " start = streets_list[i]\n", + " end = streets_list[j]\n", + " manhattan_dist = abs(start[0] - end[0]) + abs(start[1] - end[1])\n", + " \n", + " if manhattan_dist < MAX_MANHATTAN_DIST:\n", + " valid_pairs.append((start, end))\n", + "\n", + "print(f\"Found {len(valid_pairs)} valid pairs\")\n", + "\n", + "# Benchmark\n", + "times = []\n", + "path_lengths = []\n", + "\n", + "print(f\"\\nBenchmarking {NUM_PATH_QUERIES} get_shortest_path() calls...\")\n", + "for start, end in valid_pairs:\n", + " t0 = time.time()\n", + " path = city.get_shortest_path(start, end)\n", + " elapsed = time.time() - t0\n", + " times.append(elapsed)\n", + " path_lengths.append(len(path) if path else 0)\n", + "\n", + "times = np.array(times) * 1000 # Convert to milliseconds\n", + "path_lengths = np.array(path_lengths)\n", + "\n", + "print(\"\\nResults:\")\n", + "print(f\" Total queries: {len(times)}\")\n", + "print(f\" Mean time: {times.mean():.2f} ms\")\n", + "print(f\" Median time: {np.median(times):.2f} ms\")\n", + "print(f\" Std time: {times.std():.2f} ms\")\n", + "print(f\" Min time: {times.min():.2f} ms\")\n", + "print(f\" Max time: {times.max():.2f} ms\")\n", + "print(f\" Mean path length: {path_lengths.mean():.1f} blocks\")\n", + "print(f\" Median path length: {np.median(path_lengths):.1f} blocks\")" + ] + }, + { + "cell_type": "markdown", + "id": "4cd8b5e6", + "metadata": {}, + "source": [ + "## Benchmark 2: Destination Diary Generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0ae1bfa", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\"*60)\n", + "print(\"BENCHMARK 2: Destination Diary Generation\")\n", + "print(\"=\"*60)\n", + "\n", + "population = Population(city)\n", + "population.generate_agents(\n", + " N=1,\n", + " seed=SEED,\n", + " name_count=1,\n", + " datetimes=\"2024-01-01 00:00-05:00\"\n", + ")\n", + "\n", + "agent = list(population.roster.values())[0]\n", + "end_time = pd.Timestamp(\"2024-01-01 00:00-05:00\") + pd.Timedelta(hours=SIMULATION_HOURS)\n", + "\n", + "print(f\"\\nGenerating {SIMULATION_HOURS}-hour destination diary...\")\n", + "print(f\" Start: {agent.last_ping['datetime']}\")\n", + "print(f\" End: {end_time}\")\n", + "\n", + "t0 = time.time()\n", + "agent.generate_dest_diary(\n", + " end_time=end_time,\n", + " epr_time_res=EPR_TIME_RES,\n", + " rho=RHO,\n", + " gamma=GAMMA,\n", + " seed=SEED\n", + ")\n", + "elapsed = time.time() - t0\n", + "\n", + "print(f\"\\nResults:\")\n", + "print(f\" Time: {elapsed:.2f}s\")\n", + "print(f\" Diary entries: {len(agent.destination_diary):,}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7957e9d4", + "metadata": {}, + "source": [ + "## Benchmark 3: Trajectory Generation from Diary" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a87214", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\" + \"=\"*60)\n", + "print(\"BENCHMARK 3: Trajectory Generation from Diary\")\n", + "print(\"=\"*60)\n", + "\n", + "print(f\"\\nGenerating trajectory from destination diary...\")\n", + "print(f\" Diary entries: {len(agent.destination_diary):,}\")\n", + "print(f\" Time step (dt): {DT} minutes\")\n", + "\n", + "t0 = time.time()\n", + "agent.generate_trajectory(\n", + " destination_diary=agent.destination_diary,\n", + " dt=DT,\n", + " seed=SEED\n", + ")\n", + "elapsed = time.time() - t0\n", + "\n", + "trajectory = agent.trajectory\n", + "\n", + "print(f\"\\nResults:\")\n", + "print(f\" Time: {elapsed:.2f}s\")\n", + "print(f\" Trajectory points: {len(trajectory):,}\")\n", + "print(f\" Points per second: {len(trajectory)/elapsed:.1f}\")\n", + "print(f\" Time per point: {1000*elapsed/len(trajectory):.2f} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd497726", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/research/virtual_philadelphia/sample_step_benchmark.py b/examples/research/virtual_philadelphia/sample_step_benchmark.py new file mode 100644 index 0000000..a72b684 --- /dev/null +++ b/examples/research/virtual_philadelphia/sample_step_benchmark.py @@ -0,0 +1,228 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.17.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Benchmark: `_sample_step` Performance Analysis +# +# ## Goals +# 1. Benchmark repeated `get_shortest_path()` calls (callable mode) +# 2. Profile trajectory generation to identify bottlenecks +# 3. Test optimization strategies + +# %% +import time +import numpy as np +import pandas as pd +import geopandas as gpd +from pathlib import Path + +from nomad.city_gen import RasterCity +from nomad.traj_gen import Population + +# %% +# Configuration +BOX_SIZE = 'small' # 'small', 'medium', or 'large' +BLOCK_SIDE_LENGTH = 10.0 +HUB_SIZE = 100 +MAX_MANHATTAN_DIST = 30 +NUM_PATH_QUERIES = 100 +SIMULATION_HOURS = 24 +DT = 0.5 # minutes +EPR_TIME_RES = 15 +RHO = 0.4 +GAMMA = 0.3 +SEED = 42 + +# %% +print("="*60) +print(f"SETUP: {BOX_SIZE.upper()} BOX") +print("="*60) + +# Load OSM data and rasterize +data_dir = Path("sandbox") +osm_path = data_dir / f"sandbox_data_{BOX_SIZE}.gpkg" + +if not osm_path.exists(): + raise FileNotFoundError(f"OSM data not found at {osm_path}. Run rasterization_report.py first.") + +print(f"Loading OSM data...") +buildings = gpd.read_file(osm_path, layer="buildings") +streets = gpd.read_file(osm_path, layer="streets") +boundary = gpd.read_file(osm_path, layer="boundary") +print(f" Buildings: {len(buildings):,}") +print(f" Streets: {len(streets):,}") + +print(f"\nRasterizing city...") +t0 = time.time() +city = RasterCity( + boundary.geometry.iloc[0], + streets, + buildings, + block_side_length=BLOCK_SIDE_LENGTH, + resolve_overlaps=True, + verbose=False +) +print(f" Rasterization: {time.time()-t0:.2f}s") +print(f" Buildings added: {len(city.buildings_gdf):,}") +print(f" Street blocks: {len(city.streets_gdf):,}") + +# %% +print("\nBuilding street graph...") +t0 = time.time() +G = city.get_street_graph() +print(f" Street graph: {time.time()-t0:.2f}s") +print(f" Nodes: {G.number_of_nodes():,}") +print(f" Edges: {G.number_of_edges():,}") + +# %% +print("\nBuilding hub network...") +t0 = time.time() +city._build_hub_network(hub_size=HUB_SIZE) +print(f" Hub network: {time.time()-t0:.2f}s") +print(f" Hubs: {len(city.hubs):,}") + +# %% +print("\nComputing gravity (callable mode)...") +t0 = time.time() +city.compute_gravity(exponent=2.0, callable_only=True) +print(f" Gravity computation: {time.time()-t0:.2f}s") + +# %% +print("\nComputing shortest paths (callable mode)...") +t0 = time.time() +city.compute_shortest_paths(callable_only=True) +print(f" Shortest paths: {time.time()-t0:.2f}s") + +# %% [markdown] +# ## Benchmark 1: Repeated `get_shortest_path()` Calls + +# %% +print("\n" + "="*60) +print("BENCHMARK 1: get_shortest_path() Performance") +print("="*60) + +# Find valid street block pairs with manhattan distance < MAX_MANHATTAN_DIST +streets_list = list(city.streets_gdf.index) +valid_pairs = [] + +print(f"Finding {NUM_PATH_QUERIES} pairs with Manhattan distance < {MAX_MANHATTAN_DIST}...") +rng = np.random.default_rng(42) + +while len(valid_pairs) < NUM_PATH_QUERIES: + i = rng.integers(0, len(streets_list)) + j = rng.integers(0, len(streets_list)) + if i == j: + continue + + start = streets_list[i] + end = streets_list[j] + manhattan_dist = abs(start[0] - end[0]) + abs(start[1] - end[1]) + + if manhattan_dist < MAX_MANHATTAN_DIST: + valid_pairs.append((start, end)) + +print(f"Found {len(valid_pairs)} valid pairs") + +# Benchmark +times = [] +path_lengths = [] + +print(f"\nBenchmarking {NUM_PATH_QUERIES} get_shortest_path() calls...") +for start, end in valid_pairs: + t0 = time.time() + path = city.get_shortest_path(start, end) + elapsed = time.time() - t0 + times.append(elapsed) + path_lengths.append(len(path) if path else 0) + +times = np.array(times) * 1000 # Convert to milliseconds +path_lengths = np.array(path_lengths) + +print("\nResults:") +print(f" Total queries: {len(times)}") +print(f" Mean time: {times.mean():.2f} ms") +print(f" Median time: {np.median(times):.2f} ms") +print(f" Std time: {times.std():.2f} ms") +print(f" Min time: {times.min():.2f} ms") +print(f" Max time: {times.max():.2f} ms") +print(f" Mean path length: {path_lengths.mean():.1f} blocks") +print(f" Median path length: {np.median(path_lengths):.1f} blocks") + +# %% [markdown] +# ## Benchmark 2: Destination Diary Generation + +# %% +print("\n" + "="*60) +print("BENCHMARK 2: Destination Diary Generation") +print("="*60) + +population = Population(city) +population.generate_agents( + N=1, + seed=SEED, + name_count=1, + datetimes="2024-01-01 00:00-05:00" +) + +agent = list(population.roster.values())[0] +end_time = pd.Timestamp("2024-01-01 00:00-05:00") + pd.Timedelta(hours=SIMULATION_HOURS) + +print(f"\nGenerating {SIMULATION_HOURS}-hour destination diary...") +print(f" Start: {agent.last_ping['datetime']}") +print(f" End: {end_time}") + +t0 = time.time() +agent.generate_dest_diary( + end_time=end_time, + epr_time_res=EPR_TIME_RES, + rho=RHO, + gamma=GAMMA, + seed=SEED +) +elapsed = time.time() - t0 + +print(f"\nResults:") +print(f" Time: {elapsed:.2f}s") +print(f" Diary entries: {len(agent.destination_diary):,}") + +# %% [markdown] +# ## Benchmark 3: Trajectory Generation from Diary + +# %% +print("\n" + "="*60) +print("BENCHMARK 3: Trajectory Generation from Diary") +print("="*60) + +print(f"\nGenerating trajectory from destination diary...") +print(f" Diary entries: {len(agent.destination_diary):,}") +print(f" Time step (dt): {DT} minutes") + +t0 = time.time() +agent.generate_trajectory( + destination_diary=agent.destination_diary, + dt=DT, + seed=SEED +) +elapsed = time.time() - t0 + +trajectory = agent.trajectory + +print(f"\nResults:") +print(f" Time: {elapsed:.2f}s") +print(f" Trajectory points: {len(trajectory):,}") +print(f" Points per second: {len(trajectory)/elapsed:.1f}") +print(f" Time per point: {1000*elapsed/len(trajectory):.2f} ms") + +# %% \ No newline at end of file diff --git a/examples/research/virtual_philadelphia/sandbox/buildings_SMALL_BOX.parquet b/examples/research/virtual_philadelphia/sandbox/buildings_SMALL_BOX.parquet new file mode 100644 index 0000000..baeb2f7 Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/buildings_SMALL_BOX.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/buildings_SMALL_BOX.pkl b/examples/research/virtual_philadelphia/sandbox/buildings_SMALL_BOX.pkl new file mode 100644 index 0000000..bfb1bb4 Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/buildings_SMALL_BOX.pkl differ diff --git a/examples/research/virtual_philadelphia/sandbox/config_medium.json b/examples/research/virtual_philadelphia/sandbox/config_medium.json new file mode 100644 index 0000000..689f585 --- /dev/null +++ b/examples/research/virtual_philadelphia/sandbox/config_medium.json @@ -0,0 +1,27 @@ +{ + "box_name": "medium", + "block_side_length": 10.0, + "hub_size": 100, + "N": 10, + "name_seed": 42, + "name_count": 2, + "epr_params": { + "datetime": "2024-01-01 00:00-05:00", + "end_time": "2024-01-08 00:00-05:00", + "epr_time_res": 15, + "rho": 0.4, + "gamma": 0.3, + "seed_base": 100 + }, + "traj_params": { + "dt": 0.5, + "seed_base": 200 + }, + "sampling_params": { + "beta_ping": 5, + "beta_durations": null, + "beta_start": null, + "ha": 0.7666666666666667, + "seed_base": 300 + } +} \ No newline at end of file diff --git a/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-01/part-0.parquet b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-01/part-0.parquet new file mode 100644 index 0000000..6f9525e Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-01/part-0.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-02/part-0.parquet b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-02/part-0.parquet new file mode 100644 index 0000000..98c2e74 Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-02/part-0.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-03/part-0.parquet b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-03/part-0.parquet new file mode 100644 index 0000000..a7b15ea Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-03/part-0.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-04/part-0.parquet b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-04/part-0.parquet new file mode 100644 index 0000000..3965a04 Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-04/part-0.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-05/part-0.parquet b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-05/part-0.parquet new file mode 100644 index 0000000..4064c0e Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-05/part-0.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-06/part-0.parquet b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-06/part-0.parquet new file mode 100644 index 0000000..b7b8a78 Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-06/part-0.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-07/part-0.parquet b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-07/part-0.parquet new file mode 100644 index 0000000..e40f704 Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/dest_diaries_medium/date=2024-01-07/part-0.parquet differ diff --git a/examples/research/virtual_philadelphia/sandbox/streets_SMALL_BOX.pkl b/examples/research/virtual_philadelphia/sandbox/streets_SMALL_BOX.pkl new file mode 100644 index 0000000..e34e53c Binary files /dev/null and b/examples/research/virtual_philadelphia/sandbox/streets_SMALL_BOX.pkl differ diff --git a/examples/research/virtual_philadelphia/synthetic_philadelphia.py b/examples/research/virtual_philadelphia/synthetic_philadelphia.py index 609dd54..cfd8e20 100644 --- a/examples/research/virtual_philadelphia/synthetic_philadelphia.py +++ b/examples/research/virtual_philadelphia/synthetic_philadelphia.py @@ -25,10 +25,13 @@ import geopandas as gpd import pandas as pd from shapely.geometry import box +import matplotlib.pyplot as plt +import contextily as cx import nomad.map_utils as nm from nomad.city_gen import RasterCity from nomad.traj_gen import Population +from nomad.io.base import from_file from tqdm import tqdm # %% [markdown] @@ -36,6 +39,7 @@ # %% LARGE_BOX = box(-75.1905, 39.9235, -75.1425, 39.9535) +MEDIUM_BOX = box(-75.1665, 39.9385, -75.1425, 39.9535) USE_FULL_CITY = False OUTPUT_DIR = Path("sandbox") @@ -45,26 +49,37 @@ BOX_NAME = "full" POLY = "Philadelphia, Pennsylvania, USA" else: - BOX_NAME = "large" - POLY = LARGE_BOX + BOX_NAME = "medium" + POLY = MEDIUM_BOX SANDBOX_GPKG = OUTPUT_DIR / f"sandbox_data_{BOX_NAME}.gpkg" REGENERATE_DATA = False config = { "box_name": BOX_NAME, - "block_side_length": 15.0, - "hub_size": 600, - "N": 100, + "block_side_length": 10.0, + "hub_size": 100, + "N": 10, "name_seed": 42, "name_count": 2, "epr_params": { "datetime": "2024-01-01 00:00-05:00", - "end_time": "2024-04-01 00:00-05:00", + "end_time": "2024-01-08 00:00-05:00", # 7 days "epr_time_res": 15, "rho": 0.4, "gamma": 0.3, "seed_base": 100 + }, + "traj_params": { + "dt": 0.5, + "seed_base": 200 + }, + "sampling_params": { + "beta_ping": 5, + "beta_durations": None, + "beta_start": None, + "ha": 11.5/15, + "seed_base": 300 } } @@ -179,7 +194,12 @@ grav_time = time.time() - t3 print(f"Gravity computation: {grav_time:>6.2f}s") -raster_time = gen_time + graph_time + hub_time + grav_time +t4 = time.time() +city.compute_shortest_paths(callable_only=True) +paths_time = time.time() - t4 +print(f"Shortest paths: {paths_time:>6.2f}s") + +raster_time = gen_time + graph_time + hub_time + grav_time + paths_time print("-"*50) print(f"Rasterization: {raster_time:>6.2f}s") print("="*50) @@ -283,15 +303,128 @@ def get_size_mb(obj): print(f"\nConfig saved to {config_path}") print(f"Destination diaries saved to {dest_diaries_path}") +# %% [markdown] +# ## Generate Full Trajectories from Destination Diaries + # %% -import geopandas as gpd -import contextily as cx -import matplotlib.pyplot as plt +print("\n" + "="*50) +print("TRAJECTORY GENERATION") +print("="*50) + +t1 = time.time() +failed_agents = [] +for i, agent in tqdm(enumerate(population.roster.values()), total=config["N"], desc="Generating trajectories"): + try: + agent.generate_trajectory( + dt=config["traj_params"]["dt"], + seed=config["traj_params"]["seed_base"] + i + ) + except ValueError as e: + failed_agents.append((agent.identifier, str(e))) + continue + +traj_gen_time = time.time() - t1 +print(f"Trajectory generation: {traj_gen_time:>6.2f}s") +if failed_agents: + print(f"Warning: {len(failed_agents)} agents failed trajectory generation") + +total_points = sum(len(agent.trajectory) for agent in population.roster.values() if agent.trajectory is not None) +print(f"Total trajectory points: {total_points:,}") +print(f"Points per second: {total_points/traj_gen_time:.1f}") +print("-"*50) +print(f"Total trajectory: {traj_gen_time:>6.2f}s") +print("="*50) + +# %% [markdown] +# ## Sample Sparse Trajectories + +# %% +print("\n" + "="*50) +print("SPARSE TRAJECTORY SAMPLING") +print("="*50) + +t1 = time.time() +for i, agent in tqdm(enumerate(population.roster.values()), total=config["N"], desc="Sampling trajectories"): + if agent.trajectory is None: + continue + agent.sample_trajectory( + beta_ping=config["sampling_params"]["beta_ping"], + beta_durations=config["sampling_params"]["beta_durations"], + beta_start=config["sampling_params"]["beta_start"], + ha=config["sampling_params"]["ha"], + seed=config["sampling_params"]["seed_base"] + i, + replace_sparse_traj=True + ) + +sampling_time = time.time() - t1 +print(f"Sparse sampling: {sampling_time:>6.2f}s") + +total_sparse_points = sum(len(agent.sparse_traj) for agent in population.roster.values() if agent.sparse_traj is not None) +print(f"Total sparse points: {total_sparse_points:,}") +print(f"Sparsity ratio: {total_sparse_points/total_points:.2%}") +print("-"*50) +print(f"Total sampling: {sampling_time:>6.2f}s") +print("="*50) + +# %% [markdown] +# ## Reproject to Mercator and Persist + +# %% +print("\n" + "="*50) +print("REPROJECTION AND PERSISTENCE") +print("="*50) + +# Build POI data for diary reprojection +cent = city.buildings_gdf['door_point'] if 'door_point' in city.buildings_gdf.columns else city.buildings_gdf.geometry.centroid +poi_data = pd.DataFrame({ + 'building_id': city.buildings_gdf['id'].values, + 'x': (city.buildings_gdf['door_cell_x'].astype(float) + 0.5).values if 'door_cell_x' in city.buildings_gdf.columns else cent.x.values, + 'y': (city.buildings_gdf['door_cell_y'].astype(float) + 0.5).values if 'door_cell_y' in city.buildings_gdf.columns else cent.y.values +}) + +print("Reprojecting sparse trajectories to Web Mercator...") +population.reproject_to_mercator(sparse_traj=True, full_traj=False, diaries=True, poi_data=poi_data) + +print("Saving sparse trajectories and diaries...") +population.save_pop( + sparse_path=OUTPUT_DIR / f"sparse_traj_{BOX_NAME}", + diaries_path=OUTPUT_DIR / f"diaries_{BOX_NAME}", + partition_cols=["date"], + fmt='parquet' +) +print("-"*50) +print("="*50) + +# %% [markdown] +# ## Visualize Sparse Trajectories + +# %% +print("\n" + "="*50) +print("VISUALIZATION") +print("="*50) + +# Read sparse trajectories +sparse_traj_df = from_file(OUTPUT_DIR / f"sparse_traj_{BOX_NAME}", format="parquet") +print(f"Loaded {len(sparse_traj_df):,} sparse trajectory points for {config['N']} agents") + +# Plot with contextily basemap +fig, ax = plt.subplots(figsize=(12, 10)) + +# Plot each agent with different color +for agent_id in sparse_traj_df['user_id'].unique(): + agent_traj = sparse_traj_df[sparse_traj_df['user_id'] == agent_id] + ax.scatter(agent_traj['x'], agent_traj['y'], s=1, alpha=0.5, label=agent_id) + +# Add basemap +cx.add_basemap(ax, crs="EPSG:3857", source=cx.providers.CartoDB.Positron) -box = gpd.GeoSeries(city.boundary_polygon) -fig, ax = plt.subplots() -box.plot(ax=ax, facecolor="None") -cx.add_basemap(ax, crs=box.crs) +ax.set_xlabel('X (Web Mercator)') +ax.set_ylabel('Y (Web Mercator)') +ax.set_title(f'Sparse Trajectories - {config["N"]} Agents, 7 Days') +ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', markerscale=10) +plt.tight_layout() +plt.savefig(OUTPUT_DIR / f"sparse_trajectories_{BOX_NAME}.png", dpi=150, bbox_inches='tight') +print(f"Saved plot to {OUTPUT_DIR / f'sparse_trajectories_{BOX_NAME}.png'}") plt.show() # %% diff --git a/nomad/city_gen.py b/nomad/city_gen.py index 52f926a..03646ad 100644 --- a/nomad/city_gen.py +++ b/nomad/city_gen.py @@ -188,6 +188,8 @@ def __init__(self, # Precomputed building-to-building gravity (optional, built on demand) self.grav = None self.door_dist = None + # Precomputed shortest paths (optional, built on demand via compute_shortest_paths) + self.shortest_paths = None def _derive_streets_from_blocks(self): """ @@ -509,48 +511,69 @@ def check_adjacent(self, geom1, geom2, graph=None): raise TypeError("Unsupported types: geom1 must be block tuple/list or shapely geometry; geom2 must be block tuple or shapely geometry") - def get_street_graph(self): - """Build the street graph needed for routing. - - Constructs a grid-adjacency graph where each street block is a node - and edges connect cardinally adjacent blocks. - """ - if hasattr(self, 'street_graph') and self.street_graph is not None: - return self.street_graph - - # --------------------------------------------------------------------- - # Build street block adjacency graph - # --------------------------------------------------------------------- - # Nodes - nodes_df = self.streets_gdf[['coord_x', 'coord_y']].copy() - - # Build edge list via vectorized neighbor matches + def _build_adjacency_graph(self, gdf): + """Build grid-adjacency graph from GeoDataFrame with coord_x, coord_y.""" + nodes_df = gdf[['coord_x', 'coord_y']].copy() edge_list = [] - for dx, dy in [(1, 0), (0, 1)]: # undirected; only positive directions + for dx, dy in [(1, 0), (0, 1)]: shifted = nodes_df.copy() shifted['nx'] = nodes_df['coord_x'] + dx shifted['ny'] = nodes_df['coord_y'] + dy - # Merge to find neighbor pairs merged = shifted.merge( nodes_df.rename(columns={'coord_x': 'nx', 'coord_y': 'ny'}), on=['nx', 'ny'], how='inner' ) - edge_list.append( merged[['coord_x', 'coord_y', 'nx', 'ny']].rename(columns={'coord_x': 'x', 'coord_y': 'y'}) ) - edges_df = pd.concat(edge_list, ignore_index=True) - - # Create graph G = nx.Graph() G.add_nodes_from([(int(x), int(y)) for x, y in nodes_df.values]) edge_list = [((int(r.x), int(r.y)), (int(r.nx), int(r.ny))) for r in edges_df.itertuples(index=False)] G.add_edges_from(edge_list, weight=1) - self.street_graph = G - + return G + + def get_street_graph(self): + """Build the street graph needed for routing.""" + if hasattr(self, 'street_graph') and self.street_graph is not None: + return self.street_graph + self.street_graph = self._build_adjacency_graph(self.streets_gdf) return self.street_graph + def compute_shortest_paths(self, callable_only=False): + """ + Compute shortest paths between street blocks. + + Parameters + ---------- + callable_only : bool, default False + If False, precompute all-pairs shortest paths (only for small cities) + If True, store on-demand callable (placeholder for hub-based routing) + + Notes + ----- + Callable mode currently uses nx.shortest_path() on demand—this is a + PLACEHOLDER. Production use with _sample_step requires hub-based routing + for scalability (billions of calls). Current implementation too slow for + large-scale trajectory generation. + + Stores result in self.shortest_paths as dict or callable + """ + G = self.get_street_graph() + + if callable_only: + # TODO: Replace with hub-based routing for production use + # Current on-demand nx.shortest_path is too slow for billion+ calls in _sample_step + def compute_path(start_coord, end_coord): + try: + return nx.shortest_path(G, start_coord, end_coord) + except nx.NetworkXNoPath: + return [] + self.shortest_paths = compute_path + else: + # Dense storage for small cities only + self.shortest_paths = dict(nx.all_pairs_shortest_path(G)) + # --------------------------------------------------------------------- # Shortcut ("highway") network for fast on-demand routing # --------------------------------------------------------------------- @@ -1502,13 +1525,13 @@ def get_shortest_path(self, start_coord: tuple, end_coord: tuple, plot: bool = F Raises ------ ValueError - If either coordinate is not a valid street block. + If either coordinate is not a valid street block or if shortest_paths + has not been computed. Notes ----- - - Uses the hub-based shortcut index for speed; hub-to-hub segments may - be obtained via a local NetworkX shortest path when a direct next-hop - entry is unavailable. + Requires compute_shortest_paths() to be called first. Uses precomputed + paths (dict) or on-demand callable depending on mode. """ if not (isinstance(start_coord, tuple) and isinstance(end_coord, tuple)): raise ValueError("Coordinates must be tuples of (x, y).") @@ -1519,78 +1542,18 @@ def get_shortest_path(self, start_coord: tuple, end_coord: tuple, plot: bool = F if not ((self.streets_gdf['coord_x'] == end_coord[0]) & (self.streets_gdf['coord_y'] == end_coord[1])).any(): raise ValueError(f"End coordinate {end_coord} must be a street block.") - # Ensure graph and shortcuts are built - if not hasattr(self, 'street_graph') or self.street_graph is None: - self.get_street_graph(lazy=True) - if not hasattr(self, '_shortcut_hubs') or self._shortcut_hubs is None: - try: - self._build_hub_network() - except Exception: - pass - - # Prefer shortcut network if available; fall back to NetworkX if not - if hasattr(self, '_shortcut_hubs') and self._shortcut_hubs: - # Map arbitrary node to its hub via precomputed next_to_hub - def path_to_hub(node): - route = [node] - # Prevent infinite loops; cap at city size - for _ in range(max(1, len(self.streets_gdf))): - if node == self._nearest_hub[node]: - break - node = self._next_to_hub[node] - route.append(node) - if node == self._nearest_hub[node]: - break - return route - - u = start_coord - v = end_coord - hu = self._nearest_hub[u] - hv = self._nearest_hub[v] - - # u -> hu - seg_u_hu = path_to_hub(u) - # hubs path hu -> hv using next_hop table - seg_hubs = [hu] - cur = hu - safe_cap = max(2, len(self._shortcut_hubs) * 4) - for _ in range(safe_cap): - if cur == hv: - break - nh = self._hub_next_hop.get(cur, {}).get(hv) - if nh is None: - # fallback to direct nx shortest path if hub-hub mapping missing - try: - nx_path = nx.shortest_path(self.get_street_graph(), cur, hv) - except nx.NetworkXNoPath: - return [] - seg_hubs = nx_path - cur = hv - break - seg_hubs.append(nh) - cur = nh - - # hv -> v: walk from v to hv using next_to_hub, then reverse to get hv->v - seg_v_to_hv = path_to_hub(v) - # ensure last element is hv - if seg_v_to_hv[-1] != hv: - # fallback to nx shortest if something is off - try: - nx_path = nx.shortest_path(self.get_street_graph(), hv, v) - except nx.NetworkXNoPath: - return [] - seg_hv_to_v = nx_path - else: - seg_hv_to_v = list(reversed(seg_v_to_hv)) - - # Concatenate segments, avoiding duplicate junctions - path = seg_u_hu[:-1] + seg_hubs + seg_hv_to_v[1:] + # Check if shortest_paths has been computed + if not hasattr(self, 'shortest_paths') or self.shortest_paths is None: + raise ValueError("Must call compute_shortest_paths() before using get_shortest_path().") + + # Get path from either dict or callable + if callable(self.shortest_paths): + path = self.shortest_paths(start_coord, end_coord) else: - # Fallback: compute on-demand with NetworkX - try: - path = nx.shortest_path(self.get_street_graph(), start_coord, end_coord) - except nx.NetworkXNoPath: + # Dict mode: lookup precomputed path + if start_coord not in self.shortest_paths: return [] + path = self.shortest_paths[start_coord].get(end_coord, []) if plot: if ax is None: @@ -1620,49 +1583,107 @@ def path_to_hub(node): return path # --------------------------------------------------------------------- - # Fast distance with small LRU-style cache (simple and effective) + # DEPRECATED: Cached path retrieval (kept for potential future use) # --------------------------------------------------------------------- - def get_distance_fast(self, start_coord: tuple, end_coord: tuple) -> float: + def get_paths_fast(self, start_coord: tuple, end_coord: tuple): """ - Return an approximate shortest-path distance in number of steps between two - street blocks using the shortcut network for speed. Results are cached. + DEPRECATED: Return shortest path with LRU-style caching. + + This method wraps get_shortest_path() with a simple cache. May be useful + for future optimizations but currently not sufficient for large-scale + trajectory generation needs. - Falls back to on-demand NetworkX shortest_path if shortcuts are unavailable. - Returns np.inf if no path exists. + Parameters + ---------- + start_coord : tuple[int, int] + Starting street block (x, y). + end_coord : tuple[int, int] + Ending street block (x, y). + + Returns + ------- + list[tuple[int, int]] + Sequence of street blocks from start to end, or empty list if no path. """ if not (isinstance(start_coord, tuple) and isinstance(end_coord, tuple)): raise ValueError("Coordinates must be tuples of (x, y).") # Initialize cache - if not hasattr(self, '_distance_cache'): - self._distance_cache = {} + if not hasattr(self, '_paths_cache'): + self._paths_cache = {} MAX_CACHE = 50000 key = (start_coord, end_coord) - if key in self._distance_cache: - return self._distance_cache[key] + if key in self._paths_cache: + return self._paths_cache[key] - # Compute via path length + # Compute path try: path = self.get_shortest_path(start_coord, end_coord) - if not path: - d = float('inf') - else: - d = float(max(0, len(path) - 1)) except Exception: - d = float('inf') + path = [] # Cache with simple capacity control (drop half when full) - try: - self._distance_cache[key] = d - if len(self._distance_cache) > MAX_CACHE: - # drop roughly half oldest by arbitrary order - for k in list(self._distance_cache.keys())[:MAX_CACHE // 2]: - self._distance_cache.pop(k, None) - except Exception: - pass + self._paths_cache[key] = path + if len(self._paths_cache) > MAX_CACHE: + # drop roughly half oldest by arbitrary order + for k in list(self._paths_cache.keys())[:MAX_CACHE // 2]: + self._paths_cache.pop(k, None) - return d + return path + + def _add_to_adjacency_graph(self, G, coord): + """Add node and cardinal edges to existing graph.""" + G.add_node(coord) + x, y = coord + for dx, dy in [(1, 0), (-1, 0), (0, 1), (0, -1)]: + neighbor = (x + dx, y + dy) + if neighbor in G: + G.add_edge(coord, neighbor, weight=1) + + def _connect_building_door_to_streets(self, building_id, max_depth=5): + """Connect building door to street network by converting blocks along shortest path.""" + building_row = self.buildings_gdf.loc[building_id] + door_x = int(building_row['door_cell_x']) + door_y = int(building_row['door_cell_y']) + door_coord = (door_x, door_y) + + # Get blocks within Manhattan distance + candidates = [(x, y) for x in range(door_x - max_depth, door_x + max_depth + 1) + for y in range(door_y - max_depth, door_y + max_depth + 1) + if abs(x - door_x) + abs(y - door_y) <= max_depth] + + non_building = self.blocks_gdf[self.blocks_gdf['kind'] != 'building'] + nearby = non_building[non_building.index.isin(candidates)] + block_graph = self._build_adjacency_graph(nearby) + + paths = nx.single_source_shortest_path(block_graph, source=door_coord, cutoff=max_depth) + + for target, path in paths.items(): + if target != door_coord and target in self.streets_gdf.index: + new_coords = [c for c in path if c not in self.streets_gdf.index] + + for coord in path: + self.blocks_gdf.loc[coord, 'kind'] = 'street' + self.blocks_gdf.loc[coord, 'building_type'] = 'street' + + if new_coords: + new_gdf = gpd.GeoDataFrame([ + {'coord_x': x, 'coord_y': y, 'geometry': box(x, y, x+1, y+1), + 'building_type': 'street', 'building_id': None} + for x, y in new_coords + ], geometry='geometry', crs=self.streets_gdf.crs) + new_gdf.set_index(['coord_x', 'coord_y'], inplace=True, drop=False) + new_gdf.index.names = [None, None] + self.streets_gdf = pd.concat([self.streets_gdf, new_gdf]) + + if hasattr(self, 'street_graph') and self.street_graph is not None: + for coord in new_coords: + self._add_to_adjacency_graph(self.street_graph, coord) + + return True + + return False # ============================================================================= @@ -2027,6 +2048,20 @@ def __init__(self, boundary_polygon, streets_gdf, buildings_gdf, block_side_leng self.resolve_overlaps = resolve_overlaps self._rasterize(verbose=verbose) + + # Connect buildings with non-street doors + non_street_doors = [] + for building_id in self.buildings_gdf['id']: + building_row = self.buildings_gdf.loc[building_id] + door_coord = (int(building_row['door_cell_x']), int(building_row['door_cell_y'])) + if door_coord not in self.streets_gdf.index: + non_street_doors.append(building_id) + + if non_street_doors: + if verbose: + print(f" Connecting {len(non_street_doors)} buildings with non-street doors...") + for building_id in non_street_doors: + self._connect_building_door_to_streets(building_id, max_depth=5) def _rasterize(self, verbose=True): # Assigning block types could arguably be its own method, but keeping it here for now diff --git a/nomad/tests/test_city_gen.py b/nomad/tests/test_city_gen.py index 620ddfd..f5250b7 100644 --- a/nomad/tests/test_city_gen.py +++ b/nomad/tests/test_city_gen.py @@ -81,6 +81,7 @@ def test_from_geodataframes_roundtrip_nonsquare(tmp_path): def test_shortest_path(): rcg = RandomCityGenerator(width=10, height=10, street_spacing=5, seed=1) city = rcg.generate_city() + city.compute_shortest_paths(callable_only=False) # Get two street coordinates streets_temp = city.streets_gdf.reset_index(drop=True) @@ -581,3 +582,115 @@ def test_to_file_reverse_affine_transformation(tmp_path): f"Area scaling incorrect: {wm_area} vs expected {expected_area}" +def test_connect_building_door_success(): + """Test successful connection of building door to streets within max_depth""" + city = City(dimensions=(10, 10)) + + # Add buildings to create an isolated area + # Building 1: door at (5,5), footprint at (5,6) + city.add_building('home', door=(5, 5), blocks=[(5, 6)]) + building_id = city.buildings_gdf.iloc[0]['id'] + + # Add park buildings to isolate the corridor + city.add_building('park', door=(3, 4), blocks=[(4, 4), (4, 5), (4, 6)]) + city.add_building('park', door=(7, 4), blocks=[(6, 4), (6, 5), (6, 6)]) + + # Manually create corridor by setting blocks to None (non-street, non-building) + # Corridor: (5,3), (5,4), (5,5) leading to street at (5,2) + corridor_blocks = [(5, 3), (5, 4), (5, 5)] + for coord in corridor_blocks: + city.blocks_gdf.loc[coord, 'kind'] = None + city.blocks_gdf.loc[coord, 'building_type'] = None + city.streets_gdf = city.streets_gdf.drop(coord, errors='ignore') + + # Call connection function + result = city._connect_building_door_to_streets(building_id, max_depth=5) + + # Assert connection successful + assert result == True + + # Assert corridor blocks are now streets + assert city.blocks_gdf.loc[(5, 3), 'kind'] == 'street' + assert city.blocks_gdf.loc[(5, 4), 'kind'] == 'street' + assert city.blocks_gdf.loc[(5, 5), 'kind'] == 'street' + + # Assert blocks exist in streets_gdf + assert (5, 3) in city.streets_gdf.index + assert (5, 4) in city.streets_gdf.index + assert (5, 5) in city.streets_gdf.index + + +def test_connect_building_door_path_too_long(): + """Test failure when path exceeds max_depth""" + city = City(dimensions=(15, 15)) + + # Add a building with door at (7, 7) + city.add_building('home', door=(7, 7), blocks=[(7, 8)]) + building_id = city.buildings_gdf.iloc[0]['id'] + + # Create a long corridor by adding buildings on both sides + # Force a path length of 7: (7,7) -> (7,6) -> (7,5) -> (7,4) -> (7,3) -> (7,2) -> (7,1) -> (7,0) street + for x in [6, 8]: + for y in range(1, 8): + city.add_building('park', door=(x-1 if x==6 else x+1, y), blocks=[(x, y)]) + + # Set corridor blocks to None + for y in range(1, 8): + city.blocks_gdf.loc[(7, y), 'kind'] = None + city.blocks_gdf.loc[(7, y), 'building_type'] = None + city.streets_gdf = city.streets_gdf.drop((7, y), errors='ignore') + + # Path length is 7, cutoff is 5 - should fail + result = city._connect_building_door_to_streets(building_id, max_depth=5) + + # Assert connection failed + assert result == False + + +def test_connect_building_door_isolated(): + """Test failure when building door is completely isolated""" + city = City(dimensions=(20, 20)) + city.add_building('home', door=(10, 10), blocks=[(10, 11)]) + building_id = city.buildings_gdf.iloc[0]['id'] + + for x in range(3, 18): + for y in range(3, 18): + if not (x == 10 and y == 10): + city.blocks_gdf.loc[(x, y), 'kind'] = 'building' + city.blocks_gdf.loc[(x, y), 'building_type'] = 'park' + + city.streets_gdf = city._derive_streets_from_blocks() + + result = city._connect_building_door_to_streets(building_id, max_depth=5) + assert result == False + + +def test_connect_building_door_already_connected(): + """Test that function returns True if door is already a street""" + city = City(dimensions=(10, 10)) + + # Add a building normally (door should be on street by default) + city.add_building('home', door=(5, 5), blocks=[(5, 6)]) + building_id = city.buildings_gdf.iloc[0]['id'] + + # Call connection function - should succeed immediately + result = city._connect_building_door_to_streets(building_id, max_depth=5) + + # Assert connection successful (door already connected) + assert result == True + + +def test_rastercity_connects_isolated_doors(): + """Test that RasterCity automatically connects buildings with non-street doors""" + buildings, streets, boundary = _load_fixture() + + # Create city with rasterization that will produce some non-street doors + city = RasterCity(boundary, streets, buildings, block_side_length=15.0, verbose=False) + + # All building doors must be streets after initialization + for building_id in city.buildings_gdf['id']: + building_row = city.buildings_gdf.loc[building_id] + door_coord = (int(building_row['door_cell_x']), int(building_row['door_cell_y'])) + assert door_coord in city.streets_gdf.index, f"Building {building_id} door {door_coord} not in streets" + + diff --git a/nomad/tests/test_traj_gen.py b/nomad/tests/test_traj_gen.py index ce4715e..efd166e 100644 --- a/nomad/tests/test_traj_gen.py +++ b/nomad/tests/test_traj_gen.py @@ -31,6 +31,7 @@ def garden_city(): city_path = data_dir / "garden-city.gpkg" # Garden-city fixture (legacy) uses 'type' instead of 'building_type' city = cg.City.from_geopackage(city_path, poi_cols={'building_type':'type'}) + city.compute_shortest_paths(callable_only=False) return city diff --git a/nomad/traj_gen.py b/nomad/traj_gen.py index c7e88f0..a6f3848 100644 --- a/nomad/traj_gen.py +++ b/nomad/traj_gen.py @@ -288,6 +288,10 @@ def __init__(self, self.diary = diary if diary is not None else pd.DataFrame( columns=['datetime', 'timestamp', 'duration', 'location', 'identifier']) self.sparse_traj = None + + # Geometry cache for _sample_step optimization + self._cached_geom = None + self._cached_geom_id = None def reset_trajectory(self, trajectory = True, sparse = True, last_ping = True, diary = True): @@ -363,6 +367,12 @@ def _sample_step(self, start_point, dest_building_id, dt, rng): The building ID if the step is a stay, or `None` if the step is a move. """ city = self.city + + # Check if start_point is in cached geometry (diagnostic only, no effect yet) + if self._cached_geom is not None: + in_cache = self._cached_geom.contains(Point(start_point)) + if in_cache: + pass # Could reuse cached geometry here # Resolve destination building geometry and attributes (strict schema) brow = city.buildings_gdf[city.buildings_gdf['id'] == dest_building_id] @@ -401,6 +411,10 @@ def _sample_step(self, start_point, dest_building_id, dt, rng): coord = rng.normal(loc=start_point_arr, scale=sigma*np.sqrt(dt), size=2) if dest_geom.contains(Point(coord)): break + + # Cache building geometry for potential reuse + self._cached_geom = dest_geom + self._cached_geom_id = dest_building_id return coord, location # Otherwise, move along streets toward destination door cell @@ -451,7 +465,14 @@ def _sample_step(self, start_point, dest_building_id, dt, rng): if bound_poly.contains(Point(coord)): break - + + # Cache path geometry for potential reuse + # For path movements, create ID from start and destination + if start_info['building_id'] is not None: + self._cached_geom_id = (start_info['building_id'], dest_building_id) + else: + self._cached_geom_id = (start_node, dest_building_id) + self._cached_geom = bound_poly return coord, location @@ -472,6 +493,9 @@ def _traj_from_dest_diary(self, dt, seed=0): self.diary = self.diary.iloc[:-1] tick_secs = int(60*dt) + + # Track previous building for cache optimization + previous_building_id = None entry_update = [] for i in range(destination_diary.shape[0]): @@ -508,6 +532,9 @@ def _traj_from_dest_diary(self, dt, seed=0): 'user_id': self.identifier} else: current_entry['duration'] += 1*dt # add one tick to the duration + + # Update previous_building_id after completing this destination segment + previous_building_id = building_id if self.trajectory is None: self.trajectory = pd.DataFrame(trajectory_update) @@ -1275,6 +1302,8 @@ def save_pop(self, """ if full_path: full_df = pd.concat([agent.trajectory for agent in self.roster.values()], ignore_index=True) + if partition_cols and 'date' in partition_cols and 'date' not in full_df.columns: + full_df['date'] = pd.to_datetime(full_df['timestamp']).dt.date.astype(str) full_df = from_df(full_df, traj_cols=traj_cols, mixed_timezone_behavior=mixed_timezone_behavior) to_file(full_df, path=full_path, @@ -1285,6 +1314,8 @@ def save_pop(self, if sparse_path: sparse_df = pd.concat([agent.sparse_traj for agent in self.roster.values()], ignore_index=True) + if partition_cols and 'date' in partition_cols and 'date' not in sparse_df.columns: + sparse_df['date'] = pd.to_datetime(sparse_df['timestamp']).dt.date.astype(str) sparse_df = from_df(sparse_df, traj_cols=traj_cols, mixed_timezone_behavior=mixed_timezone_behavior) to_file(sparse_df, path=sparse_path, @@ -1296,6 +1327,8 @@ def save_pop(self, if diaries_path: diaries_df = pd.concat([agent.diary for agent in self.roster.values()], ignore_index=True) + if partition_cols and 'date' in partition_cols and 'date' not in diaries_df.columns: + diaries_df['date'] = pd.to_datetime(diaries_df['timestamp']).dt.date.astype(str) diaries_df = from_df(diaries_df, traj_cols=traj_cols, mixed_timezone_behavior=mixed_timezone_behavior) to_file(diaries_df, path=diaries_path,