From 6a5fa6d868e39c15ed9ee24d4f07fd67ebd5f1d3 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Thu, 12 Mar 2026 13:09:48 -0400 Subject: [PATCH 1/6] update cpdb version --- products/cpdb/recipe.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/products/cpdb/recipe.yml b/products/cpdb/recipe.yml index 67b3a21c23..5c1b3d33a3 100644 --- a/products/cpdb/recipe.yml +++ b/products/cpdb/recipe.yml @@ -1,6 +1,6 @@ name: CPDB product: db-cpdb -version: 25adopt +version: 26prelim inputs: missing_versions_strategy: find_latest datasets: From e31b629d079e2295b85cde5631a69eca0d653231 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Wed, 15 Apr 2026 15:12:15 -0400 Subject: [PATCH 2/6] fix FISA soruce data a double-quote charcter at the beginning of description column for the RDAMPSIP project causes the rest of the file to be inserted into the downstream description for that project, breaking CPDB csvs --- ingest_templates/fisa_capitalcommitments.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest_templates/fisa_capitalcommitments.yml b/ingest_templates/fisa_capitalcommitments.yml index a1bc0b8af1..712a5195ca 100644 --- a/ingest_templates/fisa_capitalcommitments.yml +++ b/ingest_templates/fisa_capitalcommitments.yml @@ -10,7 +10,7 @@ ingestion: source: type: s3 bucket: edm-recipes - key: inbox/fisa_capitalcommitments/AICP_OREQ_CAPPLN_PJCP_20260218121103.asc + key: inbox/fisa_capitalcommitments/AICP_OREQ_CAPPLN_PJCP_20260218121103_fixed.asc file_format: type: csv encoding: ISO-8859-1 From 8fe7bdad06b3862a1675e801ab56fe6bb8eb7751 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 24 Apr 2026 11:33:21 -0400 Subject: [PATCH 3/6] remove manual mapping for capital project 801SANDBCT --- products/cpdb/data/sprints.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/products/cpdb/data/sprints.sql b/products/cpdb/data/sprints.sql index 22dcaec590..9241fa21ce 100644 --- a/products/cpdb/data/sprints.sql +++ b/products/cpdb/data/sprints.sql @@ -4592,7 +4592,6 @@ INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND1106' INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND1108', '1009620100', NULL, 'AD Sprint', '0106000020E6100000010000000103000000010000004D00000015D4E4114C7E52C0249925ADB35E4440FD5E77114C7E52C0353FDFB0B35E4440A114F8DF4B7E52C00673D0EDB45E4440B9129F954B7E52C0BB85501FB65E44409F1F84334B7E52C000CFE140B75E44409F6918BB4A7E52C0619B424EB85E4440AC2E7B624A7E52C043494DE9B95E44403E3E932D4A7E52C04BA95492BB5E44403257441D4A7E52C068453342BD5E4440CA1080FA497E52C0F4E92C62C05E4440A2F707F3497E52C05F26070EC15E4440E715E6EB497E52C0DF4F1AC5C15E444054BEE3EB497E52C02C5E70C5C15E44406E618408487E52C0A8293AEDBF5E4440CBEB3D0F487E52C06C8A9140BF5E4440A9D96019487E52C0B2935657BE5E444034818524487E52C047FE7F74BD5E444016452D31487E52C0381E9E89BC5E444094C48B3D487E52C02DE2E5B5BB5E4440D3A37044487E52C0E8F67A46BB5E44407EB06B51487E52C0DF717A81BA5E444036CF21C5487E52C019227A1CB55E44405DF51306497E52C0B68157B8B25E4440BD4C7F80497E52C01199ECE6AE5E444065DA66AD497E52C087DFDAA8AD5E4440326428D0497E52C063DE03BEAC5E4440D411B0004A7E52C022C86E84AB5E4440E6F6FD3A4A7E52C0BF20871EAA5E4440B946C3754B7E52C0B0CA659DA35E444011D7EB3D4C7E52C0FE10781BA05E44401A3089524D7E52C0D8F6A6CB9B5E444069C1B75A4F7E52C093376FDA945E4440615148F6507E52C0586BB81E905E444061C0B150527E52C0BC48E2848C5E44409D11F676527E52C0F2E83F2F8C5E44401F50B190527E52C0B33AC9EA8B5E4440A09948AE527E52C0A61354878B5E4440A544CFC0527E52C0B68747388B5E4440C0778BCE527E52C0008C5DE48A5E4440012A29DB527E52C03981A77C8A5E44405D068EDF527E52C09D9AE9268A5E444063EA3CDF527E52C0B51660C5895E44400FE5FBE0527E52C02B30096C895E444036526FE7527E52C0ACD921FF885E4440D7F982F9527E52C0C138C178885E44407D325CFD527E52C04FB19E63885E4440E50DC527537E52C05546B3B7875E4440E6E2BF4F537E52C08FF98144875E444065A03BB6537E52C00D853D23865E4440AFC558E85C7E52C09BDC802E6C5E4440B3F1CA72627E52C08858B48A5C5E444056D12DBC627E52C0EA2DBDFC5B5E4440FD89EFCE627E52C01A542AE55B5E4440385892FD627E52C0EE478EAA5B5E44406C105047637E52C06E3191685B5E44408BF400AC637E52C0C24DC0335B5E444089C7A21F647E52C0DBC68D1F5B5E444049372886647E52C042BA86305B5E4440EC81CAD2647E52C03500B2535B5E444043CEE016657E52C08514957F5B5E44400F276BC7687E52C0265687915E5E4440706E667A8D7E52C05263901A7D5E444079ED8296717E52C095C35DAFC95E44405C1DE6B1697E52C0301D6008C35E4440D08E2BA46C7E52C02A4E5CF1BA5E444061959539597E52C021740294AA5E4440FB0599BF5A7E52C0B5662B65A65E444065F4BC335A7E52C0B30A49EFA55E444090D48CBA517E52C01143FACA9E5E44405B233C80517E52C0BD60166B9F5E4440F9EDFEDC507E52C02A1E80E19E5E44404C1DF47E507E52C0B055B1E39F5E444058F7FB4E4F7E52C017B77CE39E5E4440FE4444AE4E7E52C04757D464A15E444084750C774E7E52C0A0438A5CA25E4440DCFF46804D7E52C006CF95AFA65E444015D4E4114C7E52C0249925ADB35E4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND6703', '1013730050', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000005000000C28A25F72D7C52C0E02BA79EBA624440D50101284D7C52C06269D09467624440691F3B6C657C52C04D29BA6C7D6244404943893B467C52C08C1CC676D0624440C28A25F72D7C52C0E02BA79EBA624440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SAND6706', '1013730050', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000005000000C28A25F72D7C52C0E02BA79EBA624440D50101284D7C52C06269D09467624440691F3B6C657C52C04D29BA6C7D6244404943893B467C52C08C1CC676D0624440C28A25F72D7C52C0E02BA79EBA624440'); -INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SANDBCT', '3052570045', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000010000000E8933364567E52C0909BEA62E953444071469547577E52C0E6246F02EC53444091890BB9577E52C05D25BB46ED53444015D08C23587E52C05D0BF77DEE53444099140C81587E52C008D32D8FEF534440CF8182E9587E52C07CD272C0F0534440EC891A46597E52C0B2FB05CFF153444078CB9FB0597E52C03BA04D06F3534440196AB40D5A7E52C077EA4B16F45344406317226D5A7E52C0C82A282DF55344408E65E0D05A7E52C08632A450F65344403CB3852D5B7E52C0CA52585FF753444057C9CAF55B7E52C0039B95A8F953444028690732547E52C0AA9562B5FF534440689F157A517E52C0A1083137ED534440E8933364567E52C0909BEA62E9534440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('841TFD503M24', NULL, NULL, 'DCP Sprint', '0106000020E6100000010000000103000000010000003700000098EB5B8E077F52C02A428843A45E444063EE5C0A227F52C08CAD3C93BA5E4440BBA5640C227F52C0B01AF494BA5E444063888E3F3E7F52C0AD291883D25E444053AFF4413E7F52C0A4932385D25E444082115D08427F52C08D4754C0D55E44402B5758444F7F52C09D1A2DE6E05E4440B7A4F9564F7F52C0A12068F6E05E4440EACCA431527F52C033996888E35E44408DB99C95567F52C0C1E15A2FE75E4440452FB497567F52C02BFC1931E75E4440829CC9818B7F52C084DD4C8A135F4440623D33FA8B7F52C03D861408145F44409DED2A608C7F52C0E45D2DA7145F4440AF86C5AF8C7F52C05E367A61155F444052E4F3E58C7F52C0F641D22F165F444021FDA0008D7F52C02E82470A175F4440D55EC6FE8C7F52C080CA74E8175F4440804476E08C7F52C0BD57D0C1185F44408AE3DAA68C7F52C076CFFF8D195F44403DF62A548C7F52C00B6D2A451A5F44408FF493EB8B7F52C00D3446E01A5F444069D11A718B7F52C03C305D591B5F4440E06F74E98A7F52C0101AC8AB1B5F44408255D7598A7F52C0AD1E5CD41B5F44401B61C8C7897F52C0DB088AD11B5F4440B47EE438897F52C089986DA31B5F4440DD6EA9B2887F52C0BF71CC4B1B5F4440862697C9537F52C046616FF3EE5E444083C0835B4F7F52C04C7A1444EB5E4440282C6D454F7F52C0986EF430EB5E44408CBE0E694C7F52C087C56B9DE85E4440E3A63D343F7F52C04E119B7DDD5E444058A6CF2F3F7F52C04B25D879DD5E4440758563683B7F52C002DDC83DDA5E44404C466C371F7F52C015648051C25E444018684CBC047F52C0DCE98702AC5E44400E7C977CEA7E52C0C3AC62E0955E4440DE506604EA7E52C0C45D2262955E44407743B59EE97E52C04564A3C2945E444043C36C4FE97E52C075DE0608945E444001CB9819E97E52C0D9AC7839935E44408BE74AFFE87E52C0DCE4E85E925E444062DE8501E97E52C014B9BD80915E444037BC3320E97E52C0B8D780A7905E444038AD265AE97E52C00A6C8BDB8F5E4440B49624ADE97E52C01FFDB3248F5E4440FEFFFC15EA7E52C0F551018A8E5E4440ED72A890EA7E52C06D5165118E5E44404D1F7018EB7E52C0338682BF8D5E4440493B1CA8EB7E52C0D1857E978D5E44408F58273AEC7E52C091FBE29A8D5E444045B5F4C8EC7E52C0B6878EC98D5E44406472074FED7E52C0C507B6218E5E444098EB5B8E077F52C02A428843A45E4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('806BVIEW', '2028610001', NULL, 'AD Sprint', '0104000020E6100000010000000101000000420EAE76817A52C0D6AAC71C426C4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('841TFD503X21', NULL, NULL, 'DCP Sprint', '0106000020E610000001000000010300000001000000260000007A865E0E657A52C05935ED1D2C6A4440947FD2D0567A52C0F1A9F2875F6A4440F800288B567A52C0E903504B606A4440064EBD2D567A52C045D546F6606A4440A96D29BC557A52C0382D4582616A4440FEBEC93A557A52C0CDCAE9E9616A4440A00897AE547A52C07C0A3929626A4440F98EF41C547A52C08915C43D626A4440D4127B8B537A52C093D1C026626A4440B4C1C1FF527A52C0E6A411E5616A4440AA35277F527A52C017C33C7B616A4440CEA19C0E527A52C0A25753ED606A4440213375B2517A52C0AD82C940606A444028843B6E517A52C0D4B1407C5F6A4440E5C58E44517A52C0B66446A75E6A4440E7F40837517A52C01EDF09CA5D6A444020182F46517A52C0CBA30BED5C6A44408B246C71517A52C00AD0C9185C6A44406619A4AF5F7A52C00E4575AC286A44403E47F0B05F7A52C06061CDA7286A4440594E46436B7A52C036502064FF694440BA3082896B7A52C0C6DC3BA1FE69444075A46BE76B7A52C0C9EEE6F6FD694440ECC266596C7A52C0233BAD6BFD6944404C3612DB6C7A52C00571E804FD694440265172676D7A52C04E978BC6FC6944403D1422F96D7A52C05532FCB2FC6944406740888A6E7A52C04CB0FACAFC6944406C6A0E166F7A52C029049B0DFD6944400EF357966F7A52C0D7B74D78FD69444087C67606707A52C07A1CF906FE694440A1DC1B62707A52C0E2A021B4FE694440289CC1A5707A52C086C11F79FF694440EE80CECE707A52C0587F614E006A44403BAFAEDB707A52C06BD9B42B016A44404679E3CB707A52C0E56C9808026A4440C23D08A0707A52C0E4218FDC026A44407A865E0E657A52C05935ED1D2C6A4440'); From da5dc80fe8740127ed2aed97d56cf2542adac333 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 24 Apr 2026 15:21:14 -0400 Subject: [PATCH 4/6] manually assign a project to a bin --- products/cpdb/seeds/dcp_id_bin_map.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/products/cpdb/seeds/dcp_id_bin_map.csv b/products/cpdb/seeds/dcp_id_bin_map.csv index a1ec62f91c..f74627a181 100644 --- a/products/cpdb/seeds/dcp_id_bin_map.csv +++ b/products/cpdb/seeds/dcp_id_bin_map.csv @@ -102,3 +102,4 @@ maprojid,bin 126PV176-REP,2116695 850PV176-WHC,2116709 846P-312DMRR,1075616 +801SANDBCT,3398756 From d131b46c55bea96423db629329c4f68fed784336 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Thu, 30 Apr 2026 13:43:57 -0400 Subject: [PATCH 5/6] manually assign a project to a bin --- products/cpdb/data/sprints.sql | 1 - products/cpdb/seeds/dcp_id_bin_map.csv | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/products/cpdb/data/sprints.sql b/products/cpdb/data/sprints.sql index 9241fa21ce..8e33059719 100644 --- a/products/cpdb/data/sprints.sql +++ b/products/cpdb/data/sprints.sql @@ -4570,7 +4570,6 @@ INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801RICONNECT INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801ROCKBLVD', NULL, NULL, 'DCP Sprint', '0104000020E61000000100000001010000003DB5856F387052C01D3A803AFC524440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('056PO7990LOC', '3024650100', NULL, 'AD Sprint', '0104000020E610000001000000010100000000000099D77C52C0C8EB567C675A4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('126PV256DIMM', '1010340022', NULL, 'AD Sprint', '0106000020E610000001000000010300000001000000050000004B4D5B65577F52C072765B1B1661444026391C535A7F52C0EEB477140E614440A8B341D8607F52C01D236F91136144405A6B83EA5D7F52C03DBE57981B6144404B4D5B65577F52C072765B1B16614440'); -INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801SANDRBCT', '3052570045', NULL, 'AD Sprint', '0106000020E61000000100000001030000000100000010000000E8933364567E52C0909BEA62E953444071469547577E52C0E6246F02EC53444091890BB9577E52C05D25BB46ED53444015D08C23587E52C05D0BF77DEE53444099140C81587E52C008D32D8FEF534440CF8182E9587E52C07CD272C0F0534440EC891A46597E52C0B2FB05CFF153444078CB9FB0597E52C03BA04D06F3534440196AB40D5A7E52C077EA4B16F45344406317226D5A7E52C0C82A282DF55344408E65E0D05A7E52C08632A450F65344403CB3852D5B7E52C0CA52585FF753444057C9CAF55B7E52C0039B95A8F953444028690732547E52C0AA9562B5FF534440689F157A517E52C0A1083137ED534440E8933364567E52C0909BEA62E9534440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('801WATERSIDE', '1009910035', NULL, 'AD Sprint', '0106000020E610000001000000010300000001000000200000001B7DC498527E52C0A6E1E8C0D45D444012D82C41537E52C046D3556AD45D4440BDA7CFFD537E52C044B2B0AED95D44404B2DD3B8547E52C0629970E7DE5D44408A599E99547E52C0A5A6EAB7DE5D4440F0799249547E52C02A97E636DE5D444070239D28547E52C05DF2BC04DE5D4440F109C833537E52C0C2595EC9DC5D4440211785A6527E52C056D2010FDC5D4440D45F9185527E52C0E81128EBDB5D44409F6AE65F527E52C05FA953C7DB5D4440D26BE1EE517E52C0868CA07FDB5D444047C0E8CD517E52C0EA414971DB5D44409F9A6F74517E52C063D99054DB5D44404B79ED3B517E52C0EFBC8E29DB5D44404FDFC705517E52C007969FECDA5D44401ECCC1DD507E52C08ACCB6AFDA5D4440DBA638C6507E52C02D55887DDA5D4440B04B7192507E52C09F9737EED95D4440FD2F6163507E52C05BDDE35ED95D4440B1852547507E52C0475792CFD85D44402D4B0B39507E52C043291839D85D4440E2662740507E52C0F86C6482D75D4440231CEF4B507E52C02A2F2737D75D444064A76F5C507E52C09C5E47FAD65D444021DB76A5507E52C0B9853D60D65D4440C2F9B9C1507E52C0E1301C2ED65D44409F4BB6E2507E52C0475D2303D65D444015EE180D517E52C0DAC9FDD0D55D44404BBA9F45517E52C0EC0ADB9ED55D44402C520FE1517E52C04E407733D55D44401B7DC498527E52C0A6E1E8C0D45D4440'); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('81948201040', NULL, NULL, 'DCP Sprint', NULL); INSERT INTO sprints (maprojid, bbl, bin, geomsource, geom) VALUES ('81948201102', NULL, NULL, 'AD Sprint', NULL); diff --git a/products/cpdb/seeds/dcp_id_bin_map.csv b/products/cpdb/seeds/dcp_id_bin_map.csv index f74627a181..aaf534a69a 100644 --- a/products/cpdb/seeds/dcp_id_bin_map.csv +++ b/products/cpdb/seeds/dcp_id_bin_map.csv @@ -103,3 +103,4 @@ maprojid,bin 850PV176-WHC,2116709 846P-312DMRR,1075616 801SANDBCT,3398756 +801SANDRBCT,3398756 From 1c4cf5dbdf82ad762db70024ebb86ac78f80eb0f Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 24 Apr 2026 15:21:36 -0400 Subject: [PATCH 6/6] add notebook to compare build files in S3 --- .../marimo/lifecycle/build_qa/s3_compare.py | 409 ++++++++++++++++++ 1 file changed, 409 insertions(+) create mode 100644 notebooks/marimo/lifecycle/build_qa/s3_compare.py diff --git a/notebooks/marimo/lifecycle/build_qa/s3_compare.py b/notebooks/marimo/lifecycle/build_qa/s3_compare.py new file mode 100644 index 0000000000..4fe8042197 --- /dev/null +++ b/notebooks/marimo/lifecycle/build_qa/s3_compare.py @@ -0,0 +1,409 @@ +import marimo + +__generated_with = "0.23.1" +app = marimo.App(width="full") + +with app.setup: + import marimo as mo + import pandas as pd + + from dcpy.utils import s3 + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + # S3 Build QA + """) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Directory Comparison + + Explore and compare files across two S3 paths (e.g. a published draft vs. a new build). + """) + return + + +@app.cell +def _(): + bucket_input = mo.ui.text(value="edm-publishing", label="Bucket") + path_a_input = mo.ui.text( + value="db-cpdb/draft/26prelim/2/", + label="Path A — baseline (e.g. published draft)", + full_width=True, + ) + path_b_input = mo.ui.text( + value="db-cpdb/build/dm-cpdb-26prelim/", + label="Path B — new build", + full_width=True, + ) + mo.vstack( + [ + bucket_input, + mo.hstack([path_a_input, path_b_input], gap=2), + ], + align="center", + ) + return bucket_input, path_a_input, path_b_input + + +@app.cell +def _(bucket_input, path_a_input, path_b_input): + def _fetch(prefix: str) -> pd.DataFrame: + objs = s3.list_objects(bucket_input.value, prefix) + if not objs: + return pd.DataFrame(columns=["filename", "size_bytes", "last_modified"]) + return pd.DataFrame( + [ + { + "filename": o["Key"].removeprefix(prefix), + "size_bytes": o["Size"], + "last_modified": o["LastModified"], + } + for o in objs + if not o["Key"].endswith("/") + ] + ) + + with mo.status.spinner(title="Fetching objects from S3…"): + df_a = _fetch(path_a_input.value) + df_b = _fetch(path_b_input.value) + return df_a, df_b + + +@app.cell +def _(df_a, df_b, path_a_input, path_b_input): + _merged = pd.merge( + df_a[["filename", "size_bytes"]].rename(columns={"size_bytes": "size_a"}), + df_b[["filename", "size_bytes"]].rename(columns={"size_bytes": "size_b"}), + on="filename", + how="outer", + indicator=True, + ) + _merged["in_a"] = _merged["_merge"].isin(["left_only", "both"]) + _merged["in_b"] = _merged["_merge"].isin(["right_only", "both"]) + _merged["size_diff_bytes"] = _merged["size_b"] - _merged["size_a"] + _merged["size_diff_pct"] = ( + _merged["size_diff_bytes"] / _merged["size_a"] * 100 + ).round(1) + comparison = _merged.drop(columns=["_merge"]) + + _n_only_a = comparison[~comparison["in_b"]].shape[0] + _n_only_b = comparison[~comparison["in_a"]].shape[0] + _n_both = comparison[comparison["in_a"] & comparison["in_b"]].shape[0] + + mo.md( + f""" + ## Summary + + | | Count | + |---|---| + | Files in **A** (`{path_a_input.value}`) | {len(df_a)} | + | Files in **B** (`{path_b_input.value}`) | {len(df_b)} | + | Only in A | {_n_only_a} | + | Only in B | {_n_only_b} | + | In both | {_n_both} | + """ + ) + return (comparison,) + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Only in A — missing from new build + """) + return + + +@app.cell +def _(comparison): + _df = comparison[~comparison["in_b"]][["filename", "size_a"]].reset_index(drop=True) + mo.ui.table(_df, selection=None) if len(_df) else mo.md( + "_None — all files from A are present in B._" + ) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Only in B — new files + """) + return + + +@app.cell +def _(comparison): + _df = comparison[~comparison["in_a"]][["filename", "size_b"]].reset_index(drop=True) + mo.ui.table(_df, selection=None) if len(_df) else mo.md( + "_None — no new files in B._" + ) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Files in both — size comparison + """) + return + + +@app.cell +def _(comparison): + _df = ( + comparison[comparison["in_a"] & comparison["in_b"]][ + ["filename", "size_a", "size_b", "size_diff_bytes", "size_diff_pct"] + ] + .sort_values("size_diff_pct", key=abs, ascending=False, na_position="last") + .reset_index(drop=True) + ) + mo.ui.table(_df, selection=None) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Table Comparison + """) + return + + +@app.cell(hide_code=True) +def _(comparison): + _files_in_both = sorted( + comparison[comparison["in_a"] & comparison["in_b"]]["filename"].tolist() + ) + table_selector = mo.ui.dropdown( + options=_files_in_both, + label="Select a file to compare", + searchable=True, + ) + mo.vstack([table_selector], align="center") + return (table_selector,) + + +@app.cell(hide_code=True) +def _(bucket_input, path_a_input, path_b_input, table_selector): + from io import BytesIO + + def _load_file(prefix: str, filename: str) -> pd.DataFrame: + key = prefix + filename + body = s3.get_file(bucket_input.value, key) + data = BytesIO(body.read()) + ext = filename.rsplit(".", 1)[-1].lower() + if ext == "csv": + return pd.read_csv(data) + elif ext == "parquet": + return pd.read_parquet(data) + elif ext == "json": + return pd.read_json(data) + else: + raise ValueError(f"Unsupported file extension: .{ext}") + + if table_selector.value: + with mo.status.spinner(title=f"Loading {table_selector.value}…"): + tbl_a = _load_file(path_a_input.value, table_selector.value) + tbl_b = _load_file(path_b_input.value, table_selector.value) + else: + tbl_a = tbl_b = None + return tbl_a, tbl_b + + +@app.cell(hide_code=True) +def _(table_selector, tbl_a, tbl_b): + mo.stop( + tbl_a is None or tbl_b is None, + mo.md("_Select a file above to load and compare._"), + ) + _cols_only_a = sorted(set(tbl_a.columns) - set(tbl_b.columns)) + _cols_only_b = sorted(set(tbl_b.columns) - set(tbl_a.columns)) + _col_status = ( + "Column sets match." + if not _cols_only_a and not _cols_only_b + else f"Only in A: `{'`, `'.join(_cols_only_a) or '—'}` · Only in B: `{'`, `'.join(_cols_only_b) or '—'}`" + ) + mo.vstack( + [ + mo.md(f""" + | | A | B | + |---|---|---| + | Rows | {len(tbl_a):,} | {len(tbl_b):,} | + | Columns | {len(tbl_a.columns)} | {len(tbl_b.columns)} | + + **Columns:** {_col_status} + """), + mo.ui.tabs( + { + f"A — {table_selector.value}": mo.ui.table(tbl_a, selection=None), + f"B — {table_selector.value}": mo.ui.table(tbl_b, selection=None), + } + ), + ] + ) + return + + +@app.cell(hide_code=True) +def _(): + _df = mo.sql( + f""" + select * from + """ + ) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(""" + ## Geospatial Comparison + """) + return + + +@app.cell +def _(): + import openlayers as ol + from openlayers.basemaps import CartoBasemapLayer, Carto + + return Carto, CartoBasemapLayer, ol + + +@app.cell(hide_code=True) +def _(comparison): + _zip_files = sorted( + comparison.loc[ + comparison["in_a"] + & comparison["in_b"] + & comparison["filename"].str.endswith(".zip"), + "filename", + ].tolist() + ) + mo.stop(not _zip_files, mo.md("_No shared `.zip` files found in both paths._")) + geo_selector = mo.ui.dropdown( + options=_zip_files, + label="Select a shapefile (.zip) to compare", + searchable=True, + ) + mo.vstack([geo_selector], align="center") + return (geo_selector,) + + +@app.cell(hide_code=True) +def _(bucket_input, geo_selector, path_a_input, path_b_input): + import geopandas as gpd + import tempfile + import os + + mo.stop(geo_selector.value is None, mo.md("_Select a shapefile above._")) + + def _load_shapefile(prefix: str, filename: str) -> gpd.GeoDataFrame: + body = s3.get_file(bucket_input.value, prefix + filename) + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as _tmp: + _tmp.write(body.read()) + _tmp_path = _tmp.name + try: + _gdf = gpd.read_file(f"zip://{_tmp_path}") + finally: + os.unlink(_tmp_path) + return _gdf + + with mo.status.spinner(title=f"Loading {geo_selector.value} from both paths…"): + geo_gdf_a = ( + _load_shapefile(path_a_input.value, geo_selector.value) + .to_crs(4326) + .reset_index(drop=True) + ) + geo_gdf_b = ( + _load_shapefile(path_b_input.value, geo_selector.value) + .to_crs(4326) + .reset_index(drop=True) + ) + return geo_gdf_a, geo_gdf_b + + +@app.cell(hide_code=True) +def _(geo_gdf_a, geo_selector, path_a_input): + _geom_col = geo_gdf_a.geometry.name + _attr_cols = [c for c in geo_gdf_a.columns if c != _geom_col] + geo_table_a = mo.ui.table( + geo_gdf_a[_attr_cols], + selection="multi", + page_size=20, + label=f"A — {path_a_input.value}{geo_selector.value} ({len(geo_gdf_a):,} features) · check rows to highlight on map", + ) + geo_table_a + return (geo_table_a,) + + +@app.cell(hide_code=True) +def _(Carto, CartoBasemapLayer, geo_gdf_a, geo_table_a, ol): + _sel_idx = geo_table_a.value.index.tolist() + _plot_a = geo_gdf_a.loc[_sel_idx] if _sel_idx else geo_gdf_a + _style_a = ol.FlatStyle( + fill_color="rgba(70, 130, 180, 0.35)", + stroke_color="#4682b4", + stroke_width=2, + circle_radius=6, + circle_fill_color="rgba(70, 130, 180, 0.7)", + circle_stroke_color="#4682b4", + circle_stroke_width=1.5, + ) + _layer_a = _plot_a.ol.to_layer(style=_style_a, fit_bounds=False) + _map_a = ol.MapWidget( + view=ol.View(center=[0, 0], zoom=1), + layers=[CartoBasemapLayer(Carto.LIGHT_ALL), _layer_a], + ) + _map_a.fit_bounds(tuple(_plot_a.geometry.total_bounds)) + _map_a.add_tooltip() + _map_a + return + + +@app.cell(hide_code=True) +def _(geo_gdf_b, geo_selector, path_b_input): + _geom_col = geo_gdf_b.geometry.name + _attr_cols = [c for c in geo_gdf_b.columns if c != _geom_col] + geo_table_b = mo.ui.table( + geo_gdf_b[_attr_cols], + selection="multi", + page_size=20, + label=f"B — {path_b_input.value}{geo_selector.value} ({len(geo_gdf_b):,} features) · check rows to highlight on map", + ) + geo_table_b + return (geo_table_b,) + + +@app.cell(hide_code=True) +def _(Carto, CartoBasemapLayer, geo_gdf_b, geo_table_b, ol): + _sel_idx = geo_table_b.value.index.tolist() + _plot_b = geo_gdf_b.loc[_sel_idx] if _sel_idx else geo_gdf_b + _style_b = ol.FlatStyle( + fill_color="rgba(230, 57, 70, 0.35)", + stroke_color="#e63946", + stroke_width=2, + circle_radius=6, + circle_fill_color="rgba(230, 57, 70, 0.7)", + circle_stroke_color="#e63946", + circle_stroke_width=1.5, + ) + _layer_b = _plot_b.ol.to_layer(style=_style_b, fit_bounds=False) + _map_b = ol.MapWidget( + view=ol.View(center=[0, 0], zoom=1), + layers=[CartoBasemapLayer(Carto.LIGHT_ALL), _layer_b], + ) + _map_b.fit_bounds(tuple(_plot_b.geometry.total_bounds)) + _map_b.add_tooltip() + _map_b + return + + +if __name__ == "__main__": + app.run()