From 81ec21829cfa9c90c1790832f031d15d2c5a581f Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Sun, 22 Mar 2026 22:11:54 +0000 Subject: [PATCH 01/19] WIP: dvc --- .../plans/dvc_pipeline_setup_872f50fc.plan.md | 280 ++++++++++ .dvc/.gitignore | 3 + .dvc/config | 0 .dvcignore | 3 + .gitignore | 4 + README.markdown | 57 +- data/EOL/.gitignore | 3 +- data/OpenTree/.gitignore | 3 +- data/Wiki/.gitignore | 3 +- data/Wiki/README.markdown | 14 +- data/Wiki/wd_JSON/.gitignore | 1 + data/Wiki/wp_SQL/.gitignore | 1 + data/Wiki/wp_pagecounts/.gitignore | 1 + data/output_files/.gitignore | 1 + dvc.yaml | 167 ++++++ oz_tree_build/README.markdown | 118 ++-- oz_tree_build/utilities/filter_common.py | 29 + oz_tree_build/utilities/filter_eol.py | 79 +++ oz_tree_build/utilities/filter_pageviews.py | 81 +++ oz_tree_build/utilities/filter_wikidata.py | 223 ++++++++ .../utilities/filter_wikipedia_sql.py | 80 +++ .../utilities/generate_filtered_files.py | 508 +++--------------- params.yaml | 6 + pyproject.toml | 5 + 24 files changed, 1157 insertions(+), 513 deletions(-) create mode 100644 .cursor/plans/dvc_pipeline_setup_872f50fc.plan.md create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 dvc.yaml create mode 100644 oz_tree_build/utilities/filter_common.py create mode 100644 oz_tree_build/utilities/filter_eol.py create mode 100644 oz_tree_build/utilities/filter_pageviews.py create mode 100644 oz_tree_build/utilities/filter_wikidata.py create mode 100644 oz_tree_build/utilities/filter_wikipedia_sql.py create mode 100644 params.yaml diff --git a/.cursor/plans/dvc_pipeline_setup_872f50fc.plan.md b/.cursor/plans/dvc_pipeline_setup_872f50fc.plan.md new file mode 100644 index 0000000..14f49f0 --- /dev/null +++ b/.cursor/plans/dvc_pipeline_setup_872f50fc.plan.md @@ -0,0 +1,280 @@ +--- +name: DVC Pipeline Setup +overview: Set up DVC to define a cached, repeatable data pipeline for the OneZoom tree-build project, replacing manual download/filter workarounds with a declarative `dvc.yaml` pipeline backed by shared remote cache storage. +todos: + - id: install-dvc + content: Add dvc to pyproject.toml dependencies, run dvc init to create .dvc/ directory + status: completed + - id: params-yaml + content: Create params.yaml with oz_tree, ot_version, ot_taxonomy_version, ot_taxonomy_extra, build_version, exclude_from_popularity + status: completed + - id: split-filters + content: "Split generate_filtered_files.py into 4 separate filter modules: filter_eol.py, filter_wikidata.py, filter_wikipedia_sql.py, filter_pageviews.py. Remove generate_and_cache_filtered_file. Each module gets its own CLI entry point writing to a specified output path." + status: completed + - id: register-scripts + content: Register the 4 new filter scripts as console_scripts in pyproject.toml + status: completed + - id: dvc-yaml + content: Create dvc.yaml with all 11 pipeline stages (tree build + 4 parallel-capable filter stages + tables + JS) using DVC templating from params.yaml + status: completed + - id: gitignore-update + content: "Update .gitignore and data/ .gitignore files: add data/filtered/, data/output_files/js/, ensure .dvc files are not ignored" + status: completed + - id: update-docs + content: Update README.markdown, oz_tree_build/README.markdown, and data/Wiki/README.markdown with DVC workflow + status: completed +isProject: false +--- + +# DVC Pipeline for OneZoom Tree-Build + +## Current State + +The build process is a sequence of manual shell commands documented in `[oz_tree_build/README.markdown](oz_tree_build/README.markdown)`. Key pain points: + +- Massive source files (Wikidata ~100GB, enwiki SQL ~1GB, pageviews multi-GB) must be downloaded by every contributor +- `generate_filtered_files` takes **5-7 hours** to reduce these to usable subsets +- Pre-processed pageviews are distributed as GitHub releases as a workaround (no longer needed with DVC) +- No caching or reproducibility guarantees + +## Target Workflow + +```bash +# First person: downloads data, runs pipeline, pushes cache +dvc repro +dvc push + +# Everyone else: pulls only the cached outputs they need +dvc repro --pull --allow-missing +``` + +If nothing has changed, `dvc repro --pull --allow-missing` pulls pre-built outputs from shared storage -- no multi-GB downloads, no 5-7 hour filtering runs. + +## Pipeline DAG + +The monolithic `filter_files` stage is split into 4 independent filter stages. EOL and wikidata filters can run in parallel (both depend on taxonomy). SQL and pageview filters can run in parallel (both depend on filtered wikidata output). + +```mermaid +graph TD + OT_tre[labelled_supertree.tre.dvc] --> preprocess_opentree + OT_tgz[ott_taxonomy.tgz.dvc] --> unpack_taxonomy + preprocess_opentree --> prepare_open_trees + unpack_taxonomy --> add_ott_numbers + bespoke[BespokeTree in git] --> add_ott_numbers + add_ott_numbers --> prepare_open_trees + OT_req[OT_required in git] --> prepare_open_trees + prepare_open_trees --> build_tree + + unpack_taxonomy --> filter_eol + EOL[provider_ids.csv.gz.dvc] --> filter_eol + + unpack_taxonomy --> filter_wikidata + WD[latest-all.json.bz2.dvc] --> filter_wikidata + + filter_wikidata --> filter_sql + WP_SQL[enwiki-page.sql.gz.dvc] --> filter_sql + + filter_wikidata --> filter_pageviews + WP_PV[wp_pagecounts.dvc] --> filter_pageviews + + build_tree --> create_tables + filter_eol --> create_tables + filter_wikidata --> create_tables + filter_sql --> create_tables + filter_pageviews --> create_tables + unpack_taxonomy --> create_tables + SupTax[SupplementaryTaxonomy in git] --> create_tables + create_tables --> make_js +``` + +## Key Design Decisions + +### 1. Parameters in `params.yaml` (replaces env vars) + +Currently `OT_VERSION`, `OT_TAXONOMY_VERSION`, `OT_TAXONOMY_EXTRA`, and `OZ_TREE` are shell environment variables. These become DVC parameters so that changing a version automatically invalidates the right stages. + +```yaml +# params.yaml +oz_tree: AllLife +ot_version: "15.1" +ot_taxonomy_version: "3.7" +ot_taxonomy_extra: "draft2" +build_version: 28017344 # deterministic version for CSV_base_table_creator (replaces time-based default) +exclude_from_popularity: + - Archosauria_ott335588 + - Dinosauria_ott90215 +``` + +The `build_version` param is important: `CSV_base_table_creator` defaults to `int(time.time()/60)`, which would make outputs non-deterministic. A fixed param ensures DVC caching works correctly. + +### 2. Source data tracked with `dvc add` + +Large downloaded files are tracked via `dvc add`, producing `.dvc` files committed to git. The raw data itself lives only in DVC cache/remote, never in git. Files to track: + +- `data/OpenTree/labelled_supertree_simplified_ottnames.tre` +- `data/OpenTree/ott${ot_taxonomy_version}.tgz` +- `data/Wiki/wd_JSON/latest-all.json.bz2` +- `data/Wiki/wp_SQL/enwiki-latest-page.sql.gz` +- `data/Wiki/wp_pagecounts/` (directory -- raw pageview files; pre-processed GitHub releases are no longer needed since DVC caches the filtered outputs) +- `data/EOL/provider_ids.csv.gz` + +With `--allow-missing`, DVC can skip stages whose inputs haven't changed even when the raw files aren't present locally. + +### 3. Split filters into separate modules and remove mtime caching + +The monolithic `[generate_filtered_files.py](oz_tree_build/utilities/generate_filtered_files.py)` will be refactored: + +**Remove `generate_and_cache_filtered_file`** -- this function implements mtime-based caching (comparing filtered file timestamps to source file timestamps). DVC's run cache completely supersedes this. Each filter script simply writes its output; DVC decides whether to run it. + +**Split into 4 separate filter modules**, each with its own CLI entry point: + +- `oz_tree_build/utilities/filter_eol.py` -- filters EOL provider IDs CSV + - Inputs: EOL CSV (gz), taxonomy.tsv + - Output: filtered EOL CSV + - Reads taxonomy to build `source_ids` (NCBI, IF, WoRMS, IRMNG, GBIF sets), then keeps only matching EOL rows +- `oz_tree_build/utilities/filter_wikidata.py` -- filters the massive wikidata JSON dump (~100GB compressed) + - Inputs: wikidata JSON (bz2), taxonomy.tsv + - Outputs: filtered wikidata JSON, **plus a sidecar `wikidata_titles.txt`** (one Wikipedia page title per line) + - The sidecar file replaces the in-memory `context.wikidata_ids` handoff. It's produced by running the equivalent of `read_wikidata_dump()` on the filtered output and writing the titles to a text file. This is the key that enables SQL and pageview filters to run independently. +- `oz_tree_build/utilities/filter_wikipedia_sql.py` -- filters enwiki SQL page dump + - Inputs: enwiki SQL (gz), `wikidata_titles.txt` + - Output: filtered SQL file + - Reads the titles file to build the filter set (replaces `context.wikidata_ids`) +- `oz_tree_build/utilities/filter_pageviews.py` -- filters Wikipedia pageview files + - Inputs: one or more pageview files (bz2), `wikidata_titles.txt` + - Output: filtered pageview files in output directory + - Reads the titles file to build the filter set + +**Shared code** stays in `generate_filtered_files.py` (or a new common module): `read_taxonomy_file`, helper imports, and the orchestrating `generate_all_filtered_files` function (simplified to call the individual filter modules directly, useful for non-DVC usage and clade-specific test filtering). + +**New console scripts** registered in `pyproject.toml`: + +``` +filter_eol = "oz_tree_build.utilities.filter_eol:main" +filter_wikidata = "oz_tree_build.utilities.filter_wikidata:main" +filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" +filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" +``` + +The parallelism benefit: `filter_eol` and `filter_wikidata` share no outputs, so DVC can run them concurrently. Once `filter_wikidata` finishes and produces `wikidata_titles.txt`, `filter_sql` and `filter_pageviews` can also run concurrently. + +### 4. JS output stays in this repo + +`make_js_treefiles` currently defaults to writing into `../OZtree/static/FinalOutputs/data/`. In the DVC pipeline, use `--outdir data/output_files/js/` to keep outputs within this repo for DVC tracking. Users copy to OZtree manually afterward. + +### 5. DVC remote (shared cache) + +A DVC remote must be configured for shared caching. This is a one-line config per backend: + +```bash +dvc remote add -d myremote s3://my-bucket/dvc-cache # S3 +dvc remote add -d myremote gs://my-bucket/dvc-cache # GCS +dvc remote add -d myremote ssh://server:/path/to/cache # SSH +dvc remote add -d myremote /mnt/shared/dvc-cache # local/NFS +``` + +The choice of backend can be made later; the pipeline design is independent of it. + +## Pipeline Stages (`dvc.yaml`) + +The `dvc.yaml` at the project root will define these stages (using DVC templating with `vars` from `params.yaml`): + +**preprocess_opentree** -- perl to strip mrca labels and normalize underscores + +- deps: `data/OpenTree/labelled_supertree_simplified_ottnames.tre` +- params: `ot_version` +- outs: `data/OpenTree/draftversion${ot_version}.tre` + +**unpack_taxonomy** -- extract taxonomy.tsv from tarball + +- deps: `data/OpenTree/ott${ot_taxonomy_version}.tgz` +- params: `ot_taxonomy_version` +- outs: `data/OpenTree/ott${ot_taxonomy_version}/` (directory) + +**add_ott_numbers** -- call OpenTree API to annotate bespoke trees with OTT IDs + +- deps: `data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/` +- params: `oz_tree`, `ot_taxonomy_version`, `ot_taxonomy_extra` +- outs: `data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/` +- Note: calls external API; cached unless inputs change. Use `dvc repro -f add_ott_numbers` to force refresh. + +**prepare_open_trees** -- copy supplementary .nwk files and extract OpenTree subtrees + +- deps: `draftversion${ot_version}.tre`, `include_OTT.../`, `OT_required/` +- outs: `data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/` + +**build_tree** -- assemble the full newick tree + +- deps: `include_OTT.../`, `OpenTree_all/` +- outs: `data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy` + +**filter_eol** -- filter EOL provider IDs to relevant sources + +- deps: `data/EOL/provider_ids.csv.gz`, `data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv` +- outs: `data/filtered/OneZoom_provider_ids.csv` +- Parallelizable with `filter_wikidata` + +**filter_wikidata** -- filter massive wikidata JSON to taxon/vernacular items (THE most expensive step, hours) + +- deps: `data/Wiki/wd_JSON/latest-all.json.bz2`, `data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv` +- outs: `data/filtered/OneZoom_latest-all.json`, `data/filtered/wikidata_titles.txt` +- Parallelizable with `filter_eol` + +**filter_sql** -- filter enwiki SQL page dump to matching titles + +- deps: `data/Wiki/wp_SQL/enwiki-latest-page.sql.gz`, `data/filtered/wikidata_titles.txt` +- outs: `data/filtered/OneZoom_enwiki-latest-page.sql` +- Parallelizable with `filter_pageviews` + +**filter_pageviews** -- filter and aggregate Wikipedia pageview counts + +- deps: `data/Wiki/wp_pagecounts/`, `data/filtered/wikidata_titles.txt` +- outs: `data/filtered/pageviews/` (directory of filtered pageview files) +- Parallelizable with `filter_sql` + +**create_tables** -- map taxa, calculate popularity, produce DB-ready CSVs and ordered trees + +- deps: full tree, taxonomy, all `data/filtered/` outputs, `SupplementaryTaxonomy.tsv` +- params: `build_version`, `exclude_from_popularity` +- outs: `data/output_files/` + +**make_js** -- convert ordered trees to JS viewer files + +- deps: `data/output_files/` +- outs: `data/output_files/js/` + +## Files to Create/Modify + +- **Create** `params.yaml` -- pipeline parameters +- **Create** `dvc.yaml` -- pipeline definition (11 stages) +- **Create** `oz_tree_build/utilities/filter_eol.py` -- standalone EOL filter with CLI +- **Create** `oz_tree_build/utilities/filter_wikidata.py` -- standalone wikidata filter with CLI +- **Create** `oz_tree_build/utilities/filter_wikipedia_sql.py` -- standalone SQL filter with CLI +- **Create** `oz_tree_build/utilities/filter_pageviews.py` -- standalone pageviews filter with CLI +- **Modify** `[oz_tree_build/utilities/generate_filtered_files.py](oz_tree_build/utilities/generate_filtered_files.py)` -- remove `generate_and_cache_filtered_file`, simplify to orchestrator that calls the new modules (retains clade-filtering support for tests) +- **Modify** `[pyproject.toml](pyproject.toml)` -- add `dvc` to dependencies, register 4 new console scripts +- **Modify** `[.gitignore](.gitignore)` -- add `/data/filtered/`, DVC internals are handled by `dvc init` +- **Update** `[README.markdown](README.markdown)` -- new DVC-based workflow instructions +- **Update** `[oz_tree_build/README.markdown](oz_tree_build/README.markdown)` -- reference DVC pipeline +- **Update** `[data/Wiki/README.markdown](data/Wiki/README.markdown)` -- remove pre-processed pageview GitHub release instructions (DVC cache replaces this entirely) + +After creating these files, the first pipeline run involves: + +```bash +pip install -e . +dvc init +# download source files, then: +dvc add data/OpenTree/labelled_supertree_simplified_ottnames.tre +dvc add data/OpenTree/ott3.7.tgz +dvc add data/Wiki/wd_JSON/latest-all.json.bz2 +dvc add data/Wiki/wp_SQL/enwiki-latest-page.sql.gz +dvc add data/Wiki/wp_pagecounts/ +dvc add data/EOL/provider_ids.csv.gz +dvc repro +dvc push +git add . && git commit -m "Add DVC pipeline" +``` + +But this should not be run as part of this plan, the user will run it manually after the pipeline is set up. + +Also note that you should not try to run the individual large stages as part of this plan, since the input files are massive and the processing takes a long time, so the user will schedule it for a convenient time. diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore index 2cd3896..cb15a51 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,7 @@ cython_debug/ # sqlite storage.sqlite + +# DVC pipeline outputs (tracked by DVC, not git) +/data/filtered/ +/data/js_output/ diff --git a/README.markdown b/README.markdown index e47eef5..d160461 100644 --- a/README.markdown +++ b/README.markdown @@ -41,22 +41,55 @@ you will need a valid Azure Image cropping key in your appconfig.ini. ## Building the latest tree from OpenTree -### Setup +This project uses [DVC](https://dvc.org/) for a cached, repeatable data pipeline. The build parameters (OpenTree version, taxonomy version, etc.) are defined in `params.yaml` and the pipeline stages are declared in `dvc.yaml`. -We assume that you want to build a OneZoom tree based on the most recent online OpenTree version. -You can check the most recent version of both the synthetic tree (`synth_id`) and the taxonomy (`taxonomy_version`) via the -[API](https://github.com/OpenTreeOfLife/germinator/wiki/Open-Tree-of-Life-Web-APIs) e.g. by running `curl -X POST https://api.opentreeoflife.org/v3/tree_of_life/about`. Later in the build, we use specific environment variables set to these version numbers. Assuming you are in a bash shell or similar, you can set them as follows: +### Quick start (using cached outputs) +If someone has already run the pipeline and pushed the results to the DVC remote, you can reproduce the build without downloading any of the massive source files: + +```bash +source .venv/bin/activate +dvc repro --pull --allow-missing ``` -OT_VERSION=14.9 #or whatever your OpenTree version is -OT_TAXONOMY_VERSION=3.6 -OT_TAXONOMY_EXTRA=draft1 #optional - the draft for this version, e.g. `draft1` if the taxonomy_version is 3.6draft1 -``` -### Download +DVC will pull only the cached outputs needed for stages that haven't changed. If all stages are cached, nothing needs to be re-run. + +### Full build (first time / updating source data) + +1. Update `params.yaml` with the desired OpenTree version numbers. You can check the latest version via the [API](https://github.com/OpenTreeOfLife/germinator/wiki/Open-Tree-of-Life-Web-APIs): + + ```bash + curl -s -X POST https://api.opentreeoflife.org/v3/tree_of_life/about | grep -E '"synth_id"|"taxonomy_version"' + ``` + +2. Download the required source files into `data/` as [documented here](data/README.markdown), then register them with DVC: + + ```bash + dvc add data/OpenTree/labelled_supertree_simplified_ottnames.tre + dvc add data/OpenTree/ott3.7.tgz + dvc add data/Wiki/wd_JSON/latest-all.json.bz2 + dvc add data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + dvc add data/Wiki/wp_pagecounts/ + dvc add data/EOL/provider_ids.csv.gz + ``` + +3. Run the pipeline and push results to the shared cache: + + ```bash + dvc repro + dvc push + ``` + +4. Commit the `.dvc` files and `dvc.lock` to git. + +### Pipeline stages -Constructing the full tree of life requires various files downloaded from the internet. They should be placed within the appropriate directories in the `data` directory, as [documented here](data/README.markdown). +The pipeline is defined in `dvc.yaml`. Use `dvc dag` to visualize the DAG. Key stages include: -### Building the tree +- **preprocess_opentree**, **unpack_taxonomy** -- prepare OpenTree data +- **add_ott_numbers**, **prepare_open_trees**, **build_tree** -- assemble the full newick tree +- **filter_eol**, **filter_wikidata**, **filter_sql**, **filter_pageviews** -- filter massive source files (parallelizable) +- **create_tables** -- map taxa, calculate popularity, produce DB-ready CSVs +- **make_js** -- generate JS viewer files -Once data files are downloaded, you should be set up to actually build the tree and other backend files, by following [these instructions](oz_tree_build/README.markdown). +For detailed step-by-step documentation, see [oz_tree_build/README.markdown](oz_tree_build/README.markdown). diff --git a/data/EOL/.gitignore b/data/EOL/.gitignore index 6c9d760..47df0e7 100755 --- a/data/EOL/.gitignore +++ b/data/EOL/.gitignore @@ -3,4 +3,5 @@ # But not these files... !.gitignore -!README.markdown \ No newline at end of file +!README.markdown +!*.dvc \ No newline at end of file diff --git a/data/OpenTree/.gitignore b/data/OpenTree/.gitignore index 6c9d760..47df0e7 100755 --- a/data/OpenTree/.gitignore +++ b/data/OpenTree/.gitignore @@ -3,4 +3,5 @@ # But not these files... !.gitignore -!README.markdown \ No newline at end of file +!README.markdown +!*.dvc \ No newline at end of file diff --git a/data/Wiki/.gitignore b/data/Wiki/.gitignore index 6c9d760..47df0e7 100755 --- a/data/Wiki/.gitignore +++ b/data/Wiki/.gitignore @@ -3,4 +3,5 @@ # But not these files... !.gitignore -!README.markdown \ No newline at end of file +!README.markdown +!*.dvc \ No newline at end of file diff --git a/data/Wiki/README.markdown b/data/Wiki/README.markdown index dcbd4e5..13bd3cb 100755 --- a/data/Wiki/README.markdown +++ b/data/Wiki/README.markdown @@ -7,14 +7,10 @@ versions on external storage) * The `wp_SQL` directory should contain the en.wikipedia SQL dump file, as `enwiki-latest-page.sql.gz` (download from ) * The `wp_pagecounts` directory should contain the wikipedia pagevisits dump files: -multiple files such as `wp_pagecounts/pageviews-202403-user.bz2` etc... +multiple files such as `wp_pagecounts/pageviews-202403-user.bz2` etc... (download from ). -For `wp_pagecounts`, as a much faster alternative, you can download preprocessed pageviews files from a [release](https://github.com/OneZoom/tree-build/releases). - -You can download the gz file and unpack it in one command. e.g. from `data/Wiki/wp_pagecounts`, run: -```bash -wget https://github.com/OneZoom/tree-build/releases/download/pageviews-202306-202403/OneZoom_pageviews-202306-202403.tar.gz -O - | tar -xz -``` - -You will then omit passing pageviews files when you later run `generate_filtered_files` (see [build steps](../../oz_tree_build/README.markdown)). +These files are used as inputs to the DVC pipeline's filtering stages. If someone +has already run the pipeline and pushed results to the DVC remote, you do not need +to download these files yourself -- `dvc repro --pull --allow-missing` will pull +the cached filtered outputs instead. diff --git a/data/Wiki/wd_JSON/.gitignore b/data/Wiki/wd_JSON/.gitignore index d6b7ef3..65e7aa0 100644 --- a/data/Wiki/wd_JSON/.gitignore +++ b/data/Wiki/wd_JSON/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!*.dvc diff --git a/data/Wiki/wp_SQL/.gitignore b/data/Wiki/wp_SQL/.gitignore index d6b7ef3..65e7aa0 100644 --- a/data/Wiki/wp_SQL/.gitignore +++ b/data/Wiki/wp_SQL/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!*.dvc diff --git a/data/Wiki/wp_pagecounts/.gitignore b/data/Wiki/wp_pagecounts/.gitignore index d6b7ef3..65e7aa0 100644 --- a/data/Wiki/wp_pagecounts/.gitignore +++ b/data/Wiki/wp_pagecounts/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!*.dvc diff --git a/data/output_files/.gitignore b/data/output_files/.gitignore index d6b7ef3..65e7aa0 100644 --- a/data/output_files/.gitignore +++ b/data/output_files/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!*.dvc diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..3b32111 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,167 @@ +vars: + - params.yaml + +stages: + preprocess_opentree: + cmd: >- + perl -pe 's/\)mrcaott\d+ott\d+/\)/g; s/[ _]+/_/g;' + data/OpenTree/labelled_supertree_simplified_ottnames.tre + > data/OpenTree/draftversion${ot_version}.tre + deps: + - data/OpenTree/labelled_supertree_simplified_ottnames.tre + params: + - ot_version + outs: + - data/OpenTree/draftversion${ot_version}.tre + + unpack_taxonomy: + cmd: tar -C data/OpenTree -zxf data/OpenTree/ott${ot_taxonomy_version}.tgz + deps: + - data/OpenTree/ott${ot_taxonomy_version}.tgz + params: + - ot_taxonomy_version + outs: + - data/OpenTree/ott${ot_taxonomy_version}/ + + add_ott_numbers: + cmd: >- + rm -rf data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra} && + mkdir -p data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra} && + add_ott_numbers_to_trees + --savein data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra} + data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] + deps: + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/ + params: + - oz_tree + - ot_taxonomy_version + - ot_taxonomy_extra + outs: + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/ + + prepare_open_trees: + cmd: >- + cp -n data/OZTreeBuild/${oz_tree}/OpenTreeParts/OT_required/*.nwk + data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ 2>/dev/null || true && + cd data/OZTreeBuild/${oz_tree} && + get_open_trees_from_one_zoom + ../../OpenTree/draftversion${ot_version}.tre + OpenTreeParts/OpenTree_all/ + BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/*.PHY + deps: + - data/OpenTree/draftversion${ot_version}.tre + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/ + - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OT_required/ + params: + - oz_tree + - ot_version + - ot_taxonomy_version + - ot_taxonomy_extra + outs: + - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ + + build_tree: + cmd: >- + cd data/OZTreeBuild/${oz_tree} && + build_oz_tree + BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/Base.PHY + OpenTreeParts/OpenTree_all/ + ${oz_tree}_full_tree.phy + deps: + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/ + - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ + params: + - oz_tree + - ot_taxonomy_version + - ot_taxonomy_extra + outs: + - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy + + filter_eol: + cmd: >- + filter_eol + data/EOL/provider_ids.csv.gz + data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + -o data/filtered/OneZoom_provider_ids.csv + deps: + - data/EOL/provider_ids.csv.gz + - data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + params: + - ot_taxonomy_version + outs: + - data/filtered/OneZoom_provider_ids.csv + + filter_wikidata: + cmd: >- + filter_wikidata + data/Wiki/wd_JSON/latest-all.json.bz2 + -o data/filtered/OneZoom_latest-all.json + --titles-output data/filtered/wikidata_titles.txt + deps: + - data/Wiki/wd_JSON/latest-all.json.bz2 + outs: + - data/filtered/OneZoom_latest-all.json + - data/filtered/wikidata_titles.txt + + filter_sql: + cmd: >- + filter_wikipedia_sql + data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + data/filtered/wikidata_titles.txt + -o data/filtered/OneZoom_enwiki-latest-page.sql + deps: + - data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + - data/filtered/wikidata_titles.txt + outs: + - data/filtered/OneZoom_enwiki-latest-page.sql + + filter_pageviews: + cmd: >- + filter_pageviews + data/Wiki/wp_pagecounts/pageviews*.bz2 + --titles-file data/filtered/wikidata_titles.txt + -o data/filtered/pageviews + deps: + - data/Wiki/wp_pagecounts/ + - data/filtered/wikidata_titles.txt + outs: + - data/filtered/pageviews/ + + create_tables: + cmd: >- + CSV_base_table_creator + data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy + data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + data/filtered/OneZoom_provider_ids.csv + data/filtered/OneZoom_latest-all.json + data/filtered/OneZoom_enwiki-latest-page.sql + data/filtered/pageviews/OneZoom_pageviews* + -o data/output_files -v + --version ${build_version} + --exclude ${exclude_from_popularity} + --extra_source_file data/OZTreeBuild/${oz_tree}/BespokeTree/SupplementaryTaxonomy.tsv + 2> data/output_files/ordered_output.log + deps: + - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy + - data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + - data/filtered/OneZoom_provider_ids.csv + - data/filtered/OneZoom_latest-all.json + - data/filtered/OneZoom_enwiki-latest-page.sql + - data/filtered/pageviews/ + - data/OZTreeBuild/${oz_tree}/BespokeTree/SupplementaryTaxonomy.tsv + params: + - oz_tree + - ot_taxonomy_version + - build_version + - exclude_from_popularity + outs: + - data/output_files/ + + make_js: + cmd: >- + make_js_treefiles + --outdir data/js_output + deps: + - data/output_files/ + outs: + - data/js_output/ diff --git a/oz_tree_build/README.markdown b/oz_tree_build/README.markdown index d3336f5..439edcf 100755 --- a/oz_tree_build/README.markdown +++ b/oz_tree_build/README.markdown @@ -1,18 +1,59 @@ # Introduction + Creating a bespoke OneZoom tree involves a number of steps, as documented below. These take an initial tree, map taxa onto Open Tree identifiers, add subtrees from the OpenTree of Life, resolve polytomies and delete subspecies, and calculate mappings to other databases together with creating wikipedia popularity metrics for all taxa. Finally, the resulting tree and database files are converted to a format usable by the OneZoom viewer. Mapping and popularity calculations require various large files to be downloaded e.g. from wikipedia, as [documented here](../data/README.markdown). The instructions below are primarily intended for creating a full tree of all life on the main OneZoom site. If you are making a bespoke tree, you may need to tweak them slightly. -The output files created by the tree building process (database files and files to feed to the js, -and which can be loaded into the database and for the tree viewer) are saved in `output_files`. +The output files created by the tree building process (database files and files to feed to the js, and which can be loaded into the database and for the tree viewer) are saved in `data/output_files`. + +## Using DVC (recommended) + +The entire build is defined as a [DVC](https://dvc.org/) pipeline in `dvc.yaml`, with parameters in `params.yaml`. This means you can reproduce the full build with a single command: + +```bash +source .venv/bin/activate +dvc repro +``` + +If the pipeline has already been run by someone else and the results pushed to the DVC remote, you can pull cached outputs without downloading any of the large source files: + +```bash +dvc repro --pull --allow-missing +``` + +To run only up to a specific stage (e.g. just the JS generation): + +```bash +dvc repro make_js +``` + +To visualize the pipeline DAG: + +```bash +dvc dag +``` + +After running the pipeline, copy the JS output from `data/js_output/` to the OZtree repo: + +```bash +cp data/js_output/* ../OZtree/static/FinalOutputs/data/ +``` + +### Updating parameters + +Edit `params.yaml` to change the OpenTree version, taxonomy version, build version, etc. DVC will detect the parameter changes and re-run only the affected stages. + +## Manual steps (without DVC) -## Environment +The following manual instructions are preserved for reference. They document the same steps that the DVC pipeline automates. + +### Environment The following environment variables should be set: ``` OZ_TREE=AllLife # a tree directory in data/OZTreeBuild -OZ_DIR=../OZtree # the path to the OneZoom/OZtree github directory (here we assume the `tree-build` repo is a sibling to the `OZtree` repo) +OZ_DIR=../OZtree # the path to the OneZoom/OZtree github directory ``` You also need to select the OpenTree version to build against. @@ -33,7 +74,7 @@ OT_TAXONOMY_VERSION=3.7 OT_TAXONOMY_EXTRA=draft2 #optional - the draft for this version, e.g. `draft1` if the taxonomy_version is 3.6draft1 ``` -## Downloads +### Downloads Follow the [the download instructions](../data/README.markdown) to fetch required files. In summary, this should entail: @@ -46,13 +87,12 @@ wget -cP data/OpenTree/ "https://files.opentreeoflife.org/ott/ott${OT_TAXONOMY_V wget -cP data/Wiki/wp_SQL/ https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz wget -cP data/Wiki/wd_JSON/ https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 -## Pre-processed PageViews - see https://github.com/OneZoom/tree-build/releases -curl -L https://github.com/OneZoom/tree-build/releases/download/pageviews-202306-202403/OneZoom_pageviews-202306-202403.tar.gz | tar -zxC data/Wiki/wp_pagecounts/ +## Pageviews +wget -cP data/Wiki/wp_pagecounts/ 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2024/2024-03/pageviews-202403-user.bz2' ## EoL # TODO: In theory fetchable from https://opendata.eol.org/dataset/identifier-map, but currently broken cp provider_ids.csv.gz data/EOL/ - ``` Note that as documented in that readme, @@ -64,13 +104,13 @@ perl -pe 's/\)mrcaott\d+ott\d+/\)/g; s/[ _]+/_/g;' \ > data/OpenTree/draftversion${OT_VERSION}.tre ``` -# Building a tree +### Building a tree -The times given at the start of each of the following steps refer to the time taken to run the commands on the entire tree of life. +The times given at the start of each of the following steps refer to the time taken to run the commands on the entire tree of life. If you already have your own newick tree with open tree ids on it already, and don't want to graft extra clades from the OpenTree, you can skip steps 1-4, and simply save the tree as `${OZ_TREE}_full_tree.phy` in your base directory. If you have a tree but it does not have ott numbers, then you can add them using step 1, and move the resulting tree in `BespokeTree/include_files` to `${OZ_TREE}_full_tree.phy` in your base directory. -## Create the tree +### Create the tree 0. The following steps assume the venv has been activated: @@ -80,8 +120,8 @@ If you already have your own newick tree with open tree ids on it already, and d If not created, see installation steps in the [main README](../README.markdown). -1. (20 secs) Use the [OpenTree API](https://github.com/OpenTreeOfLife/germinator/wiki/Synthetic-tree-API-v3) to add OTT ids to any non-opentree taxa in our own bespoke phylogenies (those in `*.phy` or `*.PHY` files). The new `.phy` and `.PHY` files will be created in a new directory within `data/OZTreeBuild/${OZ_TREE}/BespokeTree`, and a symlink to that directory will be created called `include_files` - +1. (20 secs) Use the [OpenTree API](https://github.com/OpenTreeOfLife/germinator/wiki/Synthetic-tree-API-v3) to add OTT ids to any non-opentree taxa in our own bespoke phylogenies (those in `*.phy` or `*.PHY` files). The new `.phy` and `.PHY` files will be created in a new directory within `data/OZTreeBuild/${OZ_TREE}/BespokeTree`, and a symlink to that directory will be created called `include_files` + ``` mkdir -p "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}" touch "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/dir" @@ -107,8 +147,8 @@ If you already have your own newick tree with open tree ids on it already, and d BespokeTree/include_files/*.PHY) ``` If you are not including any OpenTree subtrees in your final tree, you should have no `.PHY` files, and this step will output a warning, which can be ignored. - -1. (1 sec) substitute these subtrees into the main tree, and save the resulting full newick file using the `build_oz_tree` script: + +1. (1 sec) substitute these subtrees into the main tree, and save the resulting full newick file using the `build_oz_tree` script: ``` (cd data/OZTreeBuild/${OZ_TREE} && \ @@ -126,29 +166,21 @@ If you already have your own newick tree with open tree ids on it already, and d gzip < data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy > ${OZ_DIR}/static/FinalOutputs/${OZ_TREE}_full_tree.phy.gz ``` - ## Create the base tree and table data - -1. (5 to 7 hours, or a few mins if files are already filtered, see below) This generates filtered versions of the raw input files, which then makes them faster to work with. For example, for the massive wikimedia dump file (`latest-all.json.bz2`), it remove all entries that aren't taxons or vernaculars, and for each remaining entry, in only keeps the small subset of fields that we care about. - - The output files have the same names as the input files, but with a `OneZoom_` prefix, and without using compression (e.g. `OneZoom_latest-all.json` for `latest-all.json`.bz2). They are stored next to their matching input files. +### Create the base tree and table data - Note that by default, it works incrementally and only generates new filtered files if they are missing or old versions. It does this by setting the timestamp of generated files to match their source file. So if for instance it has already filtered `latest-all.json.bz2`, but has not processed the SQL or Page Count files, you can just rerun the same command, and it will not need to reprocess `latest-all.json.bz2`. You can override this behavior and force full regeneration by passing in a `-f` flag. - - From the data folder, run the `generate_filtered_files` script: +5. (5 to 7 hours, or a few mins if files are already filtered) This generates filtered versions of the raw input files, which then makes them faster to work with. In the DVC pipeline, this is handled by the `filter_eol`, `filter_wikidata`, `filter_sql`, and `filter_pageviews` stages, which run as separate parallel stages. Without DVC, the `generate_filtered_files` script can still be used to run them all together: ``` tar -C data/OpenTree -zxvf data/OpenTree/ott${OT_TAXONOMY_VERSION}.tgz (cd data && generate_filtered_files OZTreeBuild/AllLife/AllLife_full_tree.phy OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv EOL/provider_ids.csv.gz Wiki/wd_JSON/latest-all.json.bz2 Wiki/wp_SQL/enwiki-latest-page.sql.gz Wiki/wp_pagecounts/pageviews*.bz2) ``` - Alternatively, if you downloaded the preprocessed pageviews file (per [instructions](../data/Wiki/README.markdown)), you should omit the last argument (`Wiki/wp_pagecounts/pageviews*.bz2`) from this `generate_filtered_files` command. - 1. (11 mins) On the basis of the `${OZ_TREE}_full_tree.phy` file, look for ID mappings between different datasets, calculate popularity measures via wikidata/pedia, refine the tree (remove subspecies, randomly break polytomies, remove unifurcations etc), and then create corresponding database tables together with `ordered_tree_XXXXX.nwk`, `ordered_tree_XXXXX.poly` (same file but with polytomies marked with curly braces), and `ordered_dates_XXXXX.js` files (where XXXXX is the version number, usually a timestamp). Additional flags can be given to override the OpenTree taxonomy in specific cases (using `--extra_source_file`), and to exclude certain taxa (e.g. dinosaurs) from the popularity calculations. If you do not have comprehensive tree of a clade, it probably doesn't make sense to calculate popularity measures, and you can run this script with the `-p` flag (or omit the references to the `wp_` wikipedia files). - + ``` CSV_base_table_creator \ data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy \ @@ -163,18 +195,17 @@ If you already have your own newick tree with open tree ids on it already, and d 2> data/output_files/ordered_output.log ``` - Since round braces, curly braces, and commas are banned from the `simplified_ottnames` file, we can create minimal topology files by simply removing everything except these characters from the `.nwk` and `.poly` files. If the tree has been ladderised, with polytomies and unifurcations removed, the commas are also redundant, and can be removed. This is done in the next step, which saves these highly shortened strings into .js data files. + Since round braces, curly braces, and commas are banned from the `simplified_ottnames` file, we can create minimal topology files by simply removing everything except these characters from the `.nwk` and `.poly` files. If the tree has been ladderised, with polytomies and unifurcations removed, the commas are also redundant, and can be removed. This is done in the next step, which saves these highly shortened strings into .js data files. + +1. (1 min) Turn the most recently saved tree files (saved in the previous step as `data/output_files/ordered_tree_XXXXXX.poly` and `ordered_dates_XXXXXX.json`) into bracketed newick JS files. In the DVC pipeline, these are output to `data/js_output/` and can be copied to the OZtree repo. Without DVC, you can write directly to the OZtree directory: -1. (1 min) turn the most recently saved tree files (saved in the previous step as `data/output_files/ordered_tree_XXXXXX.poly` and `ordered_dates_XXXXXX.json`) into bracketed newick strings in `${OZ_DIR}/static/FinalOutputs/data/basetree_XXXXXX.js`, ``${OZ_DIR}/static/FinalOutputs/data/polytree_XXXXXX.js`, a cutpoints file in ``${OZ_DIR}/static/FinalOutputs/data/cut_position_map_XXXXXX.js`, and a dates file in ``${OZ_DIR}/static/FinalOutputs/data/dates_XXXXXX.json` as well as their gzipped equivalents, using - ``` make_js_treefiles --outdir ${OZ_DIR}/static/FinalOutputs/data ``` - - ## Upload data to the server and check it - -1. If you are running the tree building scripts on a different computer to the one running the web server, you will need to push the `completetree_XXXXXX.js`, `completetree_XXXXXX.js.gz`, `cut_position_map_XXXXXX.js`, `cut_position_map_XXXXXX.js.gz`, `dates_XXXXXX.js` -, `dates_XXXXXX.js.gz` files onto your server, e.g. by pushing to your local Github repo then pulling the latest github changes to the server. + +### Upload data to the server and check it + +8. If you are running the tree building scripts on a different computer to the one running the web server, you will need to push the `completetree_XXXXXX.js`, `completetree_XXXXXX.js.gz`, `cut_position_map_XXXXXX.js`, `cut_position_map_XXXXXX.js.gz`, `dates_XXXXXX.js`, `dates_XXXXXX.js.gz` files onto your server, e.g. by pushing to your local Github repo then pulling the latest github changes to the server. 1. (15 mins) load the CSV tables into the DB, using the SQL commands printed in step 6 (at the end of the `data/output_files/ordered_output.log` file: the lines that start something like `TRUNCATE TABLE ordered_leaves; LOAD DATA LOCAL INFILE ...;` `TRUNCATE TABLE ordered_nodes; LOAD DATA LOCAL INFILE ...;`). Either do so via a GUI utility, or copy the `.csv.mySQL` files to a local directory on the machine running your SQL server (e.g. using `scp -C` for compression) and run your `LOAD DATA LOCAL INFILE` commands on the mysql command line (this may require you to start the command line utility using `mysql --local-infile`, e.g.: ``` @@ -186,22 +217,23 @@ If you already have your own newick tree with open tree ids on it already, and d select * from reservations left outer join ordered_leaves on reservations.OTT_ID = ordered_leaves.ott where ordered_leaves.ott is null and reservations.verified_name IS NOT NULL; select group_concat(id), group_concat(parent), group_concat(name), count(ott) from ordered_leaves group by ott having(count(ott) > 1) ``` - - ## Fill in additional server fields -1. (15 mins) create example pictures for each node by percolating up. This requires the most recent `images_by_ott` table, so either do this on the main server, or (if you are doing it locally) update your `images_by_ott` to the most recent server version. +### Fill in additional server fields + +11. (15 mins) create example pictures for each node by percolating up. This requires the most recent `images_by_ott` table, so either do this on the main server, or (if you are doing it locally) update your `images_by_ott` to the most recent server version. ``` ${OZ_DIR}/OZprivate/ServerScripts/Utilities/picProcess.py -v ``` -1. (5 mins) percolate the IUCN data up using - +1. (5 mins) percolate the IUCN data up using + ``` ${OZ_DIR}/OZprivate/ServerScripts/Utilities/IUCNquery.py -v ``` (note that this both updates the IUCN data in the DB and percolates up interior node info) 1. (10 mins) If this is a site with sponsorship (only the main OZ site), set the pricing structure using SET_PRICES.html (accessible from the management pages). -1. (5 mins - this does seem to be necessary for ordered nodes & ordered leaves). Make sure indexes are reset. Look at `OZprivate/ServerScripts/SQL/create_db_indexes.sql` for the SQL to do this - this may involve logging in to the SQL server (e.g. via Sequel Pro on Mac) and pasting all the drop index and create index commands. - - ## at last -1. Have a well deserved cup of tea +1. (5 mins - this does seem to be necessary for ordered nodes & ordered leaves). Make sure indexes are reset. Look at `OZprivate/ServerScripts/SQL/create_db_indexes.sql` for the SQL to do this - this may involve logging in to the SQL server (e.g. via Sequel Pro on Mac) and pasting all the drop index and create index commands. + +### At last + +15. Have a well deserved cup of tea diff --git a/oz_tree_build/utilities/filter_common.py b/oz_tree_build/utilities/filter_common.py new file mode 100644 index 0000000..8abdb87 --- /dev/null +++ b/oz_tree_build/utilities/filter_common.py @@ -0,0 +1,29 @@ +"""Shared utilities for the filter modules.""" + +import csv + +from .file_utils import open_file_based_on_extension + + +def read_taxonomy_source_ids(taxonomy_file): + """ + Read an OpenTree taxonomy.tsv file and return a dict mapping source + names to sets of integer IDs. Used by filter_eol and filter_wikidata + (in clade mode). + """ + sources = {"ncbi", "if", "worms", "irmng", "gbif"} + source_ids = {source: set() for source in sources} + + with open_file_based_on_extension(taxonomy_file, "rt") as f: + reader = csv.DictReader(f, delimiter="\t") + for OTTrow in reader: + sourceinfo = OTTrow["sourceinfo"] + for srcs in sourceinfo.split(","): + src, src_id = srcs.split(":", 1) + if src in sources: + try: + source_ids[src].add(int(src_id)) + except ValueError: + pass + + return source_ids diff --git a/oz_tree_build/utilities/filter_eol.py b/oz_tree_build/utilities/filter_eol.py new file mode 100644 index 0000000..99a9209 --- /dev/null +++ b/oz_tree_build/utilities/filter_eol.py @@ -0,0 +1,79 @@ +"""Filter EOL provider IDs CSV to keep only relevant sources.""" + +import argparse +import logging +import sys + +from ..taxon_mapping_and_popularity.CSV_base_table_creator import iucn_num +from .file_utils import open_file_based_on_extension +from .filter_common import read_taxonomy_source_ids + + +def filter_eol_ids(eol_id_file, output_file, source_ids, clade=None): + """ + Filter the EOL identifiers file, keeping only rows from known providers + whose IDs appear in the taxonomy. In non-clade (full-tree) mode, all rows + from known providers are kept. + + Returns nothing; writes filtered output to output_file. + """ + eol_sources = {"676": "ncbi", "459": "worms", "767": "gbif", str(iucn_num): "iucn"} + iucn_lines = [] + known_names = set() + + with open_file_based_on_extension(eol_id_file, "rt") as eol_f: + with open_file_based_on_extension(output_file, "wt") as filtered_eol_f: + for i, line in enumerate(eol_f): + if i == 0: + filtered_eol_f.write(line) + continue + + fields = line.split(",") + + if fields[2] not in eol_sources: + continue + + try: + eol_id = int(fields[1]) + except ValueError: + continue + + if not clade: + filtered_eol_f.write(line) + continue + + if fields[2] == str(iucn_num): + iucn_lines.append(line) + elif eol_id in source_ids[eol_sources[fields[2]]]: + filtered_eol_f.write(line) + known_names.add(fields[4]) + + for line in iucn_lines: + fields = line.split(",") + if fields[4] in known_names: + filtered_eol_f.write(line) + + logging.info( + f"Found {len(source_ids['ncbi'])} NCBI ids, " + f"{len(source_ids['if'])} IF ids, " + f"{len(source_ids['worms'])} WoRMS ids, " + f"{len(source_ids['irmng'])} IRMNG ids, " + f"{len(source_ids['gbif'])} GBIF ids" + ) + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("eol_file", help="The EOL identifiers CSV file (optionally gzipped)") + parser.add_argument("taxonomy_file", help="The OpenTree taxonomy.tsv file") + parser.add_argument("-o", "--output", required=True, help="Output path for filtered EOL CSV") + args = parser.parse_args() + + source_ids = read_taxonomy_source_ids(args.taxonomy_file) + filter_eol_ids(args.eol_file, args.output, source_ids) + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/filter_pageviews.py b/oz_tree_build/utilities/filter_pageviews.py new file mode 100644 index 0000000..8b766ca --- /dev/null +++ b/oz_tree_build/utilities/filter_pageviews.py @@ -0,0 +1,81 @@ +"""Filter Wikipedia pageview files to keep only pages matching wikidata titles.""" + +import argparse +import logging +import os +import sys +from collections import defaultdict + +from .file_utils import enumerate_lines_from_file, open_file_based_on_extension +from .filter_wikidata import load_titles_file + + +def unquote_if_quoted(s): + if (s.startswith("'") and s.endswith("'")) or (s.startswith('"') and s.endswith('"')): + s = s[1:-1] + return bytes(s, "utf-8").decode("unicode_escape") + return s + + +def filter_pageviews(pageviews_file, output_file, wikidata_titles, wikilang="en"): + """ + Filter a single pageview file, keeping only entries whose title appears + in the wikidata_titles set. Aggregates views per title and writes output + in the simplified format (``Title viewcount``). + """ + match_project = wikilang + ".wikipedia " + pageviews = defaultdict(int) + simplified_line_format = False + + for i, line in enumerate_lines_from_file(pageviews_file): + if i == 0: + simplified_line_format = line.count(" ") == 1 + + if i > 0 and i % 10000000 == 0: + logging.info(f"Processed {i} lines") + + if not simplified_line_format and not line.startswith(match_project): + continue + + info = line.split(" ") + if simplified_line_format: + title = info[0] + views = info[1] + else: + title = unquote_if_quoted(info[1]) + views = info[4] + + if title in wikidata_titles: + pageviews[title] += int(views) + + with open_file_based_on_extension(output_file, "wt") as filtered_f: + for title, views in pageviews.items(): + filtered_f.write(title + " " + str(views) + "\n") + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("pageview_files", nargs="+", help="One or more pageview files (optionally bz2-compressed)") + parser.add_argument("--titles-file", required=True, help="wikidata_titles.txt file (one title per line)") + parser.add_argument("-o", "--output-dir", required=True, help="Output directory for filtered pageview files") + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + args = parser.parse_args() + + wikidata_titles = load_titles_file(args.titles_file) + logging.info(f"Loaded {len(wikidata_titles)} wikidata titles") + + os.makedirs(args.output_dir, exist_ok=True) + + for pv_file in args.pageview_files: + basename = os.path.basename(pv_file) + if basename.endswith(".bz2"): + basename = basename[:-4] + output_file = os.path.join(args.output_dir, f"OneZoom_{basename}") + logging.info(f"Filtering {pv_file} -> {output_file}") + filter_pageviews(pv_file, output_file, wikidata_titles, wikilang=args.wikilang) + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/filter_wikidata.py b/oz_tree_build/utilities/filter_wikidata.py new file mode 100644 index 0000000..fb294ff --- /dev/null +++ b/oz_tree_build/utilities/filter_wikidata.py @@ -0,0 +1,223 @@ +"""Filter the massive wikidata JSON dump to taxon and vernacular items.""" + +import argparse +import json +import logging +import sys + +from .._OZglobals import wikiflags +from ..taxon_mapping_and_popularity.OTT_popularity_mapping import ( + JSON_contains_known_dbID, + Qid, + label, +) +from .apply_mask_to_object_graph import ANY, KEEP, apply_mask_to_object_graph +from .file_utils import enumerate_lines_from_file, open_file_based_on_extension +from .temp_helpers import ( + find_taxon_and_vernaculars, + get_wikipedia_name, + quick_byte_match, + wikidata_value, +) + +WIKIDATA_MASK = { + "type": KEEP, + "id": KEEP, + "labels": {"en": {"value": KEEP}}, + "claims": { + "P31": [ + { + "mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}, + "qualifiers": { + "P642": [{"datavalue": {"value": {"numeric-id": KEEP}}}] + }, + } + ], + "P685": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P846": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P850": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P1391": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P5055": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P830": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P961": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P9157": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P3151": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P141": [{"references": [{"snaks": {"P627": [{"datavalue": {"value": KEEP}}]}}]}], + "P1420": [{"mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}}], + "P18": [ + { + "mainsnak": {"datavalue": {"value": KEEP}}, + "rank": KEEP, + } + ], + "P1843": [ + { + "mainsnak": {"datavalue": {"value": KEEP}}, + "rank": KEEP, + } + ], + }, + "sitelinks": {ANY: {"title": KEEP}}, +} + + +def filter_wikidata( + wikidata_file, + output_file, + source_ids=None, + clade=None, + wikilang="en", + dont_trim_sitelinks=False, +): + """ + Filter the wikidata JSON dump, keeping only taxon and vernacular items, + and trimming each item to only the fields we consume. + + Returns the set of Wikipedia page titles found in the filtered output + (the ``wikidata_ids`` set used by downstream SQL and pageview filters). + """ + sitelinks_key = f"{wikilang}wiki" + + def trim_and_write_json_item(json_item, filtered_wiki_f): + apply_mask_to_object_graph(json_item, WIKIDATA_MASK) + + if dont_trim_sitelinks: + json_item["sitelinks"] = { + k: v for k, v in json_item["sitelinks"].items() if k.endswith("wiki") + } + else: + json_item["sitelinks"] = { + k: v if k == sitelinks_key else {} + for k, v in json_item["sitelinks"].items() + if k.endswith("wiki") and len(k) == 6 and k[:2] in wikiflags + } + + filtered_wiki_f.write(json.dumps(json_item, separators=(",", ":"))) + filtered_wiki_f.write(",\n") + + included_qids = set() + potential_extra_json_items = [] + + with open_file_based_on_extension(output_file, "wt") as filtered_wiki_f: + filtered_wiki_f.write("[\n") + preserved_lines = 0 + + def get_line_message(line_num): + return f"Kept {preserved_lines}/{line_num} lines ({preserved_lines / line_num * 100:.2f}%)" + + for _, line in enumerate_lines_from_file(wikidata_file, 100000, get_line_message): + if not (line.startswith('{"type":') and quick_byte_match.search(line)): + continue + + json_item = json.loads(line.rstrip().rstrip(",")) + + try: + is_taxon, vernaculars_matches = find_taxon_and_vernaculars(json_item) + except KeyError: + continue + + if not is_taxon and not len(vernaculars_matches) > 0: + continue + + if clade and is_taxon and source_ids: + if not len(JSON_contains_known_dbID(json_item, source_ids)) > 0: + if "P1420" in json_item["claims"] and json_item["sitelinks"]: + potential_extra_json_items.append( + ( + "taxon_synonym", + {wikidata_value(i["mainsnak"])["numeric-id"] for i in json_item["claims"]["P1420"]}, + json_item, + ) + ) + if vernaculars_matches: + potential_extra_json_items.append(("instance_of_synonym", vernaculars_matches, json_item)) + continue + + if is_taxon: + trim_and_write_json_item(json_item, filtered_wiki_f) + included_qids.add(Qid(json_item)) + preserved_lines += 1 + else: + potential_extra_json_items.append(("vernacular", vernaculars_matches, json_item)) + + logging.info( + "Writing extra lines at the end of the file " + f"(subset of {len(potential_extra_json_items)} lines)" + ) + + for desc, linked_qids, json_item in potential_extra_json_items: + for qid in linked_qids: + if qid in included_qids: + trim_and_write_json_item(json_item, filtered_wiki_f) + logging.info( + f"Including {desc} entry: Q{Qid(json_item)} " + f"('{label(json_item)}','{get_wikipedia_name(json_item)}' => Q{qid}" + ) + break + + filtered_wiki_f.write("]\n") + + # Re-read the filtered output to extract the set of Wikipedia page titles + wikidata_titles = extract_wikidata_titles(output_file) + return wikidata_titles + + +def extract_wikidata_titles(filtered_wikidata_file): + """ + Read a filtered wikidata JSON file and return the set of Wikipedia page + titles (used by downstream SQL and pageview filters). + """ + titles = set() + for _, line in enumerate_lines_from_file(filtered_wikidata_file): + if not line.startswith('{"type":'): + continue + json_item = json.loads(line.rstrip().rstrip(",")) + titles.add(get_wikipedia_name(json_item)) + return titles + + +def write_titles_file(titles, output_path): + """Write the set of titles to a text file, one per line.""" + with open(output_path, "w") as f: + for title in sorted(titles): + f.write(title + "\n") + + +def load_titles_file(titles_path): + """Load a titles text file into a set.""" + with open(titles_path) as f: + return {line.strip() for line in f if line.strip()} + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("wikidata_file", help="The wikidata JSON dump file (bz2 or plain)") + parser.add_argument("-o", "--output", required=True, help="Output path for filtered wikidata JSON") + parser.add_argument( + "--titles-output", + required=True, + help="Output path for wikidata_titles.txt sidecar (one Wikipedia title per line)", + ) + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + parser.add_argument( + "--dont-trim-sitelinks", + action="store_true", + default=False, + help="Keep the full sitelinks value for all languages", + ) + args = parser.parse_args() + + titles = filter_wikidata( + args.wikidata_file, + args.output, + wikilang=args.wikilang, + dont_trim_sitelinks=args.dont_trim_sitelinks, + ) + write_titles_file(titles, args.titles_output) + logging.info(f"Wrote {len(titles)} titles to {args.titles_output}") + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/filter_wikipedia_sql.py b/oz_tree_build/utilities/filter_wikipedia_sql.py new file mode 100644 index 0000000..d8a6e8d --- /dev/null +++ b/oz_tree_build/utilities/filter_wikipedia_sql.py @@ -0,0 +1,80 @@ +"""Filter the enwiki SQL page dump to keep only pages matching wikidata titles.""" + +import argparse +import csv +import logging +import sys + +from .file_utils import open_file_based_on_extension +from .filter_wikidata import load_titles_file + + +def filter_wikipedia_sql(sql_file, output_file, wikidata_titles): + """ + Filter the enwiki page SQL dump, keeping only rows whose title appears + in the wikidata_titles set. + """ + page_table_namespace_column = 2 + page_table_title_column = 3 + page_is_redirect_column = 4 + page_table_pagelen_column = 10 + + with open_file_based_on_extension(output_file, "wt") as filtered_sql_f: + current_output_line_entry_count = 0 + max_entries_per_line = 10 + with open_file_based_on_extension(sql_file, "rt") as sql_f: + pagelen_file = csv.reader(sql_f, quotechar="'", escapechar="\\", doublequote=False) + match_line = "INSERT INTO `page` VALUES " + for fields in filter( + lambda x: False if len(x) == 0 else x[0].startswith(match_line), + pagelen_file, + ): + field_num = 0 + for field in fields: + try: + if field and field.lstrip()[0] == "(": + field_num = 0 + namespace = None + title = None + is_redirect = "0" + except IndexError: + pass + field_num += 1 + if field_num == page_table_namespace_column: + namespace = field + if field_num == page_table_title_column: + title = field + if field_num == page_is_redirect_column: + is_redirect = field + elif field_num == page_table_pagelen_column and namespace == "0": + if title in wikidata_titles: + if current_output_line_entry_count == 0: + filtered_sql_f.write(match_line) + else: + filtered_sql_f.write(",") + + title = title.replace("'", "\\'") + filtered_sql_f.write(f"(,{namespace},'{title}',{is_redirect},,,,,,{field},,)") + + current_output_line_entry_count += 1 + if current_output_line_entry_count == max_entries_per_line: + filtered_sql_f.write(";\n") + current_output_line_entry_count = 0 + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("sql_file", help="The enwiki SQL page dump file (optionally gzipped)") + parser.add_argument("titles_file", help="wikidata_titles.txt file (one title per line)") + parser.add_argument("-o", "--output", required=True, help="Output path for filtered SQL file") + args = parser.parse_args() + + wikidata_titles = load_titles_file(args.titles_file) + logging.info(f"Loaded {len(wikidata_titles)} wikidata titles") + filter_wikipedia_sql(args.sql_file, args.output, wikidata_titles) + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/generate_filtered_files.py b/oz_tree_build/utilities/generate_filtered_files.py index 538b84a..cfc9f4e 100644 --- a/oz_tree_build/utilities/generate_filtered_files.py +++ b/oz_tree_build/utilities/generate_filtered_files.py @@ -3,472 +3,75 @@ 1. Filter the input files to remove many irrelevant things in order to make them smaller. 2. Generate test files that are a filtered subset of the full files, targeted at a specific clade/taxon. + +For the DVC pipeline, the individual filter modules (filter_eol, filter_wikidata, +filter_wikipedia_sql, filter_pageviews) are run as separate stages. This module +provides the orchestrating function used for clade-specific test filtering. """ import argparse -import csv -import json import logging import os import sys import time -from collections import defaultdict -from .._OZglobals import wikiflags from ..newick.extract_trees import get_taxon_subtree_from_newick_file from ..newick.newick_parser import parse_tree -from ..taxon_mapping_and_popularity.CSV_base_table_creator import iucn_num -from ..taxon_mapping_and_popularity.OTT_popularity_mapping import ( - JSON_contains_known_dbID, - Qid, - label, -) -from .apply_mask_to_object_graph import ANY, KEEP, apply_mask_to_object_graph from .file_utils import enumerate_lines_from_file, open_file_based_on_extension -from .temp_helpers import ( - find_taxon_and_vernaculars, - get_wikipedia_name, - quick_byte_match, - wikidata_value, -) +from .filter_common import read_taxonomy_source_ids +from .filter_eol import filter_eol_ids +from .filter_pageviews import filter_pageviews +from .filter_wikidata import filter_wikidata +from .filter_wikipedia_sql import filter_wikipedia_sql __author__ = "David Ebbo" one_zoom_file_prefix = "OneZoom" -def generate_and_cache_filtered_file(original_file, context, processing_function): - """ - Helper to perform caching of filtered files. - """ - +def _compute_output_path(original_file, prefix, compress=False): + """Compute the output path for a filtered file given a prefix (clade or OneZoom).""" dirname = os.path.dirname(original_file) file_name = os.path.basename(original_file) - filtered_file_prefix = (context.clade or one_zoom_file_prefix) + "_" - if file_name.startswith(filtered_file_prefix): - raise Exception(f"Input and output files are the same, with prefix {filtered_file_prefix}") - - # If original file is a OneZoom file, remove the OneZoom prefix to avoid double prefixes if file_name.startswith(one_zoom_file_prefix): file_name = file_name[len(one_zoom_file_prefix) + 1 :] - # Include clade in new file name, e.g. '/foo/bar.csv.gz' --> '/foo/Mammalia_bar.csv.gz' - # If no clade is specified, use 'OneZoom' instead as the prefix - clade_filtered_file = os.path.join(dirname, f"{filtered_file_prefix}{file_name}") - - # If we're not compressing and it has a .gz or .bz2 extension, remove it - if not context.compress: - if clade_filtered_file.endswith(".gz") or clade_filtered_file.endswith(".bz2"): - clade_filtered_file = os.path.splitext(clade_filtered_file)[0] - - # Unless force is set, check we already have a filtered file with the matching timestamp - if not context.force: - if os.path.exists(clade_filtered_file) and os.path.getmtime(clade_filtered_file) == os.path.getmtime( - original_file - ): - logging.info(f"Using cached file {clade_filtered_file}") - return clade_filtered_file - - # If the filtered file already exists, rename it to include the timestamp, so we don't overwrite it - if os.path.exists(clade_filtered_file): - existing_file_time = os.path.getmtime(clade_filtered_file) - renamed_file_name = ( - os.path.splitext(clade_filtered_file)[0] - + "_" - + time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(existing_file_time)) - + os.path.splitext(clade_filtered_file)[1] - ) - os.rename(clade_filtered_file, renamed_file_name) - logging.info(f"Renamed existing file to {renamed_file_name}") - - logging.info(f"Generating file {clade_filtered_file}") + output_file = os.path.join(dirname, f"{prefix}_{file_name}") - # Call the processing function to generate the filtered file - processing_function(original_file, clade_filtered_file, context) - - # Set the timestamp of the filtered file to match the original file - os.utime( - clade_filtered_file, - (os.path.getatime(original_file), os.path.getmtime(original_file)), - ) + if not compress: + if output_file.endswith(".gz") or output_file.endswith(".bz2"): + output_file = os.path.splitext(output_file)[0] - logging.info(f"Finished generating file {clade_filtered_file}") + return output_file - return clade_filtered_file - - -def generate_filtered_newick(newick_file, filtered_newick_file, context): - tree_string = get_taxon_subtree_from_newick_file(newick_file, context.clade) +def generate_filtered_newick(newick_file, filtered_newick_file, clade): + tree_string = get_taxon_subtree_from_newick_file(newick_file, clade) with open_file_based_on_extension(filtered_newick_file, "wt") as f: f.write(tree_string) -def read_newick_file(newick_file, context): +def read_newick_otts(newick_file): with open_file_based_on_extension(newick_file, "rt") as f: - filtered_tree_string = f.read() + tree_string = f.read() + return {node["ott"] for node in parse_tree(tree_string)} - # Get the set of OTT ids from the filtered tree - context.otts = {node["ott"] for node in parse_tree(filtered_tree_string)} - -def generate_filtered_taxonomy_file(taxonomy_file, filtered_taxonomy_file, context): +def generate_filtered_taxonomy_file(taxonomy_file, filtered_taxonomy_file, otts): with open_file_based_on_extension(filtered_taxonomy_file, "wt") as filtered_taxonomy: for i, line in enumerate_lines_from_file(taxonomy_file): - # Always copy the header if i == 0: filtered_taxonomy.write(line) continue - # The ott id is the first column (known as the "uid" in the tsv file) fields = line.split("\t") ott = fields[0] - # Only include lines that have an ott id in the filtered tree - if ott in context.otts: + if ott in otts: filtered_taxonomy.write(line) -def read_taxonomy_file(taxonomy_file, context): - sources = {"ncbi", "if", "worms", "irmng", "gbif"} - context.source_ids = {source: set() for source in sources} - - # Get the sets of source ids we're actually using from the taxonomy file - with open_file_based_on_extension(taxonomy_file, "rt") as f: - reader = csv.DictReader(f, delimiter="\t") - for OTTrow in reader: - sourceinfo = OTTrow["sourceinfo"] - for srcs in sourceinfo.split(","): - src, src_id = srcs.split(":", 1) - if src in sources: - try: - context.source_ids[src].add(int(src_id)) - except ValueError: - # Ignore it if it's not an integer - pass - - -def generate_filtered_eol_id_file(eol_id_file, filtered_eol_id_file, context): - eol_sources = {"676": "ncbi", "459": "worms", "767": "gbif", str(iucn_num): "iucn"} - iucn_lines = [] - known_names = set() - - with open_file_based_on_extension(eol_id_file, "rt") as eol_f: - with open_file_based_on_extension(filtered_eol_id_file, "wt") as filtered_eol_f: - for i, line in enumerate(eol_f): - # Always copy the header - if i == 0: - filtered_eol_f.write(line) - continue - - fields = line.split(",") - - # Ignore it if it's not one of the known sources - if fields[2] not in eol_sources: - continue - - try: - eol_id = int(fields[1]) - except ValueError: - # Some lines have the eol_id set to a weird value, e.g. - # "Animalia/Arthropoda/Malacostraca/Cumacea/Pseudocumatidae/Strauchia" - # We ignore these - continue - - if not context.clade: - # If we're not filtering by clade, keep all the lines - filtered_eol_f.write(line) - continue - - # If it's an IUCN line, just save it for now - if fields[2] == str(iucn_num): - iucn_lines.append(line) - # For other providers, only include it if we saw it in the taxonomy file - elif eol_id in context.source_ids[eol_sources[fields[2]]]: - filtered_eol_f.write(line) - known_names.add(fields[4]) - - # Include any IUCN lines that have a name that we encountered - for line in iucn_lines: - fields = line.split(",") - if fields[4] in known_names: - filtered_eol_f.write(line) - - logging.info( - f"Found {len(context.source_ids['ncbi'])} NCBI ids, " - f"{len(context.source_ids['if'])} IF ids, " - f"{len(context.source_ids['worms'])} WoRMS ids, " - f"{len(context.source_ids['irmng'])} IRMNG ids, " - f"{len(context.source_ids['gbif'])} GBIF ids" - ) - - -def generate_filtered_wikidata_dump(wikipedia_dump_file, filtered_wikipedia_dump_file, context): - # This mask defines which fields we want to keep from the wikidata dump - # The goal is to keep it structurally the same as the original, but only - # include the fields we actually consume - mask = { - "type": KEEP, # Only needed for the quick 'startswith()' line check - "id": KEEP, - "labels": {"en": {"value": KEEP}}, - "claims": { - "P31": [ - { - "mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}, - "qualifiers": { - "P642": [{"datavalue": {"value": {"numeric-id": KEEP}}}] - }, # "of" (applies within the scope of a particular item) - } - ], # Instance of - "P685": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # ncbi id - "P846": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # gbif id - "P850": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # worms id - "P1391": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # if id - "P5055": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # irmng id - "P830": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # EOL id - "P961": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # IPNI id - "P9157": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # OTT id - "P3151": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # iNaturalist id - "P141": [{"references": [{"snaks": {"P627": [{"datavalue": {"value": KEEP}}]}}]}], # IUCN id - "P1420": [{"mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}}], # taxon synonym - "P18": [ - { - "mainsnak": {"datavalue": {"value": KEEP}}, - "rank": KEEP, - } - ], # image - "P1843": [ - { - "mainsnak": {"datavalue": {"value": KEEP}}, - "rank": KEEP, - } - ], # taxon common name (aka vernaculars) - }, - "sitelinks": {ANY: {"title": KEEP}}, - } - - sitelinks_key = f"{context.wikilang}wiki" - - def trim_and_write_json_item(json_item, filtered_wiki_f): - # Remove everything we don't need from the json - apply_mask_to_object_graph(json_item, mask) - - # Only keep the sitelinks that end in "wiki", e.g. enwiki, dewiki, etc. - # (leave out those ending in "wikiquote", "wikivoyage", "wikinews", "wikibooks", etc.) - if context.dont_trim_sitelinks: - # Keep the full sitelinks value for all languages if flag is passed - json_item["sitelinks"] = {k: v for k, v in json_item["sitelinks"].items() if k.endswith("wiki")} - else: - # Otherwise only keep the original value for the language we want, since the - # rest is just needed to collect the language names into the bit field - # Also, limit the sitelinks to the languages we care about for the bit field - json_item["sitelinks"] = { - k: v if k == sitelinks_key else {} - for k, v in json_item["sitelinks"].items() - if k.endswith("wiki") and len(k) == 6 and k[:2] in wikiflags - } - - # Write out a line. We set the separators to avoid spaces - filtered_wiki_f.write(json.dumps(json_item, separators=(",", ":"))) - filtered_wiki_f.write(",\n") - - included_qids = set() - - # Keep track of vernaculars and taxon synonyms that we might want to include at the end - # There are only a few hundred, so memory isn't an issue. - potential_extra_json_items = [] - - with open_file_based_on_extension(filtered_wikipedia_dump_file, "wt") as filtered_wiki_f: - filtered_wiki_f.write("[\n") - preserved_lines = 0 - - def get_line_message(line_num): - return f"Kept {preserved_lines}/{line_num} lines ({preserved_lines / line_num * 100:.2f}%)" - - for _, line in enumerate_lines_from_file(wikipedia_dump_file, 100000, get_line_message): - if not (line.startswith('{"type":') and quick_byte_match.search(line)): - continue - - json_item = json.loads(line.rstrip().rstrip(",")) - - try: - is_taxon, vernaculars_matches = find_taxon_and_vernaculars(json_item) - except KeyError: - continue - - # If it's neither, ignore it - if not is_taxon and not len(vernaculars_matches) > 0: - continue - - # When clade filter, we only want to keep the taxa that map to source ids. - # In addition, when it doesn't map to any, we want to track it if it's - # a synonym, so we may end up including it at the end. - if context.clade and is_taxon: - if not len(JSON_contains_known_dbID(json_item, context.source_ids)) > 0: - # Case 1: it could have taxon synonyms via P1420 - if "P1420" in json_item["claims"] and json_item["sitelinks"]: - potential_extra_json_items.append( - ( - "taxon_synonym", - {wikidata_value(i["mainsnak"])["numeric-id"] for i in json_item["claims"]["P1420"]}, - json_item, - ) - ) - # Case 2: it could have synonyms via a P642 in P31 - # Note: as this is a taxon, we're dealing with synonyms, not vernaculars, - # so the variable name is a bit misleading - if vernaculars_matches: - potential_extra_json_items.append(("instance_of_synonym", vernaculars_matches, json_item)) - continue - - if is_taxon: - trim_and_write_json_item(json_item, filtered_wiki_f) - - included_qids.add(Qid(json_item)) - - preserved_lines += 1 - else: - # If it's vernacular, we'll potentially write it out at the end, so save it - potential_extra_json_items.append(("vernacular", vernaculars_matches, json_item)) - - logging.info( - "Writing extra lines at the end of the file " f"(subset of {len(potential_extra_json_items)} lines)" - ) - - for desc, linked_qids, json_item in potential_extra_json_items: - for qid in linked_qids: - # Only write it if it maps to one of the entries we included above - if qid in included_qids: - trim_and_write_json_item(json_item, filtered_wiki_f) - logging.info( - f"Including {desc} entry: Q{Qid(json_item)} " - f"('{label(json_item)}','{get_wikipedia_name(json_item)}' => Q{qid}" - ) - break - - filtered_wiki_f.write("]\n") - - -def read_wikidata_dump(wikidata_dump_file, context): - context.wikidata_ids = set() - - for _, line in enumerate_lines_from_file(wikidata_dump_file): - if not line.startswith('{"type":'): - continue - - json_item = json.loads(line.rstrip().rstrip(",")) - context.wikidata_ids.add(get_wikipedia_name(json_item)) - - -def generate_filtered_wikipedia_sql_dump(wikipedia_sql_dump_file, filtered_wikipedia_sql_dump_file, context): - # the column numbers for each datum are specified in the SQL file, and hardcoded here. - page_table_namespace_column = 2 - page_table_title_column = 3 - page_is_redirect_column = 4 - page_table_pagelen_column = 10 - - with open_file_based_on_extension(filtered_wikipedia_sql_dump_file, "wt") as filtered_sql_f: - current_output_line_entry_count = 0 - max_entries_per_line = 10 - with open_file_based_on_extension(wikipedia_sql_dump_file, "rt") as sql_f: - pagelen_file = csv.reader(sql_f, quotechar="'", escapechar="\\", doublequote=False) - match_line = "INSERT INTO `page` VALUES " - for fields in filter( - lambda x: False if len(x) == 0 else x[0].startswith(match_line), - pagelen_file, - ): - field_num = 0 - # the records are all on the same line, separated by '),(', - # so we need to count fields into the line. - for field in fields: - try: - if field and field.lstrip()[0] == "(": - field_num = 0 - namespace = None - title = None - is_redirect = "0" - except IndexError: - pass - field_num += 1 - if field_num == page_table_namespace_column: - namespace = field - if field_num == page_table_title_column: - title = field - if field_num == page_is_redirect_column: - is_redirect = field - elif field_num == page_table_pagelen_column and namespace == "0": - # Only include it if it's one of our wikidata ids - if title in context.wikidata_ids: - if current_output_line_entry_count == 0: - filtered_sql_f.write(match_line) - else: - filtered_sql_f.write(",") - - # Escape the quotes in the title - title = title.replace("'", "\\'") - - # We leave all the other fields empty, as we don't need them - # e.g. (,0,'Pan_paniscus',0,,,,,,87,,) - filtered_sql_f.write(f"(,{namespace},'{title}',{is_redirect},,,,,,{field},,)") - - current_output_line_entry_count += 1 - if current_output_line_entry_count == max_entries_per_line: - filtered_sql_f.write(";\n") - current_output_line_entry_count = 0 - - -# If it's quoted, remove the quotes and unescape it -def unquote_if_quoted(s): - if s.startswith("'") and s.endswith("'") or s.startswith('"') and s.endswith('"'): - s = s[1:-1] - return bytes(s, "utf-8").decode("unicode_escape") - return s - - -def generate_filtered_pageviews_file(pageviews_file, filtered_pageviews_file, context): - match_project = context.wikilang + ".wikipedia " - - pageviews = defaultdict(int) - simplified_line_format = False - - for i, line in enumerate_lines_from_file(pageviews_file): - # Check if it's the simplified format based on the first line. - # - Simplified format: - # - Looks like: Chimpanzee 78033 - # - We process all lines - # - Only one line for a given taxon - # - Full format (original format from wikipedia): - # - Looks like: en.wikipedia Chimpanzee 7844 mobile-web 50018 A1581B168[etc...] - # - We ignore lines that don't start with en.wikipedia - # - There can be multiple lines for a given taxon (e.g. mobile vs desktop views) - if i == 0: - simplified_line_format = line.count(" ") == 1 - - if i > 0 and i % 10000000 == 0: - logging.info(f"Processed {i} lines") - - if not simplified_line_format and not line.startswith(match_project): - continue - - info = line.split(" ") - if simplified_line_format: - title = info[0] - views = info[1] - else: - title = unquote_if_quoted(info[1]) - views = info[4] - - # Only include it if it's one of our wikidata ids - if title in context.wikidata_ids: - pageviews[title] += int(views) - - # Write out the filtered pageviews in the simplified format - with open_file_based_on_extension(filtered_pageviews_file, "wt") as filtered_f: - for title, views in pageviews.items(): - filtered_f.write(title + " " + str(views) + "\n") - - def generate_all_filtered_files( context, newick_file, @@ -478,40 +81,53 @@ def generate_all_filtered_files( wikipedia_sql_dump_file, wikipedia_pageviews_files, ): + """ + Orchestrate all filtering steps. Used for clade-specific test filtering + and as a convenience wrapper. For the DVC pipeline, the individual filter + modules are invoked as separate stages instead. + """ + prefix = context.clade or one_zoom_file_prefix + if context.clade: - # If we're filtering by clade, we need to generate a filtered newick - filtered_newick_file = generate_and_cache_filtered_file(newick_file, context, generate_filtered_newick) - read_newick_file(filtered_newick_file, context) + filtered_newick_file = _compute_output_path(newick_file, prefix, context.compress) + generate_filtered_newick(newick_file, filtered_newick_file, context.clade) + otts = read_newick_otts(filtered_newick_file) - # We also need to generate a filtered taxonomy file - filtered_taxonomy_file = generate_and_cache_filtered_file( - taxonomy_file, context, generate_filtered_taxonomy_file - ) + filtered_taxonomy_file = _compute_output_path(taxonomy_file, prefix, context.compress) + generate_filtered_taxonomy_file(taxonomy_file, filtered_taxonomy_file, otts) else: - # If we're not filtering by clade, there is really nothing to filter, - # so we just use the original taxonomy file directly. - # Note that we completely ignore the newick file in this case. filtered_taxonomy_file = taxonomy_file - read_taxonomy_file(filtered_taxonomy_file, context) - - generate_and_cache_filtered_file(eol_id_file, context, generate_filtered_eol_id_file) - if os.path.basename(wikidata_dump_file).startswith(one_zoom_file_prefix) and not context.clade: - filtered_wikidata_dump_file = wikidata_dump_file - else: - filtered_wikidata_dump_file = generate_and_cache_filtered_file( - wikidata_dump_file, context, generate_filtered_wikidata_dump + source_ids = read_taxonomy_source_ids(filtered_taxonomy_file) + + if eol_id_file: + eol_output = _compute_output_path(eol_id_file, prefix, context.compress) + filter_eol_ids(eol_id_file, eol_output, source_ids, clade=context.clade) + + if wikidata_dump_file: + wikidata_output = _compute_output_path(wikidata_dump_file, prefix, context.compress) + wikidata_titles = filter_wikidata( + wikidata_dump_file, + wikidata_output, + source_ids=source_ids if context.clade else None, + clade=context.clade, + wikilang=context.wikilang, + dont_trim_sitelinks=context.dont_trim_sitelinks, ) - read_wikidata_dump(filtered_wikidata_dump_file, context) + else: + wikidata_titles = set() - generate_and_cache_filtered_file(wikipedia_sql_dump_file, context, generate_filtered_wikipedia_sql_dump) + if wikipedia_sql_dump_file: + sql_output = _compute_output_path(wikipedia_sql_dump_file, prefix, context.compress) + filter_wikipedia_sql(wikipedia_sql_dump_file, sql_output, wikidata_titles) - for wikipedia_pageviews_file in wikipedia_pageviews_files: - generate_and_cache_filtered_file(wikipedia_pageviews_file, context, generate_filtered_pageviews_file) + if wikipedia_pageviews_files: + for pv_file in wikipedia_pageviews_files: + pv_output = _compute_output_path(pv_file, prefix, context.compress) + filter_pageviews(pv_file, pv_output, wikidata_titles, wikilang=context.wikilang) def process_args(args): - # Create a context object to hold various things we need to pass around context = type( "", (object,), @@ -550,8 +166,8 @@ def main(): help="The OpenTree taxonomy.tsv file, from http://files.opentreeoflife.org/ott/", ) parser.add_argument( - "EOLidentifiers", - help=("The gzipped EOL identifiers file, from " "https://opendata.eol.org/dataset/identifiers-csv-gz"), + "--EOLidentifiers", + help="The gzipped EOL identifiers file (optional, previously from opendata.eol.org)", ) parser.add_argument( "wikidataDumpFile", @@ -593,7 +209,7 @@ def main(): "-f", action=argparse.BooleanOptionalAction, default=False, - help="If true, forces the regeneration of all files, ignoring caching.", + help="If true, forces the regeneration of all files (ignored, kept for CLI compatibility).", ) parser.add_argument( "--dont_trim_sitelinks", diff --git a/params.yaml b/params.yaml new file mode 100644 index 0000000..d1281e1 --- /dev/null +++ b/params.yaml @@ -0,0 +1,6 @@ +oz_tree: AllLife +ot_version: "15.1" +ot_taxonomy_version: "3.7" +ot_taxonomy_extra: "draft2" +build_version: 28017344 +exclude_from_popularity: Archosauria_ott335588 Dinosauria_ott90215 diff --git a/pyproject.toml b/pyproject.toml index 62dc6b4..c2c927f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "cryptography>=42.0", "mwparserfromhell>=0.6.6", "requests-cache>=1.2.1", + "dvc>=3.0", ] [project.optional-dependencies] @@ -39,6 +40,10 @@ add_ott_numbers_to_trees = "oz_tree_build.tree_build.ott_mapping.add_ott_numbers build_oz_tree = "oz_tree_build.tree_build.build_oz_tree:main" get_open_trees_from_one_zoom = "oz_tree_build.tree_build.get_open_trees_from_one_zoom:main" generate_filtered_files = "oz_tree_build.utilities.generate_filtered_files:main" +filter_eol = "oz_tree_build.utilities.filter_eol:main" +filter_wikidata = "oz_tree_build.utilities.filter_wikidata:main" +filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" +filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" process_image_bits = "oz_tree_build.images_and_vernaculars.process_image_bits:main" From 77fac7922fc9bd20fcf96d55d5127721ed940fa0 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Mon, 23 Mar 2026 19:42:49 +0000 Subject: [PATCH 02/19] Keep the output dirs --- .gitignore | 4 ---- data/filtered/.gitignore | 6 ++++++ data/js_outputs/.gitignore | 6 ++++++ 3 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 data/filtered/.gitignore create mode 100644 data/js_outputs/.gitignore diff --git a/.gitignore b/.gitignore index cb15a51..2cd3896 100644 --- a/.gitignore +++ b/.gitignore @@ -164,7 +164,3 @@ cython_debug/ # sqlite storage.sqlite - -# DVC pipeline outputs (tracked by DVC, not git) -/data/filtered/ -/data/js_output/ diff --git a/data/filtered/.gitignore b/data/filtered/.gitignore new file mode 100644 index 0000000..4ba3b80 --- /dev/null +++ b/data/filtered/.gitignore @@ -0,0 +1,6 @@ +# Ignore everything +* + +# But not these files... +!.gitignore +!*.dvc \ No newline at end of file diff --git a/data/js_outputs/.gitignore b/data/js_outputs/.gitignore new file mode 100644 index 0000000..4ba3b80 --- /dev/null +++ b/data/js_outputs/.gitignore @@ -0,0 +1,6 @@ +# Ignore everything +* + +# But not these files... +!.gitignore +!*.dvc \ No newline at end of file From 484320f83f5347e3bfb658ec5f7c2b4be8ec6caa Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Mon, 23 Mar 2026 19:50:50 +0000 Subject: [PATCH 03/19] Split wiki title extraction into separate step --- dvc.yaml | 10 ++++++- oz_tree_build/utilities/filter_wikidata.py | 30 +++++++++---------- .../utilities/generate_filtered_files.py | 5 ++-- pyproject.toml | 1 + 4 files changed, 28 insertions(+), 18 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 3b32111..bfe9ac3 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -96,11 +96,19 @@ stages: filter_wikidata data/Wiki/wd_JSON/latest-all.json.bz2 -o data/filtered/OneZoom_latest-all.json - --titles-output data/filtered/wikidata_titles.txt deps: - data/Wiki/wd_JSON/latest-all.json.bz2 outs: - data/filtered/OneZoom_latest-all.json + + extract_wikidata_titles: + cmd: >- + extract_wikidata_titles + data/filtered/OneZoom_latest-all.json + -o data/filtered/wikidata_titles.txt + deps: + - data/filtered/OneZoom_latest-all.json + outs: - data/filtered/wikidata_titles.txt filter_sql: diff --git a/oz_tree_build/utilities/filter_wikidata.py b/oz_tree_build/utilities/filter_wikidata.py index fb294ff..2db5565 100644 --- a/oz_tree_build/utilities/filter_wikidata.py +++ b/oz_tree_build/utilities/filter_wikidata.py @@ -72,9 +72,6 @@ def filter_wikidata( """ Filter the wikidata JSON dump, keeping only taxon and vernacular items, and trimming each item to only the fields we consume. - - Returns the set of Wikipedia page titles found in the filtered output - (the ``wikidata_ids`` set used by downstream SQL and pageview filters). """ sitelinks_key = f"{wikilang}wiki" @@ -157,10 +154,6 @@ def get_line_message(line_num): filtered_wiki_f.write("]\n") - # Re-read the filtered output to extract the set of Wikipedia page titles - wikidata_titles = extract_wikidata_titles(output_file) - return wikidata_titles - def extract_wikidata_titles(filtered_wikidata_file): """ @@ -195,11 +188,6 @@ def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("wikidata_file", help="The wikidata JSON dump file (bz2 or plain)") parser.add_argument("-o", "--output", required=True, help="Output path for filtered wikidata JSON") - parser.add_argument( - "--titles-output", - required=True, - help="Output path for wikidata_titles.txt sidecar (one Wikipedia title per line)", - ) parser.add_argument("--wikilang", default="en", help="Wikipedia language code") parser.add_argument( "--dont-trim-sitelinks", @@ -209,14 +197,26 @@ def main(): ) args = parser.parse_args() - titles = filter_wikidata( + filter_wikidata( args.wikidata_file, args.output, wikilang=args.wikilang, dont_trim_sitelinks=args.dont_trim_sitelinks, ) - write_titles_file(titles, args.titles_output) - logging.info(f"Wrote {len(titles)} titles to {args.titles_output}") + + +def extract_titles_main(): + """Extract Wikipedia page titles from a filtered wikidata JSON file.""" + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=extract_titles_main.__doc__) + parser.add_argument("filtered_wikidata_file", help="The filtered wikidata JSON file") + parser.add_argument("-o", "--output", required=True, help="Output path for wikidata_titles.txt") + args = parser.parse_args() + + titles = extract_wikidata_titles(args.filtered_wikidata_file) + write_titles_file(titles, args.output) + logging.info(f"Wrote {len(titles)} titles to {args.output}") if __name__ == "__main__": diff --git a/oz_tree_build/utilities/generate_filtered_files.py b/oz_tree_build/utilities/generate_filtered_files.py index cfc9f4e..8ab0b88 100644 --- a/oz_tree_build/utilities/generate_filtered_files.py +++ b/oz_tree_build/utilities/generate_filtered_files.py @@ -21,7 +21,7 @@ from .filter_common import read_taxonomy_source_ids from .filter_eol import filter_eol_ids from .filter_pageviews import filter_pageviews -from .filter_wikidata import filter_wikidata +from .filter_wikidata import extract_wikidata_titles, filter_wikidata from .filter_wikipedia_sql import filter_wikipedia_sql __author__ = "David Ebbo" @@ -106,7 +106,7 @@ def generate_all_filtered_files( if wikidata_dump_file: wikidata_output = _compute_output_path(wikidata_dump_file, prefix, context.compress) - wikidata_titles = filter_wikidata( + filter_wikidata( wikidata_dump_file, wikidata_output, source_ids=source_ids if context.clade else None, @@ -114,6 +114,7 @@ def generate_all_filtered_files( wikilang=context.wikilang, dont_trim_sitelinks=context.dont_trim_sitelinks, ) + wikidata_titles = extract_wikidata_titles(wikidata_output) else: wikidata_titles = set() diff --git a/pyproject.toml b/pyproject.toml index c2c927f..43b1d3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ get_open_trees_from_one_zoom = "oz_tree_build.tree_build.get_open_trees_from_one generate_filtered_files = "oz_tree_build.utilities.generate_filtered_files:main" filter_eol = "oz_tree_build.utilities.filter_eol:main" filter_wikidata = "oz_tree_build.utilities.filter_wikidata:main" +extract_wikidata_titles = "oz_tree_build.utilities.filter_wikidata:extract_titles_main" filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" From 73d5f6943ca8040621c6fb364fbb5621b701b1fc Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Tue, 24 Mar 2026 07:43:47 +0000 Subject: [PATCH 04/19] Run wikidata extraction in dvc --- dvc.lock | 28 ++++++++++++++++++++++ oz_tree_build/utilities/filter_wikidata.py | 4 +++- 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 dvc.lock diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..1772da3 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,28 @@ +schema: '2.0' +stages: + filter_wikidata: + cmd: filter_wikidata data/Wiki/wd_JSON/latest-all.json.bz2 -o + data/filtered/OneZoom_latest-all.json + deps: + - path: data/Wiki/wd_JSON/latest-all.json.bz2 + hash: md5 + md5: 9d2253e0b9e629c8f77eadd8f09c1d88 + size: 100346119273 + outs: + - path: data/filtered/OneZoom_latest-all.json + hash: md5 + md5: 65b22a5c761c78d79b30faf871d1e404 + size: 1542990225 + extract_wikidata_titles: + cmd: extract_wikidata_titles data/filtered/OneZoom_latest-all.json -o + data/filtered/wikidata_titles.txt + deps: + - path: data/filtered/OneZoom_latest-all.json + hash: md5 + md5: 65b22a5c761c78d79b30faf871d1e404 + size: 1542990225 + outs: + - path: data/filtered/wikidata_titles.txt + hash: md5 + md5: e498b85311c8a84a0d5157a0bbbcb23f + size: 9382189 diff --git a/oz_tree_build/utilities/filter_wikidata.py b/oz_tree_build/utilities/filter_wikidata.py index 2db5565..eaf8e6b 100644 --- a/oz_tree_build/utilities/filter_wikidata.py +++ b/oz_tree_build/utilities/filter_wikidata.py @@ -165,7 +165,9 @@ def extract_wikidata_titles(filtered_wikidata_file): if not line.startswith('{"type":'): continue json_item = json.loads(line.rstrip().rstrip(",")) - titles.add(get_wikipedia_name(json_item)) + title = get_wikipedia_name(json_item) + if title is not None: + titles.add(title) return titles From 6ceee5bd1647dc8325dc6b2848104067daa67ac6 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Thu, 26 Mar 2026 08:22:56 +0000 Subject: [PATCH 05/19] Automate download and filter of pageviews --- data/Wiki/README.markdown | 12 +- dvc.lock | 32 +++ dvc.yaml | 10 +- .../download_and_filter_pageviews.py | 260 ++++++++++++++++++ oz_tree_build/utilities/filter_pageviews.py | 26 +- pyproject.toml | 1 + 6 files changed, 327 insertions(+), 14 deletions(-) create mode 100644 oz_tree_build/utilities/download_and_filter_pageviews.py diff --git a/data/Wiki/README.markdown b/data/Wiki/README.markdown index 13bd3cb..1c6fa6a 100755 --- a/data/Wiki/README.markdown +++ b/data/Wiki/README.markdown @@ -1,4 +1,4 @@ -To allow mappings to wikipedia and popularity calculations, the following three files +To allow mappings to wikipedia and popularity calculations, the following files should be uploaded to their respective directories (NB: these could be symlinks to versions on external storage) @@ -6,9 +6,13 @@ versions on external storage) (download from ) * The `wp_SQL` directory should contain the en.wikipedia SQL dump file, as `enwiki-latest-page.sql.gz` (download from ) -* The `wp_pagecounts` directory should contain the wikipedia pagevisits dump files: -multiple files such as `wp_pagecounts/pageviews-202403-user.bz2` etc... -(download from ). + +Wikipedia pageview files are downloaded and filtered automatically by the +`download_and_filter_pageviews` pipeline stage. It streams monthly `-user` dumps +from , filters them +against the wikidata titles, and caches the small filtered outputs. Only the most +recent N months (configured via `--months` in the DVC stage) are processed. To +pick up newly published months, run `dvc repro --force download_and_filter_pageviews`. These files are used as inputs to the DVC pipeline's filtering stages. If someone has already run the pipeline and pushed results to the DVC remote, you do not need diff --git a/dvc.lock b/dvc.lock index 1772da3..4a855d9 100644 --- a/dvc.lock +++ b/dvc.lock @@ -26,3 +26,35 @@ stages: hash: md5 md5: e498b85311c8a84a0d5157a0bbbcb23f size: 9382189 + download_and_filter_pageviews: + cmd: download_and_filter_pageviews --titles-file + data/filtered/wikidata_titles.txt --months 12 -o data/filtered/pageviews + deps: + - path: data/filtered/wikidata_titles.txt + hash: md5 + md5: e498b85311c8a84a0d5157a0bbbcb23f + size: 9382189 + outs: + - path: data/filtered/pageviews/ + hash: md5 + md5: f021afa12f9d7c893412a0b2980ab187.dir + size: 104535557 + nfiles: 13 + filter_sql: + cmd: filter_wikipedia_sql data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + data/filtered/wikidata_titles.txt -o + data/filtered/OneZoom_enwiki-latest-page.sql + deps: + - path: data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + hash: md5 + md5: 85108d569d644c3d34d8b930597e42dc + size: 2384405614 + - path: data/filtered/wikidata_titles.txt + hash: md5 + md5: e498b85311c8a84a0d5157a0bbbcb23f + size: 9382189 + outs: + - path: data/filtered/OneZoom_enwiki-latest-page.sql + hash: md5 + md5: ac10389b87372eff1d276802dbb345a5 + size: 21411110 diff --git a/dvc.yaml b/dvc.yaml index bfe9ac3..3729698 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -123,17 +123,17 @@ stages: outs: - data/filtered/OneZoom_enwiki-latest-page.sql - filter_pageviews: + download_and_filter_pageviews: cmd: >- - filter_pageviews - data/Wiki/wp_pagecounts/pageviews*.bz2 + download_and_filter_pageviews --titles-file data/filtered/wikidata_titles.txt + --months 12 -o data/filtered/pageviews deps: - - data/Wiki/wp_pagecounts/ - data/filtered/wikidata_titles.txt outs: - - data/filtered/pageviews/ + - data/filtered/pageviews/: + persist: true create_tables: cmd: >- diff --git a/oz_tree_build/utilities/download_and_filter_pageviews.py b/oz_tree_build/utilities/download_and_filter_pageviews.py new file mode 100644 index 0000000..c0ddc95 --- /dev/null +++ b/oz_tree_build/utilities/download_and_filter_pageviews.py @@ -0,0 +1,260 @@ +"""Download pageview files from Wikimedia and filter to wikidata titles. + +Streams monthly pageview dumps directly from +https://dumps.wikimedia.org/other/pageview_complete/monthly/, +decompresses on the fly, and writes only the filtered results to disk. +Already-filtered months are skipped unless the titles file has changed. +""" + +import argparse +import bz2 +import codecs +import hashlib +import itertools +import logging +import os +import re +import shutil +import subprocess +import sys +import tempfile +import urllib.request + +from .filter_pageviews import filter_pageview_lines, write_filtered_pageviews +from .filter_wikidata import load_titles_file + +BASE_URL = "https://dumps.wikimedia.org/other/pageview_complete/monthly/" +TITLES_HASH_FILE = ".titles_hash" +WGET_READ_TIMEOUT = 120 # seconds of no data before wget gives up + + +def _fetch_index(url): + """Fetch an Apache directory index page and return its HTML.""" + print(f"Fetching index from {url}") + req = urllib.request.Request(url, headers={"User-Agent": "OneZoom-tree-build/1.0"}) + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read().decode("utf-8") + + +def discover_pageview_months(base_url=BASE_URL): + """ + Crawl the Wikimedia monthly pageview directory listing and yield + (url, filename) tuples for ``*-user.bz2`` files, most recent first. + + Iterates years and months in reverse so callers needing only the N most + recent months can stop early without fetching every index page. + """ + year_pattern = re.compile(r'href="(\d{4}/)"') + month_dir_pattern = re.compile(r'href="(\d{4}-\d{2}/)"') + user_file_pattern = re.compile(r'href="(pageviews-\d{6}-user\.bz2)"') + + years_html = _fetch_index(base_url) + year_dirs = sorted( + (m.group(1) for m in year_pattern.finditer(years_html)), reverse=True + ) + + for year_dir in year_dirs: + year_url = base_url + year_dir + + months_html = _fetch_index(year_url) + month_dirs = sorted( + (m.group(1) for m in month_dir_pattern.finditer(months_html)), + reverse=True, + ) + + for month_dir in month_dirs: + month_url = year_url + month_dir + + files_html = _fetch_index(month_url) + for file_match in user_file_pattern.finditer(files_html): + filename = file_match.group(1) + file_url = month_url + filename + yield file_url, filename + + +def _stream_bz2_lines(url): + """ + Stream a .bz2 file over HTTP via wget and yield decompressed lines. + wget handles timeouts and connection management; Python handles decompression. + """ + if not shutil.which("wget"): + raise RuntimeError("wget is required but not found on PATH") + + wget = subprocess.Popen( + [ + "wget", "-q", "-O", "-", + "--connect-timeout=30", + f"--read-timeout={WGET_READ_TIMEOUT}", + "--header=User-Agent: OneZoom-tree-build/1.0", + url, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + decompressor = bz2.BZ2Decompressor() + decoder = codecs.getincrementaldecoder("utf-8")("replace") + line_buf = "" + bytes_read = 0 + + try: + while True: + chunk = wget.stdout.read(1024 * 1024) + if not chunk: + break + bytes_read += len(chunk) + if bytes_read % (500 * 1024 * 1024) < 1024 * 1024: + logging.info(f" Downloaded {bytes_read / (1024**3):.1f} GB so far...") + try: + raw = decompressor.decompress(chunk) + except EOFError: + break + text = decoder.decode(raw) + line_buf += text + parts = line_buf.split("\n") + line_buf = parts[-1] + for line in parts[:-1]: + yield line + + trailing = decoder.decode(b"", final=True) + line_buf += trailing + if line_buf: + yield line_buf + finally: + wget.stdout.close() + if wget.poll() is None: + wget.terminate() + rc = wget.wait() + stderr_out = wget.stderr.read().decode("utf-8", errors="replace").strip() + wget.stderr.close() + if rc != 0: + raise RuntimeError(f"wget failed (exit {rc}): {stderr_out}") + + +def _compute_file_hash(path): + """Return the SHA-256 hex digest of a file's contents.""" + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + h.update(chunk) + return h.hexdigest() + + +def _output_filename(pageview_filename): + """Map a raw pageview filename to its filtered output name.""" + basename = pageview_filename + if basename.endswith(".bz2"): + basename = basename[:-4] + return f"OneZoom_{basename}" + + +def _check_and_update_titles_hash(output_dir, titles_file): + """ + Compare the stored titles hash with the current file. Returns True if + the cache is still valid. Clears existing output files and updates the + hash when it changes. + """ + current_hash = _compute_file_hash(titles_file) + hash_path = os.path.join(output_dir, TITLES_HASH_FILE) + + if os.path.exists(hash_path): + with open(hash_path) as f: + stored_hash = f.read().strip() + if stored_hash == current_hash: + return True + logging.info("Titles file changed -- clearing cached pageview outputs") + for name in os.listdir(output_dir): + if name == TITLES_HASH_FILE: + continue + os.remove(os.path.join(output_dir, name)) + + with open(hash_path, "w") as f: + f.write(current_hash) + return False + + +def stream_and_filter(url, output_path, wikidata_titles, wikilang="en"): + """ + Stream a remote .bz2 pageview file, filter it, and write the result. + Uses a temp file + rename for atomicity. + """ + lines = _stream_bz2_lines(url) + pageviews = filter_pageview_lines(lines, wikidata_titles, wikilang) + + dir_name = os.path.dirname(output_path) + fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp") + os.close(fd) + try: + write_filtered_pageviews(pageviews, tmp_path) + os.replace(tmp_path, output_path) + except BaseException: + if os.path.exists(tmp_path): + os.remove(tmp_path) + raise + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--titles-file", + required=True, + help="wikidata_titles.txt file (one title per line)", + ) + parser.add_argument( + "--months", + type=int, + required=True, + help="Number of most recent months to process", + ) + parser.add_argument( + "-o", + "--output-dir", + required=True, + help="Output directory for filtered pageview files", + ) + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + parser.add_argument( + "--base-url", + default=BASE_URL, + help="Base URL for the Wikimedia pageview dumps", + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + cache_valid = _check_and_update_titles_hash(args.output_dir, args.titles_file) + if cache_valid: + logging.info("Titles file unchanged -- cached outputs are valid") + else: + logging.info("Titles file is new or changed -- will reprocess all months") + + wikidata_titles = load_titles_file(args.titles_file) + logging.info(f"Loaded {len(wikidata_titles)} wikidata titles") + + logging.info("Discovering available pageview months from Wikimedia...") + selected = list(itertools.islice( + discover_pageview_months(args.base_url), args.months + )) + selected.reverse() + logging.info( + f"Selected {len(selected)} most recent months" + ) + + for i, (url, filename) in enumerate(selected, 1): + output_file = os.path.join(args.output_dir, _output_filename(filename)) + + if os.path.exists(output_file): + logging.info(f"[{i}/{len(selected)}] Skipping {filename} (already filtered)") + continue + + logging.info(f"[{i}/{len(selected)}] Streaming and filtering {filename}...") + stream_and_filter(url, output_file, wikidata_titles, wikilang=args.wikilang) + logging.info(f"[{i}/{len(selected)}] Done: {output_file}") + + logging.info("All pageview months up to date") + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/filter_pageviews.py b/oz_tree_build/utilities/filter_pageviews.py index 8b766ca..e539cf8 100644 --- a/oz_tree_build/utilities/filter_pageviews.py +++ b/oz_tree_build/utilities/filter_pageviews.py @@ -17,17 +17,17 @@ def unquote_if_quoted(s): return s -def filter_pageviews(pageviews_file, output_file, wikidata_titles, wikilang="en"): +def filter_pageview_lines(lines, wikidata_titles, wikilang="en"): """ - Filter a single pageview file, keeping only entries whose title appears - in the wikidata_titles set. Aggregates views per title and writes output - in the simplified format (``Title viewcount``). + Filter an iterable of pageview lines, keeping only entries whose title + appears in the wikidata_titles set. Returns a dict mapping title to + aggregated view count. """ match_project = wikilang + ".wikipedia " pageviews = defaultdict(int) simplified_line_format = False - for i, line in enumerate_lines_from_file(pageviews_file): + for i, line in enumerate(lines): if i == 0: simplified_line_format = line.count(" ") == 1 @@ -48,11 +48,27 @@ def filter_pageviews(pageviews_file, output_file, wikidata_titles, wikilang="en" if title in wikidata_titles: pageviews[title] += int(views) + return pageviews + + +def write_filtered_pageviews(pageviews, output_file): + """Write aggregated pageview counts to file in ``Title viewcount`` format.""" with open_file_based_on_extension(output_file, "wt") as filtered_f: for title, views in pageviews.items(): filtered_f.write(title + " " + str(views) + "\n") +def filter_pageviews(pageviews_file, output_file, wikidata_titles, wikilang="en"): + """ + Filter a single pageview file, keeping only entries whose title appears + in the wikidata_titles set. Aggregates views per title and writes output + in the simplified format (``Title viewcount``). + """ + lines = (line for _, line in enumerate_lines_from_file(pageviews_file)) + pageviews = filter_pageview_lines(lines, wikidata_titles, wikilang) + write_filtered_pageviews(pageviews, output_file) + + def main(): logging.basicConfig(stream=sys.stderr, level=logging.INFO) diff --git a/pyproject.toml b/pyproject.toml index 43b1d3f..9e22965 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ filter_wikidata = "oz_tree_build.utilities.filter_wikidata:main" extract_wikidata_titles = "oz_tree_build.utilities.filter_wikidata:extract_titles_main" filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" +download_and_filter_pageviews = "oz_tree_build.utilities.download_and_filter_pageviews:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" process_image_bits = "oz_tree_build.images_and_vernaculars.process_image_bits:main" From 783ce339b91f2f2f263e6d17905396224c66a6b2 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Thu, 26 Mar 2026 08:44:24 +0000 Subject: [PATCH 06/19] Script the OpenTree download --- README.markdown | 14 +- .../AllLife/OpenTreeParts/.gitignore | 1 + .../OpenTreeParts/OpenTree_all/.gitignore | 2 - data/OpenTree/.gitignore | 1 - data/OpenTree/README.markdown | 32 ++-- data/README.markdown | 6 +- dvc.lock | 65 ++++++++ dvc.yaml | 61 +++----- oz_tree_build/utilities/download_opentree.py | 146 ++++++++++++++++++ params.yaml | 4 +- pyproject.toml | 1 + 11 files changed, 260 insertions(+), 73 deletions(-) create mode 100644 data/OZTreeBuild/AllLife/OpenTreeParts/.gitignore delete mode 100644 data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/.gitignore create mode 100644 oz_tree_build/utilities/download_opentree.py diff --git a/README.markdown b/README.markdown index d160461..4b5241d 100644 --- a/README.markdown +++ b/README.markdown @@ -41,7 +41,7 @@ you will need a valid Azure Image cropping key in your appconfig.ini. ## Building the latest tree from OpenTree -This project uses [DVC](https://dvc.org/) for a cached, repeatable data pipeline. The build parameters (OpenTree version, taxonomy version, etc.) are defined in `params.yaml` and the pipeline stages are declared in `dvc.yaml`. +This project uses [DVC](https://dvc.org/) for a cached, repeatable data pipeline. The build parameters are defined in `params.yaml` and the pipeline stages are declared in `dvc.yaml`. ### Quick start (using cached outputs) @@ -56,17 +56,11 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If ### Full build (first time / updating source data) -1. Update `params.yaml` with the desired OpenTree version numbers. You can check the latest version via the [API](https://github.com/OpenTreeOfLife/germinator/wiki/Open-Tree-of-Life-Web-APIs): +1. Set `ot_version` in `params.yaml` to the desired OpenTree synthesis version (e.g. `"v16.1"`). Available versions can be found in the [synthesis manifest](https://raw.githubusercontent.com/OpenTreeOfLife/opentree/master/webapp/static/statistics/synthesis.json). The OpenTree tree and taxonomy will be downloaded automatically by the `download_opentree` pipeline stage. - ```bash - curl -s -X POST https://api.opentreeoflife.org/v3/tree_of_life/about | grep -E '"synth_id"|"taxonomy_version"' - ``` - -2. Download the required source files into `data/` as [documented here](data/README.markdown), then register them with DVC: +2. Download the other required source files into `data/` as [documented here](data/README.markdown), then register them with DVC: ```bash - dvc add data/OpenTree/labelled_supertree_simplified_ottnames.tre - dvc add data/OpenTree/ott3.7.tgz dvc add data/Wiki/wd_JSON/latest-all.json.bz2 dvc add data/Wiki/wp_SQL/enwiki-latest-page.sql.gz dvc add data/Wiki/wp_pagecounts/ @@ -86,7 +80,7 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If The pipeline is defined in `dvc.yaml`. Use `dvc dag` to visualize the DAG. Key stages include: -- **preprocess_opentree**, **unpack_taxonomy** -- prepare OpenTree data +- **download_opentree** -- download OpenTree synthesis tree and taxonomy - **add_ott_numbers**, **prepare_open_trees**, **build_tree** -- assemble the full newick tree - **filter_eol**, **filter_wikidata**, **filter_sql**, **filter_pageviews** -- filter massive source files (parallelizable) - **create_tables** -- map taxa, calculate popularity, produce DB-ready CSVs diff --git a/data/OZTreeBuild/AllLife/OpenTreeParts/.gitignore b/data/OZTreeBuild/AllLife/OpenTreeParts/.gitignore new file mode 100644 index 0000000..6681d35 --- /dev/null +++ b/data/OZTreeBuild/AllLife/OpenTreeParts/.gitignore @@ -0,0 +1 @@ +/OpenTree_all diff --git a/data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/.gitignore b/data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/data/OpenTree/.gitignore b/data/OpenTree/.gitignore index 47df0e7..45e8756 100755 --- a/data/OpenTree/.gitignore +++ b/data/OpenTree/.gitignore @@ -4,4 +4,3 @@ # But not these files... !.gitignore !README.markdown -!*.dvc \ No newline at end of file diff --git a/data/OpenTree/README.markdown b/data/OpenTree/README.markdown index 16d5b6b..a371470 100755 --- a/data/OpenTree/README.markdown +++ b/data/OpenTree/README.markdown @@ -1,26 +1,28 @@ ### Directory contents -Files herein are .gitignored. To get the site working, this folder should contain the following files (or symlinks to them) - -* `draftversionXXX.tre` -* `ottYYY/taxonomy.tsv` + +This folder contains versioned subdirectories of Open Tree of Life data, e.g. `v16.1/`. Each subdirectory is created by the `download_opentree` script and contains: + +* `labelled_supertree_simplified_ottnames.tre` -- the raw downloaded tree +* `draftversion.tre` -- the tree with `mrca***` labels removed and whitespace normalised +* `taxonomy.tsv` -- the OTT taxonomy file + +These subdirectories are .gitignored and tracked by DVC as pipeline outputs. ### How to get the files -* `draftversionXXX.tre` should contain an OpenTree newick file with simplified names and `mrca***` labels removed. This can be created from the OpenTree download file `labelled_supertree_simplified_ottnames.tre`. To get this file, you can either download the complete OpenTree distribution, or get the single necessary file by following the link from [https://tree.opentreeoflife.org/about/synthesis-release/](https://tree.opentreeoflife.org/about/synthesis-release/) to 'browse full output' then 'labelled_supertree/index.html' (usually at the end of the "Supertree algorithm" section). Make sure that you *don't* get the `...without_monotypic.tre` version, otherwise you will be missing some intermediate nodes, and the popularity ratings may suffer. - - Removing the `mrca***` labels can be done by using a simple regular expression substitution, as in the following perl command: - ``` - # assumes you have defined OT_VERSION as an environment variable, e.g. > OT_VERSION=14.7 - perl -pe 's/\)mrcaott\d+ott\d+/\)/g; s/[ _]+/_/g;' labelled_supertree_simplified_ottnames.tre > draftversion${OT_VERSION}.tre - ``` +Run the download script with the desired synthesis version: -* The OpenTree taxonomy, in a subfolder called ottYYY/ (where YYY is the OT_TAXONOMY_VERSION; the only important file is ottYYY/taxonomy.tsv). Get the `ottYYY.tgz` file (where YYY is the correct taxonomy version for your version XXX of the tree) from [http://files.opentreeoflife.org/ott](http://files.opentreeoflife.org/ott/) and unpack it. Alternatively, the lastest is usually at [https://tree.opentreeoflife.org/about/taxonomy-version](https://tree.opentreeoflife.org/about/taxonomy-version). +``` +download_opentree --version v16.1 --output-dir data/OpenTree +``` -### Use +The script fetches the [synthesis manifest](https://raw.githubusercontent.com/OpenTreeOfLife/opentree/master/webapp/static/statistics/synthesis.json) to look up the correct OTT taxonomy version, then downloads both the labelled supertree and taxonomy automatically. + +This is also available as a DVC pipeline stage (`download_opentree` in `dvc.yaml`), so `dvc repro` will run it when `ot_version` changes in `params.yaml`. -These files are processed by the scripts in ServerScripts/TreeBuild/OpenTreeRefine to create an OpenTree without subspecies, with polytomies resolved, and with all nodes named. +### Use -Note that the `ott/taxonomy.tsv` file is also used by other scripts e.g. for popularity, TaxonMapping, etc. +These files are processed by the pipeline stages in `dvc.yaml` to create the full OneZoom tree. The `taxonomy.tsv` file is also used by other stages (e.g. for popularity mapping, EoL filtering, etc.). NB: for the rationale of using `...simplified_ottnames` see [https://github.com/OpenTreeOfLife/treemachine/issues/147#issuecomment-209105659](https://github.com/OpenTreeOfLife/treemachine/issues/147#issuecomment-209105659) and also [here](https://groups.google.com/forum/#!topic/opentreeoflife/EzqctKrJySk) diff --git a/data/README.markdown b/data/README.markdown index 73010b7..8210407 100755 --- a/data/README.markdown +++ b/data/README.markdown @@ -2,9 +2,9 @@ To build a tree, you will first need to download various files from the internet. These are not provided by OneZoom directly as they are (a) very large and (b) regularly updated. The files you will need are: -* Open Tree of Life files, to be downloaded into the `OpenTree` directory (see [OpenTree/README.markdown](OpenTree/README.markdown) - * `labelled_supertree_simplified_ottnames.tre` (subsequently converted to `draftversionXXX.tre`, as detailed in the instructions) - * `ottX.Y/taxonomy.tsv` (where X.Y is the OT_TAXONOMY_VERSION) +* Open Tree of Life files, downloaded automatically by the `download_opentree` pipeline stage into `OpenTree//` (see [OpenTree/README.markdown](OpenTree/README.markdown)) + * `draftversion.tre` (processed synthesis tree) + * `taxonomy.tsv` (OTT taxonomy) * Wikimedia files, to be downloaded into directories within the `Wiki` directory (see [Wiki/README.markdown](Wiki/README.markdown)) * `wd_JSON/latest-all.json.bz2` * `wp_SQL/enwiki-latest-page.sql.gz` diff --git a/dvc.lock b/dvc.lock index 4a855d9..0615172 100644 --- a/dvc.lock +++ b/dvc.lock @@ -58,3 +58,68 @@ stages: hash: md5 md5: ac10389b87372eff1d276802dbb345a5 size: 21411110 + download_opentree: + cmd: download_opentree --version v16.1 --output-dir data/OpenTree + params: + params.yaml: + ot_version: v16.1 + outs: + - path: data/OpenTree/v16.1/ + hash: md5 + md5: 87ff995e9d5028efc185857f34448746.dir + size: 587064765 + nfiles: 3 + add_ott_numbers: + cmd: rm -rf data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 && mkdir + -p data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 && + add_ott_numbers_to_trees --savein + data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 + data/OZTreeBuild/AllLife/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] + deps: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_noAutoOTT/ + hash: md5 + md5: 8cb57266b725e9893505618bf366af54.dir + size: 1231351 + nfiles: 56 + params: + params.yaml: + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ + hash: md5 + md5: 694f92ea523d0628824b4cf4ed7ef19e.dir + size: 1806192 + nfiles: 56 + prepare_open_trees: + cmd: mkdir -p data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all && cp -n + data/OZTreeBuild/AllLife/OpenTreeParts/OT_required/*.nwk + data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ 2>/dev/null || true + && cd data/OZTreeBuild/AllLife && get_open_trees_from_one_zoom + ../../OpenTree/v16.1/draftversion.tre OpenTreeParts/OpenTree_all/ + BespokeTree/include_OT_v16.1/*.PHY + deps: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ + hash: md5 + md5: 694f92ea523d0628824b4cf4ed7ef19e.dir + size: 1806192 + nfiles: 56 + - path: data/OZTreeBuild/AllLife/OpenTreeParts/OT_required/ + hash: md5 + md5: 81be05fde561126fb58b7bb7e8a0fbcd.dir + size: 808 + nfiles: 3 + - path: data/OpenTree/v16.1/draftversion.tre + hash: md5 + md5: f59f21497ceb1b33273a192a63308386 + size: 83619970 + params: + params.yaml: + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ + hash: md5 + md5: 3f8df65320201c2db0ee35b17916c7cb.dir + size: 81928129 + nfiles: 229 diff --git a/dvc.yaml b/dvc.yaml index 3729698..5a43520 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -2,61 +2,45 @@ vars: - params.yaml stages: - preprocess_opentree: - cmd: >- - perl -pe 's/\)mrcaott\d+ott\d+/\)/g; s/[ _]+/_/g;' - data/OpenTree/labelled_supertree_simplified_ottnames.tre - > data/OpenTree/draftversion${ot_version}.tre - deps: - - data/OpenTree/labelled_supertree_simplified_ottnames.tre + download_opentree: + cmd: download_opentree --version ${ot_version} --output-dir data/OpenTree params: - ot_version outs: - - data/OpenTree/draftversion${ot_version}.tre - - unpack_taxonomy: - cmd: tar -C data/OpenTree -zxf data/OpenTree/ott${ot_taxonomy_version}.tgz - deps: - - data/OpenTree/ott${ot_taxonomy_version}.tgz - params: - - ot_taxonomy_version - outs: - - data/OpenTree/ott${ot_taxonomy_version}/ + - data/OpenTree/${ot_version}/ add_ott_numbers: cmd: >- - rm -rf data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra} && - mkdir -p data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra} && + rm -rf data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} && + mkdir -p data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} && add_ott_numbers_to_trees - --savein data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra} + --savein data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] deps: - data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/ params: - oz_tree - - ot_taxonomy_version - - ot_taxonomy_extra + - ot_version outs: - - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/ + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ prepare_open_trees: cmd: >- + mkdir -p data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all && cp -n data/OZTreeBuild/${oz_tree}/OpenTreeParts/OT_required/*.nwk data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ 2>/dev/null || true && cd data/OZTreeBuild/${oz_tree} && get_open_trees_from_one_zoom - ../../OpenTree/draftversion${ot_version}.tre + ../../OpenTree/${ot_version}/draftversion.tre OpenTreeParts/OpenTree_all/ - BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/*.PHY + BespokeTree/include_OT_${ot_version}/*.PHY deps: - - data/OpenTree/draftversion${ot_version}.tre - - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/ + - data/OpenTree/${ot_version}/draftversion.tre + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OT_required/ params: - oz_tree - ot_version - - ot_taxonomy_version - - ot_taxonomy_extra outs: - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ @@ -64,16 +48,15 @@ stages: cmd: >- cd data/OZTreeBuild/${oz_tree} && build_oz_tree - BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/Base.PHY + BespokeTree/include_OT_${ot_version}/Base.PHY OpenTreeParts/OpenTree_all/ ${oz_tree}_full_tree.phy deps: - - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/ + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ params: - oz_tree - - ot_taxonomy_version - - ot_taxonomy_extra + - ot_version outs: - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy @@ -81,13 +64,13 @@ stages: cmd: >- filter_eol data/EOL/provider_ids.csv.gz - data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + data/OpenTree/${ot_version}/taxonomy.tsv -o data/filtered/OneZoom_provider_ids.csv deps: - data/EOL/provider_ids.csv.gz - - data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + - data/OpenTree/${ot_version}/taxonomy.tsv params: - - ot_taxonomy_version + - ot_version outs: - data/filtered/OneZoom_provider_ids.csv @@ -139,7 +122,7 @@ stages: cmd: >- CSV_base_table_creator data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy - data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + data/OpenTree/${ot_version}/taxonomy.tsv data/filtered/OneZoom_provider_ids.csv data/filtered/OneZoom_latest-all.json data/filtered/OneZoom_enwiki-latest-page.sql @@ -151,7 +134,7 @@ stages: 2> data/output_files/ordered_output.log deps: - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy - - data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv + - data/OpenTree/${ot_version}/taxonomy.tsv - data/filtered/OneZoom_provider_ids.csv - data/filtered/OneZoom_latest-all.json - data/filtered/OneZoom_enwiki-latest-page.sql @@ -159,7 +142,7 @@ stages: - data/OZTreeBuild/${oz_tree}/BespokeTree/SupplementaryTaxonomy.tsv params: - oz_tree - - ot_taxonomy_version + - ot_version - build_version - exclude_from_popularity outs: diff --git a/oz_tree_build/utilities/download_opentree.py b/oz_tree_build/utilities/download_opentree.py new file mode 100644 index 0000000..3f3e0c4 --- /dev/null +++ b/oz_tree_build/utilities/download_opentree.py @@ -0,0 +1,146 @@ +"""Download Open Tree of Life synthesis data (tree + taxonomy) into a versioned folder. + +Usage: + download_opentree --version v16.1 --output-dir data/OpenTree + +This fetches the synthesis manifest from the OpenTree GitHub repo, then downloads +the labelled supertree and OTT taxonomy for the requested synthesis version. Files +are placed in ``//`` with version-agnostic names: + + //labelled_supertree_simplified_ottnames.tre + //draftversion.tre + //taxonomy.tsv +""" + +import argparse +import os +import re +import shutil +import tarfile +import tempfile + +import requests + +SYNTHESIS_JSON_URL = ( + "https://raw.githubusercontent.com/OpenTreeOfLife/opentree" + "/master/webapp/static/statistics/synthesis.json" +) + + +def fetch_synthesis_json(): + response = requests.get(SYNTHESIS_JSON_URL) + response.raise_for_status() + return response.json() + + +def find_synthesis_entry(synthesis_json, version): + """Return the manifest entry whose ``version`` field matches *version*.""" + for entry in synthesis_json.values(): + if entry.get("version") == version: + return entry + available = [e["version"] for e in synthesis_json.values() if "version" in e] + raise SystemExit( + f"Version '{version}' not found in synthesis.json. " + f"Available versions: {', '.join(available)}" + ) + + +def strip_mrca_prefixes(content: str) -> str: + content = re.sub(r"\)mrcaott\d+ott\d+", ")", content) + content = re.sub(r"[ _]+", "_", content) + return content + + +def download_tree(version, output_dir): + """Download the labelled supertree and produce the processed draftversion.""" + assert version.startswith("v") + version_without_v = version[1:] + tree_url = ( + f"https://files.opentreeoflife.org/synthesis/opentree{version_without_v}" + f"/output/labelled_supertree/labelled_supertree_simplified_ottnames.tre" + ) + print(f"Downloading tree from {tree_url} ...") + response = requests.get(tree_url) + response.raise_for_status() + + raw_path = os.path.join(output_dir, "labelled_supertree_simplified_ottnames.tre") + with open(raw_path, "w") as f: + f.write(response.text) + print(f" Saved raw tree to {raw_path}") + + draft_path = os.path.join(output_dir, "draftversion.tre") + print(" Stripping mrca prefixes ...") + with open(draft_path, "w") as f: + f.write(strip_mrca_prefixes(response.text)) + print(f" Saved processed tree to {draft_path}") + + +def download_taxonomy(ott_version_raw, output_dir): + """Download and extract taxonomy.tsv from the OTT taxonomy tarball.""" + ott_version = ott_version_raw.split("draft")[0] + taxonomy_url = ( + f"https://files.opentreeoflife.org/ott/{ott_version}/{ott_version}.tgz" + ) + print(f"Downloading taxonomy from {taxonomy_url} ...") + response = requests.get(taxonomy_url) + response.raise_for_status() + + with tempfile.TemporaryDirectory() as tmpdir: + tar_path = os.path.join(tmpdir, "taxonomy.tgz") + with open(tar_path, "wb") as f: + f.write(response.content) + + print(" Extracting taxonomy.tsv ...") + with tarfile.open(tar_path, "r:gz") as tar: + taxonomy_member = None + for member in tar.getmembers(): + if member.name.endswith("/taxonomy.tsv"): + taxonomy_member = member + break + if taxonomy_member is None: + raise SystemExit( + "Could not find taxonomy.tsv in the taxonomy tarball" + ) + extracted = tar.extractfile(taxonomy_member) + dest_path = os.path.join(output_dir, "taxonomy.tsv") + with open(dest_path, "wb") as f: + shutil.copyfileobj(extracted, f) + print(f" Saved taxonomy to {dest_path}") + + +def main(): + parser = argparse.ArgumentParser( + description="Download Open Tree of Life synthesis data into a versioned folder." + ) + parser.add_argument( + "--version", + required=True, + help='Synthesis version to download (e.g. "v16.1"). ' + "Must match the 'version' field in the OpenTree synthesis.json manifest.", + ) + parser.add_argument( + "--output-dir", + default="data/OpenTree", + help="Parent directory for the versioned output folder (default: data/OpenTree).", + ) + args = parser.parse_args() + + version = args.version + if not version.startswith("v"): + raise SystemExit(f"Version must start with 'v' (got '{version}')") + + print("Fetching synthesis manifest ...") + synthesis_json = fetch_synthesis_json() + entry = find_synthesis_entry(synthesis_json, version) + print(f"Found synthesis {version} (OTT {entry['OTT_version']})") + + output_dir = os.path.join(args.output_dir, version) + os.makedirs(output_dir, exist_ok=True) + + download_tree(version, output_dir) + download_taxonomy(entry["OTT_version"], output_dir) + print(f"Done. All files written to {output_dir}/") + + +if __name__ == "__main__": + main() diff --git a/params.yaml b/params.yaml index d1281e1..cad7876 100644 --- a/params.yaml +++ b/params.yaml @@ -1,6 +1,4 @@ oz_tree: AllLife -ot_version: "15.1" -ot_taxonomy_version: "3.7" -ot_taxonomy_extra: "draft2" +ot_version: "v16.1" build_version: 28017344 exclude_from_popularity: Archosauria_ott335588 Dinosauria_ott90215 diff --git a/pyproject.toml b/pyproject.toml index 9e22965..c6ee696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ extract_wikidata_titles = "oz_tree_build.utilities.filter_wikidata:extract_title filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" download_and_filter_pageviews = "oz_tree_build.utilities.download_and_filter_pageviews:main" +download_opentree = "oz_tree_build.utilities.download_opentree:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" process_image_bits = "oz_tree_build.images_and_vernaculars.process_image_bits:main" From 7b647d97bb0367f9fcdd603afaf4a14290541ce1 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Thu, 26 Mar 2026 23:21:02 +0000 Subject: [PATCH 07/19] Run the rest of the dvc pipeline --- data/.gitignore | 2 + data/output_files/.gitignore | 3 - dvc.lock | 120 +++++++++++++++++++++++++++++++++++ dvc.yaml | 9 +++ 4 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 data/.gitignore delete mode 100644 data/output_files/.gitignore diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..0992b36 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,2 @@ +/js_output +/output_files diff --git a/data/output_files/.gitignore b/data/output_files/.gitignore deleted file mode 100644 index 65e7aa0..0000000 --- a/data/output_files/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -* -!.gitignore -!*.dvc diff --git a/dvc.lock b/dvc.lock index 0615172..c13d610 100644 --- a/dvc.lock +++ b/dvc.lock @@ -123,3 +123,123 @@ stages: md5: 3f8df65320201c2db0ee35b17916c7cb.dir size: 81928129 nfiles: 229 + download_eol: + cmd: curl -L -o data/EOL/provider_ids.csv.gz + https://eol.org/data/provider_ids.csv.gz + outs: + - path: data/EOL/provider_ids.csv.gz + hash: md5 + md5: 03c0e857a35695b6b6a5467014bd38d8 + size: 314398683 + filter_eol: + cmd: filter_eol data/EOL/provider_ids.csv.gz + data/OpenTree/v16.1/taxonomy.tsv -o data/filtered/OneZoom_provider_ids.csv + deps: + - path: data/EOL/provider_ids.csv.gz + hash: md5 + md5: 03c0e857a35695b6b6a5467014bd38d8 + size: 314398683 + - path: data/OpenTree/v16.1/taxonomy.tsv + hash: md5 + md5: d7a58eaaf132522b89a506e96ca5098f + size: 417054016 + params: + params.yaml: + ot_version: v16.1 + outs: + - path: data/filtered/OneZoom_provider_ids.csv + hash: md5 + md5: f7c9bb8374957c07168bec36d6591347 + size: 221682224 + build_tree: + cmd: cd data/OZTreeBuild/AllLife && build_oz_tree + BespokeTree/include_OT_v16.1/Base.PHY OpenTreeParts/OpenTree_all/ + AllLife_full_tree.phy + deps: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ + hash: md5 + md5: 694f92ea523d0628824b4cf4ed7ef19e.dir + size: 1806192 + nfiles: 56 + - path: data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ + hash: md5 + md5: 3f8df65320201c2db0ee35b17916c7cb.dir + size: 81928129 + nfiles: 229 + params: + params.yaml: + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/OZTreeBuild/AllLife/AllLife_full_tree.phy + hash: md5 + md5: 0b17680b0a0a633f8ae50e4a8f68f17a + size: 83061022 + create_tables: + cmd: mkdir -p data/output_files && CSV_base_table_creator + data/OZTreeBuild/AllLife/AllLife_full_tree.phy + data/OpenTree/v16.1/taxonomy.tsv data/filtered/OneZoom_provider_ids.csv + data/filtered/OneZoom_latest-all.json + data/filtered/OneZoom_enwiki-latest-page.sql + data/filtered/pageviews/OneZoom_pageviews* -o data/output_files -v + --version 28017344 --exclude Archosauria_ott335588 Dinosauria_ott90215 + --extra_source_file + data/OZTreeBuild/AllLife/BespokeTree/SupplementaryTaxonomy.tsv 2> + data/output_files/ordered_output.log + deps: + - path: data/OZTreeBuild/AllLife/AllLife_full_tree.phy + hash: md5 + md5: 0b17680b0a0a633f8ae50e4a8f68f17a + size: 83061022 + - path: data/OZTreeBuild/AllLife/BespokeTree/SupplementaryTaxonomy.tsv + hash: md5 + md5: 8e861649388bf88595b93c0199f2cc3a + size: 312 + isexec: true + - path: data/OpenTree/v16.1/taxonomy.tsv + hash: md5 + md5: d7a58eaaf132522b89a506e96ca5098f + size: 417054016 + - path: data/filtered/OneZoom_enwiki-latest-page.sql + hash: md5 + md5: ac10389b87372eff1d276802dbb345a5 + size: 21411110 + - path: data/filtered/OneZoom_latest-all.json + hash: md5 + md5: 65b22a5c761c78d79b30faf871d1e404 + size: 1542990225 + - path: data/filtered/OneZoom_provider_ids.csv + hash: md5 + md5: f7c9bb8374957c07168bec36d6591347 + size: 221682224 + - path: data/filtered/pageviews/ + hash: md5 + md5: f021afa12f9d7c893412a0b2980ab187.dir + size: 104535557 + nfiles: 13 + params: + params.yaml: + build_version: 28017344 + exclude_from_popularity: Archosauria_ott335588 Dinosauria_ott90215 + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/output_files/ + hash: md5 + md5: babe7302afec6bc868d2889963796b74.dir + size: 1182737679 + nfiles: 8 + make_js: + cmd: mkdir -p data/js_output && make_js_treefiles --outdir data/js_output + deps: + - path: data/output_files/ + hash: md5 + md5: babe7302afec6bc868d2889963796b74.dir + size: 1182737679 + nfiles: 8 + outs: + - path: data/js_output/ + hash: md5 + md5: 77ff7fae56095a0cdcec84e70b895b28.dir + size: 8293094 + nfiles: 6 diff --git a/dvc.yaml b/dvc.yaml index 5a43520..313b116 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -60,6 +60,13 @@ stages: outs: - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy + download_eol: + cmd: >- + curl -L -o data/EOL/provider_ids.csv.gz + https://eol.org/data/provider_ids.csv.gz + outs: + - data/EOL/provider_ids.csv.gz + filter_eol: cmd: >- filter_eol @@ -120,6 +127,7 @@ stages: create_tables: cmd: >- + mkdir -p data/output_files && CSV_base_table_creator data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy data/OpenTree/${ot_version}/taxonomy.tsv @@ -150,6 +158,7 @@ stages: make_js: cmd: >- + mkdir -p data/js_output && make_js_treefiles --outdir data/js_output deps: From 810d78b8f6e1c710545ed63439cb9ecfa04b3d1a Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Sat, 28 Mar 2026 00:03:50 +0000 Subject: [PATCH 08/19] wip: don't store the biggun --- README.markdown | 17 ++-- data/README.markdown | 23 ++--- data/Wiki/README.markdown | 36 ++++---- dvc.yaml | 7 +- .../download_and_filter_pageviews.py | 66 +-------------- .../utilities/download_and_filter_wikidata.py | 84 +++++++++++++++++++ oz_tree_build/utilities/file_utils.py | 84 +++++++++++++------ oz_tree_build/utilities/filter_wikidata.py | 16 ++-- .../utilities/generate_filtered_files.py | 3 +- pyproject.toml | 1 + 10 files changed, 202 insertions(+), 135 deletions(-) create mode 100644 oz_tree_build/utilities/download_and_filter_wikidata.py diff --git a/README.markdown b/README.markdown index 4b5241d..bf2c22a 100644 --- a/README.markdown +++ b/README.markdown @@ -41,7 +41,7 @@ you will need a valid Azure Image cropping key in your appconfig.ini. ## Building the latest tree from OpenTree -This project uses [DVC](https://dvc.org/) for a cached, repeatable data pipeline. The build parameters are defined in `params.yaml` and the pipeline stages are declared in `dvc.yaml`. +This project uses [DVC](https://dvc.org/) to manage the pipeline. The build parameters are defined in `params.yaml` and the pipeline stages are declared in `dvc.yaml`. ### Quick start (using cached outputs) @@ -58,13 +58,16 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If 1. Set `ot_version` in `params.yaml` to the desired OpenTree synthesis version (e.g. `"v16.1"`). Available versions can be found in the [synthesis manifest](https://raw.githubusercontent.com/OpenTreeOfLife/opentree/master/webapp/static/statistics/synthesis.json). The OpenTree tree and taxonomy will be downloaded automatically by the `download_opentree` pipeline stage. -2. Download the other required source files into `data/` as [documented here](data/README.markdown), then register them with DVC: +2. Download the Wikipedia SQL dump into `data/` as [documented here](data/README.markdown), then register it with DVC: ```bash - dvc add data/Wiki/wd_JSON/latest-all.json.bz2 dvc add data/Wiki/wp_SQL/enwiki-latest-page.sql.gz - dvc add data/Wiki/wp_pagecounts/ - dvc add data/EOL/provider_ids.csv.gz + ``` + + The other source files (OpenTree, EOL, Wikidata dump, pageviews) are downloaded automatically by pipeline stages. To force re-download all of them: + + ```bash + dvc repro --force download_opentree download_eol download_and_filter_wikidata download_and_filter_pageviews ``` 3. Run the pipeline and push results to the shared cache: @@ -80,9 +83,9 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If The pipeline is defined in `dvc.yaml`. Use `dvc dag` to visualize the DAG. Key stages include: -- **download_opentree** -- download OpenTree synthesis tree and taxonomy +- **download_opentree**, **download_eol** -- download OpenTree and EOL source files - **add_ott_numbers**, **prepare_open_trees**, **build_tree** -- assemble the full newick tree -- **filter_eol**, **filter_wikidata**, **filter_sql**, **filter_pageviews** -- filter massive source files (parallelizable) +- **filter_eol**, **download_and_filter_wikidata**, **filter_sql**, **download_and_filter_pageviews** -- download/filter massive source files (parallelizable) - **create_tables** -- map taxa, calculate popularity, produce DB-ready CSVs - **make_js** -- generate JS viewer files diff --git a/data/README.markdown b/data/README.markdown index 8210407..e901e95 100755 --- a/data/README.markdown +++ b/data/README.markdown @@ -1,13 +1,14 @@ # Downloading required data files -To build a tree, you will first need to download various files from the internet. These are not provided by OneZoom directly as they are (a) very large and (b) regularly updated. The files you will need are: - -* Open Tree of Life files, downloaded automatically by the `download_opentree` pipeline stage into `OpenTree//` (see [OpenTree/README.markdown](OpenTree/README.markdown)) - * `draftversion.tre` (processed synthesis tree) - * `taxonomy.tsv` (OTT taxonomy) -* Wikimedia files, to be downloaded into directories within the `Wiki` directory (see [Wiki/README.markdown](Wiki/README.markdown)) - * `wd_JSON/latest-all.json.bz2` - * `wp_SQL/enwiki-latest-page.sql.gz` - * `wp_pagecounts/pageviews-YYYYMM-user.bz2` (several files for different months). Or download preprocessed files from a [release](https://github.com/OneZoom/tree-build/releases) -* EoL files, to be downloaded into the `EOL` directory (see [EOL/README.markdown](EOL/README.markdown)) - * `identifiers.csv` +To build a tree, you will first need various data files from the internet. These are not provided by OneZoom directly as they are (a) very large and (b) regularly updated. + +Most source files are downloaded automatically by pipeline stages. The only file that must be downloaded manually is: + +* Wikipedia SQL dump: `Wiki/wp_SQL/enwiki-latest-page.sql.gz` (see [Wiki/README.markdown](Wiki/README.markdown)) + +The following are handled by DVC pipeline stages: + +* **Open Tree of Life** files, downloaded by the `download_opentree` stage into `OpenTree//` (see [OpenTree/README.markdown](OpenTree/README.markdown)) +* **EOL provider IDs**, downloaded by the `download_eol` stage into `EOL/provider_ids.csv.gz` +* **Wikidata JSON dump**, streamed and filtered by the `download_and_filter_wikidata` stage (see [Wiki/README.markdown](Wiki/README.markdown)) +* **Wikipedia pageviews**, streamed and filtered by the `download_and_filter_pageviews` stage (see [Wiki/README.markdown](Wiki/README.markdown)) diff --git a/data/Wiki/README.markdown b/data/Wiki/README.markdown index 1c6fa6a..4ad97bb 100755 --- a/data/Wiki/README.markdown +++ b/data/Wiki/README.markdown @@ -1,20 +1,26 @@ -To allow mappings to wikipedia and popularity calculations, the following files -should be uploaded to their respective directories (NB: these could be symlinks to -versions on external storage) +To allow mappings to wikipedia and popularity calculations, the following file +must be downloaded manually into its directory (NB: this could be a symlink to +a version on external storage): -* The `wd_JSON` directory should contain the wikidata JSON dump, as `latest-all.json.bz2` -(download from ) * The `wp_SQL` directory should contain the en.wikipedia SQL dump file, as `enwiki-latest-page.sql.gz` (download from ) -Wikipedia pageview files are downloaded and filtered automatically by the -`download_and_filter_pageviews` pipeline stage. It streams monthly `-user` dumps -from , filters them -against the wikidata titles, and caches the small filtered outputs. Only the most -recent N months (configured via `--months` in the DVC stage) are processed. To -pick up newly published months, run `dvc repro --force download_and_filter_pageviews`. +The Wikidata dump and Wikipedia pageview files are downloaded and filtered +automatically by pipeline stages: -These files are used as inputs to the DVC pipeline's filtering stages. If someone -has already run the pipeline and pushed results to the DVC remote, you do not need -to download these files yourself -- `dvc repro --pull --allow-missing` will pull -the cached filtered outputs instead. +* **`download_and_filter_wikidata`** streams the full Wikidata JSON dump + (`latest-all.json.bz2`, ~90 GB) from + , filters it on the fly, + and writes only the small filtered output. To re-download with a fresh dump, + run `dvc repro --force download_and_filter_wikidata`. + +* **`download_and_filter_pageviews`** streams monthly `-user` dumps from + , filters them + against the wikidata titles, and caches the small filtered outputs. Only the + most recent N months (configured via `--months` in the DVC stage) are + processed. To pick up newly published months, run + `dvc repro --force download_and_filter_pageviews`. + +If someone has already run the pipeline and pushed results to the DVC remote, +you do not need to download these files yourself -- +`dvc repro --pull --allow-missing` will pull the cached filtered outputs instead. diff --git a/dvc.yaml b/dvc.yaml index 313b116..5a6740d 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -81,13 +81,10 @@ stages: outs: - data/filtered/OneZoom_provider_ids.csv - filter_wikidata: + download_and_filter_wikidata: cmd: >- - filter_wikidata - data/Wiki/wd_JSON/latest-all.json.bz2 + download_and_filter_wikidata -o data/filtered/OneZoom_latest-all.json - deps: - - data/Wiki/wd_JSON/latest-all.json.bz2 outs: - data/filtered/OneZoom_latest-all.json diff --git a/oz_tree_build/utilities/download_and_filter_pageviews.py b/oz_tree_build/utilities/download_and_filter_pageviews.py index c0ddc95..031fa45 100644 --- a/oz_tree_build/utilities/download_and_filter_pageviews.py +++ b/oz_tree_build/utilities/download_and_filter_pageviews.py @@ -7,19 +7,16 @@ """ import argparse -import bz2 -import codecs import hashlib import itertools import logging import os import re -import shutil -import subprocess import sys import tempfile import urllib.request +from .file_utils import stream_bz2_lines_from_url from .filter_pageviews import filter_pageview_lines, write_filtered_pageviews from .filter_wikidata import load_titles_file @@ -72,65 +69,6 @@ def discover_pageview_months(base_url=BASE_URL): yield file_url, filename -def _stream_bz2_lines(url): - """ - Stream a .bz2 file over HTTP via wget and yield decompressed lines. - wget handles timeouts and connection management; Python handles decompression. - """ - if not shutil.which("wget"): - raise RuntimeError("wget is required but not found on PATH") - - wget = subprocess.Popen( - [ - "wget", "-q", "-O", "-", - "--connect-timeout=30", - f"--read-timeout={WGET_READ_TIMEOUT}", - "--header=User-Agent: OneZoom-tree-build/1.0", - url, - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - decompressor = bz2.BZ2Decompressor() - decoder = codecs.getincrementaldecoder("utf-8")("replace") - line_buf = "" - bytes_read = 0 - - try: - while True: - chunk = wget.stdout.read(1024 * 1024) - if not chunk: - break - bytes_read += len(chunk) - if bytes_read % (500 * 1024 * 1024) < 1024 * 1024: - logging.info(f" Downloaded {bytes_read / (1024**3):.1f} GB so far...") - try: - raw = decompressor.decompress(chunk) - except EOFError: - break - text = decoder.decode(raw) - line_buf += text - parts = line_buf.split("\n") - line_buf = parts[-1] - for line in parts[:-1]: - yield line - - trailing = decoder.decode(b"", final=True) - line_buf += trailing - if line_buf: - yield line_buf - finally: - wget.stdout.close() - if wget.poll() is None: - wget.terminate() - rc = wget.wait() - stderr_out = wget.stderr.read().decode("utf-8", errors="replace").strip() - wget.stderr.close() - if rc != 0: - raise RuntimeError(f"wget failed (exit {rc}): {stderr_out}") - - def _compute_file_hash(path): """Return the SHA-256 hex digest of a file's contents.""" h = hashlib.sha256() @@ -178,7 +116,7 @@ def stream_and_filter(url, output_path, wikidata_titles, wikilang="en"): Stream a remote .bz2 pageview file, filter it, and write the result. Uses a temp file + rename for atomicity. """ - lines = _stream_bz2_lines(url) + lines = stream_bz2_lines_from_url(url, read_timeout=WGET_READ_TIMEOUT) pageviews = filter_pageview_lines(lines, wikidata_titles, wikilang) dir_name = os.path.dirname(output_path) diff --git a/oz_tree_build/utilities/download_and_filter_wikidata.py b/oz_tree_build/utilities/download_and_filter_wikidata.py new file mode 100644 index 0000000..9319072 --- /dev/null +++ b/oz_tree_build/utilities/download_and_filter_wikidata.py @@ -0,0 +1,84 @@ +"""Download the Wikidata JSON dump and filter to taxon/vernacular items. + +Streams the dump directly from Wikimedia, decompresses on the fly, and +writes only the filtered results to disk. Avoids storing the full ~90 GB +dump locally. +""" + +import argparse +import logging +import os +import sys +import tempfile + +from .file_utils import stream_bz2_lines_from_url +from .filter_wikidata import filter_wikidata + +WIKIDATA_DUMP_URL = ( + "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2" +) +WGET_READ_TIMEOUT = 600 + + +def stream_and_filter(url, output_path, wikilang="en", dont_trim_sitelinks=False): + """ + Stream a remote Wikidata .bz2 dump, filter it, and write the result. + Uses a temp file + rename for atomicity. + """ + lines = stream_bz2_lines_from_url(url, read_timeout=WGET_READ_TIMEOUT) + + dir_name = os.path.dirname(output_path) or "." + fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp") + os.close(fd) + try: + filter_wikidata( + lines, + tmp_path, + wikilang=wikilang, + dont_trim_sitelinks=dont_trim_sitelinks, + ) + os.replace(tmp_path, output_path) + except BaseException: + if os.path.exists(tmp_path): + os.remove(tmp_path) + raise + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "-o", + "--output", + required=True, + help="Output path for filtered wikidata JSON", + ) + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + parser.add_argument( + "--url", + default=WIKIDATA_DUMP_URL, + help="URL of the Wikidata JSON dump (.bz2)", + ) + parser.add_argument( + "--dont-trim-sitelinks", + action="store_true", + default=False, + help="Keep the full sitelinks value for all languages", + ) + args = parser.parse_args() + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + + logging.info(f"Streaming Wikidata dump from {args.url}") + stream_and_filter( + args.url, + args.output, + wikilang=args.wikilang, + dont_trim_sitelinks=args.dont_trim_sitelinks, + ) + logging.info(f"Done: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/file_utils.py b/oz_tree_build/utilities/file_utils.py index 0a84627..136a9f0 100644 --- a/oz_tree_build/utilities/file_utils.py +++ b/oz_tree_build/utilities/file_utils.py @@ -3,9 +3,11 @@ """ import bz2 +import codecs import gzip import os -import time +import shutil +import subprocess __author__ = "David Ebbo" @@ -20,36 +22,64 @@ def open_file_based_on_extension(filename, mode): return open(filename, mode, encoding="utf-8") -def enumerate_lines_from_file(filename, print_every=None, print_line_num_func=None): +def stream_bz2_lines_from_url(url, read_timeout=120): """ - Enumerate the lines in a file, whether it's uncompressed, bz2 or gz. If print_every - is given as an integer, print a message out every print_every lines. If - print_line_num_func is given, it should be a function that takes in the line number - and returns the string to print out. + Stream a .bz2 file over HTTP via wget and yield decompressed lines. + wget handles timeouts, connection management, and progress display; + Python handles decompression and line splitting. """ - underlying_file_size = os.path.getsize(filename) - start_time = time.time() - with open_file_based_on_extension(filename, "rt") as f: - if print_every is not None: + if not shutil.which("wget"): + raise RuntimeError("wget is required but not found on PATH") + + wget = subprocess.Popen( + [ + "wget", "-q", "--show-progress", "-O", "-", + "--connect-timeout=30", + f"--read-timeout={read_timeout}", + "--header=User-Agent: OneZoom-tree-build/1.0", + url, + ], + stdout=subprocess.PIPE, + stderr=None, + ) + + decompressor = bz2.BZ2Decompressor() + decoder = codecs.getincrementaldecoder("utf-8")("replace") + line_buf = "" + + try: + while True: + chunk = wget.stdout.read(1024 * 1024) + if not chunk: + break try: - underlying_file = f.buffer.fileobj # gzip - except AttributeError: - try: - underlying_file = f.buffer._buffer.raw._fp # b2zip - except AttributeError: - underlying_file = f # plain + raw = decompressor.decompress(chunk) + except EOFError: + break + text = decoder.decode(raw) + line_buf += text + parts = line_buf.split("\n") + line_buf = parts[-1] + for line in parts[:-1]: + yield line + + trailing = decoder.decode(b"", final=True) + line_buf += trailing + if line_buf: + yield line_buf + finally: + wget.stdout.close() + if wget.poll() is None: + wget.terminate() + rc = wget.wait() + if rc != 0: + raise RuntimeError(f"wget failed (exit {rc})") + + +def enumerate_lines_from_file(filename): + """Enumerate the lines in a file, whether it's uncompressed, bz2 or gz.""" + with open_file_based_on_extension(filename, "rt") as f: for line_num, line in enumerate(iter(f.readline, "")): - if print_every is not None and line_num != 0 and line_num % print_every == 0: - underlying_file_pos = underlying_file.tell() - percent_done = 100 * underlying_file_pos / underlying_file_size - elapsed_time = time.time() - start_time - time_left = elapsed_time * (100 - percent_done) / percent_done - expected_ETA = time.strftime("%H:%M:%S", time.localtime(time.time() + time_left)) - if print_line_num_func is not None: - line_num_str = print_line_num_func(line_num) - else: - line_num_str = f"Processing line {line_num}" - print(f"{percent_done:.2f}% read. " + line_num_str + f" ETA: {expected_ETA}") yield line_num, line diff --git a/oz_tree_build/utilities/filter_wikidata.py b/oz_tree_build/utilities/filter_wikidata.py index eaf8e6b..5cc7f8b 100644 --- a/oz_tree_build/utilities/filter_wikidata.py +++ b/oz_tree_build/utilities/filter_wikidata.py @@ -62,7 +62,7 @@ def filter_wikidata( - wikidata_file, + lines, output_file, source_ids=None, clade=None, @@ -72,6 +72,9 @@ def filter_wikidata( """ Filter the wikidata JSON dump, keeping only taxon and vernacular items, and trimming each item to only the fields we consume. + + *lines* should be an iterable of raw dump lines (e.g. from + ``stream_bz2_lines_from_url`` or ``enumerate_lines_from_file``). """ sitelinks_key = f"{wikilang}wiki" @@ -99,10 +102,12 @@ def trim_and_write_json_item(json_item, filtered_wiki_f): filtered_wiki_f.write("[\n") preserved_lines = 0 - def get_line_message(line_num): - return f"Kept {preserved_lines}/{line_num} lines ({preserved_lines / line_num * 100:.2f}%)" + for line_num, line in enumerate(lines): + if line_num > 0 and line_num % 100_000 == 0: + logging.info( + f"Processed {line_num} lines, kept {preserved_lines}" + ) - for _, line in enumerate_lines_from_file(wikidata_file, 100000, get_line_message): if not (line.startswith('{"type":') and quick_byte_match.search(line)): continue @@ -199,8 +204,9 @@ def main(): ) args = parser.parse_args() + lines = (line for _, line in enumerate_lines_from_file(args.wikidata_file)) filter_wikidata( - args.wikidata_file, + lines, args.output, wikilang=args.wikilang, dont_trim_sitelinks=args.dont_trim_sitelinks, diff --git a/oz_tree_build/utilities/generate_filtered_files.py b/oz_tree_build/utilities/generate_filtered_files.py index 8ab0b88..0f2fdcc 100644 --- a/oz_tree_build/utilities/generate_filtered_files.py +++ b/oz_tree_build/utilities/generate_filtered_files.py @@ -106,8 +106,9 @@ def generate_all_filtered_files( if wikidata_dump_file: wikidata_output = _compute_output_path(wikidata_dump_file, prefix, context.compress) + lines = (line for _, line in enumerate_lines_from_file(wikidata_dump_file)) filter_wikidata( - wikidata_dump_file, + lines, wikidata_output, source_ids=source_ids if context.clade else None, clade=context.clade, diff --git a/pyproject.toml b/pyproject.toml index c6ee696..43143f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ extract_wikidata_titles = "oz_tree_build.utilities.filter_wikidata:extract_title filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" download_and_filter_pageviews = "oz_tree_build.utilities.download_and_filter_pageviews:main" +download_and_filter_wikidata = "oz_tree_build.utilities.download_and_filter_wikidata:main" download_opentree = "oz_tree_build.utilities.download_opentree:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" From 9b890bd4b76af931dd4fb75164dd04fd8768894c Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Sat, 28 Mar 2026 01:01:46 +0000 Subject: [PATCH 09/19] wikipedia sql download --- README.markdown | 23 ++++++----------------- data/README.markdown | 7 ++----- data/Wiki/README.markdown | 14 ++++++-------- dvc.lock | 18 ++++++++++-------- dvc.yaml | 12 ++++++++++++ oz_tree_build/README.markdown | 2 +- 6 files changed, 37 insertions(+), 39 deletions(-) diff --git a/README.markdown b/README.markdown index bf2c22a..a24bcb0 100644 --- a/README.markdown +++ b/README.markdown @@ -21,6 +21,8 @@ If you want to run the test suite, make sure the test requirements are also inst pip install -e '.[test]' +To be able to run the pipeline, you'll also need to install `wget`. + ## Testing Assuming you have installed the test requirements, you should be able to run @@ -58,17 +60,13 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If 1. Set `ot_version` in `params.yaml` to the desired OpenTree synthesis version (e.g. `"v16.1"`). Available versions can be found in the [synthesis manifest](https://raw.githubusercontent.com/OpenTreeOfLife/opentree/master/webapp/static/statistics/synthesis.json). The OpenTree tree and taxonomy will be downloaded automatically by the `download_opentree` pipeline stage. -2. Download the Wikipedia SQL dump into `data/` as [documented here](data/README.markdown), then register it with DVC: +2. Some source files are unversioned so will use cached results unless forced. To force re-download them all with the latest upstream data: ```bash - dvc add data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + dvc repro --force download_eol download_wikipedia_sql download_and_filter_wikidata download_and_filter_pageviews ``` - The other source files (OpenTree, EOL, Wikidata dump, pageviews) are downloaded automatically by pipeline stages. To force re-download all of them: - - ```bash - dvc repro --force download_opentree download_eol download_and_filter_wikidata download_and_filter_pageviews - ``` +Note that download_and_filter_wikidata and download_and_filter_pageviews take several hours to run. 3. Run the pipeline and push results to the shared cache: @@ -77,16 +75,7 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If dvc push ``` -4. Commit the `.dvc` files and `dvc.lock` to git. - -### Pipeline stages - -The pipeline is defined in `dvc.yaml`. Use `dvc dag` to visualize the DAG. Key stages include: +4. Commit `dvc.lock` to git. -- **download_opentree**, **download_eol** -- download OpenTree and EOL source files -- **add_ott_numbers**, **prepare_open_trees**, **build_tree** -- assemble the full newick tree -- **filter_eol**, **download_and_filter_wikidata**, **filter_sql**, **download_and_filter_pageviews** -- download/filter massive source files (parallelizable) -- **create_tables** -- map taxa, calculate popularity, produce DB-ready CSVs -- **make_js** -- generate JS viewer files For detailed step-by-step documentation, see [oz_tree_build/README.markdown](oz_tree_build/README.markdown). diff --git a/data/README.markdown b/data/README.markdown index e901e95..bf5af44 100755 --- a/data/README.markdown +++ b/data/README.markdown @@ -2,13 +2,10 @@ To build a tree, you will first need various data files from the internet. These are not provided by OneZoom directly as they are (a) very large and (b) regularly updated. -Most source files are downloaded automatically by pipeline stages. The only file that must be downloaded manually is: - -* Wikipedia SQL dump: `Wiki/wp_SQL/enwiki-latest-page.sql.gz` (see [Wiki/README.markdown](Wiki/README.markdown)) - -The following are handled by DVC pipeline stages: +All source files are downloaded automatically by DVC pipeline stages: * **Open Tree of Life** files, downloaded by the `download_opentree` stage into `OpenTree//` (see [OpenTree/README.markdown](OpenTree/README.markdown)) * **EOL provider IDs**, downloaded by the `download_eol` stage into `EOL/provider_ids.csv.gz` +* **Wikipedia SQL dump**, downloaded by the `download_wikipedia_sql` stage into `Wiki/wp_SQL/enwiki-latest-page.sql.gz` (see [Wiki/README.markdown](Wiki/README.markdown)) * **Wikidata JSON dump**, streamed and filtered by the `download_and_filter_wikidata` stage (see [Wiki/README.markdown](Wiki/README.markdown)) * **Wikipedia pageviews**, streamed and filtered by the `download_and_filter_pageviews` stage (see [Wiki/README.markdown](Wiki/README.markdown)) diff --git a/data/Wiki/README.markdown b/data/Wiki/README.markdown index 4ad97bb..1f7ba95 100755 --- a/data/Wiki/README.markdown +++ b/data/Wiki/README.markdown @@ -1,12 +1,10 @@ -To allow mappings to wikipedia and popularity calculations, the following file -must be downloaded manually into its directory (NB: this could be a symlink to -a version on external storage): +To allow mappings to wikipedia and popularity calculations, the following +files are downloaded and filtered automatically by pipeline stages: -* The `wp_SQL` directory should contain the en.wikipedia SQL dump file, as `enwiki-latest-page.sql.gz` -(download from ) - -The Wikidata dump and Wikipedia pageview files are downloaded and filtered -automatically by pipeline stages: +* **`download_wikipedia_sql`** downloads the en.wikipedia SQL dump + (`enwiki-latest-page.sql.gz`, ~2 GB) from + . To re-download the latest + version, run `dvc repro --force download_wikipedia_sql`. * **`download_and_filter_wikidata`** streams the full Wikidata JSON dump (`latest-all.json.bz2`, ~90 GB) from diff --git a/dvc.lock b/dvc.lock index c13d610..dd45f6e 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,13 +1,7 @@ schema: '2.0' stages: - filter_wikidata: - cmd: filter_wikidata data/Wiki/wd_JSON/latest-all.json.bz2 -o - data/filtered/OneZoom_latest-all.json - deps: - - path: data/Wiki/wd_JSON/latest-all.json.bz2 - hash: md5 - md5: 9d2253e0b9e629c8f77eadd8f09c1d88 - size: 100346119273 + download_and_filter_wikidata: + cmd: download_and_filter_wikidata -o data/filtered/OneZoom_latest-all.json outs: - path: data/filtered/OneZoom_latest-all.json hash: md5 @@ -26,6 +20,14 @@ stages: hash: md5 md5: e498b85311c8a84a0d5157a0bbbcb23f size: 9382189 + download_wikipedia_sql: + cmd: wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz + outs: + - path: data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + hash: md5 + md5: 85108d569d644c3d34d8b930597e42dc + size: 2384405614 download_and_filter_pageviews: cmd: download_and_filter_pageviews --titles-file data/filtered/wikidata_titles.txt --months 12 -o data/filtered/pageviews diff --git a/dvc.yaml b/dvc.yaml index 5a6740d..e0320d1 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -9,6 +9,7 @@ stages: outs: - data/OpenTree/${ot_version}/ + # ~20 secs add_ott_numbers: cmd: >- rm -rf data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} && @@ -24,6 +25,7 @@ stages: outs: - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ + # ~a few secs prepare_open_trees: cmd: >- mkdir -p data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all && @@ -81,6 +83,7 @@ stages: outs: - data/filtered/OneZoom_provider_ids.csv + # >6 hours (streams ~90 GB dump from wikidata server) download_and_filter_wikidata: cmd: >- download_and_filter_wikidata @@ -98,6 +101,13 @@ stages: outs: - data/filtered/wikidata_titles.txt + download_wikipedia_sql: + cmd: >- + wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz + outs: + - data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + filter_sql: cmd: >- filter_wikipedia_sql @@ -110,6 +120,7 @@ stages: outs: - data/filtered/OneZoom_enwiki-latest-page.sql + # ~several hours (streams 12 ~5GB monthly dumps) download_and_filter_pageviews: cmd: >- download_and_filter_pageviews @@ -122,6 +133,7 @@ stages: - data/filtered/pageviews/: persist: true + # ~10 mins create_tables: cmd: >- mkdir -p data/output_files && diff --git a/oz_tree_build/README.markdown b/oz_tree_build/README.markdown index 439edcf..dc1572d 100755 --- a/oz_tree_build/README.markdown +++ b/oz_tree_build/README.markdown @@ -27,7 +27,7 @@ To run only up to a specific stage (e.g. just the JS generation): dvc repro make_js ``` -To visualize the pipeline DAG: +To visualize the pipeline graph: ```bash dvc dag From f5ba6b4b1f0bb427f78521c0983f31ce593a614f Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Sat, 28 Mar 2026 11:28:20 +0000 Subject: [PATCH 10/19] Remove old pageviews --- .../utilities/download_and_filter_pageviews.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/oz_tree_build/utilities/download_and_filter_pageviews.py b/oz_tree_build/utilities/download_and_filter_pageviews.py index 031fa45..bfb43e0 100644 --- a/oz_tree_build/utilities/download_and_filter_pageviews.py +++ b/oz_tree_build/utilities/download_and_filter_pageviews.py @@ -180,8 +180,11 @@ def main(): f"Selected {len(selected)} most recent months" ) + expected_filenames = set() for i, (url, filename) in enumerate(selected, 1): - output_file = os.path.join(args.output_dir, _output_filename(filename)) + output_name = _output_filename(filename) + expected_filenames.add(output_name) + output_file = os.path.join(args.output_dir, output_name) if os.path.exists(output_file): logging.info(f"[{i}/{len(selected)}] Skipping {filename} (already filtered)") @@ -191,6 +194,13 @@ def main(): stream_and_filter(url, output_file, wikidata_titles, wikilang=args.wikilang) logging.info(f"[{i}/{len(selected)}] Done: {output_file}") + expected_filenames.add(TITLES_HASH_FILE) + for name in os.listdir(args.output_dir): + if name not in expected_filenames: + stale_path = os.path.join(args.output_dir, name) + logging.info(f"Removing old pageview file outside window: {name}") + os.remove(stale_path) + logging.info("All pageview months up to date") From 6e08b44dd41f9e50994098e2c5ea917b67926e7e Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Sat, 28 Mar 2026 13:02:37 +0000 Subject: [PATCH 11/19] Add remote storage for dvc --- .dvc/config | 5 +++++ README.markdown | 18 +++++++++--------- pyproject.toml | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.dvc/config b/.dvc/config index e69de29..84e7938 100644 --- a/.dvc/config +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = jared-r2 +['remote "jared-r2"'] + url = s3://onezoom + endpointurl = https://9d168184d3ac384b6a159313dd90a75a.r2.cloudflarestorage.com diff --git a/README.markdown b/README.markdown index a24bcb0..569e7c6 100644 --- a/README.markdown +++ b/README.markdown @@ -47,7 +47,8 @@ This project uses [DVC](https://dvc.org/) to manage the pipeline. The build para ### Quick start (using cached outputs) -If someone has already run the pipeline and pushed the results to the DVC remote, you can reproduce the build without downloading any of the massive source files: +You'll need to ask for the DVC remote credentials on the OneZoom Slack channel in order to pull cached results. +Then, if someone has already run the pipeline and pushed the results to the DVC remote, you can reproduce the build and any of the intermediate stages without downloading any of the massive source files: ```bash source .venv/bin/activate @@ -62,20 +63,19 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If 2. Some source files are unversioned so will use cached results unless forced. To force re-download them all with the latest upstream data: - ```bash - dvc repro --force download_eol download_wikipedia_sql download_and_filter_wikidata download_and_filter_pageviews - ``` + ```bash + dvc repro --force download_eol download_wikipedia_sql download_and_filter_wikidata download_and_filter_pageviews + ``` Note that download_and_filter_wikidata and download_and_filter_pageviews take several hours to run. 3. Run the pipeline and push results to the shared cache: - ```bash - dvc repro - dvc push - ``` + ```bash + dvc repro + dvc push + ``` 4. Commit `dvc.lock` to git. - For detailed step-by-step documentation, see [oz_tree_build/README.markdown](oz_tree_build/README.markdown). diff --git a/pyproject.toml b/pyproject.toml index 43143f2..7b94d8b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies = [ "cryptography>=42.0", "mwparserfromhell>=0.6.6", "requests-cache>=1.2.1", - "dvc>=3.0", + "dvc[s3]>=3.0", ] [project.optional-dependencies] From 0f381733a5191bd037da6f9c63cb263285a3a4b7 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Sun, 29 Mar 2026 20:18:37 +0100 Subject: [PATCH 12/19] Remove js_outputs dir --- data/js_outputs/.gitignore | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 data/js_outputs/.gitignore diff --git a/data/js_outputs/.gitignore b/data/js_outputs/.gitignore deleted file mode 100644 index 4ba3b80..0000000 --- a/data/js_outputs/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -# Ignore everything -* - -# But not these files... -!.gitignore -!*.dvc \ No newline at end of file From ce09ebade0434a7b3526e7ca8ce849c83da26a04 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Fri, 3 Apr 2026 10:17:03 +0100 Subject: [PATCH 13/19] Capture the wiki data dump version --- README.markdown | 2 +- dvc.lock | 17 +++++- dvc.yaml | 9 +++ .../utilities/download_and_filter_wikidata.py | 57 +++++++++++++++++-- pyproject.toml | 1 + 5 files changed, 79 insertions(+), 7 deletions(-) diff --git a/README.markdown b/README.markdown index 569e7c6..5e5aec6 100644 --- a/README.markdown +++ b/README.markdown @@ -64,7 +64,7 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If 2. Some source files are unversioned so will use cached results unless forced. To force re-download them all with the latest upstream data: ```bash - dvc repro --force download_eol download_wikipedia_sql download_and_filter_wikidata download_and_filter_pageviews + dvc repro --force download_eol download_wikipedia_sql discover_wikidata_url download_and_filter_wikidata download_and_filter_pageviews ``` Note that download_and_filter_wikidata and download_and_filter_pageviews take several hours to run. diff --git a/dvc.lock b/dvc.lock index dd45f6e..58e262c 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1,7 +1,14 @@ schema: '2.0' stages: download_and_filter_wikidata: - cmd: download_and_filter_wikidata -o data/filtered/OneZoom_latest-all.json + cmd: download_and_filter_wikidata --url "$(cat + data/Wiki/wd_JSON/latest-all-json-bz2-url.txt)" -o + data/filtered/OneZoom_latest-all.json + deps: + - path: data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + hash: md5 + md5: e094b0f57c0c14e1016842c2dac5482e + size: 90 outs: - path: data/filtered/OneZoom_latest-all.json hash: md5 @@ -245,3 +252,11 @@ stages: md5: 77ff7fae56095a0cdcec84e70b895b28.dir size: 8293094 nfiles: 6 + discover_wikidata_url: + cmd: discover_latest_wikidata_dump_url > + data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + outs: + - path: data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + hash: md5 + md5: e094b0f57c0c14e1016842c2dac5482e + size: 90 diff --git a/dvc.yaml b/dvc.yaml index e0320d1..70b2749 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -83,11 +83,20 @@ stages: outs: - data/filtered/OneZoom_provider_ids.csv + discover_wikidata_url: + cmd: >- + discover_latest_wikidata_dump_url > data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + outs: + - data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + # >6 hours (streams ~90 GB dump from wikidata server) download_and_filter_wikidata: cmd: >- download_and_filter_wikidata + --url "$(cat data/Wiki/wd_JSON/latest-all-json-bz2-url.txt)" -o data/filtered/OneZoom_latest-all.json + deps: + - data/Wiki/wd_JSON/latest-all-json-bz2-url.txt outs: - data/filtered/OneZoom_latest-all.json diff --git a/oz_tree_build/utilities/download_and_filter_wikidata.py b/oz_tree_build/utilities/download_and_filter_wikidata.py index 9319072..510cd4d 100644 --- a/oz_tree_build/utilities/download_and_filter_wikidata.py +++ b/oz_tree_build/utilities/download_and_filter_wikidata.py @@ -7,18 +7,60 @@ import argparse import logging +import re import os import sys import tempfile +import urllib.request from .file_utils import stream_bz2_lines_from_url from .filter_wikidata import filter_wikidata -WIKIDATA_DUMP_URL = ( - "https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2" -) +WIKIDATA_ENTITIES_URL = "https://dumps.wikimedia.org/wikidatawiki/entities/" WGET_READ_TIMEOUT = 600 +logger = logging.getLogger(__name__) + + +def discover_latest_wikidata_dump_url( + base_url=WIKIDATA_ENTITIES_URL, timeout=30 +): + """Find the URL of the most recent dated wikidata-YYYYMMDD-all.json.bz2 dump. + We don't use the symlinked latest-all.json.bz2 file because we want to know the date.""" + folder_re = re.compile(r'href="(\d{8})/"') + file_re_template = r'href="(wikidata-{date}-all\.json\.bz2)"' + + index_html = urllib.request.urlopen( + base_url, timeout=timeout + ).read().decode() + + dates = sorted(folder_re.findall(index_html), reverse=True) + if not dates: + raise RuntimeError(f"No dated folders found at {base_url}") + + for date in dates: + folder_url = f"{base_url}{date}/" + logger.info("Checking %s", folder_url) + try: + folder_html = urllib.request.urlopen( + folder_url, timeout=timeout + ).read().decode() + except urllib.error.URLError as exc: + logger.warning("Could not fetch %s: %s", folder_url, exc) + continue + + match = re.search( + file_re_template.format(date=date), folder_html + ) + if match: + url = f"{folder_url}{match.group(1)}" + logger.info("Found latest dump: %s", url) + return url + + raise RuntimeError( + f"No wikidata-YYYYMMDD-all.json.bz2 file found in any folder at {base_url}" + ) + def stream_and_filter(url, output_path, wikilang="en", dont_trim_sitelinks=False): """ @@ -57,8 +99,7 @@ def main(): parser.add_argument("--wikilang", default="en", help="Wikipedia language code") parser.add_argument( "--url", - default=WIKIDATA_DUMP_URL, - help="URL of the Wikidata JSON dump (.bz2)", + required=True, ) parser.add_argument( "--dont-trim-sitelinks", @@ -80,5 +121,11 @@ def main(): logging.info(f"Done: {args.output}") +def discover_main(): + """CLI entry point: discover the latest wikidata dump URL.""" + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + url = discover_latest_wikidata_dump_url() + print(url) + if __name__ == "__main__": main() diff --git a/pyproject.toml b/pyproject.toml index 7b94d8b..a298ea6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" download_and_filter_pageviews = "oz_tree_build.utilities.download_and_filter_pageviews:main" download_and_filter_wikidata = "oz_tree_build.utilities.download_and_filter_wikidata:main" +discover_latest_wikidata_dump_url = "oz_tree_build.utilities.download_and_filter_wikidata:discover_main" download_opentree = "oz_tree_build.utilities.download_opentree:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" From 0e02006d248159e163e2d6718969b644afd8d139 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Fri, 3 Apr 2026 12:14:54 +0100 Subject: [PATCH 14/19] Discover dated wiki sql URL --- README.markdown | 2 +- data/README.markdown | 12 +- data/Wiki/README.markdown | 14 +- dvc.lock | 49 +++--- dvc.yaml | 18 ++- oz_tree_build/README.markdown | 145 +++++++++--------- .../CSV_base_table_creator.py | 2 +- .../utilities/download_and_filter_wikidata.py | 2 +- .../utilities/filter_wikipedia_sql.py | 62 ++++++++ .../utilities/generate_filtered_files.py | 2 +- pyproject.toml | 1 + 11 files changed, 200 insertions(+), 109 deletions(-) diff --git a/README.markdown b/README.markdown index 5e5aec6..a565ac0 100644 --- a/README.markdown +++ b/README.markdown @@ -64,7 +64,7 @@ DVC will pull only the cached outputs needed for stages that haven't changed. If 2. Some source files are unversioned so will use cached results unless forced. To force re-download them all with the latest upstream data: ```bash - dvc repro --force download_eol download_wikipedia_sql discover_wikidata_url download_and_filter_wikidata download_and_filter_pageviews + dvc repro --force download_eol discover_enwiki_sql_url download_wikipedia_sql discover_wikidata_url download_and_filter_wikidata download_and_filter_pageviews ``` Note that download_and_filter_wikidata and download_and_filter_pageviews take several hours to run. diff --git a/data/README.markdown b/data/README.markdown index bf5af44..3479ab1 100755 --- a/data/README.markdown +++ b/data/README.markdown @@ -1,11 +1,11 @@ # Downloading required data files - + To build a tree, you will first need various data files from the internet. These are not provided by OneZoom directly as they are (a) very large and (b) regularly updated. All source files are downloaded automatically by DVC pipeline stages: -* **Open Tree of Life** files, downloaded by the `download_opentree` stage into `OpenTree//` (see [OpenTree/README.markdown](OpenTree/README.markdown)) -* **EOL provider IDs**, downloaded by the `download_eol` stage into `EOL/provider_ids.csv.gz` -* **Wikipedia SQL dump**, downloaded by the `download_wikipedia_sql` stage into `Wiki/wp_SQL/enwiki-latest-page.sql.gz` (see [Wiki/README.markdown](Wiki/README.markdown)) -* **Wikidata JSON dump**, streamed and filtered by the `download_and_filter_wikidata` stage (see [Wiki/README.markdown](Wiki/README.markdown)) -* **Wikipedia pageviews**, streamed and filtered by the `download_and_filter_pageviews` stage (see [Wiki/README.markdown](Wiki/README.markdown)) +- **Open Tree of Life** files, downloaded by the `download_opentree` stage into `OpenTree//` (see [OpenTree/README.markdown](OpenTree/README.markdown)) +- **EOL provider IDs**, downloaded by the `download_eol` stage into `EOL/provider_ids.csv.gz` +- **Wikipedia SQL dump**, downloaded by the `download_wikipedia_sql` stage into `Wiki/wp_SQL/enwiki-page.sql.gz` (see [Wiki/README.markdown](Wiki/README.markdown)) +- **Wikidata JSON dump**, streamed and filtered by the `download_and_filter_wikidata` stage (see [Wiki/README.markdown](Wiki/README.markdown)) +- **Wikipedia pageviews**, streamed and filtered by the `download_and_filter_pageviews` stage (see [Wiki/README.markdown](Wiki/README.markdown)) diff --git a/data/Wiki/README.markdown b/data/Wiki/README.markdown index 1f7ba95..3e9d45a 100755 --- a/data/Wiki/README.markdown +++ b/data/Wiki/README.markdown @@ -1,18 +1,18 @@ To allow mappings to wikipedia and popularity calculations, the following files are downloaded and filtered automatically by pipeline stages: -* **`download_wikipedia_sql`** downloads the en.wikipedia SQL dump - (`enwiki-latest-page.sql.gz`, ~2 GB) from - . To re-download the latest - version, run `dvc repro --force download_wikipedia_sql`. +- **`download_wikipedia_sql`** downloads the en.wikipedia SQL dump + (`enwiki-page.sql.gz`, ~2 GB) from + . To re-download the latest + version, run `dvc repro --force discover_enwiki_sql_url download_wikipedia_sql`. -* **`download_and_filter_wikidata`** streams the full Wikidata JSON dump +- **`download_and_filter_wikidata`** streams the full Wikidata JSON dump (`latest-all.json.bz2`, ~90 GB) from , filters it on the fly, and writes only the small filtered output. To re-download with a fresh dump, - run `dvc repro --force download_and_filter_wikidata`. + run `dvc repro --force discover_wikidata_url download_and_filter_wikidata`. -* **`download_and_filter_pageviews`** streams monthly `-user` dumps from +- **`download_and_filter_pageviews`** streams monthly `-user` dumps from , filters them against the wikidata titles, and caches the small filtered outputs. Only the most recent N months (configured via `--months` in the DVC stage) are diff --git a/dvc.lock b/dvc.lock index 58e262c..ae835e5 100644 --- a/dvc.lock +++ b/dvc.lock @@ -28,13 +28,18 @@ stages: md5: e498b85311c8a84a0d5157a0bbbcb23f size: 9382189 download_wikipedia_sql: - cmd: wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-latest-page.sql.gz - https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz + cmd: wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-page.sql.gz "$(cat + data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt)" + deps: + - path: data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + hash: md5 + md5: 1018a5c664d01747fdd7e218190cb4ac + size: 72 outs: - - path: data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + - path: data/Wiki/wp_SQL/enwiki-page.sql.gz hash: md5 - md5: 85108d569d644c3d34d8b930597e42dc - size: 2384405614 + md5: 541324eaa9f3f1a14bb6ddcf7ea95de6 + size: 2397370114 download_and_filter_pageviews: cmd: download_and_filter_pageviews --titles-file data/filtered/wikidata_titles.txt --months 12 -o data/filtered/pageviews @@ -50,14 +55,14 @@ stages: size: 104535557 nfiles: 13 filter_sql: - cmd: filter_wikipedia_sql data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + cmd: filter_wikipedia_sql data/Wiki/wp_SQL/enwiki-page.sql.gz data/filtered/wikidata_titles.txt -o data/filtered/OneZoom_enwiki-latest-page.sql deps: - - path: data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + - path: data/Wiki/wp_SQL/enwiki-page.sql.gz hash: md5 - md5: 85108d569d644c3d34d8b930597e42dc - size: 2384405614 + md5: 541324eaa9f3f1a14bb6ddcf7ea95de6 + size: 2397370114 - path: data/filtered/wikidata_titles.txt hash: md5 md5: e498b85311c8a84a0d5157a0bbbcb23f @@ -65,8 +70,8 @@ stages: outs: - path: data/filtered/OneZoom_enwiki-latest-page.sql hash: md5 - md5: ac10389b87372eff1d276802dbb345a5 - size: 21411110 + md5: af577077774060f78a752f22e2f1c6d3 + size: 21445833 download_opentree: cmd: download_opentree --version v16.1 --output-dir data/OpenTree params: @@ -211,8 +216,8 @@ stages: size: 417054016 - path: data/filtered/OneZoom_enwiki-latest-page.sql hash: md5 - md5: ac10389b87372eff1d276802dbb345a5 - size: 21411110 + md5: af577077774060f78a752f22e2f1c6d3 + size: 21445833 - path: data/filtered/OneZoom_latest-all.json hash: md5 md5: 65b22a5c761c78d79b30faf871d1e404 @@ -235,21 +240,21 @@ stages: outs: - path: data/output_files/ hash: md5 - md5: babe7302afec6bc868d2889963796b74.dir - size: 1182737679 + md5: a9e258bc6560d47e8d4af723eee707b6.dir + size: 1182677609 nfiles: 8 make_js: cmd: mkdir -p data/js_output && make_js_treefiles --outdir data/js_output deps: - path: data/output_files/ hash: md5 - md5: babe7302afec6bc868d2889963796b74.dir - size: 1182737679 + md5: a9e258bc6560d47e8d4af723eee707b6.dir + size: 1182677609 nfiles: 8 outs: - path: data/js_output/ hash: md5 - md5: 77ff7fae56095a0cdcec84e70b895b28.dir + md5: 35cc8189813c4523903d33c9acce3b69.dir size: 8293094 nfiles: 6 discover_wikidata_url: @@ -260,3 +265,11 @@ stages: hash: md5 md5: e094b0f57c0c14e1016842c2dac5482e size: 90 + discover_enwiki_sql_url: + cmd: discover_latest_enwiki_sql_url > + data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + outs: + - path: data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + hash: md5 + md5: 1018a5c664d01747fdd7e218190cb4ac + size: 72 diff --git a/dvc.yaml b/dvc.yaml index 70b2749..aee0a74 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -110,21 +110,29 @@ stages: outs: - data/filtered/wikidata_titles.txt + discover_enwiki_sql_url: + cmd: >- + discover_latest_enwiki_sql_url > data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + outs: + - data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + download_wikipedia_sql: cmd: >- - wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-latest-page.sql.gz - https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz + wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-page.sql.gz + "$(cat data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt)" + deps: + - data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt outs: - - data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + - data/Wiki/wp_SQL/enwiki-page.sql.gz filter_sql: cmd: >- filter_wikipedia_sql - data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + data/Wiki/wp_SQL/enwiki-page.sql.gz data/filtered/wikidata_titles.txt -o data/filtered/OneZoom_enwiki-latest-page.sql deps: - - data/Wiki/wp_SQL/enwiki-latest-page.sql.gz + - data/Wiki/wp_SQL/enwiki-page.sql.gz - data/filtered/wikidata_titles.txt outs: - data/filtered/OneZoom_enwiki-latest-page.sql diff --git a/oz_tree_build/README.markdown b/oz_tree_build/README.markdown index dc1572d..01fbdb2 100755 --- a/oz_tree_build/README.markdown +++ b/oz_tree_build/README.markdown @@ -114,94 +114,97 @@ If you already have your own newick tree with open tree ids on it already, and d 0. The following steps assume the venv has been activated: - ``` - . .venv/bin/activate - ``` + ``` + . .venv/bin/activate + ``` - If not created, see installation steps in the [main README](../README.markdown). + If not created, see installation steps in the [main README](../README.markdown). 1. (20 secs) Use the [OpenTree API](https://github.com/OpenTreeOfLife/germinator/wiki/Synthetic-tree-API-v3) to add OTT ids to any non-opentree taxa in our own bespoke phylogenies (those in `*.phy` or `*.PHY` files). The new `.phy` and `.PHY` files will be created in a new directory within `data/OZTreeBuild/${OZ_TREE}/BespokeTree`, and a symlink to that directory will be created called `include_files` - ``` - mkdir -p "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}" - touch "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/dir" - rm data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/* && \ - add_ott_numbers_to_trees \ - --savein data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA} \ - data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] - ``` + ``` + mkdir -p "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}" + touch "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/dir" + rm data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/* && \ + add_ott_numbers_to_trees \ + --savein data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA} \ + data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] + ``` 1. Copy supplementary OpenTree-like newick files (if any) to the `OpenTree_all` directory. These are clades referenced in the OneZoom phylogeny that are missing from the OpenTree, and whose subtrees thus need to be supplied by hand. If any are required, they should be placed in the `OT_required` directory within `data/OZTreeBuild/${OZ_TREE}`. For tree building, they should be copied into the directory containing OpenTree subtrees using - ``` - (cd data/OZTreeBuild/${OZ_TREE}/OpenTreeParts && \ - cp -n OT_required/*.nwk OpenTree_all/) - ``` - If you do not have any supplementary `.nwk` subtrees in the `OT_required` directory, this step will output a warning, which can be ignored. + ``` + (cd data/OZTreeBuild/${OZ_TREE}/OpenTreeParts && \ + cp -n OT_required/*.nwk OpenTree_all/) + ``` + + If you do not have any supplementary `.nwk` subtrees in the `OT_required` directory, this step will output a warning, which can be ignored. 1. (a few secs) Construct OpenTree subtrees for inclusion from the `draftversion${OT_VERSION}.tre` file. The subtrees to be extracted are specified by inclusion strings in the `.PHY` files created in step 1. The command for this is `getOpenTreesFromOneZoom.py`, and it needs to be run from within the `data/OZTreeBuild/${OZ_TREE}` directory, as follows: - ``` - (cd data/OZTreeBuild/${OZ_TREE} && get_open_trees_from_one_zoom \ - ../../OpenTree/draftversion${OT_VERSION}.tre OpenTreeParts/OpenTree_all/ \ - BespokeTree/include_files/*.PHY) - ``` - If you are not including any OpenTree subtrees in your final tree, you should have no `.PHY` files, and this step will output a warning, which can be ignored. + ``` + (cd data/OZTreeBuild/${OZ_TREE} && get_open_trees_from_one_zoom \ + ../../OpenTree/draftversion${OT_VERSION}.tre OpenTreeParts/OpenTree_all/ \ + BespokeTree/include_files/*.PHY) + ``` + + If you are not including any OpenTree subtrees in your final tree, you should have no `.PHY` files, and this step will output a warning, which can be ignored. 1. (1 sec) substitute these subtrees into the main tree, and save the resulting full newick file using the `build_oz_tree` script: - ``` - (cd data/OZTreeBuild/${OZ_TREE} && \ - build_oz_tree BespokeTree/include_files/Base.PHY OpenTreeParts/OpenTree_all/ AllLife_full_tree.phy) - ``` + ``` + (cd data/OZTreeBuild/${OZ_TREE} && \ + build_oz_tree BespokeTree/include_files/Base.PHY OpenTreeParts/OpenTree_all/ AllLife_full_tree.phy) + ``` - Now that we are not having to run this every sponsorship time, we should probably re-write this to actually know what tree structure looks like, maybe using Python/DendroPy (see https://github.com/jrosindell/OneZoomComplete/issues/340) and also to automatically create the list of DOIs at `${OZ_DIR}/static/FinalOutputs/refs.txt`. Note that any '@' signs in the `${OZ_TREE}_full_tree.phy` output file are indicative of OpenTree substitutions that have not been possible: it would be good to check to see if there are other sources (or old OpenTree versions) that have trees for these nodes, and place them as .phy files in `data/OZTreeBuild/${OZ_TREE}/OpenTreeParts/OT_required/`. You can check with + Now that we are not having to run this every sponsorship time, we should probably re-write this to actually know what tree structure looks like, maybe using Python/DendroPy (see https://github.com/jrosindell/OneZoomComplete/issues/340) and also to automatically create the list of DOIs at `${OZ_DIR}/static/FinalOutputs/refs.txt`. Note that any '@' signs in the `${OZ_TREE}_full_tree.phy` output file are indicative of OpenTree substitutions that have not been possible: it would be good to check to see if there are other sources (or old OpenTree versions) that have trees for these nodes, and place them as .phy files in `data/OZTreeBuild/${OZ_TREE}/OpenTreeParts/OT_required/`. You can check with - ``` - grep -o '.............@' data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy - ``` - You may also want to save a zipped version of the full tree file in a place where users can download it for reference purposes, in which case you can do + ``` + grep -o '.............@' data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy + ``` - ``` - gzip < data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy > ${OZ_DIR}/static/FinalOutputs/${OZ_TREE}_full_tree.phy.gz - ``` + You may also want to save a zipped version of the full tree file in a place where users can download it for reference purposes, in which case you can do + + ``` + gzip < data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy > ${OZ_DIR}/static/FinalOutputs/${OZ_TREE}_full_tree.phy.gz + ``` ### Create the base tree and table data 5. (5 to 7 hours, or a few mins if files are already filtered) This generates filtered versions of the raw input files, which then makes them faster to work with. In the DVC pipeline, this is handled by the `filter_eol`, `filter_wikidata`, `filter_sql`, and `filter_pageviews` stages, which run as separate parallel stages. Without DVC, the `generate_filtered_files` script can still be used to run them all together: - ``` - tar -C data/OpenTree -zxvf data/OpenTree/ott${OT_TAXONOMY_VERSION}.tgz - (cd data && generate_filtered_files OZTreeBuild/AllLife/AllLife_full_tree.phy OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv EOL/provider_ids.csv.gz Wiki/wd_JSON/latest-all.json.bz2 Wiki/wp_SQL/enwiki-latest-page.sql.gz Wiki/wp_pagecounts/pageviews*.bz2) - ``` + ``` + tar -C data/OpenTree -zxvf data/OpenTree/ott${OT_TAXONOMY_VERSION}.tgz + (cd data && generate_filtered_files OZTreeBuild/AllLife/AllLife_full_tree.phy OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv EOL/provider_ids.csv.gz Wiki/wd_JSON/latest-all.json.bz2 Wiki/wp_SQL/enwiki-page.sql.gz Wiki/wp_pagecounts/pageviews*.bz2) + ``` 1. (11 mins) On the basis of the `${OZ_TREE}_full_tree.phy` file, look for ID mappings between different datasets, calculate popularity measures via wikidata/pedia, refine the tree (remove subspecies, randomly break polytomies, remove unifurcations etc), and then create corresponding database tables together with `ordered_tree_XXXXX.nwk`, `ordered_tree_XXXXX.poly` (same file but with polytomies marked with curly braces), and `ordered_dates_XXXXX.js` files (where XXXXX is the version number, usually a timestamp). - Additional flags can be given to override the OpenTree taxonomy in specific cases (using `--extra_source_file`), and to exclude certain taxa (e.g. dinosaurs) from the popularity calculations. + Additional flags can be given to override the OpenTree taxonomy in specific cases (using `--extra_source_file`), and to exclude certain taxa (e.g. dinosaurs) from the popularity calculations. - If you do not have comprehensive tree of a clade, it probably doesn't make sense to calculate popularity measures, and you can run this script with the `-p` flag (or omit the references to the `wp_` wikipedia files). + If you do not have comprehensive tree of a clade, it probably doesn't make sense to calculate popularity measures, and you can run this script with the `-p` flag (or omit the references to the `wp_` wikipedia files). - ``` - CSV_base_table_creator \ - data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy \ - data/OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv \ - data/EOL/OneZoom_provider_ids.csv \ - data/Wiki/wd_JSON/OneZoom_latest-all.json \ - data/Wiki/wp_SQL/OneZoom_enwiki-latest-page.sql \ - data/Wiki/wp_pagecounts/OneZoom_pageviews* \ - -o data/output_files -v \ - --exclude Archosauria_ott335588 Dinosauria_ott90215 \ - --extra_source_file data/OZTreeBuild/${OZ_TREE}/BespokeTree/SupplementaryTaxonomy.tsv \ - 2> data/output_files/ordered_output.log - ``` + ``` + CSV_base_table_creator \ + data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy \ + data/OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv \ + data/EOL/OneZoom_provider_ids.csv \ + data/Wiki/wd_JSON/OneZoom_latest-all.json \ + data/Wiki/wp_SQL/OneZoom_enwiki-latest-page.sql \ + data/Wiki/wp_pagecounts/OneZoom_pageviews* \ + -o data/output_files -v \ + --exclude Archosauria_ott335588 Dinosauria_ott90215 \ + --extra_source_file data/OZTreeBuild/${OZ_TREE}/BespokeTree/SupplementaryTaxonomy.tsv \ + 2> data/output_files/ordered_output.log + ``` - Since round braces, curly braces, and commas are banned from the `simplified_ottnames` file, we can create minimal topology files by simply removing everything except these characters from the `.nwk` and `.poly` files. If the tree has been ladderised, with polytomies and unifurcations removed, the commas are also redundant, and can be removed. This is done in the next step, which saves these highly shortened strings into .js data files. + Since round braces, curly braces, and commas are banned from the `simplified_ottnames` file, we can create minimal topology files by simply removing everything except these characters from the `.nwk` and `.poly` files. If the tree has been ladderised, with polytomies and unifurcations removed, the commas are also redundant, and can be removed. This is done in the next step, which saves these highly shortened strings into .js data files. 1. (1 min) Turn the most recently saved tree files (saved in the previous step as `data/output_files/ordered_tree_XXXXXX.poly` and `ordered_dates_XXXXXX.json`) into bracketed newick JS files. In the DVC pipeline, these are output to `data/js_output/` and can be copied to the OZtree repo. Without DVC, you can write directly to the OZtree directory: - ``` - make_js_treefiles --outdir ${OZ_DIR}/static/FinalOutputs/data - ``` + ``` + make_js_treefiles --outdir ${OZ_DIR}/static/FinalOutputs/data + ``` ### Upload data to the server and check it @@ -211,26 +214,30 @@ If you already have your own newick tree with open tree ids on it already, and d ``` mysql --local-infile --host db.MYSERVER.net --user onezoom --password --database onezoom_dev ``` + 1. Check for dups, and if any sponsors are no longer on the tree, using something like the following SQL command: - ``` - select * from reservations left outer join ordered_leaves on reservations.OTT_ID = ordered_leaves.ott where ordered_leaves.ott is null and reservations.verified_name IS NOT NULL; - select group_concat(id), group_concat(parent), group_concat(name), count(ott) from ordered_leaves group by ott having(count(ott) > 1) - ``` + ``` + select * from reservations left outer join ordered_leaves on reservations.OTT_ID = ordered_leaves.ott where ordered_leaves.ott is null and reservations.verified_name IS NOT NULL; + select group_concat(id), group_concat(parent), group_concat(name), count(ott) from ordered_leaves group by ott having(count(ott) > 1) + ``` ### Fill in additional server fields 11. (15 mins) create example pictures for each node by percolating up. This requires the most recent `images_by_ott` table, so either do this on the main server, or (if you are doing it locally) update your `images_by_ott` to the most recent server version. - ``` - ${OZ_DIR}/OZprivate/ServerScripts/Utilities/picProcess.py -v - ``` + ``` + ${OZ_DIR}/OZprivate/ServerScripts/Utilities/picProcess.py -v + ``` + 1. (5 mins) percolate the IUCN data up using - ``` - ${OZ_DIR}/OZprivate/ServerScripts/Utilities/IUCNquery.py -v - ``` - (note that this both updates the IUCN data in the DB and percolates up interior node info) + ``` + ${OZ_DIR}/OZprivate/ServerScripts/Utilities/IUCNquery.py -v + ``` + + (note that this both updates the IUCN data in the DB and percolates up interior node info) + 1. (10 mins) If this is a site with sponsorship (only the main OZ site), set the pricing structure using SET_PRICES.html (accessible from the management pages). 1. (5 mins - this does seem to be necessary for ordered nodes & ordered leaves). Make sure indexes are reset. Look at `OZprivate/ServerScripts/SQL/create_db_indexes.sql` for the SQL to do this - this may involve logging in to the SQL server (e.g. via Sequel Pro on Mac) and pasting all the drop index and create index commands. diff --git a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py index 81d0dd8..5d7a963 100755 --- a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py +++ b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py @@ -1048,7 +1048,7 @@ def main(): nargs="?", help=( "The gzipped >1GB wikipedia -latest-page.sql.gz dump, " - "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-latest-page.sql.gz) " + "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-page.sql.gz) " ), ) parser.add_argument( diff --git a/oz_tree_build/utilities/download_and_filter_wikidata.py b/oz_tree_build/utilities/download_and_filter_wikidata.py index 510cd4d..254d03a 100644 --- a/oz_tree_build/utilities/download_and_filter_wikidata.py +++ b/oz_tree_build/utilities/download_and_filter_wikidata.py @@ -7,8 +7,8 @@ import argparse import logging -import re import os +import re import sys import tempfile import urllib.request diff --git a/oz_tree_build/utilities/filter_wikipedia_sql.py b/oz_tree_build/utilities/filter_wikipedia_sql.py index d8a6e8d..775494e 100644 --- a/oz_tree_build/utilities/filter_wikipedia_sql.py +++ b/oz_tree_build/utilities/filter_wikipedia_sql.py @@ -3,11 +3,18 @@ import argparse import csv import logging +import re import sys +import urllib.parse +import urllib.request from .file_utils import open_file_based_on_extension from .filter_wikidata import load_titles_file +ENWIKI_DUMPS_URL = "https://dumps.wikimedia.org/enwiki/" + +logger = logging.getLogger(__name__) + def filter_wikipedia_sql(sql_file, output_file, wikidata_titles): """ @@ -76,5 +83,60 @@ def main(): filter_wikipedia_sql(args.sql_file, args.output, wikidata_titles) +def discover_latest_enwiki_sql_url( + base_url=ENWIKI_DUMPS_URL, timeout=30 +): + """Find the URL of the most recent enwiki-YYYYMMDD-page.sql.gz dump. + + Fetches the directory listing at *base_url*, collects the dated + sub-folders (``YYYYMMDD/``), and walks them in reverse-chronological + order until it finds one whose dump status page contains a link to + the ``page.sql.gz`` file. + + Returns the full URL to that file. + Raises ``RuntimeError`` if no suitable dump can be found. + """ + folder_re = re.compile(r'href="(\d{8})/"') + file_re_template = r'href="([^"]*enwiki-{date}-page\.sql\.gz)"' + + index_html = urllib.request.urlopen( + base_url, timeout=timeout + ).read().decode() + + dates = sorted(folder_re.findall(index_html), reverse=True) + if not dates: + raise RuntimeError(f"No dated folders found at {base_url}") + + for date in dates: + folder_url = f"{base_url}{date}/" + logger.info("Checking %s", folder_url) + try: + folder_html = urllib.request.urlopen( + folder_url, timeout=timeout + ).read().decode() + except urllib.error.URLError as exc: + logger.warning("Could not fetch %s: %s", folder_url, exc) + continue + + match = re.search( + file_re_template.format(date=date), folder_html + ) + if match: + url = urllib.parse.urljoin(folder_url, match.group(1)) + logger.info("Found latest enwiki SQL dump: %s", url) + return url + + raise RuntimeError( + f"No enwiki-YYYYMMDD-page.sql.gz file found in any folder at {base_url}" + ) + + +def discover_main(): + """CLI entry point: discover the latest enwiki SQL dump URL.""" + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + url = discover_latest_enwiki_sql_url() + print(url) + + if __name__ == "__main__": main() diff --git a/oz_tree_build/utilities/generate_filtered_files.py b/oz_tree_build/utilities/generate_filtered_files.py index 0f2fdcc..063f4c4 100644 --- a/oz_tree_build/utilities/generate_filtered_files.py +++ b/oz_tree_build/utilities/generate_filtered_files.py @@ -187,7 +187,7 @@ def main(): nargs="?", help=( "The gzipped >1GB wikipedia -latest-page.sql.gz dump, " - "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-latest-page.sql.gz) " + "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-page.sql.gz) " ), ) parser.add_argument( diff --git a/pyproject.toml b/pyproject.toml index a298ea6..42a5ab8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" download_and_filter_pageviews = "oz_tree_build.utilities.download_and_filter_pageviews:main" download_and_filter_wikidata = "oz_tree_build.utilities.download_and_filter_wikidata:main" discover_latest_wikidata_dump_url = "oz_tree_build.utilities.download_and_filter_wikidata:discover_main" +discover_latest_enwiki_sql_url = "oz_tree_build.utilities.filter_wikipedia_sql:discover_main" download_opentree = "oz_tree_build.utilities.download_opentree:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" From 070abb47447defaec8cf228e4270513c66abdb92 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Fri, 3 Apr 2026 12:15:05 +0100 Subject: [PATCH 15/19] Delete plan --- .../plans/dvc_pipeline_setup_872f50fc.plan.md | 280 ------------------ 1 file changed, 280 deletions(-) delete mode 100644 .cursor/plans/dvc_pipeline_setup_872f50fc.plan.md diff --git a/.cursor/plans/dvc_pipeline_setup_872f50fc.plan.md b/.cursor/plans/dvc_pipeline_setup_872f50fc.plan.md deleted file mode 100644 index 14f49f0..0000000 --- a/.cursor/plans/dvc_pipeline_setup_872f50fc.plan.md +++ /dev/null @@ -1,280 +0,0 @@ ---- -name: DVC Pipeline Setup -overview: Set up DVC to define a cached, repeatable data pipeline for the OneZoom tree-build project, replacing manual download/filter workarounds with a declarative `dvc.yaml` pipeline backed by shared remote cache storage. -todos: - - id: install-dvc - content: Add dvc to pyproject.toml dependencies, run dvc init to create .dvc/ directory - status: completed - - id: params-yaml - content: Create params.yaml with oz_tree, ot_version, ot_taxonomy_version, ot_taxonomy_extra, build_version, exclude_from_popularity - status: completed - - id: split-filters - content: "Split generate_filtered_files.py into 4 separate filter modules: filter_eol.py, filter_wikidata.py, filter_wikipedia_sql.py, filter_pageviews.py. Remove generate_and_cache_filtered_file. Each module gets its own CLI entry point writing to a specified output path." - status: completed - - id: register-scripts - content: Register the 4 new filter scripts as console_scripts in pyproject.toml - status: completed - - id: dvc-yaml - content: Create dvc.yaml with all 11 pipeline stages (tree build + 4 parallel-capable filter stages + tables + JS) using DVC templating from params.yaml - status: completed - - id: gitignore-update - content: "Update .gitignore and data/ .gitignore files: add data/filtered/, data/output_files/js/, ensure .dvc files are not ignored" - status: completed - - id: update-docs - content: Update README.markdown, oz_tree_build/README.markdown, and data/Wiki/README.markdown with DVC workflow - status: completed -isProject: false ---- - -# DVC Pipeline for OneZoom Tree-Build - -## Current State - -The build process is a sequence of manual shell commands documented in `[oz_tree_build/README.markdown](oz_tree_build/README.markdown)`. Key pain points: - -- Massive source files (Wikidata ~100GB, enwiki SQL ~1GB, pageviews multi-GB) must be downloaded by every contributor -- `generate_filtered_files` takes **5-7 hours** to reduce these to usable subsets -- Pre-processed pageviews are distributed as GitHub releases as a workaround (no longer needed with DVC) -- No caching or reproducibility guarantees - -## Target Workflow - -```bash -# First person: downloads data, runs pipeline, pushes cache -dvc repro -dvc push - -# Everyone else: pulls only the cached outputs they need -dvc repro --pull --allow-missing -``` - -If nothing has changed, `dvc repro --pull --allow-missing` pulls pre-built outputs from shared storage -- no multi-GB downloads, no 5-7 hour filtering runs. - -## Pipeline DAG - -The monolithic `filter_files` stage is split into 4 independent filter stages. EOL and wikidata filters can run in parallel (both depend on taxonomy). SQL and pageview filters can run in parallel (both depend on filtered wikidata output). - -```mermaid -graph TD - OT_tre[labelled_supertree.tre.dvc] --> preprocess_opentree - OT_tgz[ott_taxonomy.tgz.dvc] --> unpack_taxonomy - preprocess_opentree --> prepare_open_trees - unpack_taxonomy --> add_ott_numbers - bespoke[BespokeTree in git] --> add_ott_numbers - add_ott_numbers --> prepare_open_trees - OT_req[OT_required in git] --> prepare_open_trees - prepare_open_trees --> build_tree - - unpack_taxonomy --> filter_eol - EOL[provider_ids.csv.gz.dvc] --> filter_eol - - unpack_taxonomy --> filter_wikidata - WD[latest-all.json.bz2.dvc] --> filter_wikidata - - filter_wikidata --> filter_sql - WP_SQL[enwiki-page.sql.gz.dvc] --> filter_sql - - filter_wikidata --> filter_pageviews - WP_PV[wp_pagecounts.dvc] --> filter_pageviews - - build_tree --> create_tables - filter_eol --> create_tables - filter_wikidata --> create_tables - filter_sql --> create_tables - filter_pageviews --> create_tables - unpack_taxonomy --> create_tables - SupTax[SupplementaryTaxonomy in git] --> create_tables - create_tables --> make_js -``` - -## Key Design Decisions - -### 1. Parameters in `params.yaml` (replaces env vars) - -Currently `OT_VERSION`, `OT_TAXONOMY_VERSION`, `OT_TAXONOMY_EXTRA`, and `OZ_TREE` are shell environment variables. These become DVC parameters so that changing a version automatically invalidates the right stages. - -```yaml -# params.yaml -oz_tree: AllLife -ot_version: "15.1" -ot_taxonomy_version: "3.7" -ot_taxonomy_extra: "draft2" -build_version: 28017344 # deterministic version for CSV_base_table_creator (replaces time-based default) -exclude_from_popularity: - - Archosauria_ott335588 - - Dinosauria_ott90215 -``` - -The `build_version` param is important: `CSV_base_table_creator` defaults to `int(time.time()/60)`, which would make outputs non-deterministic. A fixed param ensures DVC caching works correctly. - -### 2. Source data tracked with `dvc add` - -Large downloaded files are tracked via `dvc add`, producing `.dvc` files committed to git. The raw data itself lives only in DVC cache/remote, never in git. Files to track: - -- `data/OpenTree/labelled_supertree_simplified_ottnames.tre` -- `data/OpenTree/ott${ot_taxonomy_version}.tgz` -- `data/Wiki/wd_JSON/latest-all.json.bz2` -- `data/Wiki/wp_SQL/enwiki-latest-page.sql.gz` -- `data/Wiki/wp_pagecounts/` (directory -- raw pageview files; pre-processed GitHub releases are no longer needed since DVC caches the filtered outputs) -- `data/EOL/provider_ids.csv.gz` - -With `--allow-missing`, DVC can skip stages whose inputs haven't changed even when the raw files aren't present locally. - -### 3. Split filters into separate modules and remove mtime caching - -The monolithic `[generate_filtered_files.py](oz_tree_build/utilities/generate_filtered_files.py)` will be refactored: - -**Remove `generate_and_cache_filtered_file`** -- this function implements mtime-based caching (comparing filtered file timestamps to source file timestamps). DVC's run cache completely supersedes this. Each filter script simply writes its output; DVC decides whether to run it. - -**Split into 4 separate filter modules**, each with its own CLI entry point: - -- `oz_tree_build/utilities/filter_eol.py` -- filters EOL provider IDs CSV - - Inputs: EOL CSV (gz), taxonomy.tsv - - Output: filtered EOL CSV - - Reads taxonomy to build `source_ids` (NCBI, IF, WoRMS, IRMNG, GBIF sets), then keeps only matching EOL rows -- `oz_tree_build/utilities/filter_wikidata.py` -- filters the massive wikidata JSON dump (~100GB compressed) - - Inputs: wikidata JSON (bz2), taxonomy.tsv - - Outputs: filtered wikidata JSON, **plus a sidecar `wikidata_titles.txt`** (one Wikipedia page title per line) - - The sidecar file replaces the in-memory `context.wikidata_ids` handoff. It's produced by running the equivalent of `read_wikidata_dump()` on the filtered output and writing the titles to a text file. This is the key that enables SQL and pageview filters to run independently. -- `oz_tree_build/utilities/filter_wikipedia_sql.py` -- filters enwiki SQL page dump - - Inputs: enwiki SQL (gz), `wikidata_titles.txt` - - Output: filtered SQL file - - Reads the titles file to build the filter set (replaces `context.wikidata_ids`) -- `oz_tree_build/utilities/filter_pageviews.py` -- filters Wikipedia pageview files - - Inputs: one or more pageview files (bz2), `wikidata_titles.txt` - - Output: filtered pageview files in output directory - - Reads the titles file to build the filter set - -**Shared code** stays in `generate_filtered_files.py` (or a new common module): `read_taxonomy_file`, helper imports, and the orchestrating `generate_all_filtered_files` function (simplified to call the individual filter modules directly, useful for non-DVC usage and clade-specific test filtering). - -**New console scripts** registered in `pyproject.toml`: - -``` -filter_eol = "oz_tree_build.utilities.filter_eol:main" -filter_wikidata = "oz_tree_build.utilities.filter_wikidata:main" -filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" -filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" -``` - -The parallelism benefit: `filter_eol` and `filter_wikidata` share no outputs, so DVC can run them concurrently. Once `filter_wikidata` finishes and produces `wikidata_titles.txt`, `filter_sql` and `filter_pageviews` can also run concurrently. - -### 4. JS output stays in this repo - -`make_js_treefiles` currently defaults to writing into `../OZtree/static/FinalOutputs/data/`. In the DVC pipeline, use `--outdir data/output_files/js/` to keep outputs within this repo for DVC tracking. Users copy to OZtree manually afterward. - -### 5. DVC remote (shared cache) - -A DVC remote must be configured for shared caching. This is a one-line config per backend: - -```bash -dvc remote add -d myremote s3://my-bucket/dvc-cache # S3 -dvc remote add -d myremote gs://my-bucket/dvc-cache # GCS -dvc remote add -d myremote ssh://server:/path/to/cache # SSH -dvc remote add -d myremote /mnt/shared/dvc-cache # local/NFS -``` - -The choice of backend can be made later; the pipeline design is independent of it. - -## Pipeline Stages (`dvc.yaml`) - -The `dvc.yaml` at the project root will define these stages (using DVC templating with `vars` from `params.yaml`): - -**preprocess_opentree** -- perl to strip mrca labels and normalize underscores - -- deps: `data/OpenTree/labelled_supertree_simplified_ottnames.tre` -- params: `ot_version` -- outs: `data/OpenTree/draftversion${ot_version}.tre` - -**unpack_taxonomy** -- extract taxonomy.tsv from tarball - -- deps: `data/OpenTree/ott${ot_taxonomy_version}.tgz` -- params: `ot_taxonomy_version` -- outs: `data/OpenTree/ott${ot_taxonomy_version}/` (directory) - -**add_ott_numbers** -- call OpenTree API to annotate bespoke trees with OTT IDs - -- deps: `data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/` -- params: `oz_tree`, `ot_taxonomy_version`, `ot_taxonomy_extra` -- outs: `data/OZTreeBuild/${oz_tree}/BespokeTree/include_OTT${ot_taxonomy_version}${ot_taxonomy_extra}/` -- Note: calls external API; cached unless inputs change. Use `dvc repro -f add_ott_numbers` to force refresh. - -**prepare_open_trees** -- copy supplementary .nwk files and extract OpenTree subtrees - -- deps: `draftversion${ot_version}.tre`, `include_OTT.../`, `OT_required/` -- outs: `data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/` - -**build_tree** -- assemble the full newick tree - -- deps: `include_OTT.../`, `OpenTree_all/` -- outs: `data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy` - -**filter_eol** -- filter EOL provider IDs to relevant sources - -- deps: `data/EOL/provider_ids.csv.gz`, `data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv` -- outs: `data/filtered/OneZoom_provider_ids.csv` -- Parallelizable with `filter_wikidata` - -**filter_wikidata** -- filter massive wikidata JSON to taxon/vernacular items (THE most expensive step, hours) - -- deps: `data/Wiki/wd_JSON/latest-all.json.bz2`, `data/OpenTree/ott${ot_taxonomy_version}/taxonomy.tsv` -- outs: `data/filtered/OneZoom_latest-all.json`, `data/filtered/wikidata_titles.txt` -- Parallelizable with `filter_eol` - -**filter_sql** -- filter enwiki SQL page dump to matching titles - -- deps: `data/Wiki/wp_SQL/enwiki-latest-page.sql.gz`, `data/filtered/wikidata_titles.txt` -- outs: `data/filtered/OneZoom_enwiki-latest-page.sql` -- Parallelizable with `filter_pageviews` - -**filter_pageviews** -- filter and aggregate Wikipedia pageview counts - -- deps: `data/Wiki/wp_pagecounts/`, `data/filtered/wikidata_titles.txt` -- outs: `data/filtered/pageviews/` (directory of filtered pageview files) -- Parallelizable with `filter_sql` - -**create_tables** -- map taxa, calculate popularity, produce DB-ready CSVs and ordered trees - -- deps: full tree, taxonomy, all `data/filtered/` outputs, `SupplementaryTaxonomy.tsv` -- params: `build_version`, `exclude_from_popularity` -- outs: `data/output_files/` - -**make_js** -- convert ordered trees to JS viewer files - -- deps: `data/output_files/` -- outs: `data/output_files/js/` - -## Files to Create/Modify - -- **Create** `params.yaml` -- pipeline parameters -- **Create** `dvc.yaml` -- pipeline definition (11 stages) -- **Create** `oz_tree_build/utilities/filter_eol.py` -- standalone EOL filter with CLI -- **Create** `oz_tree_build/utilities/filter_wikidata.py` -- standalone wikidata filter with CLI -- **Create** `oz_tree_build/utilities/filter_wikipedia_sql.py` -- standalone SQL filter with CLI -- **Create** `oz_tree_build/utilities/filter_pageviews.py` -- standalone pageviews filter with CLI -- **Modify** `[oz_tree_build/utilities/generate_filtered_files.py](oz_tree_build/utilities/generate_filtered_files.py)` -- remove `generate_and_cache_filtered_file`, simplify to orchestrator that calls the new modules (retains clade-filtering support for tests) -- **Modify** `[pyproject.toml](pyproject.toml)` -- add `dvc` to dependencies, register 4 new console scripts -- **Modify** `[.gitignore](.gitignore)` -- add `/data/filtered/`, DVC internals are handled by `dvc init` -- **Update** `[README.markdown](README.markdown)` -- new DVC-based workflow instructions -- **Update** `[oz_tree_build/README.markdown](oz_tree_build/README.markdown)` -- reference DVC pipeline -- **Update** `[data/Wiki/README.markdown](data/Wiki/README.markdown)` -- remove pre-processed pageview GitHub release instructions (DVC cache replaces this entirely) - -After creating these files, the first pipeline run involves: - -```bash -pip install -e . -dvc init -# download source files, then: -dvc add data/OpenTree/labelled_supertree_simplified_ottnames.tre -dvc add data/OpenTree/ott3.7.tgz -dvc add data/Wiki/wd_JSON/latest-all.json.bz2 -dvc add data/Wiki/wp_SQL/enwiki-latest-page.sql.gz -dvc add data/Wiki/wp_pagecounts/ -dvc add data/EOL/provider_ids.csv.gz -dvc repro -dvc push -git add . && git commit -m "Add DVC pipeline" -``` - -But this should not be run as part of this plan, the user will run it manually after the pipeline is set up. - -Also note that you should not try to run the individual large stages as part of this plan, since the input files are massive and the processing takes a long time, so the user will schedule it for a convenient time. From e6383fc91fa40644914bbac6ac6ff370863608ae Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Fri, 3 Apr 2026 12:44:25 +0100 Subject: [PATCH 16/19] Track the modification date of the EOL providers file --- dvc.lock | 10 ++++++++-- dvc.yaml | 7 ++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/dvc.lock b/dvc.lock index ae835e5..ab36a80 100644 --- a/dvc.lock +++ b/dvc.lock @@ -138,13 +138,19 @@ stages: size: 81928129 nfiles: 229 download_eol: - cmd: curl -L -o data/EOL/provider_ids.csv.gz - https://eol.org/data/provider_ids.csv.gz + cmd: "HEADER_FILE=$(mktemp) && curl -L -D \"$HEADER_FILE\" -o data/EOL/provider_ids.csv.gz + https://eol.org/data/provider_ids.csv.gz && grep -i last-modified \"$HEADER_FILE\"\ + \ | tail -1 | sed 's/^[^:]*: //' > data/EOL/provider_ids_last_modified.txt && + rm -f \"$HEADER_FILE\"" outs: - path: data/EOL/provider_ids.csv.gz hash: md5 md5: 03c0e857a35695b6b6a5467014bd38d8 size: 314398683 + - path: data/EOL/provider_ids_last_modified.txt + hash: md5 + md5: cb59f793fb021063b7c35e821ee8086a + size: 31 filter_eol: cmd: filter_eol data/EOL/provider_ids.csv.gz data/OpenTree/v16.1/taxonomy.tsv -o data/filtered/OneZoom_provider_ids.csv diff --git a/dvc.yaml b/dvc.yaml index aee0a74..cbe36bf 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -63,11 +63,16 @@ stages: - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy download_eol: + # EOL doesn't version the provider ids file, so capture the last-modified header instead cmd: >- - curl -L -o data/EOL/provider_ids.csv.gz + HEADER_FILE=$(mktemp) + && curl -L -D "$HEADER_FILE" -o data/EOL/provider_ids.csv.gz https://eol.org/data/provider_ids.csv.gz + && grep -i last-modified "$HEADER_FILE" | tail -1 | sed 's/^[^:]*: //' > data/EOL/provider_ids_last_modified.txt + && rm -f "$HEADER_FILE" outs: - data/EOL/provider_ids.csv.gz + - data/EOL/provider_ids_last_modified.txt filter_eol: cmd: >- From cf6e98d7398ce75d38629c5c9f587e0e1bfdbf90 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Fri, 3 Apr 2026 13:22:31 +0100 Subject: [PATCH 17/19] Add DVC to pre-commit config --- .github/workflows/tests.yml | 4 ++-- .pre-commit-config.yaml | 26 ++++++++++++++++++++++++-- README.markdown | 11 +++++------ pyproject.toml | 3 ++- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3b32438..df9233a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,7 +17,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: "3.10" - uses: pre-commit/action@v3.0.0 test: @@ -40,7 +40,7 @@ jobs: - name: Install dependencies run: | python3 -m pip install --upgrade pip - python3 -m pip install '.[test]' + python3 -m pip install '.[dev]' - name: Test with pytest run: | python3 -m pytest tests --conf-file tests/appconfig.ini diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3fa2e93..5fab889 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,28 @@ repos: rev: v0.4.5 hooks: - id: ruff - args: [ "--fix", "--config", "ruff.toml" ] + args: [--fix, --config, ruff.toml] - id: ruff-format - args: [ "--config", "ruff.toml" ] + args: [--config, ruff.toml] + - repo: https://github.com/treeverse/dvc + rev: 3.67.0 + hooks: + - id: dvc-pre-commit + additional_dependencies: + - .[all] + language_version: python3 + stages: + - pre-commit + - id: dvc-pre-push + additional_dependencies: + - .[all] + language_version: python3 + stages: + - pre-push + - id: dvc-post-checkout + additional_dependencies: + - .[all] + language_version: python3 + stages: + - post-checkout + always_run: true diff --git a/README.markdown b/README.markdown index a565ac0..5cb2f41 100644 --- a/README.markdown +++ b/README.markdown @@ -13,19 +13,18 @@ The first step to using this repo is to create a Python virtual environment and source .venv/bin/activate # Install it - pip install -e . + pip install -e '.[dev]' -After the first time, you just need to run the `source .venv/bin/activate` each time you want to activate it in a new shell. - -If you want to run the test suite, make sure the test requirements are also installed, with: + # Set up git hooks including linting and DVC + pre-commit install --hook-type pre-push --hook-type post-checkout --hook-type pre-commit - pip install -e '.[test]' +After the first time, you just need to run the `source .venv/bin/activate` each time you want to activate it in a new shell. To be able to run the pipeline, you'll also need to install `wget`. ## Testing -Assuming you have installed the test requirements, you should be able to run +Assuming you have installed the 'dev' dependencies, you should be able to run python -m pytest --conf-file tests/appconfig.ini diff --git a/pyproject.toml b/pyproject.toml index 42a5ab8..2d011d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,9 +25,10 @@ dependencies = [ ] [project.optional-dependencies] -test = [ +dev = [ "pytest>=8.1", "ruff>=0.5.1", + "pre-commit>=4.5.1", ] [tool.pytest.ini_options] From 608de4f6da0721dae31ef56669fafecff6e9bcd3 Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Fri, 3 Apr 2026 13:38:35 +0100 Subject: [PATCH 18/19] Run linter --- .../download_and_filter_pageviews.py | 12 +++-------- .../utilities/download_and_filter_wikidata.py | 21 ++++++------------- oz_tree_build/utilities/download_opentree.py | 20 +++++------------- oz_tree_build/utilities/file_utils.py | 12 ++++++----- oz_tree_build/utilities/filter_wikidata.py | 15 ++++--------- .../utilities/filter_wikipedia_sql.py | 20 +++++------------- 6 files changed, 30 insertions(+), 70 deletions(-) diff --git a/oz_tree_build/utilities/download_and_filter_pageviews.py b/oz_tree_build/utilities/download_and_filter_pageviews.py index bfb43e0..59b1f47 100644 --- a/oz_tree_build/utilities/download_and_filter_pageviews.py +++ b/oz_tree_build/utilities/download_and_filter_pageviews.py @@ -46,9 +46,7 @@ def discover_pageview_months(base_url=BASE_URL): user_file_pattern = re.compile(r'href="(pageviews-\d{6}-user\.bz2)"') years_html = _fetch_index(base_url) - year_dirs = sorted( - (m.group(1) for m in year_pattern.finditer(years_html)), reverse=True - ) + year_dirs = sorted((m.group(1) for m in year_pattern.finditer(years_html)), reverse=True) for year_dir in year_dirs: year_url = base_url + year_dir @@ -172,13 +170,9 @@ def main(): logging.info(f"Loaded {len(wikidata_titles)} wikidata titles") logging.info("Discovering available pageview months from Wikimedia...") - selected = list(itertools.islice( - discover_pageview_months(args.base_url), args.months - )) + selected = list(itertools.islice(discover_pageview_months(args.base_url), args.months)) selected.reverse() - logging.info( - f"Selected {len(selected)} most recent months" - ) + logging.info(f"Selected {len(selected)} most recent months") expected_filenames = set() for i, (url, filename) in enumerate(selected, 1): diff --git a/oz_tree_build/utilities/download_and_filter_wikidata.py b/oz_tree_build/utilities/download_and_filter_wikidata.py index 254d03a..a907bf9 100644 --- a/oz_tree_build/utilities/download_and_filter_wikidata.py +++ b/oz_tree_build/utilities/download_and_filter_wikidata.py @@ -22,17 +22,13 @@ logger = logging.getLogger(__name__) -def discover_latest_wikidata_dump_url( - base_url=WIKIDATA_ENTITIES_URL, timeout=30 -): +def discover_latest_wikidata_dump_url(base_url=WIKIDATA_ENTITIES_URL, timeout=30): """Find the URL of the most recent dated wikidata-YYYYMMDD-all.json.bz2 dump. We don't use the symlinked latest-all.json.bz2 file because we want to know the date.""" folder_re = re.compile(r'href="(\d{8})/"') file_re_template = r'href="(wikidata-{date}-all\.json\.bz2)"' - index_html = urllib.request.urlopen( - base_url, timeout=timeout - ).read().decode() + index_html = urllib.request.urlopen(base_url, timeout=timeout).read().decode() dates = sorted(folder_re.findall(index_html), reverse=True) if not dates: @@ -42,24 +38,18 @@ def discover_latest_wikidata_dump_url( folder_url = f"{base_url}{date}/" logger.info("Checking %s", folder_url) try: - folder_html = urllib.request.urlopen( - folder_url, timeout=timeout - ).read().decode() + folder_html = urllib.request.urlopen(folder_url, timeout=timeout).read().decode() except urllib.error.URLError as exc: logger.warning("Could not fetch %s: %s", folder_url, exc) continue - match = re.search( - file_re_template.format(date=date), folder_html - ) + match = re.search(file_re_template.format(date=date), folder_html) if match: url = f"{folder_url}{match.group(1)}" logger.info("Found latest dump: %s", url) return url - raise RuntimeError( - f"No wikidata-YYYYMMDD-all.json.bz2 file found in any folder at {base_url}" - ) + raise RuntimeError(f"No wikidata-YYYYMMDD-all.json.bz2 file found in any folder at {base_url}") def stream_and_filter(url, output_path, wikilang="en", dont_trim_sitelinks=False): @@ -127,5 +117,6 @@ def discover_main(): url = discover_latest_wikidata_dump_url() print(url) + if __name__ == "__main__": main() diff --git a/oz_tree_build/utilities/download_opentree.py b/oz_tree_build/utilities/download_opentree.py index 3f3e0c4..71c9e37 100644 --- a/oz_tree_build/utilities/download_opentree.py +++ b/oz_tree_build/utilities/download_opentree.py @@ -22,8 +22,7 @@ import requests SYNTHESIS_JSON_URL = ( - "https://raw.githubusercontent.com/OpenTreeOfLife/opentree" - "/master/webapp/static/statistics/synthesis.json" + "https://raw.githubusercontent.com/OpenTreeOfLife/opentree" "/master/webapp/static/statistics/synthesis.json" ) @@ -39,10 +38,7 @@ def find_synthesis_entry(synthesis_json, version): if entry.get("version") == version: return entry available = [e["version"] for e in synthesis_json.values() if "version" in e] - raise SystemExit( - f"Version '{version}' not found in synthesis.json. " - f"Available versions: {', '.join(available)}" - ) + raise SystemExit(f"Version '{version}' not found in synthesis.json. " f"Available versions: {', '.join(available)}") def strip_mrca_prefixes(content: str) -> str: @@ -78,9 +74,7 @@ def download_tree(version, output_dir): def download_taxonomy(ott_version_raw, output_dir): """Download and extract taxonomy.tsv from the OTT taxonomy tarball.""" ott_version = ott_version_raw.split("draft")[0] - taxonomy_url = ( - f"https://files.opentreeoflife.org/ott/{ott_version}/{ott_version}.tgz" - ) + taxonomy_url = f"https://files.opentreeoflife.org/ott/{ott_version}/{ott_version}.tgz" print(f"Downloading taxonomy from {taxonomy_url} ...") response = requests.get(taxonomy_url) response.raise_for_status() @@ -98,9 +92,7 @@ def download_taxonomy(ott_version_raw, output_dir): taxonomy_member = member break if taxonomy_member is None: - raise SystemExit( - "Could not find taxonomy.tsv in the taxonomy tarball" - ) + raise SystemExit("Could not find taxonomy.tsv in the taxonomy tarball") extracted = tar.extractfile(taxonomy_member) dest_path = os.path.join(output_dir, "taxonomy.tsv") with open(dest_path, "wb") as f: @@ -109,9 +101,7 @@ def download_taxonomy(ott_version_raw, output_dir): def main(): - parser = argparse.ArgumentParser( - description="Download Open Tree of Life synthesis data into a versioned folder." - ) + parser = argparse.ArgumentParser(description="Download Open Tree of Life synthesis data into a versioned folder.") parser.add_argument( "--version", required=True, diff --git a/oz_tree_build/utilities/file_utils.py b/oz_tree_build/utilities/file_utils.py index 136a9f0..1f96715 100644 --- a/oz_tree_build/utilities/file_utils.py +++ b/oz_tree_build/utilities/file_utils.py @@ -33,7 +33,11 @@ def stream_bz2_lines_from_url(url, read_timeout=120): wget = subprocess.Popen( [ - "wget", "-q", "--show-progress", "-O", "-", + "wget", + "-q", + "--show-progress", + "-O", + "-", "--connect-timeout=30", f"--read-timeout={read_timeout}", "--header=User-Agent: OneZoom-tree-build/1.0", @@ -60,8 +64,7 @@ def stream_bz2_lines_from_url(url, read_timeout=120): line_buf += text parts = line_buf.split("\n") line_buf = parts[-1] - for line in parts[:-1]: - yield line + yield from parts[:-1] trailing = decoder.decode(b"", final=True) line_buf += trailing @@ -79,8 +82,7 @@ def stream_bz2_lines_from_url(url, read_timeout=120): def enumerate_lines_from_file(filename): """Enumerate the lines in a file, whether it's uncompressed, bz2 or gz.""" with open_file_based_on_extension(filename, "rt") as f: - for line_num, line in enumerate(iter(f.readline, "")): - yield line_num, line + yield from enumerate(iter(f.readline, "")) def check_identical_files(output_location, expected_output_path): diff --git a/oz_tree_build/utilities/filter_wikidata.py b/oz_tree_build/utilities/filter_wikidata.py index 5cc7f8b..4e1c528 100644 --- a/oz_tree_build/utilities/filter_wikidata.py +++ b/oz_tree_build/utilities/filter_wikidata.py @@ -28,9 +28,7 @@ "P31": [ { "mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}, - "qualifiers": { - "P642": [{"datavalue": {"value": {"numeric-id": KEEP}}}] - }, + "qualifiers": {"P642": [{"datavalue": {"value": {"numeric-id": KEEP}}}]}, } ], "P685": [{"mainsnak": {"datavalue": {"value": KEEP}}}], @@ -82,9 +80,7 @@ def trim_and_write_json_item(json_item, filtered_wiki_f): apply_mask_to_object_graph(json_item, WIKIDATA_MASK) if dont_trim_sitelinks: - json_item["sitelinks"] = { - k: v for k, v in json_item["sitelinks"].items() if k.endswith("wiki") - } + json_item["sitelinks"] = {k: v for k, v in json_item["sitelinks"].items() if k.endswith("wiki")} else: json_item["sitelinks"] = { k: v if k == sitelinks_key else {} @@ -104,9 +100,7 @@ def trim_and_write_json_item(json_item, filtered_wiki_f): for line_num, line in enumerate(lines): if line_num > 0 and line_num % 100_000 == 0: - logging.info( - f"Processed {line_num} lines, kept {preserved_lines}" - ) + logging.info(f"Processed {line_num} lines, kept {preserved_lines}") if not (line.startswith('{"type":') and quick_byte_match.search(line)): continue @@ -143,8 +137,7 @@ def trim_and_write_json_item(json_item, filtered_wiki_f): potential_extra_json_items.append(("vernacular", vernaculars_matches, json_item)) logging.info( - "Writing extra lines at the end of the file " - f"(subset of {len(potential_extra_json_items)} lines)" + "Writing extra lines at the end of the file " f"(subset of {len(potential_extra_json_items)} lines)" ) for desc, linked_qids, json_item in potential_extra_json_items: diff --git a/oz_tree_build/utilities/filter_wikipedia_sql.py b/oz_tree_build/utilities/filter_wikipedia_sql.py index 775494e..d1046f0 100644 --- a/oz_tree_build/utilities/filter_wikipedia_sql.py +++ b/oz_tree_build/utilities/filter_wikipedia_sql.py @@ -83,9 +83,7 @@ def main(): filter_wikipedia_sql(args.sql_file, args.output, wikidata_titles) -def discover_latest_enwiki_sql_url( - base_url=ENWIKI_DUMPS_URL, timeout=30 -): +def discover_latest_enwiki_sql_url(base_url=ENWIKI_DUMPS_URL, timeout=30): """Find the URL of the most recent enwiki-YYYYMMDD-page.sql.gz dump. Fetches the directory listing at *base_url*, collects the dated @@ -99,9 +97,7 @@ def discover_latest_enwiki_sql_url( folder_re = re.compile(r'href="(\d{8})/"') file_re_template = r'href="([^"]*enwiki-{date}-page\.sql\.gz)"' - index_html = urllib.request.urlopen( - base_url, timeout=timeout - ).read().decode() + index_html = urllib.request.urlopen(base_url, timeout=timeout).read().decode() dates = sorted(folder_re.findall(index_html), reverse=True) if not dates: @@ -111,24 +107,18 @@ def discover_latest_enwiki_sql_url( folder_url = f"{base_url}{date}/" logger.info("Checking %s", folder_url) try: - folder_html = urllib.request.urlopen( - folder_url, timeout=timeout - ).read().decode() + folder_html = urllib.request.urlopen(folder_url, timeout=timeout).read().decode() except urllib.error.URLError as exc: logger.warning("Could not fetch %s: %s", folder_url, exc) continue - match = re.search( - file_re_template.format(date=date), folder_html - ) + match = re.search(file_re_template.format(date=date), folder_html) if match: url = urllib.parse.urljoin(folder_url, match.group(1)) logger.info("Found latest enwiki SQL dump: %s", url) return url - raise RuntimeError( - f"No enwiki-YYYYMMDD-page.sql.gz file found in any folder at {base_url}" - ) + raise RuntimeError(f"No enwiki-YYYYMMDD-page.sql.gz file found in any folder at {base_url}") def discover_main(): From 2a5660f6a83fbad8134d80eade39c7abd48c737a Mon Sep 17 00:00:00 2001 From: Jared Khan Date: Fri, 3 Apr 2026 14:18:15 +0100 Subject: [PATCH 19/19] Add DVC status to CI --- .github/workflows/tests.yml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index df9233a..24ed66d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,29 @@ jobs: python-version: "3.10" - uses: pre-commit/action@v3.0.0 + dvc: + name: DVC + runs-on: ubuntu-latest + steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.6.0 + with: + access_token: ${{ github.token }} + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install '.[dev]' + - name: Check DVC status + run: | + dvc remote modify --local jared-r2 access_key_id ${{ secrets.DVC_ACCESS_KEY_ID }} + dvc remote modify --local jared-r2 secret_access_key ${{ secrets.DVC_SECRET_ACCESS_KEY }} + dvc repro --allow-missing --dry | tee /dev/stderr | grep -q "Data and pipelines are up to date." + if dvc data status --not-in-remote | grep -q "Not in remote"; then exit 1; fi test: name: Python runs-on: ubuntu-latest