diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..84e7938 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = jared-r2 +['remote "jared-r2"'] + url = s3://onezoom + endpointurl = https://9d168184d3ac384b6a159313dd90a75a.r2.cloudflarestorage.com diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3b32438..24ed66d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -17,9 +17,32 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: "3.10" - uses: pre-commit/action@v3.0.0 + dvc: + name: DVC + runs-on: ubuntu-latest + steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.6.0 + with: + access_token: ${{ github.token }} + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install '.[dev]' + - name: Check DVC status + run: | + dvc remote modify --local jared-r2 access_key_id ${{ secrets.DVC_ACCESS_KEY_ID }} + dvc remote modify --local jared-r2 secret_access_key ${{ secrets.DVC_SECRET_ACCESS_KEY }} + dvc repro --allow-missing --dry | tee /dev/stderr | grep -q "Data and pipelines are up to date." + if dvc data status --not-in-remote | grep -q "Not in remote"; then exit 1; fi test: name: Python runs-on: ubuntu-latest @@ -40,7 +63,7 @@ jobs: - name: Install dependencies run: | python3 -m pip install --upgrade pip - python3 -m pip install '.[test]' + python3 -m pip install '.[dev]' - name: Test with pytest run: | python3 -m pytest tests --conf-file tests/appconfig.ini diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3fa2e93..5fab889 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,6 +11,28 @@ repos: rev: v0.4.5 hooks: - id: ruff - args: [ "--fix", "--config", "ruff.toml" ] + args: [--fix, --config, ruff.toml] - id: ruff-format - args: [ "--config", "ruff.toml" ] + args: [--config, ruff.toml] + - repo: https://github.com/treeverse/dvc + rev: 3.67.0 + hooks: + - id: dvc-pre-commit + additional_dependencies: + - .[all] + language_version: python3 + stages: + - pre-commit + - id: dvc-pre-push + additional_dependencies: + - .[all] + language_version: python3 + stages: + - pre-push + - id: dvc-post-checkout + additional_dependencies: + - .[all] + language_version: python3 + stages: + - post-checkout + always_run: true diff --git a/README.markdown b/README.markdown index e47eef5..5cb2f41 100644 --- a/README.markdown +++ b/README.markdown @@ -13,17 +13,18 @@ The first step to using this repo is to create a Python virtual environment and source .venv/bin/activate # Install it - pip install -e . + pip install -e '.[dev]' -After the first time, you just need to run the `source .venv/bin/activate` each time you want to activate it in a new shell. + # Set up git hooks including linting and DVC + pre-commit install --hook-type pre-push --hook-type post-checkout --hook-type pre-commit -If you want to run the test suite, make sure the test requirements are also installed, with: +After the first time, you just need to run the `source .venv/bin/activate` each time you want to activate it in a new shell. - pip install -e '.[test]' +To be able to run the pipeline, you'll also need to install `wget`. ## Testing -Assuming you have installed the test requirements, you should be able to run +Assuming you have installed the 'dev' dependencies, you should be able to run python -m pytest --conf-file tests/appconfig.ini @@ -41,22 +42,39 @@ you will need a valid Azure Image cropping key in your appconfig.ini. ## Building the latest tree from OpenTree -### Setup +This project uses [DVC](https://dvc.org/) to manage the pipeline. The build parameters are defined in `params.yaml` and the pipeline stages are declared in `dvc.yaml`. -We assume that you want to build a OneZoom tree based on the most recent online OpenTree version. -You can check the most recent version of both the synthetic tree (`synth_id`) and the taxonomy (`taxonomy_version`) via the -[API](https://github.com/OpenTreeOfLife/germinator/wiki/Open-Tree-of-Life-Web-APIs) e.g. by running `curl -X POST https://api.opentreeoflife.org/v3/tree_of_life/about`. Later in the build, we use specific environment variables set to these version numbers. Assuming you are in a bash shell or similar, you can set them as follows: +### Quick start (using cached outputs) +You'll need to ask for the DVC remote credentials on the OneZoom Slack channel in order to pull cached results. +Then, if someone has already run the pipeline and pushed the results to the DVC remote, you can reproduce the build and any of the intermediate stages without downloading any of the massive source files: + +```bash +source .venv/bin/activate +dvc repro --pull --allow-missing ``` -OT_VERSION=14.9 #or whatever your OpenTree version is -OT_TAXONOMY_VERSION=3.6 -OT_TAXONOMY_EXTRA=draft1 #optional - the draft for this version, e.g. `draft1` if the taxonomy_version is 3.6draft1 -``` -### Download +DVC will pull only the cached outputs needed for stages that haven't changed. If all stages are cached, nothing needs to be re-run. + +### Full build (first time / updating source data) + +1. Set `ot_version` in `params.yaml` to the desired OpenTree synthesis version (e.g. `"v16.1"`). Available versions can be found in the [synthesis manifest](https://raw.githubusercontent.com/OpenTreeOfLife/opentree/master/webapp/static/statistics/synthesis.json). The OpenTree tree and taxonomy will be downloaded automatically by the `download_opentree` pipeline stage. + +2. Some source files are unversioned so will use cached results unless forced. To force re-download them all with the latest upstream data: + + ```bash + dvc repro --force download_eol discover_enwiki_sql_url download_wikipedia_sql discover_wikidata_url download_and_filter_wikidata download_and_filter_pageviews + ``` + +Note that download_and_filter_wikidata and download_and_filter_pageviews take several hours to run. + +3. Run the pipeline and push results to the shared cache: -Constructing the full tree of life requires various files downloaded from the internet. They should be placed within the appropriate directories in the `data` directory, as [documented here](data/README.markdown). + ```bash + dvc repro + dvc push + ``` -### Building the tree +4. Commit `dvc.lock` to git. -Once data files are downloaded, you should be set up to actually build the tree and other backend files, by following [these instructions](oz_tree_build/README.markdown). +For detailed step-by-step documentation, see [oz_tree_build/README.markdown](oz_tree_build/README.markdown). diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..0992b36 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,2 @@ +/js_output +/output_files diff --git a/data/EOL/.gitignore b/data/EOL/.gitignore index 6c9d760..47df0e7 100755 --- a/data/EOL/.gitignore +++ b/data/EOL/.gitignore @@ -3,4 +3,5 @@ # But not these files... !.gitignore -!README.markdown \ No newline at end of file +!README.markdown +!*.dvc \ No newline at end of file diff --git a/data/OZTreeBuild/AllLife/OpenTreeParts/.gitignore b/data/OZTreeBuild/AllLife/OpenTreeParts/.gitignore new file mode 100644 index 0000000..6681d35 --- /dev/null +++ b/data/OZTreeBuild/AllLife/OpenTreeParts/.gitignore @@ -0,0 +1 @@ +/OpenTree_all diff --git a/data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/.gitignore b/data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/data/OpenTree/.gitignore b/data/OpenTree/.gitignore index 6c9d760..45e8756 100755 --- a/data/OpenTree/.gitignore +++ b/data/OpenTree/.gitignore @@ -3,4 +3,4 @@ # But not these files... !.gitignore -!README.markdown \ No newline at end of file +!README.markdown diff --git a/data/OpenTree/README.markdown b/data/OpenTree/README.markdown index 16d5b6b..a371470 100755 --- a/data/OpenTree/README.markdown +++ b/data/OpenTree/README.markdown @@ -1,26 +1,28 @@ ### Directory contents -Files herein are .gitignored. To get the site working, this folder should contain the following files (or symlinks to them) - -* `draftversionXXX.tre` -* `ottYYY/taxonomy.tsv` + +This folder contains versioned subdirectories of Open Tree of Life data, e.g. `v16.1/`. Each subdirectory is created by the `download_opentree` script and contains: + +* `labelled_supertree_simplified_ottnames.tre` -- the raw downloaded tree +* `draftversion.tre` -- the tree with `mrca***` labels removed and whitespace normalised +* `taxonomy.tsv` -- the OTT taxonomy file + +These subdirectories are .gitignored and tracked by DVC as pipeline outputs. ### How to get the files -* `draftversionXXX.tre` should contain an OpenTree newick file with simplified names and `mrca***` labels removed. This can be created from the OpenTree download file `labelled_supertree_simplified_ottnames.tre`. To get this file, you can either download the complete OpenTree distribution, or get the single necessary file by following the link from [https://tree.opentreeoflife.org/about/synthesis-release/](https://tree.opentreeoflife.org/about/synthesis-release/) to 'browse full output' then 'labelled_supertree/index.html' (usually at the end of the "Supertree algorithm" section). Make sure that you *don't* get the `...without_monotypic.tre` version, otherwise you will be missing some intermediate nodes, and the popularity ratings may suffer. - - Removing the `mrca***` labels can be done by using a simple regular expression substitution, as in the following perl command: - ``` - # assumes you have defined OT_VERSION as an environment variable, e.g. > OT_VERSION=14.7 - perl -pe 's/\)mrcaott\d+ott\d+/\)/g; s/[ _]+/_/g;' labelled_supertree_simplified_ottnames.tre > draftversion${OT_VERSION}.tre - ``` +Run the download script with the desired synthesis version: -* The OpenTree taxonomy, in a subfolder called ottYYY/ (where YYY is the OT_TAXONOMY_VERSION; the only important file is ottYYY/taxonomy.tsv). Get the `ottYYY.tgz` file (where YYY is the correct taxonomy version for your version XXX of the tree) from [http://files.opentreeoflife.org/ott](http://files.opentreeoflife.org/ott/) and unpack it. Alternatively, the lastest is usually at [https://tree.opentreeoflife.org/about/taxonomy-version](https://tree.opentreeoflife.org/about/taxonomy-version). +``` +download_opentree --version v16.1 --output-dir data/OpenTree +``` -### Use +The script fetches the [synthesis manifest](https://raw.githubusercontent.com/OpenTreeOfLife/opentree/master/webapp/static/statistics/synthesis.json) to look up the correct OTT taxonomy version, then downloads both the labelled supertree and taxonomy automatically. + +This is also available as a DVC pipeline stage (`download_opentree` in `dvc.yaml`), so `dvc repro` will run it when `ot_version` changes in `params.yaml`. -These files are processed by the scripts in ServerScripts/TreeBuild/OpenTreeRefine to create an OpenTree without subspecies, with polytomies resolved, and with all nodes named. +### Use -Note that the `ott/taxonomy.tsv` file is also used by other scripts e.g. for popularity, TaxonMapping, etc. +These files are processed by the pipeline stages in `dvc.yaml` to create the full OneZoom tree. The `taxonomy.tsv` file is also used by other stages (e.g. for popularity mapping, EoL filtering, etc.). NB: for the rationale of using `...simplified_ottnames` see [https://github.com/OpenTreeOfLife/treemachine/issues/147#issuecomment-209105659](https://github.com/OpenTreeOfLife/treemachine/issues/147#issuecomment-209105659) and also [here](https://groups.google.com/forum/#!topic/opentreeoflife/EzqctKrJySk) diff --git a/data/README.markdown b/data/README.markdown index 73010b7..3479ab1 100755 --- a/data/README.markdown +++ b/data/README.markdown @@ -1,13 +1,11 @@ # Downloading required data files - -To build a tree, you will first need to download various files from the internet. These are not provided by OneZoom directly as they are (a) very large and (b) regularly updated. The files you will need are: -* Open Tree of Life files, to be downloaded into the `OpenTree` directory (see [OpenTree/README.markdown](OpenTree/README.markdown) - * `labelled_supertree_simplified_ottnames.tre` (subsequently converted to `draftversionXXX.tre`, as detailed in the instructions) - * `ottX.Y/taxonomy.tsv` (where X.Y is the OT_TAXONOMY_VERSION) -* Wikimedia files, to be downloaded into directories within the `Wiki` directory (see [Wiki/README.markdown](Wiki/README.markdown)) - * `wd_JSON/latest-all.json.bz2` - * `wp_SQL/enwiki-latest-page.sql.gz` - * `wp_pagecounts/pageviews-YYYYMM-user.bz2` (several files for different months). Or download preprocessed files from a [release](https://github.com/OneZoom/tree-build/releases) -* EoL files, to be downloaded into the `EOL` directory (see [EOL/README.markdown](EOL/README.markdown)) - * `identifiers.csv` +To build a tree, you will first need various data files from the internet. These are not provided by OneZoom directly as they are (a) very large and (b) regularly updated. + +All source files are downloaded automatically by DVC pipeline stages: + +- **Open Tree of Life** files, downloaded by the `download_opentree` stage into `OpenTree//` (see [OpenTree/README.markdown](OpenTree/README.markdown)) +- **EOL provider IDs**, downloaded by the `download_eol` stage into `EOL/provider_ids.csv.gz` +- **Wikipedia SQL dump**, downloaded by the `download_wikipedia_sql` stage into `Wiki/wp_SQL/enwiki-page.sql.gz` (see [Wiki/README.markdown](Wiki/README.markdown)) +- **Wikidata JSON dump**, streamed and filtered by the `download_and_filter_wikidata` stage (see [Wiki/README.markdown](Wiki/README.markdown)) +- **Wikipedia pageviews**, streamed and filtered by the `download_and_filter_pageviews` stage (see [Wiki/README.markdown](Wiki/README.markdown)) diff --git a/data/Wiki/.gitignore b/data/Wiki/.gitignore index 6c9d760..47df0e7 100755 --- a/data/Wiki/.gitignore +++ b/data/Wiki/.gitignore @@ -3,4 +3,5 @@ # But not these files... !.gitignore -!README.markdown \ No newline at end of file +!README.markdown +!*.dvc \ No newline at end of file diff --git a/data/Wiki/README.markdown b/data/Wiki/README.markdown index dcbd4e5..3e9d45a 100755 --- a/data/Wiki/README.markdown +++ b/data/Wiki/README.markdown @@ -1,20 +1,24 @@ -To allow mappings to wikipedia and popularity calculations, the following three files -should be uploaded to their respective directories (NB: these could be symlinks to -versions on external storage) +To allow mappings to wikipedia and popularity calculations, the following +files are downloaded and filtered automatically by pipeline stages: -* The `wd_JSON` directory should contain the wikidata JSON dump, as `latest-all.json.bz2` -(download from ) -* The `wp_SQL` directory should contain the en.wikipedia SQL dump file, as `enwiki-latest-page.sql.gz` -(download from ) -* The `wp_pagecounts` directory should contain the wikipedia pagevisits dump files: -multiple files such as `wp_pagecounts/pageviews-202403-user.bz2` etc... -(download from ). +- **`download_wikipedia_sql`** downloads the en.wikipedia SQL dump + (`enwiki-page.sql.gz`, ~2 GB) from + . To re-download the latest + version, run `dvc repro --force discover_enwiki_sql_url download_wikipedia_sql`. -For `wp_pagecounts`, as a much faster alternative, you can download preprocessed pageviews files from a [release](https://github.com/OneZoom/tree-build/releases). +- **`download_and_filter_wikidata`** streams the full Wikidata JSON dump + (`latest-all.json.bz2`, ~90 GB) from + , filters it on the fly, + and writes only the small filtered output. To re-download with a fresh dump, + run `dvc repro --force discover_wikidata_url download_and_filter_wikidata`. -You can download the gz file and unpack it in one command. e.g. from `data/Wiki/wp_pagecounts`, run: -```bash -wget https://github.com/OneZoom/tree-build/releases/download/pageviews-202306-202403/OneZoom_pageviews-202306-202403.tar.gz -O - | tar -xz -``` +- **`download_and_filter_pageviews`** streams monthly `-user` dumps from + , filters them + against the wikidata titles, and caches the small filtered outputs. Only the + most recent N months (configured via `--months` in the DVC stage) are + processed. To pick up newly published months, run + `dvc repro --force download_and_filter_pageviews`. -You will then omit passing pageviews files when you later run `generate_filtered_files` (see [build steps](../../oz_tree_build/README.markdown)). +If someone has already run the pipeline and pushed results to the DVC remote, +you do not need to download these files yourself -- +`dvc repro --pull --allow-missing` will pull the cached filtered outputs instead. diff --git a/data/Wiki/wd_JSON/.gitignore b/data/Wiki/wd_JSON/.gitignore index d6b7ef3..65e7aa0 100644 --- a/data/Wiki/wd_JSON/.gitignore +++ b/data/Wiki/wd_JSON/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!*.dvc diff --git a/data/Wiki/wp_SQL/.gitignore b/data/Wiki/wp_SQL/.gitignore index d6b7ef3..65e7aa0 100644 --- a/data/Wiki/wp_SQL/.gitignore +++ b/data/Wiki/wp_SQL/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!*.dvc diff --git a/data/Wiki/wp_pagecounts/.gitignore b/data/Wiki/wp_pagecounts/.gitignore index d6b7ef3..65e7aa0 100644 --- a/data/Wiki/wp_pagecounts/.gitignore +++ b/data/Wiki/wp_pagecounts/.gitignore @@ -1,2 +1,3 @@ * !.gitignore +!*.dvc diff --git a/data/filtered/.gitignore b/data/filtered/.gitignore new file mode 100644 index 0000000..4ba3b80 --- /dev/null +++ b/data/filtered/.gitignore @@ -0,0 +1,6 @@ +# Ignore everything +* + +# But not these files... +!.gitignore +!*.dvc \ No newline at end of file diff --git a/data/output_files/.gitignore b/data/output_files/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/data/output_files/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..ab36a80 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,281 @@ +schema: '2.0' +stages: + download_and_filter_wikidata: + cmd: download_and_filter_wikidata --url "$(cat + data/Wiki/wd_JSON/latest-all-json-bz2-url.txt)" -o + data/filtered/OneZoom_latest-all.json + deps: + - path: data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + hash: md5 + md5: e094b0f57c0c14e1016842c2dac5482e + size: 90 + outs: + - path: data/filtered/OneZoom_latest-all.json + hash: md5 + md5: 65b22a5c761c78d79b30faf871d1e404 + size: 1542990225 + extract_wikidata_titles: + cmd: extract_wikidata_titles data/filtered/OneZoom_latest-all.json -o + data/filtered/wikidata_titles.txt + deps: + - path: data/filtered/OneZoom_latest-all.json + hash: md5 + md5: 65b22a5c761c78d79b30faf871d1e404 + size: 1542990225 + outs: + - path: data/filtered/wikidata_titles.txt + hash: md5 + md5: e498b85311c8a84a0d5157a0bbbcb23f + size: 9382189 + download_wikipedia_sql: + cmd: wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-page.sql.gz "$(cat + data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt)" + deps: + - path: data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + hash: md5 + md5: 1018a5c664d01747fdd7e218190cb4ac + size: 72 + outs: + - path: data/Wiki/wp_SQL/enwiki-page.sql.gz + hash: md5 + md5: 541324eaa9f3f1a14bb6ddcf7ea95de6 + size: 2397370114 + download_and_filter_pageviews: + cmd: download_and_filter_pageviews --titles-file + data/filtered/wikidata_titles.txt --months 12 -o data/filtered/pageviews + deps: + - path: data/filtered/wikidata_titles.txt + hash: md5 + md5: e498b85311c8a84a0d5157a0bbbcb23f + size: 9382189 + outs: + - path: data/filtered/pageviews/ + hash: md5 + md5: f021afa12f9d7c893412a0b2980ab187.dir + size: 104535557 + nfiles: 13 + filter_sql: + cmd: filter_wikipedia_sql data/Wiki/wp_SQL/enwiki-page.sql.gz + data/filtered/wikidata_titles.txt -o + data/filtered/OneZoom_enwiki-latest-page.sql + deps: + - path: data/Wiki/wp_SQL/enwiki-page.sql.gz + hash: md5 + md5: 541324eaa9f3f1a14bb6ddcf7ea95de6 + size: 2397370114 + - path: data/filtered/wikidata_titles.txt + hash: md5 + md5: e498b85311c8a84a0d5157a0bbbcb23f + size: 9382189 + outs: + - path: data/filtered/OneZoom_enwiki-latest-page.sql + hash: md5 + md5: af577077774060f78a752f22e2f1c6d3 + size: 21445833 + download_opentree: + cmd: download_opentree --version v16.1 --output-dir data/OpenTree + params: + params.yaml: + ot_version: v16.1 + outs: + - path: data/OpenTree/v16.1/ + hash: md5 + md5: 87ff995e9d5028efc185857f34448746.dir + size: 587064765 + nfiles: 3 + add_ott_numbers: + cmd: rm -rf data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 && mkdir + -p data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 && + add_ott_numbers_to_trees --savein + data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 + data/OZTreeBuild/AllLife/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] + deps: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_noAutoOTT/ + hash: md5 + md5: 8cb57266b725e9893505618bf366af54.dir + size: 1231351 + nfiles: 56 + params: + params.yaml: + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ + hash: md5 + md5: 694f92ea523d0628824b4cf4ed7ef19e.dir + size: 1806192 + nfiles: 56 + prepare_open_trees: + cmd: mkdir -p data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all && cp -n + data/OZTreeBuild/AllLife/OpenTreeParts/OT_required/*.nwk + data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ 2>/dev/null || true + && cd data/OZTreeBuild/AllLife && get_open_trees_from_one_zoom + ../../OpenTree/v16.1/draftversion.tre OpenTreeParts/OpenTree_all/ + BespokeTree/include_OT_v16.1/*.PHY + deps: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ + hash: md5 + md5: 694f92ea523d0628824b4cf4ed7ef19e.dir + size: 1806192 + nfiles: 56 + - path: data/OZTreeBuild/AllLife/OpenTreeParts/OT_required/ + hash: md5 + md5: 81be05fde561126fb58b7bb7e8a0fbcd.dir + size: 808 + nfiles: 3 + - path: data/OpenTree/v16.1/draftversion.tre + hash: md5 + md5: f59f21497ceb1b33273a192a63308386 + size: 83619970 + params: + params.yaml: + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ + hash: md5 + md5: 3f8df65320201c2db0ee35b17916c7cb.dir + size: 81928129 + nfiles: 229 + download_eol: + cmd: "HEADER_FILE=$(mktemp) && curl -L -D \"$HEADER_FILE\" -o data/EOL/provider_ids.csv.gz + https://eol.org/data/provider_ids.csv.gz && grep -i last-modified \"$HEADER_FILE\"\ + \ | tail -1 | sed 's/^[^:]*: //' > data/EOL/provider_ids_last_modified.txt && + rm -f \"$HEADER_FILE\"" + outs: + - path: data/EOL/provider_ids.csv.gz + hash: md5 + md5: 03c0e857a35695b6b6a5467014bd38d8 + size: 314398683 + - path: data/EOL/provider_ids_last_modified.txt + hash: md5 + md5: cb59f793fb021063b7c35e821ee8086a + size: 31 + filter_eol: + cmd: filter_eol data/EOL/provider_ids.csv.gz + data/OpenTree/v16.1/taxonomy.tsv -o data/filtered/OneZoom_provider_ids.csv + deps: + - path: data/EOL/provider_ids.csv.gz + hash: md5 + md5: 03c0e857a35695b6b6a5467014bd38d8 + size: 314398683 + - path: data/OpenTree/v16.1/taxonomy.tsv + hash: md5 + md5: d7a58eaaf132522b89a506e96ca5098f + size: 417054016 + params: + params.yaml: + ot_version: v16.1 + outs: + - path: data/filtered/OneZoom_provider_ids.csv + hash: md5 + md5: f7c9bb8374957c07168bec36d6591347 + size: 221682224 + build_tree: + cmd: cd data/OZTreeBuild/AllLife && build_oz_tree + BespokeTree/include_OT_v16.1/Base.PHY OpenTreeParts/OpenTree_all/ + AllLife_full_tree.phy + deps: + - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ + hash: md5 + md5: 694f92ea523d0628824b4cf4ed7ef19e.dir + size: 1806192 + nfiles: 56 + - path: data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ + hash: md5 + md5: 3f8df65320201c2db0ee35b17916c7cb.dir + size: 81928129 + nfiles: 229 + params: + params.yaml: + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/OZTreeBuild/AllLife/AllLife_full_tree.phy + hash: md5 + md5: 0b17680b0a0a633f8ae50e4a8f68f17a + size: 83061022 + create_tables: + cmd: mkdir -p data/output_files && CSV_base_table_creator + data/OZTreeBuild/AllLife/AllLife_full_tree.phy + data/OpenTree/v16.1/taxonomy.tsv data/filtered/OneZoom_provider_ids.csv + data/filtered/OneZoom_latest-all.json + data/filtered/OneZoom_enwiki-latest-page.sql + data/filtered/pageviews/OneZoom_pageviews* -o data/output_files -v + --version 28017344 --exclude Archosauria_ott335588 Dinosauria_ott90215 + --extra_source_file + data/OZTreeBuild/AllLife/BespokeTree/SupplementaryTaxonomy.tsv 2> + data/output_files/ordered_output.log + deps: + - path: data/OZTreeBuild/AllLife/AllLife_full_tree.phy + hash: md5 + md5: 0b17680b0a0a633f8ae50e4a8f68f17a + size: 83061022 + - path: data/OZTreeBuild/AllLife/BespokeTree/SupplementaryTaxonomy.tsv + hash: md5 + md5: 8e861649388bf88595b93c0199f2cc3a + size: 312 + isexec: true + - path: data/OpenTree/v16.1/taxonomy.tsv + hash: md5 + md5: d7a58eaaf132522b89a506e96ca5098f + size: 417054016 + - path: data/filtered/OneZoom_enwiki-latest-page.sql + hash: md5 + md5: af577077774060f78a752f22e2f1c6d3 + size: 21445833 + - path: data/filtered/OneZoom_latest-all.json + hash: md5 + md5: 65b22a5c761c78d79b30faf871d1e404 + size: 1542990225 + - path: data/filtered/OneZoom_provider_ids.csv + hash: md5 + md5: f7c9bb8374957c07168bec36d6591347 + size: 221682224 + - path: data/filtered/pageviews/ + hash: md5 + md5: f021afa12f9d7c893412a0b2980ab187.dir + size: 104535557 + nfiles: 13 + params: + params.yaml: + build_version: 28017344 + exclude_from_popularity: Archosauria_ott335588 Dinosauria_ott90215 + ot_version: v16.1 + oz_tree: AllLife + outs: + - path: data/output_files/ + hash: md5 + md5: a9e258bc6560d47e8d4af723eee707b6.dir + size: 1182677609 + nfiles: 8 + make_js: + cmd: mkdir -p data/js_output && make_js_treefiles --outdir data/js_output + deps: + - path: data/output_files/ + hash: md5 + md5: a9e258bc6560d47e8d4af723eee707b6.dir + size: 1182677609 + nfiles: 8 + outs: + - path: data/js_output/ + hash: md5 + md5: 35cc8189813c4523903d33c9acce3b69.dir + size: 8293094 + nfiles: 6 + discover_wikidata_url: + cmd: discover_latest_wikidata_dump_url > + data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + outs: + - path: data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + hash: md5 + md5: e094b0f57c0c14e1016842c2dac5482e + size: 90 + discover_enwiki_sql_url: + cmd: discover_latest_enwiki_sql_url > + data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + outs: + - path: data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + hash: md5 + md5: 1018a5c664d01747fdd7e218190cb4ac + size: 72 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..cbe36bf --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,198 @@ +vars: + - params.yaml + +stages: + download_opentree: + cmd: download_opentree --version ${ot_version} --output-dir data/OpenTree + params: + - ot_version + outs: + - data/OpenTree/${ot_version}/ + + # ~20 secs + add_ott_numbers: + cmd: >- + rm -rf data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} && + mkdir -p data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} && + add_ott_numbers_to_trees + --savein data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} + data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] + deps: + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/ + params: + - oz_tree + - ot_version + outs: + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ + + # ~a few secs + prepare_open_trees: + cmd: >- + mkdir -p data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all && + cp -n data/OZTreeBuild/${oz_tree}/OpenTreeParts/OT_required/*.nwk + data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ 2>/dev/null || true && + cd data/OZTreeBuild/${oz_tree} && + get_open_trees_from_one_zoom + ../../OpenTree/${ot_version}/draftversion.tre + OpenTreeParts/OpenTree_all/ + BespokeTree/include_OT_${ot_version}/*.PHY + deps: + - data/OpenTree/${ot_version}/draftversion.tre + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ + - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OT_required/ + params: + - oz_tree + - ot_version + outs: + - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ + + build_tree: + cmd: >- + cd data/OZTreeBuild/${oz_tree} && + build_oz_tree + BespokeTree/include_OT_${ot_version}/Base.PHY + OpenTreeParts/OpenTree_all/ + ${oz_tree}_full_tree.phy + deps: + - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ + - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ + params: + - oz_tree + - ot_version + outs: + - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy + + download_eol: + # EOL doesn't version the provider ids file, so capture the last-modified header instead + cmd: >- + HEADER_FILE=$(mktemp) + && curl -L -D "$HEADER_FILE" -o data/EOL/provider_ids.csv.gz + https://eol.org/data/provider_ids.csv.gz + && grep -i last-modified "$HEADER_FILE" | tail -1 | sed 's/^[^:]*: //' > data/EOL/provider_ids_last_modified.txt + && rm -f "$HEADER_FILE" + outs: + - data/EOL/provider_ids.csv.gz + - data/EOL/provider_ids_last_modified.txt + + filter_eol: + cmd: >- + filter_eol + data/EOL/provider_ids.csv.gz + data/OpenTree/${ot_version}/taxonomy.tsv + -o data/filtered/OneZoom_provider_ids.csv + deps: + - data/EOL/provider_ids.csv.gz + - data/OpenTree/${ot_version}/taxonomy.tsv + params: + - ot_version + outs: + - data/filtered/OneZoom_provider_ids.csv + + discover_wikidata_url: + cmd: >- + discover_latest_wikidata_dump_url > data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + outs: + - data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + + # >6 hours (streams ~90 GB dump from wikidata server) + download_and_filter_wikidata: + cmd: >- + download_and_filter_wikidata + --url "$(cat data/Wiki/wd_JSON/latest-all-json-bz2-url.txt)" + -o data/filtered/OneZoom_latest-all.json + deps: + - data/Wiki/wd_JSON/latest-all-json-bz2-url.txt + outs: + - data/filtered/OneZoom_latest-all.json + + extract_wikidata_titles: + cmd: >- + extract_wikidata_titles + data/filtered/OneZoom_latest-all.json + -o data/filtered/wikidata_titles.txt + deps: + - data/filtered/OneZoom_latest-all.json + outs: + - data/filtered/wikidata_titles.txt + + discover_enwiki_sql_url: + cmd: >- + discover_latest_enwiki_sql_url > data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + outs: + - data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + + download_wikipedia_sql: + cmd: >- + wget --progress=bar:force -O data/Wiki/wp_SQL/enwiki-page.sql.gz + "$(cat data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt)" + deps: + - data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt + outs: + - data/Wiki/wp_SQL/enwiki-page.sql.gz + + filter_sql: + cmd: >- + filter_wikipedia_sql + data/Wiki/wp_SQL/enwiki-page.sql.gz + data/filtered/wikidata_titles.txt + -o data/filtered/OneZoom_enwiki-latest-page.sql + deps: + - data/Wiki/wp_SQL/enwiki-page.sql.gz + - data/filtered/wikidata_titles.txt + outs: + - data/filtered/OneZoom_enwiki-latest-page.sql + + # ~several hours (streams 12 ~5GB monthly dumps) + download_and_filter_pageviews: + cmd: >- + download_and_filter_pageviews + --titles-file data/filtered/wikidata_titles.txt + --months 12 + -o data/filtered/pageviews + deps: + - data/filtered/wikidata_titles.txt + outs: + - data/filtered/pageviews/: + persist: true + + # ~10 mins + create_tables: + cmd: >- + mkdir -p data/output_files && + CSV_base_table_creator + data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy + data/OpenTree/${ot_version}/taxonomy.tsv + data/filtered/OneZoom_provider_ids.csv + data/filtered/OneZoom_latest-all.json + data/filtered/OneZoom_enwiki-latest-page.sql + data/filtered/pageviews/OneZoom_pageviews* + -o data/output_files -v + --version ${build_version} + --exclude ${exclude_from_popularity} + --extra_source_file data/OZTreeBuild/${oz_tree}/BespokeTree/SupplementaryTaxonomy.tsv + 2> data/output_files/ordered_output.log + deps: + - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy + - data/OpenTree/${ot_version}/taxonomy.tsv + - data/filtered/OneZoom_provider_ids.csv + - data/filtered/OneZoom_latest-all.json + - data/filtered/OneZoom_enwiki-latest-page.sql + - data/filtered/pageviews/ + - data/OZTreeBuild/${oz_tree}/BespokeTree/SupplementaryTaxonomy.tsv + params: + - oz_tree + - ot_version + - build_version + - exclude_from_popularity + outs: + - data/output_files/ + + make_js: + cmd: >- + mkdir -p data/js_output && + make_js_treefiles + --outdir data/js_output + deps: + - data/output_files/ + outs: + - data/js_output/ diff --git a/oz_tree_build/README.markdown b/oz_tree_build/README.markdown index d3336f5..01fbdb2 100755 --- a/oz_tree_build/README.markdown +++ b/oz_tree_build/README.markdown @@ -1,18 +1,59 @@ # Introduction + Creating a bespoke OneZoom tree involves a number of steps, as documented below. These take an initial tree, map taxa onto Open Tree identifiers, add subtrees from the OpenTree of Life, resolve polytomies and delete subspecies, and calculate mappings to other databases together with creating wikipedia popularity metrics for all taxa. Finally, the resulting tree and database files are converted to a format usable by the OneZoom viewer. Mapping and popularity calculations require various large files to be downloaded e.g. from wikipedia, as [documented here](../data/README.markdown). The instructions below are primarily intended for creating a full tree of all life on the main OneZoom site. If you are making a bespoke tree, you may need to tweak them slightly. -The output files created by the tree building process (database files and files to feed to the js, -and which can be loaded into the database and for the tree viewer) are saved in `output_files`. +The output files created by the tree building process (database files and files to feed to the js, and which can be loaded into the database and for the tree viewer) are saved in `data/output_files`. + +## Using DVC (recommended) + +The entire build is defined as a [DVC](https://dvc.org/) pipeline in `dvc.yaml`, with parameters in `params.yaml`. This means you can reproduce the full build with a single command: + +```bash +source .venv/bin/activate +dvc repro +``` + +If the pipeline has already been run by someone else and the results pushed to the DVC remote, you can pull cached outputs without downloading any of the large source files: + +```bash +dvc repro --pull --allow-missing +``` + +To run only up to a specific stage (e.g. just the JS generation): + +```bash +dvc repro make_js +``` + +To visualize the pipeline graph: + +```bash +dvc dag +``` -## Environment +After running the pipeline, copy the JS output from `data/js_output/` to the OZtree repo: + +```bash +cp data/js_output/* ../OZtree/static/FinalOutputs/data/ +``` + +### Updating parameters + +Edit `params.yaml` to change the OpenTree version, taxonomy version, build version, etc. DVC will detect the parameter changes and re-run only the affected stages. + +## Manual steps (without DVC) + +The following manual instructions are preserved for reference. They document the same steps that the DVC pipeline automates. + +### Environment The following environment variables should be set: ``` OZ_TREE=AllLife # a tree directory in data/OZTreeBuild -OZ_DIR=../OZtree # the path to the OneZoom/OZtree github directory (here we assume the `tree-build` repo is a sibling to the `OZtree` repo) +OZ_DIR=../OZtree # the path to the OneZoom/OZtree github directory ``` You also need to select the OpenTree version to build against. @@ -33,7 +74,7 @@ OT_TAXONOMY_VERSION=3.7 OT_TAXONOMY_EXTRA=draft2 #optional - the draft for this version, e.g. `draft1` if the taxonomy_version is 3.6draft1 ``` -## Downloads +### Downloads Follow the [the download instructions](../data/README.markdown) to fetch required files. In summary, this should entail: @@ -46,13 +87,12 @@ wget -cP data/OpenTree/ "https://files.opentreeoflife.org/ott/ott${OT_TAXONOMY_V wget -cP data/Wiki/wp_SQL/ https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz wget -cP data/Wiki/wd_JSON/ https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 -## Pre-processed PageViews - see https://github.com/OneZoom/tree-build/releases -curl -L https://github.com/OneZoom/tree-build/releases/download/pageviews-202306-202403/OneZoom_pageviews-202306-202403.tar.gz | tar -zxC data/Wiki/wp_pagecounts/ +## Pageviews +wget -cP data/Wiki/wp_pagecounts/ 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2024/2024-03/pageviews-202403-user.bz2' ## EoL # TODO: In theory fetchable from https://opendata.eol.org/dataset/identifier-map, but currently broken cp provider_ids.csv.gz data/EOL/ - ``` Note that as documented in that readme, @@ -64,144 +104,143 @@ perl -pe 's/\)mrcaott\d+ott\d+/\)/g; s/[ _]+/_/g;' \ > data/OpenTree/draftversion${OT_VERSION}.tre ``` -# Building a tree +### Building a tree -The times given at the start of each of the following steps refer to the time taken to run the commands on the entire tree of life. +The times given at the start of each of the following steps refer to the time taken to run the commands on the entire tree of life. If you already have your own newick tree with open tree ids on it already, and don't want to graft extra clades from the OpenTree, you can skip steps 1-4, and simply save the tree as `${OZ_TREE}_full_tree.phy` in your base directory. If you have a tree but it does not have ott numbers, then you can add them using step 1, and move the resulting tree in `BespokeTree/include_files` to `${OZ_TREE}_full_tree.phy` in your base directory. -## Create the tree +### Create the tree 0. The following steps assume the venv has been activated: - ``` - . .venv/bin/activate - ``` + ``` + . .venv/bin/activate + ``` - If not created, see installation steps in the [main README](../README.markdown). + If not created, see installation steps in the [main README](../README.markdown). -1. (20 secs) Use the [OpenTree API](https://github.com/OpenTreeOfLife/germinator/wiki/Synthetic-tree-API-v3) to add OTT ids to any non-opentree taxa in our own bespoke phylogenies (those in `*.phy` or `*.PHY` files). The new `.phy` and `.PHY` files will be created in a new directory within `data/OZTreeBuild/${OZ_TREE}/BespokeTree`, and a symlink to that directory will be created called `include_files` - - ``` - mkdir -p "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}" - touch "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/dir" - rm data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/* && \ - add_ott_numbers_to_trees \ - --savein data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA} \ - data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] - ``` +1. (20 secs) Use the [OpenTree API](https://github.com/OpenTreeOfLife/germinator/wiki/Synthetic-tree-API-v3) to add OTT ids to any non-opentree taxa in our own bespoke phylogenies (those in `*.phy` or `*.PHY` files). The new `.phy` and `.PHY` files will be created in a new directory within `data/OZTreeBuild/${OZ_TREE}/BespokeTree`, and a symlink to that directory will be created called `include_files` + + ``` + mkdir -p "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}" + touch "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/dir" + rm data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/* && \ + add_ott_numbers_to_trees \ + --savein data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA} \ + data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] + ``` 1. Copy supplementary OpenTree-like newick files (if any) to the `OpenTree_all` directory. These are clades referenced in the OneZoom phylogeny that are missing from the OpenTree, and whose subtrees thus need to be supplied by hand. If any are required, they should be placed in the `OT_required` directory within `data/OZTreeBuild/${OZ_TREE}`. For tree building, they should be copied into the directory containing OpenTree subtrees using - ``` - (cd data/OZTreeBuild/${OZ_TREE}/OpenTreeParts && \ - cp -n OT_required/*.nwk OpenTree_all/) - ``` - If you do not have any supplementary `.nwk` subtrees in the `OT_required` directory, this step will output a warning, which can be ignored. + ``` + (cd data/OZTreeBuild/${OZ_TREE}/OpenTreeParts && \ + cp -n OT_required/*.nwk OpenTree_all/) + ``` + + If you do not have any supplementary `.nwk` subtrees in the `OT_required` directory, this step will output a warning, which can be ignored. 1. (a few secs) Construct OpenTree subtrees for inclusion from the `draftversion${OT_VERSION}.tre` file. The subtrees to be extracted are specified by inclusion strings in the `.PHY` files created in step 1. The command for this is `getOpenTreesFromOneZoom.py`, and it needs to be run from within the `data/OZTreeBuild/${OZ_TREE}` directory, as follows: - ``` - (cd data/OZTreeBuild/${OZ_TREE} && get_open_trees_from_one_zoom \ - ../../OpenTree/draftversion${OT_VERSION}.tre OpenTreeParts/OpenTree_all/ \ - BespokeTree/include_files/*.PHY) - ``` - If you are not including any OpenTree subtrees in your final tree, you should have no `.PHY` files, and this step will output a warning, which can be ignored. - -1. (1 sec) substitute these subtrees into the main tree, and save the resulting full newick file using the `build_oz_tree` script: + ``` + (cd data/OZTreeBuild/${OZ_TREE} && get_open_trees_from_one_zoom \ + ../../OpenTree/draftversion${OT_VERSION}.tre OpenTreeParts/OpenTree_all/ \ + BespokeTree/include_files/*.PHY) + ``` - ``` - (cd data/OZTreeBuild/${OZ_TREE} && \ - build_oz_tree BespokeTree/include_files/Base.PHY OpenTreeParts/OpenTree_all/ AllLife_full_tree.phy) - ``` + If you are not including any OpenTree subtrees in your final tree, you should have no `.PHY` files, and this step will output a warning, which can be ignored. - Now that we are not having to run this every sponsorship time, we should probably re-write this to actually know what tree structure looks like, maybe using Python/DendroPy (see https://github.com/jrosindell/OneZoomComplete/issues/340) and also to automatically create the list of DOIs at `${OZ_DIR}/static/FinalOutputs/refs.txt`. Note that any '@' signs in the `${OZ_TREE}_full_tree.phy` output file are indicative of OpenTree substitutions that have not been possible: it would be good to check to see if there are other sources (or old OpenTree versions) that have trees for these nodes, and place them as .phy files in `data/OZTreeBuild/${OZ_TREE}/OpenTreeParts/OT_required/`. You can check with +1. (1 sec) substitute these subtrees into the main tree, and save the resulting full newick file using the `build_oz_tree` script: - ``` - grep -o '.............@' data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy - ``` - You may also want to save a zipped version of the full tree file in a place where users can download it for reference purposes, in which case you can do + ``` + (cd data/OZTreeBuild/${OZ_TREE} && \ + build_oz_tree BespokeTree/include_files/Base.PHY OpenTreeParts/OpenTree_all/ AllLife_full_tree.phy) + ``` - ``` - gzip < data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy > ${OZ_DIR}/static/FinalOutputs/${OZ_TREE}_full_tree.phy.gz - ``` + Now that we are not having to run this every sponsorship time, we should probably re-write this to actually know what tree structure looks like, maybe using Python/DendroPy (see https://github.com/jrosindell/OneZoomComplete/issues/340) and also to automatically create the list of DOIs at `${OZ_DIR}/static/FinalOutputs/refs.txt`. Note that any '@' signs in the `${OZ_TREE}_full_tree.phy` output file are indicative of OpenTree substitutions that have not been possible: it would be good to check to see if there are other sources (or old OpenTree versions) that have trees for these nodes, and place them as .phy files in `data/OZTreeBuild/${OZ_TREE}/OpenTreeParts/OT_required/`. You can check with - ## Create the base tree and table data - -1. (5 to 7 hours, or a few mins if files are already filtered, see below) This generates filtered versions of the raw input files, which then makes them faster to work with. For example, for the massive wikimedia dump file (`latest-all.json.bz2`), it remove all entries that aren't taxons or vernaculars, and for each remaining entry, in only keeps the small subset of fields that we care about. + ``` + grep -o '.............@' data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy + ``` - The output files have the same names as the input files, but with a `OneZoom_` prefix, and without using compression (e.g. `OneZoom_latest-all.json` for `latest-all.json`.bz2). They are stored next to their matching input files. + You may also want to save a zipped version of the full tree file in a place where users can download it for reference purposes, in which case you can do - Note that by default, it works incrementally and only generates new filtered files if they are missing or old versions. It does this by setting the timestamp of generated files to match their source file. So if for instance it has already filtered `latest-all.json.bz2`, but has not processed the SQL or Page Count files, you can just rerun the same command, and it will not need to reprocess `latest-all.json.bz2`. You can override this behavior and force full regeneration by passing in a `-f` flag. + ``` + gzip < data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy > ${OZ_DIR}/static/FinalOutputs/${OZ_TREE}_full_tree.phy.gz + ``` - From the data folder, run the `generate_filtered_files` script: +### Create the base tree and table data - ``` - tar -C data/OpenTree -zxvf data/OpenTree/ott${OT_TAXONOMY_VERSION}.tgz - (cd data && generate_filtered_files OZTreeBuild/AllLife/AllLife_full_tree.phy OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv EOL/provider_ids.csv.gz Wiki/wd_JSON/latest-all.json.bz2 Wiki/wp_SQL/enwiki-latest-page.sql.gz Wiki/wp_pagecounts/pageviews*.bz2) - ``` +5. (5 to 7 hours, or a few mins if files are already filtered) This generates filtered versions of the raw input files, which then makes them faster to work with. In the DVC pipeline, this is handled by the `filter_eol`, `filter_wikidata`, `filter_sql`, and `filter_pageviews` stages, which run as separate parallel stages. Without DVC, the `generate_filtered_files` script can still be used to run them all together: - Alternatively, if you downloaded the preprocessed pageviews file (per [instructions](../data/Wiki/README.markdown)), you should omit the last argument (`Wiki/wp_pagecounts/pageviews*.bz2`) from this `generate_filtered_files` command. + ``` + tar -C data/OpenTree -zxvf data/OpenTree/ott${OT_TAXONOMY_VERSION}.tgz + (cd data && generate_filtered_files OZTreeBuild/AllLife/AllLife_full_tree.phy OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv EOL/provider_ids.csv.gz Wiki/wd_JSON/latest-all.json.bz2 Wiki/wp_SQL/enwiki-page.sql.gz Wiki/wp_pagecounts/pageviews*.bz2) + ``` 1. (11 mins) On the basis of the `${OZ_TREE}_full_tree.phy` file, look for ID mappings between different datasets, calculate popularity measures via wikidata/pedia, refine the tree (remove subspecies, randomly break polytomies, remove unifurcations etc), and then create corresponding database tables together with `ordered_tree_XXXXX.nwk`, `ordered_tree_XXXXX.poly` (same file but with polytomies marked with curly braces), and `ordered_dates_XXXXX.js` files (where XXXXX is the version number, usually a timestamp). - Additional flags can be given to override the OpenTree taxonomy in specific cases (using `--extra_source_file`), and to exclude certain taxa (e.g. dinosaurs) from the popularity calculations. - - If you do not have comprehensive tree of a clade, it probably doesn't make sense to calculate popularity measures, and you can run this script with the `-p` flag (or omit the references to the `wp_` wikipedia files). - - ``` - CSV_base_table_creator \ - data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy \ - data/OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv \ - data/EOL/OneZoom_provider_ids.csv \ - data/Wiki/wd_JSON/OneZoom_latest-all.json \ - data/Wiki/wp_SQL/OneZoom_enwiki-latest-page.sql \ - data/Wiki/wp_pagecounts/OneZoom_pageviews* \ - -o data/output_files -v \ - --exclude Archosauria_ott335588 Dinosauria_ott90215 \ - --extra_source_file data/OZTreeBuild/${OZ_TREE}/BespokeTree/SupplementaryTaxonomy.tsv \ - 2> data/output_files/ordered_output.log - ``` - - Since round braces, curly braces, and commas are banned from the `simplified_ottnames` file, we can create minimal topology files by simply removing everything except these characters from the `.nwk` and `.poly` files. If the tree has been ladderised, with polytomies and unifurcations removed, the commas are also redundant, and can be removed. This is done in the next step, which saves these highly shortened strings into .js data files. - -1. (1 min) turn the most recently saved tree files (saved in the previous step as `data/output_files/ordered_tree_XXXXXX.poly` and `ordered_dates_XXXXXX.json`) into bracketed newick strings in `${OZ_DIR}/static/FinalOutputs/data/basetree_XXXXXX.js`, ``${OZ_DIR}/static/FinalOutputs/data/polytree_XXXXXX.js`, a cutpoints file in ``${OZ_DIR}/static/FinalOutputs/data/cut_position_map_XXXXXX.js`, and a dates file in ``${OZ_DIR}/static/FinalOutputs/data/dates_XXXXXX.json` as well as their gzipped equivalents, using - - ``` - make_js_treefiles --outdir ${OZ_DIR}/static/FinalOutputs/data - ``` - - ## Upload data to the server and check it - -1. If you are running the tree building scripts on a different computer to the one running the web server, you will need to push the `completetree_XXXXXX.js`, `completetree_XXXXXX.js.gz`, `cut_position_map_XXXXXX.js`, `cut_position_map_XXXXXX.js.gz`, `dates_XXXXXX.js` -, `dates_XXXXXX.js.gz` files onto your server, e.g. by pushing to your local Github repo then pulling the latest github changes to the server. + Additional flags can be given to override the OpenTree taxonomy in specific cases (using `--extra_source_file`), and to exclude certain taxa (e.g. dinosaurs) from the popularity calculations. + + If you do not have comprehensive tree of a clade, it probably doesn't make sense to calculate popularity measures, and you can run this script with the `-p` flag (or omit the references to the `wp_` wikipedia files). + + ``` + CSV_base_table_creator \ + data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy \ + data/OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv \ + data/EOL/OneZoom_provider_ids.csv \ + data/Wiki/wd_JSON/OneZoom_latest-all.json \ + data/Wiki/wp_SQL/OneZoom_enwiki-latest-page.sql \ + data/Wiki/wp_pagecounts/OneZoom_pageviews* \ + -o data/output_files -v \ + --exclude Archosauria_ott335588 Dinosauria_ott90215 \ + --extra_source_file data/OZTreeBuild/${OZ_TREE}/BespokeTree/SupplementaryTaxonomy.tsv \ + 2> data/output_files/ordered_output.log + ``` + + Since round braces, curly braces, and commas are banned from the `simplified_ottnames` file, we can create minimal topology files by simply removing everything except these characters from the `.nwk` and `.poly` files. If the tree has been ladderised, with polytomies and unifurcations removed, the commas are also redundant, and can be removed. This is done in the next step, which saves these highly shortened strings into .js data files. + +1. (1 min) Turn the most recently saved tree files (saved in the previous step as `data/output_files/ordered_tree_XXXXXX.poly` and `ordered_dates_XXXXXX.json`) into bracketed newick JS files. In the DVC pipeline, these are output to `data/js_output/` and can be copied to the OZtree repo. Without DVC, you can write directly to the OZtree directory: + + ``` + make_js_treefiles --outdir ${OZ_DIR}/static/FinalOutputs/data + ``` + +### Upload data to the server and check it + +8. If you are running the tree building scripts on a different computer to the one running the web server, you will need to push the `completetree_XXXXXX.js`, `completetree_XXXXXX.js.gz`, `cut_position_map_XXXXXX.js`, `cut_position_map_XXXXXX.js.gz`, `dates_XXXXXX.js`, `dates_XXXXXX.js.gz` files onto your server, e.g. by pushing to your local Github repo then pulling the latest github changes to the server. 1. (15 mins) load the CSV tables into the DB, using the SQL commands printed in step 6 (at the end of the `data/output_files/ordered_output.log` file: the lines that start something like `TRUNCATE TABLE ordered_leaves; LOAD DATA LOCAL INFILE ...;` `TRUNCATE TABLE ordered_nodes; LOAD DATA LOCAL INFILE ...;`). Either do so via a GUI utility, or copy the `.csv.mySQL` files to a local directory on the machine running your SQL server (e.g. using `scp -C` for compression) and run your `LOAD DATA LOCAL INFILE` commands on the mysql command line (this may require you to start the command line utility using `mysql --local-infile`, e.g.: ``` mysql --local-infile --host db.MYSERVER.net --user onezoom --password --database onezoom_dev ``` + 1. Check for dups, and if any sponsors are no longer on the tree, using something like the following SQL command: + ``` + select * from reservations left outer join ordered_leaves on reservations.OTT_ID = ordered_leaves.ott where ordered_leaves.ott is null and reservations.verified_name IS NOT NULL; + select group_concat(id), group_concat(parent), group_concat(name), count(ott) from ordered_leaves group by ott having(count(ott) > 1) + ``` + +### Fill in additional server fields + +11. (15 mins) create example pictures for each node by percolating up. This requires the most recent `images_by_ott` table, so either do this on the main server, or (if you are doing it locally) update your `images_by_ott` to the most recent server version. + ``` - select * from reservations left outer join ordered_leaves on reservations.OTT_ID = ordered_leaves.ott where ordered_leaves.ott is null and reservations.verified_name IS NOT NULL; - select group_concat(id), group_concat(parent), group_concat(name), count(ott) from ordered_leaves group by ott having(count(ott) > 1) + ${OZ_DIR}/OZprivate/ServerScripts/Utilities/picProcess.py -v ``` - - ## Fill in additional server fields - -1. (15 mins) create example pictures for each node by percolating up. This requires the most recent `images_by_ott` table, so either do this on the main server, or (if you are doing it locally) update your `images_by_ott` to the most recent server version. - - ``` - ${OZ_DIR}/OZprivate/ServerScripts/Utilities/picProcess.py -v - ``` -1. (5 mins) percolate the IUCN data up using - - ``` - ${OZ_DIR}/OZprivate/ServerScripts/Utilities/IUCNquery.py -v - ``` - (note that this both updates the IUCN data in the DB and percolates up interior node info) + +1. (5 mins) percolate the IUCN data up using + + ``` + ${OZ_DIR}/OZprivate/ServerScripts/Utilities/IUCNquery.py -v + ``` + + (note that this both updates the IUCN data in the DB and percolates up interior node info) + 1. (10 mins) If this is a site with sponsorship (only the main OZ site), set the pricing structure using SET_PRICES.html (accessible from the management pages). -1. (5 mins - this does seem to be necessary for ordered nodes & ordered leaves). Make sure indexes are reset. Look at `OZprivate/ServerScripts/SQL/create_db_indexes.sql` for the SQL to do this - this may involve logging in to the SQL server (e.g. via Sequel Pro on Mac) and pasting all the drop index and create index commands. - - ## at last -1. Have a well deserved cup of tea +1. (5 mins - this does seem to be necessary for ordered nodes & ordered leaves). Make sure indexes are reset. Look at `OZprivate/ServerScripts/SQL/create_db_indexes.sql` for the SQL to do this - this may involve logging in to the SQL server (e.g. via Sequel Pro on Mac) and pasting all the drop index and create index commands. + +### At last + +15. Have a well deserved cup of tea diff --git a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py index 81d0dd8..5d7a963 100755 --- a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py +++ b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py @@ -1048,7 +1048,7 @@ def main(): nargs="?", help=( "The gzipped >1GB wikipedia -latest-page.sql.gz dump, " - "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-latest-page.sql.gz) " + "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-page.sql.gz) " ), ) parser.add_argument( diff --git a/oz_tree_build/utilities/download_and_filter_pageviews.py b/oz_tree_build/utilities/download_and_filter_pageviews.py new file mode 100644 index 0000000..59b1f47 --- /dev/null +++ b/oz_tree_build/utilities/download_and_filter_pageviews.py @@ -0,0 +1,202 @@ +"""Download pageview files from Wikimedia and filter to wikidata titles. + +Streams monthly pageview dumps directly from +https://dumps.wikimedia.org/other/pageview_complete/monthly/, +decompresses on the fly, and writes only the filtered results to disk. +Already-filtered months are skipped unless the titles file has changed. +""" + +import argparse +import hashlib +import itertools +import logging +import os +import re +import sys +import tempfile +import urllib.request + +from .file_utils import stream_bz2_lines_from_url +from .filter_pageviews import filter_pageview_lines, write_filtered_pageviews +from .filter_wikidata import load_titles_file + +BASE_URL = "https://dumps.wikimedia.org/other/pageview_complete/monthly/" +TITLES_HASH_FILE = ".titles_hash" +WGET_READ_TIMEOUT = 120 # seconds of no data before wget gives up + + +def _fetch_index(url): + """Fetch an Apache directory index page and return its HTML.""" + print(f"Fetching index from {url}") + req = urllib.request.Request(url, headers={"User-Agent": "OneZoom-tree-build/1.0"}) + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read().decode("utf-8") + + +def discover_pageview_months(base_url=BASE_URL): + """ + Crawl the Wikimedia monthly pageview directory listing and yield + (url, filename) tuples for ``*-user.bz2`` files, most recent first. + + Iterates years and months in reverse so callers needing only the N most + recent months can stop early without fetching every index page. + """ + year_pattern = re.compile(r'href="(\d{4}/)"') + month_dir_pattern = re.compile(r'href="(\d{4}-\d{2}/)"') + user_file_pattern = re.compile(r'href="(pageviews-\d{6}-user\.bz2)"') + + years_html = _fetch_index(base_url) + year_dirs = sorted((m.group(1) for m in year_pattern.finditer(years_html)), reverse=True) + + for year_dir in year_dirs: + year_url = base_url + year_dir + + months_html = _fetch_index(year_url) + month_dirs = sorted( + (m.group(1) for m in month_dir_pattern.finditer(months_html)), + reverse=True, + ) + + for month_dir in month_dirs: + month_url = year_url + month_dir + + files_html = _fetch_index(month_url) + for file_match in user_file_pattern.finditer(files_html): + filename = file_match.group(1) + file_url = month_url + filename + yield file_url, filename + + +def _compute_file_hash(path): + """Return the SHA-256 hex digest of a file's contents.""" + h = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + h.update(chunk) + return h.hexdigest() + + +def _output_filename(pageview_filename): + """Map a raw pageview filename to its filtered output name.""" + basename = pageview_filename + if basename.endswith(".bz2"): + basename = basename[:-4] + return f"OneZoom_{basename}" + + +def _check_and_update_titles_hash(output_dir, titles_file): + """ + Compare the stored titles hash with the current file. Returns True if + the cache is still valid. Clears existing output files and updates the + hash when it changes. + """ + current_hash = _compute_file_hash(titles_file) + hash_path = os.path.join(output_dir, TITLES_HASH_FILE) + + if os.path.exists(hash_path): + with open(hash_path) as f: + stored_hash = f.read().strip() + if stored_hash == current_hash: + return True + logging.info("Titles file changed -- clearing cached pageview outputs") + for name in os.listdir(output_dir): + if name == TITLES_HASH_FILE: + continue + os.remove(os.path.join(output_dir, name)) + + with open(hash_path, "w") as f: + f.write(current_hash) + return False + + +def stream_and_filter(url, output_path, wikidata_titles, wikilang="en"): + """ + Stream a remote .bz2 pageview file, filter it, and write the result. + Uses a temp file + rename for atomicity. + """ + lines = stream_bz2_lines_from_url(url, read_timeout=WGET_READ_TIMEOUT) + pageviews = filter_pageview_lines(lines, wikidata_titles, wikilang) + + dir_name = os.path.dirname(output_path) + fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp") + os.close(fd) + try: + write_filtered_pageviews(pageviews, tmp_path) + os.replace(tmp_path, output_path) + except BaseException: + if os.path.exists(tmp_path): + os.remove(tmp_path) + raise + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--titles-file", + required=True, + help="wikidata_titles.txt file (one title per line)", + ) + parser.add_argument( + "--months", + type=int, + required=True, + help="Number of most recent months to process", + ) + parser.add_argument( + "-o", + "--output-dir", + required=True, + help="Output directory for filtered pageview files", + ) + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + parser.add_argument( + "--base-url", + default=BASE_URL, + help="Base URL for the Wikimedia pageview dumps", + ) + args = parser.parse_args() + + os.makedirs(args.output_dir, exist_ok=True) + + cache_valid = _check_and_update_titles_hash(args.output_dir, args.titles_file) + if cache_valid: + logging.info("Titles file unchanged -- cached outputs are valid") + else: + logging.info("Titles file is new or changed -- will reprocess all months") + + wikidata_titles = load_titles_file(args.titles_file) + logging.info(f"Loaded {len(wikidata_titles)} wikidata titles") + + logging.info("Discovering available pageview months from Wikimedia...") + selected = list(itertools.islice(discover_pageview_months(args.base_url), args.months)) + selected.reverse() + logging.info(f"Selected {len(selected)} most recent months") + + expected_filenames = set() + for i, (url, filename) in enumerate(selected, 1): + output_name = _output_filename(filename) + expected_filenames.add(output_name) + output_file = os.path.join(args.output_dir, output_name) + + if os.path.exists(output_file): + logging.info(f"[{i}/{len(selected)}] Skipping {filename} (already filtered)") + continue + + logging.info(f"[{i}/{len(selected)}] Streaming and filtering {filename}...") + stream_and_filter(url, output_file, wikidata_titles, wikilang=args.wikilang) + logging.info(f"[{i}/{len(selected)}] Done: {output_file}") + + expected_filenames.add(TITLES_HASH_FILE) + for name in os.listdir(args.output_dir): + if name not in expected_filenames: + stale_path = os.path.join(args.output_dir, name) + logging.info(f"Removing old pageview file outside window: {name}") + os.remove(stale_path) + + logging.info("All pageview months up to date") + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/download_and_filter_wikidata.py b/oz_tree_build/utilities/download_and_filter_wikidata.py new file mode 100644 index 0000000..a907bf9 --- /dev/null +++ b/oz_tree_build/utilities/download_and_filter_wikidata.py @@ -0,0 +1,122 @@ +"""Download the Wikidata JSON dump and filter to taxon/vernacular items. + +Streams the dump directly from Wikimedia, decompresses on the fly, and +writes only the filtered results to disk. Avoids storing the full ~90 GB +dump locally. +""" + +import argparse +import logging +import os +import re +import sys +import tempfile +import urllib.request + +from .file_utils import stream_bz2_lines_from_url +from .filter_wikidata import filter_wikidata + +WIKIDATA_ENTITIES_URL = "https://dumps.wikimedia.org/wikidatawiki/entities/" +WGET_READ_TIMEOUT = 600 + +logger = logging.getLogger(__name__) + + +def discover_latest_wikidata_dump_url(base_url=WIKIDATA_ENTITIES_URL, timeout=30): + """Find the URL of the most recent dated wikidata-YYYYMMDD-all.json.bz2 dump. + We don't use the symlinked latest-all.json.bz2 file because we want to know the date.""" + folder_re = re.compile(r'href="(\d{8})/"') + file_re_template = r'href="(wikidata-{date}-all\.json\.bz2)"' + + index_html = urllib.request.urlopen(base_url, timeout=timeout).read().decode() + + dates = sorted(folder_re.findall(index_html), reverse=True) + if not dates: + raise RuntimeError(f"No dated folders found at {base_url}") + + for date in dates: + folder_url = f"{base_url}{date}/" + logger.info("Checking %s", folder_url) + try: + folder_html = urllib.request.urlopen(folder_url, timeout=timeout).read().decode() + except urllib.error.URLError as exc: + logger.warning("Could not fetch %s: %s", folder_url, exc) + continue + + match = re.search(file_re_template.format(date=date), folder_html) + if match: + url = f"{folder_url}{match.group(1)}" + logger.info("Found latest dump: %s", url) + return url + + raise RuntimeError(f"No wikidata-YYYYMMDD-all.json.bz2 file found in any folder at {base_url}") + + +def stream_and_filter(url, output_path, wikilang="en", dont_trim_sitelinks=False): + """ + Stream a remote Wikidata .bz2 dump, filter it, and write the result. + Uses a temp file + rename for atomicity. + """ + lines = stream_bz2_lines_from_url(url, read_timeout=WGET_READ_TIMEOUT) + + dir_name = os.path.dirname(output_path) or "." + fd, tmp_path = tempfile.mkstemp(dir=dir_name, suffix=".tmp") + os.close(fd) + try: + filter_wikidata( + lines, + tmp_path, + wikilang=wikilang, + dont_trim_sitelinks=dont_trim_sitelinks, + ) + os.replace(tmp_path, output_path) + except BaseException: + if os.path.exists(tmp_path): + os.remove(tmp_path) + raise + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "-o", + "--output", + required=True, + help="Output path for filtered wikidata JSON", + ) + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + parser.add_argument( + "--url", + required=True, + ) + parser.add_argument( + "--dont-trim-sitelinks", + action="store_true", + default=False, + help="Keep the full sitelinks value for all languages", + ) + args = parser.parse_args() + + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + + logging.info(f"Streaming Wikidata dump from {args.url}") + stream_and_filter( + args.url, + args.output, + wikilang=args.wikilang, + dont_trim_sitelinks=args.dont_trim_sitelinks, + ) + logging.info(f"Done: {args.output}") + + +def discover_main(): + """CLI entry point: discover the latest wikidata dump URL.""" + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + url = discover_latest_wikidata_dump_url() + print(url) + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/download_opentree.py b/oz_tree_build/utilities/download_opentree.py new file mode 100644 index 0000000..71c9e37 --- /dev/null +++ b/oz_tree_build/utilities/download_opentree.py @@ -0,0 +1,136 @@ +"""Download Open Tree of Life synthesis data (tree + taxonomy) into a versioned folder. + +Usage: + download_opentree --version v16.1 --output-dir data/OpenTree + +This fetches the synthesis manifest from the OpenTree GitHub repo, then downloads +the labelled supertree and OTT taxonomy for the requested synthesis version. Files +are placed in ``//`` with version-agnostic names: + + //labelled_supertree_simplified_ottnames.tre + //draftversion.tre + //taxonomy.tsv +""" + +import argparse +import os +import re +import shutil +import tarfile +import tempfile + +import requests + +SYNTHESIS_JSON_URL = ( + "https://raw.githubusercontent.com/OpenTreeOfLife/opentree" "/master/webapp/static/statistics/synthesis.json" +) + + +def fetch_synthesis_json(): + response = requests.get(SYNTHESIS_JSON_URL) + response.raise_for_status() + return response.json() + + +def find_synthesis_entry(synthesis_json, version): + """Return the manifest entry whose ``version`` field matches *version*.""" + for entry in synthesis_json.values(): + if entry.get("version") == version: + return entry + available = [e["version"] for e in synthesis_json.values() if "version" in e] + raise SystemExit(f"Version '{version}' not found in synthesis.json. " f"Available versions: {', '.join(available)}") + + +def strip_mrca_prefixes(content: str) -> str: + content = re.sub(r"\)mrcaott\d+ott\d+", ")", content) + content = re.sub(r"[ _]+", "_", content) + return content + + +def download_tree(version, output_dir): + """Download the labelled supertree and produce the processed draftversion.""" + assert version.startswith("v") + version_without_v = version[1:] + tree_url = ( + f"https://files.opentreeoflife.org/synthesis/opentree{version_without_v}" + f"/output/labelled_supertree/labelled_supertree_simplified_ottnames.tre" + ) + print(f"Downloading tree from {tree_url} ...") + response = requests.get(tree_url) + response.raise_for_status() + + raw_path = os.path.join(output_dir, "labelled_supertree_simplified_ottnames.tre") + with open(raw_path, "w") as f: + f.write(response.text) + print(f" Saved raw tree to {raw_path}") + + draft_path = os.path.join(output_dir, "draftversion.tre") + print(" Stripping mrca prefixes ...") + with open(draft_path, "w") as f: + f.write(strip_mrca_prefixes(response.text)) + print(f" Saved processed tree to {draft_path}") + + +def download_taxonomy(ott_version_raw, output_dir): + """Download and extract taxonomy.tsv from the OTT taxonomy tarball.""" + ott_version = ott_version_raw.split("draft")[0] + taxonomy_url = f"https://files.opentreeoflife.org/ott/{ott_version}/{ott_version}.tgz" + print(f"Downloading taxonomy from {taxonomy_url} ...") + response = requests.get(taxonomy_url) + response.raise_for_status() + + with tempfile.TemporaryDirectory() as tmpdir: + tar_path = os.path.join(tmpdir, "taxonomy.tgz") + with open(tar_path, "wb") as f: + f.write(response.content) + + print(" Extracting taxonomy.tsv ...") + with tarfile.open(tar_path, "r:gz") as tar: + taxonomy_member = None + for member in tar.getmembers(): + if member.name.endswith("/taxonomy.tsv"): + taxonomy_member = member + break + if taxonomy_member is None: + raise SystemExit("Could not find taxonomy.tsv in the taxonomy tarball") + extracted = tar.extractfile(taxonomy_member) + dest_path = os.path.join(output_dir, "taxonomy.tsv") + with open(dest_path, "wb") as f: + shutil.copyfileobj(extracted, f) + print(f" Saved taxonomy to {dest_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Download Open Tree of Life synthesis data into a versioned folder.") + parser.add_argument( + "--version", + required=True, + help='Synthesis version to download (e.g. "v16.1"). ' + "Must match the 'version' field in the OpenTree synthesis.json manifest.", + ) + parser.add_argument( + "--output-dir", + default="data/OpenTree", + help="Parent directory for the versioned output folder (default: data/OpenTree).", + ) + args = parser.parse_args() + + version = args.version + if not version.startswith("v"): + raise SystemExit(f"Version must start with 'v' (got '{version}')") + + print("Fetching synthesis manifest ...") + synthesis_json = fetch_synthesis_json() + entry = find_synthesis_entry(synthesis_json, version) + print(f"Found synthesis {version} (OTT {entry['OTT_version']})") + + output_dir = os.path.join(args.output_dir, version) + os.makedirs(output_dir, exist_ok=True) + + download_tree(version, output_dir) + download_taxonomy(entry["OTT_version"], output_dir) + print(f"Done. All files written to {output_dir}/") + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/file_utils.py b/oz_tree_build/utilities/file_utils.py index 0a84627..1f96715 100644 --- a/oz_tree_build/utilities/file_utils.py +++ b/oz_tree_build/utilities/file_utils.py @@ -3,9 +3,11 @@ """ import bz2 +import codecs import gzip import os -import time +import shutil +import subprocess __author__ = "David Ebbo" @@ -20,37 +22,67 @@ def open_file_based_on_extension(filename, mode): return open(filename, mode, encoding="utf-8") -def enumerate_lines_from_file(filename, print_every=None, print_line_num_func=None): +def stream_bz2_lines_from_url(url, read_timeout=120): """ - Enumerate the lines in a file, whether it's uncompressed, bz2 or gz. If print_every - is given as an integer, print a message out every print_every lines. If - print_line_num_func is given, it should be a function that takes in the line number - and returns the string to print out. + Stream a .bz2 file over HTTP via wget and yield decompressed lines. + wget handles timeouts, connection management, and progress display; + Python handles decompression and line splitting. """ - underlying_file_size = os.path.getsize(filename) - start_time = time.time() - with open_file_based_on_extension(filename, "rt") as f: - if print_every is not None: + if not shutil.which("wget"): + raise RuntimeError("wget is required but not found on PATH") + + wget = subprocess.Popen( + [ + "wget", + "-q", + "--show-progress", + "-O", + "-", + "--connect-timeout=30", + f"--read-timeout={read_timeout}", + "--header=User-Agent: OneZoom-tree-build/1.0", + url, + ], + stdout=subprocess.PIPE, + stderr=None, + ) + + decompressor = bz2.BZ2Decompressor() + decoder = codecs.getincrementaldecoder("utf-8")("replace") + line_buf = "" + + try: + while True: + chunk = wget.stdout.read(1024 * 1024) + if not chunk: + break try: - underlying_file = f.buffer.fileobj # gzip - except AttributeError: - try: - underlying_file = f.buffer._buffer.raw._fp # b2zip - except AttributeError: - underlying_file = f # plain - for line_num, line in enumerate(iter(f.readline, "")): - if print_every is not None and line_num != 0 and line_num % print_every == 0: - underlying_file_pos = underlying_file.tell() - percent_done = 100 * underlying_file_pos / underlying_file_size - elapsed_time = time.time() - start_time - time_left = elapsed_time * (100 - percent_done) / percent_done - expected_ETA = time.strftime("%H:%M:%S", time.localtime(time.time() + time_left)) - if print_line_num_func is not None: - line_num_str = print_line_num_func(line_num) - else: - line_num_str = f"Processing line {line_num}" - print(f"{percent_done:.2f}% read. " + line_num_str + f" ETA: {expected_ETA}") - yield line_num, line + raw = decompressor.decompress(chunk) + except EOFError: + break + text = decoder.decode(raw) + line_buf += text + parts = line_buf.split("\n") + line_buf = parts[-1] + yield from parts[:-1] + + trailing = decoder.decode(b"", final=True) + line_buf += trailing + if line_buf: + yield line_buf + finally: + wget.stdout.close() + if wget.poll() is None: + wget.terminate() + rc = wget.wait() + if rc != 0: + raise RuntimeError(f"wget failed (exit {rc})") + + +def enumerate_lines_from_file(filename): + """Enumerate the lines in a file, whether it's uncompressed, bz2 or gz.""" + with open_file_based_on_extension(filename, "rt") as f: + yield from enumerate(iter(f.readline, "")) def check_identical_files(output_location, expected_output_path): diff --git a/oz_tree_build/utilities/filter_common.py b/oz_tree_build/utilities/filter_common.py new file mode 100644 index 0000000..8abdb87 --- /dev/null +++ b/oz_tree_build/utilities/filter_common.py @@ -0,0 +1,29 @@ +"""Shared utilities for the filter modules.""" + +import csv + +from .file_utils import open_file_based_on_extension + + +def read_taxonomy_source_ids(taxonomy_file): + """ + Read an OpenTree taxonomy.tsv file and return a dict mapping source + names to sets of integer IDs. Used by filter_eol and filter_wikidata + (in clade mode). + """ + sources = {"ncbi", "if", "worms", "irmng", "gbif"} + source_ids = {source: set() for source in sources} + + with open_file_based_on_extension(taxonomy_file, "rt") as f: + reader = csv.DictReader(f, delimiter="\t") + for OTTrow in reader: + sourceinfo = OTTrow["sourceinfo"] + for srcs in sourceinfo.split(","): + src, src_id = srcs.split(":", 1) + if src in sources: + try: + source_ids[src].add(int(src_id)) + except ValueError: + pass + + return source_ids diff --git a/oz_tree_build/utilities/filter_eol.py b/oz_tree_build/utilities/filter_eol.py new file mode 100644 index 0000000..99a9209 --- /dev/null +++ b/oz_tree_build/utilities/filter_eol.py @@ -0,0 +1,79 @@ +"""Filter EOL provider IDs CSV to keep only relevant sources.""" + +import argparse +import logging +import sys + +from ..taxon_mapping_and_popularity.CSV_base_table_creator import iucn_num +from .file_utils import open_file_based_on_extension +from .filter_common import read_taxonomy_source_ids + + +def filter_eol_ids(eol_id_file, output_file, source_ids, clade=None): + """ + Filter the EOL identifiers file, keeping only rows from known providers + whose IDs appear in the taxonomy. In non-clade (full-tree) mode, all rows + from known providers are kept. + + Returns nothing; writes filtered output to output_file. + """ + eol_sources = {"676": "ncbi", "459": "worms", "767": "gbif", str(iucn_num): "iucn"} + iucn_lines = [] + known_names = set() + + with open_file_based_on_extension(eol_id_file, "rt") as eol_f: + with open_file_based_on_extension(output_file, "wt") as filtered_eol_f: + for i, line in enumerate(eol_f): + if i == 0: + filtered_eol_f.write(line) + continue + + fields = line.split(",") + + if fields[2] not in eol_sources: + continue + + try: + eol_id = int(fields[1]) + except ValueError: + continue + + if not clade: + filtered_eol_f.write(line) + continue + + if fields[2] == str(iucn_num): + iucn_lines.append(line) + elif eol_id in source_ids[eol_sources[fields[2]]]: + filtered_eol_f.write(line) + known_names.add(fields[4]) + + for line in iucn_lines: + fields = line.split(",") + if fields[4] in known_names: + filtered_eol_f.write(line) + + logging.info( + f"Found {len(source_ids['ncbi'])} NCBI ids, " + f"{len(source_ids['if'])} IF ids, " + f"{len(source_ids['worms'])} WoRMS ids, " + f"{len(source_ids['irmng'])} IRMNG ids, " + f"{len(source_ids['gbif'])} GBIF ids" + ) + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("eol_file", help="The EOL identifiers CSV file (optionally gzipped)") + parser.add_argument("taxonomy_file", help="The OpenTree taxonomy.tsv file") + parser.add_argument("-o", "--output", required=True, help="Output path for filtered EOL CSV") + args = parser.parse_args() + + source_ids = read_taxonomy_source_ids(args.taxonomy_file) + filter_eol_ids(args.eol_file, args.output, source_ids) + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/filter_pageviews.py b/oz_tree_build/utilities/filter_pageviews.py new file mode 100644 index 0000000..e539cf8 --- /dev/null +++ b/oz_tree_build/utilities/filter_pageviews.py @@ -0,0 +1,97 @@ +"""Filter Wikipedia pageview files to keep only pages matching wikidata titles.""" + +import argparse +import logging +import os +import sys +from collections import defaultdict + +from .file_utils import enumerate_lines_from_file, open_file_based_on_extension +from .filter_wikidata import load_titles_file + + +def unquote_if_quoted(s): + if (s.startswith("'") and s.endswith("'")) or (s.startswith('"') and s.endswith('"')): + s = s[1:-1] + return bytes(s, "utf-8").decode("unicode_escape") + return s + + +def filter_pageview_lines(lines, wikidata_titles, wikilang="en"): + """ + Filter an iterable of pageview lines, keeping only entries whose title + appears in the wikidata_titles set. Returns a dict mapping title to + aggregated view count. + """ + match_project = wikilang + ".wikipedia " + pageviews = defaultdict(int) + simplified_line_format = False + + for i, line in enumerate(lines): + if i == 0: + simplified_line_format = line.count(" ") == 1 + + if i > 0 and i % 10000000 == 0: + logging.info(f"Processed {i} lines") + + if not simplified_line_format and not line.startswith(match_project): + continue + + info = line.split(" ") + if simplified_line_format: + title = info[0] + views = info[1] + else: + title = unquote_if_quoted(info[1]) + views = info[4] + + if title in wikidata_titles: + pageviews[title] += int(views) + + return pageviews + + +def write_filtered_pageviews(pageviews, output_file): + """Write aggregated pageview counts to file in ``Title viewcount`` format.""" + with open_file_based_on_extension(output_file, "wt") as filtered_f: + for title, views in pageviews.items(): + filtered_f.write(title + " " + str(views) + "\n") + + +def filter_pageviews(pageviews_file, output_file, wikidata_titles, wikilang="en"): + """ + Filter a single pageview file, keeping only entries whose title appears + in the wikidata_titles set. Aggregates views per title and writes output + in the simplified format (``Title viewcount``). + """ + lines = (line for _, line in enumerate_lines_from_file(pageviews_file)) + pageviews = filter_pageview_lines(lines, wikidata_titles, wikilang) + write_filtered_pageviews(pageviews, output_file) + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("pageview_files", nargs="+", help="One or more pageview files (optionally bz2-compressed)") + parser.add_argument("--titles-file", required=True, help="wikidata_titles.txt file (one title per line)") + parser.add_argument("-o", "--output-dir", required=True, help="Output directory for filtered pageview files") + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + args = parser.parse_args() + + wikidata_titles = load_titles_file(args.titles_file) + logging.info(f"Loaded {len(wikidata_titles)} wikidata titles") + + os.makedirs(args.output_dir, exist_ok=True) + + for pv_file in args.pageview_files: + basename = os.path.basename(pv_file) + if basename.endswith(".bz2"): + basename = basename[:-4] + output_file = os.path.join(args.output_dir, f"OneZoom_{basename}") + logging.info(f"Filtering {pv_file} -> {output_file}") + filter_pageviews(pv_file, output_file, wikidata_titles, wikilang=args.wikilang) + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/filter_wikidata.py b/oz_tree_build/utilities/filter_wikidata.py new file mode 100644 index 0000000..4e1c528 --- /dev/null +++ b/oz_tree_build/utilities/filter_wikidata.py @@ -0,0 +1,224 @@ +"""Filter the massive wikidata JSON dump to taxon and vernacular items.""" + +import argparse +import json +import logging +import sys + +from .._OZglobals import wikiflags +from ..taxon_mapping_and_popularity.OTT_popularity_mapping import ( + JSON_contains_known_dbID, + Qid, + label, +) +from .apply_mask_to_object_graph import ANY, KEEP, apply_mask_to_object_graph +from .file_utils import enumerate_lines_from_file, open_file_based_on_extension +from .temp_helpers import ( + find_taxon_and_vernaculars, + get_wikipedia_name, + quick_byte_match, + wikidata_value, +) + +WIKIDATA_MASK = { + "type": KEEP, + "id": KEEP, + "labels": {"en": {"value": KEEP}}, + "claims": { + "P31": [ + { + "mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}, + "qualifiers": {"P642": [{"datavalue": {"value": {"numeric-id": KEEP}}}]}, + } + ], + "P685": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P846": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P850": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P1391": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P5055": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P830": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P961": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P9157": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P3151": [{"mainsnak": {"datavalue": {"value": KEEP}}}], + "P141": [{"references": [{"snaks": {"P627": [{"datavalue": {"value": KEEP}}]}}]}], + "P1420": [{"mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}}], + "P18": [ + { + "mainsnak": {"datavalue": {"value": KEEP}}, + "rank": KEEP, + } + ], + "P1843": [ + { + "mainsnak": {"datavalue": {"value": KEEP}}, + "rank": KEEP, + } + ], + }, + "sitelinks": {ANY: {"title": KEEP}}, +} + + +def filter_wikidata( + lines, + output_file, + source_ids=None, + clade=None, + wikilang="en", + dont_trim_sitelinks=False, +): + """ + Filter the wikidata JSON dump, keeping only taxon and vernacular items, + and trimming each item to only the fields we consume. + + *lines* should be an iterable of raw dump lines (e.g. from + ``stream_bz2_lines_from_url`` or ``enumerate_lines_from_file``). + """ + sitelinks_key = f"{wikilang}wiki" + + def trim_and_write_json_item(json_item, filtered_wiki_f): + apply_mask_to_object_graph(json_item, WIKIDATA_MASK) + + if dont_trim_sitelinks: + json_item["sitelinks"] = {k: v for k, v in json_item["sitelinks"].items() if k.endswith("wiki")} + else: + json_item["sitelinks"] = { + k: v if k == sitelinks_key else {} + for k, v in json_item["sitelinks"].items() + if k.endswith("wiki") and len(k) == 6 and k[:2] in wikiflags + } + + filtered_wiki_f.write(json.dumps(json_item, separators=(",", ":"))) + filtered_wiki_f.write(",\n") + + included_qids = set() + potential_extra_json_items = [] + + with open_file_based_on_extension(output_file, "wt") as filtered_wiki_f: + filtered_wiki_f.write("[\n") + preserved_lines = 0 + + for line_num, line in enumerate(lines): + if line_num > 0 and line_num % 100_000 == 0: + logging.info(f"Processed {line_num} lines, kept {preserved_lines}") + + if not (line.startswith('{"type":') and quick_byte_match.search(line)): + continue + + json_item = json.loads(line.rstrip().rstrip(",")) + + try: + is_taxon, vernaculars_matches = find_taxon_and_vernaculars(json_item) + except KeyError: + continue + + if not is_taxon and not len(vernaculars_matches) > 0: + continue + + if clade and is_taxon and source_ids: + if not len(JSON_contains_known_dbID(json_item, source_ids)) > 0: + if "P1420" in json_item["claims"] and json_item["sitelinks"]: + potential_extra_json_items.append( + ( + "taxon_synonym", + {wikidata_value(i["mainsnak"])["numeric-id"] for i in json_item["claims"]["P1420"]}, + json_item, + ) + ) + if vernaculars_matches: + potential_extra_json_items.append(("instance_of_synonym", vernaculars_matches, json_item)) + continue + + if is_taxon: + trim_and_write_json_item(json_item, filtered_wiki_f) + included_qids.add(Qid(json_item)) + preserved_lines += 1 + else: + potential_extra_json_items.append(("vernacular", vernaculars_matches, json_item)) + + logging.info( + "Writing extra lines at the end of the file " f"(subset of {len(potential_extra_json_items)} lines)" + ) + + for desc, linked_qids, json_item in potential_extra_json_items: + for qid in linked_qids: + if qid in included_qids: + trim_and_write_json_item(json_item, filtered_wiki_f) + logging.info( + f"Including {desc} entry: Q{Qid(json_item)} " + f"('{label(json_item)}','{get_wikipedia_name(json_item)}' => Q{qid}" + ) + break + + filtered_wiki_f.write("]\n") + + +def extract_wikidata_titles(filtered_wikidata_file): + """ + Read a filtered wikidata JSON file and return the set of Wikipedia page + titles (used by downstream SQL and pageview filters). + """ + titles = set() + for _, line in enumerate_lines_from_file(filtered_wikidata_file): + if not line.startswith('{"type":'): + continue + json_item = json.loads(line.rstrip().rstrip(",")) + title = get_wikipedia_name(json_item) + if title is not None: + titles.add(title) + return titles + + +def write_titles_file(titles, output_path): + """Write the set of titles to a text file, one per line.""" + with open(output_path, "w") as f: + for title in sorted(titles): + f.write(title + "\n") + + +def load_titles_file(titles_path): + """Load a titles text file into a set.""" + with open(titles_path) as f: + return {line.strip() for line in f if line.strip()} + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("wikidata_file", help="The wikidata JSON dump file (bz2 or plain)") + parser.add_argument("-o", "--output", required=True, help="Output path for filtered wikidata JSON") + parser.add_argument("--wikilang", default="en", help="Wikipedia language code") + parser.add_argument( + "--dont-trim-sitelinks", + action="store_true", + default=False, + help="Keep the full sitelinks value for all languages", + ) + args = parser.parse_args() + + lines = (line for _, line in enumerate_lines_from_file(args.wikidata_file)) + filter_wikidata( + lines, + args.output, + wikilang=args.wikilang, + dont_trim_sitelinks=args.dont_trim_sitelinks, + ) + + +def extract_titles_main(): + """Extract Wikipedia page titles from a filtered wikidata JSON file.""" + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=extract_titles_main.__doc__) + parser.add_argument("filtered_wikidata_file", help="The filtered wikidata JSON file") + parser.add_argument("-o", "--output", required=True, help="Output path for wikidata_titles.txt") + args = parser.parse_args() + + titles = extract_wikidata_titles(args.filtered_wikidata_file) + write_titles_file(titles, args.output) + logging.info(f"Wrote {len(titles)} titles to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/filter_wikipedia_sql.py b/oz_tree_build/utilities/filter_wikipedia_sql.py new file mode 100644 index 0000000..d1046f0 --- /dev/null +++ b/oz_tree_build/utilities/filter_wikipedia_sql.py @@ -0,0 +1,132 @@ +"""Filter the enwiki SQL page dump to keep only pages matching wikidata titles.""" + +import argparse +import csv +import logging +import re +import sys +import urllib.parse +import urllib.request + +from .file_utils import open_file_based_on_extension +from .filter_wikidata import load_titles_file + +ENWIKI_DUMPS_URL = "https://dumps.wikimedia.org/enwiki/" + +logger = logging.getLogger(__name__) + + +def filter_wikipedia_sql(sql_file, output_file, wikidata_titles): + """ + Filter the enwiki page SQL dump, keeping only rows whose title appears + in the wikidata_titles set. + """ + page_table_namespace_column = 2 + page_table_title_column = 3 + page_is_redirect_column = 4 + page_table_pagelen_column = 10 + + with open_file_based_on_extension(output_file, "wt") as filtered_sql_f: + current_output_line_entry_count = 0 + max_entries_per_line = 10 + with open_file_based_on_extension(sql_file, "rt") as sql_f: + pagelen_file = csv.reader(sql_f, quotechar="'", escapechar="\\", doublequote=False) + match_line = "INSERT INTO `page` VALUES " + for fields in filter( + lambda x: False if len(x) == 0 else x[0].startswith(match_line), + pagelen_file, + ): + field_num = 0 + for field in fields: + try: + if field and field.lstrip()[0] == "(": + field_num = 0 + namespace = None + title = None + is_redirect = "0" + except IndexError: + pass + field_num += 1 + if field_num == page_table_namespace_column: + namespace = field + if field_num == page_table_title_column: + title = field + if field_num == page_is_redirect_column: + is_redirect = field + elif field_num == page_table_pagelen_column and namespace == "0": + if title in wikidata_titles: + if current_output_line_entry_count == 0: + filtered_sql_f.write(match_line) + else: + filtered_sql_f.write(",") + + title = title.replace("'", "\\'") + filtered_sql_f.write(f"(,{namespace},'{title}',{is_redirect},,,,,,{field},,)") + + current_output_line_entry_count += 1 + if current_output_line_entry_count == max_entries_per_line: + filtered_sql_f.write(";\n") + current_output_line_entry_count = 0 + + +def main(): + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("sql_file", help="The enwiki SQL page dump file (optionally gzipped)") + parser.add_argument("titles_file", help="wikidata_titles.txt file (one title per line)") + parser.add_argument("-o", "--output", required=True, help="Output path for filtered SQL file") + args = parser.parse_args() + + wikidata_titles = load_titles_file(args.titles_file) + logging.info(f"Loaded {len(wikidata_titles)} wikidata titles") + filter_wikipedia_sql(args.sql_file, args.output, wikidata_titles) + + +def discover_latest_enwiki_sql_url(base_url=ENWIKI_DUMPS_URL, timeout=30): + """Find the URL of the most recent enwiki-YYYYMMDD-page.sql.gz dump. + + Fetches the directory listing at *base_url*, collects the dated + sub-folders (``YYYYMMDD/``), and walks them in reverse-chronological + order until it finds one whose dump status page contains a link to + the ``page.sql.gz`` file. + + Returns the full URL to that file. + Raises ``RuntimeError`` if no suitable dump can be found. + """ + folder_re = re.compile(r'href="(\d{8})/"') + file_re_template = r'href="([^"]*enwiki-{date}-page\.sql\.gz)"' + + index_html = urllib.request.urlopen(base_url, timeout=timeout).read().decode() + + dates = sorted(folder_re.findall(index_html), reverse=True) + if not dates: + raise RuntimeError(f"No dated folders found at {base_url}") + + for date in dates: + folder_url = f"{base_url}{date}/" + logger.info("Checking %s", folder_url) + try: + folder_html = urllib.request.urlopen(folder_url, timeout=timeout).read().decode() + except urllib.error.URLError as exc: + logger.warning("Could not fetch %s: %s", folder_url, exc) + continue + + match = re.search(file_re_template.format(date=date), folder_html) + if match: + url = urllib.parse.urljoin(folder_url, match.group(1)) + logger.info("Found latest enwiki SQL dump: %s", url) + return url + + raise RuntimeError(f"No enwiki-YYYYMMDD-page.sql.gz file found in any folder at {base_url}") + + +def discover_main(): + """CLI entry point: discover the latest enwiki SQL dump URL.""" + logging.basicConfig(stream=sys.stderr, level=logging.INFO) + url = discover_latest_enwiki_sql_url() + print(url) + + +if __name__ == "__main__": + main() diff --git a/oz_tree_build/utilities/generate_filtered_files.py b/oz_tree_build/utilities/generate_filtered_files.py index 538b84a..063f4c4 100644 --- a/oz_tree_build/utilities/generate_filtered_files.py +++ b/oz_tree_build/utilities/generate_filtered_files.py @@ -3,472 +3,75 @@ 1. Filter the input files to remove many irrelevant things in order to make them smaller. 2. Generate test files that are a filtered subset of the full files, targeted at a specific clade/taxon. + +For the DVC pipeline, the individual filter modules (filter_eol, filter_wikidata, +filter_wikipedia_sql, filter_pageviews) are run as separate stages. This module +provides the orchestrating function used for clade-specific test filtering. """ import argparse -import csv -import json import logging import os import sys import time -from collections import defaultdict -from .._OZglobals import wikiflags from ..newick.extract_trees import get_taxon_subtree_from_newick_file from ..newick.newick_parser import parse_tree -from ..taxon_mapping_and_popularity.CSV_base_table_creator import iucn_num -from ..taxon_mapping_and_popularity.OTT_popularity_mapping import ( - JSON_contains_known_dbID, - Qid, - label, -) -from .apply_mask_to_object_graph import ANY, KEEP, apply_mask_to_object_graph from .file_utils import enumerate_lines_from_file, open_file_based_on_extension -from .temp_helpers import ( - find_taxon_and_vernaculars, - get_wikipedia_name, - quick_byte_match, - wikidata_value, -) +from .filter_common import read_taxonomy_source_ids +from .filter_eol import filter_eol_ids +from .filter_pageviews import filter_pageviews +from .filter_wikidata import extract_wikidata_titles, filter_wikidata +from .filter_wikipedia_sql import filter_wikipedia_sql __author__ = "David Ebbo" one_zoom_file_prefix = "OneZoom" -def generate_and_cache_filtered_file(original_file, context, processing_function): - """ - Helper to perform caching of filtered files. - """ - +def _compute_output_path(original_file, prefix, compress=False): + """Compute the output path for a filtered file given a prefix (clade or OneZoom).""" dirname = os.path.dirname(original_file) file_name = os.path.basename(original_file) - filtered_file_prefix = (context.clade or one_zoom_file_prefix) + "_" - if file_name.startswith(filtered_file_prefix): - raise Exception(f"Input and output files are the same, with prefix {filtered_file_prefix}") - - # If original file is a OneZoom file, remove the OneZoom prefix to avoid double prefixes if file_name.startswith(one_zoom_file_prefix): file_name = file_name[len(one_zoom_file_prefix) + 1 :] - # Include clade in new file name, e.g. '/foo/bar.csv.gz' --> '/foo/Mammalia_bar.csv.gz' - # If no clade is specified, use 'OneZoom' instead as the prefix - clade_filtered_file = os.path.join(dirname, f"{filtered_file_prefix}{file_name}") - - # If we're not compressing and it has a .gz or .bz2 extension, remove it - if not context.compress: - if clade_filtered_file.endswith(".gz") or clade_filtered_file.endswith(".bz2"): - clade_filtered_file = os.path.splitext(clade_filtered_file)[0] - - # Unless force is set, check we already have a filtered file with the matching timestamp - if not context.force: - if os.path.exists(clade_filtered_file) and os.path.getmtime(clade_filtered_file) == os.path.getmtime( - original_file - ): - logging.info(f"Using cached file {clade_filtered_file}") - return clade_filtered_file - - # If the filtered file already exists, rename it to include the timestamp, so we don't overwrite it - if os.path.exists(clade_filtered_file): - existing_file_time = os.path.getmtime(clade_filtered_file) - renamed_file_name = ( - os.path.splitext(clade_filtered_file)[0] - + "_" - + time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(existing_file_time)) - + os.path.splitext(clade_filtered_file)[1] - ) - os.rename(clade_filtered_file, renamed_file_name) - logging.info(f"Renamed existing file to {renamed_file_name}") - - logging.info(f"Generating file {clade_filtered_file}") + output_file = os.path.join(dirname, f"{prefix}_{file_name}") - # Call the processing function to generate the filtered file - processing_function(original_file, clade_filtered_file, context) - - # Set the timestamp of the filtered file to match the original file - os.utime( - clade_filtered_file, - (os.path.getatime(original_file), os.path.getmtime(original_file)), - ) + if not compress: + if output_file.endswith(".gz") or output_file.endswith(".bz2"): + output_file = os.path.splitext(output_file)[0] - logging.info(f"Finished generating file {clade_filtered_file}") + return output_file - return clade_filtered_file - - -def generate_filtered_newick(newick_file, filtered_newick_file, context): - tree_string = get_taxon_subtree_from_newick_file(newick_file, context.clade) +def generate_filtered_newick(newick_file, filtered_newick_file, clade): + tree_string = get_taxon_subtree_from_newick_file(newick_file, clade) with open_file_based_on_extension(filtered_newick_file, "wt") as f: f.write(tree_string) -def read_newick_file(newick_file, context): +def read_newick_otts(newick_file): with open_file_based_on_extension(newick_file, "rt") as f: - filtered_tree_string = f.read() + tree_string = f.read() + return {node["ott"] for node in parse_tree(tree_string)} - # Get the set of OTT ids from the filtered tree - context.otts = {node["ott"] for node in parse_tree(filtered_tree_string)} - -def generate_filtered_taxonomy_file(taxonomy_file, filtered_taxonomy_file, context): +def generate_filtered_taxonomy_file(taxonomy_file, filtered_taxonomy_file, otts): with open_file_based_on_extension(filtered_taxonomy_file, "wt") as filtered_taxonomy: for i, line in enumerate_lines_from_file(taxonomy_file): - # Always copy the header if i == 0: filtered_taxonomy.write(line) continue - # The ott id is the first column (known as the "uid" in the tsv file) fields = line.split("\t") ott = fields[0] - # Only include lines that have an ott id in the filtered tree - if ott in context.otts: + if ott in otts: filtered_taxonomy.write(line) -def read_taxonomy_file(taxonomy_file, context): - sources = {"ncbi", "if", "worms", "irmng", "gbif"} - context.source_ids = {source: set() for source in sources} - - # Get the sets of source ids we're actually using from the taxonomy file - with open_file_based_on_extension(taxonomy_file, "rt") as f: - reader = csv.DictReader(f, delimiter="\t") - for OTTrow in reader: - sourceinfo = OTTrow["sourceinfo"] - for srcs in sourceinfo.split(","): - src, src_id = srcs.split(":", 1) - if src in sources: - try: - context.source_ids[src].add(int(src_id)) - except ValueError: - # Ignore it if it's not an integer - pass - - -def generate_filtered_eol_id_file(eol_id_file, filtered_eol_id_file, context): - eol_sources = {"676": "ncbi", "459": "worms", "767": "gbif", str(iucn_num): "iucn"} - iucn_lines = [] - known_names = set() - - with open_file_based_on_extension(eol_id_file, "rt") as eol_f: - with open_file_based_on_extension(filtered_eol_id_file, "wt") as filtered_eol_f: - for i, line in enumerate(eol_f): - # Always copy the header - if i == 0: - filtered_eol_f.write(line) - continue - - fields = line.split(",") - - # Ignore it if it's not one of the known sources - if fields[2] not in eol_sources: - continue - - try: - eol_id = int(fields[1]) - except ValueError: - # Some lines have the eol_id set to a weird value, e.g. - # "Animalia/Arthropoda/Malacostraca/Cumacea/Pseudocumatidae/Strauchia" - # We ignore these - continue - - if not context.clade: - # If we're not filtering by clade, keep all the lines - filtered_eol_f.write(line) - continue - - # If it's an IUCN line, just save it for now - if fields[2] == str(iucn_num): - iucn_lines.append(line) - # For other providers, only include it if we saw it in the taxonomy file - elif eol_id in context.source_ids[eol_sources[fields[2]]]: - filtered_eol_f.write(line) - known_names.add(fields[4]) - - # Include any IUCN lines that have a name that we encountered - for line in iucn_lines: - fields = line.split(",") - if fields[4] in known_names: - filtered_eol_f.write(line) - - logging.info( - f"Found {len(context.source_ids['ncbi'])} NCBI ids, " - f"{len(context.source_ids['if'])} IF ids, " - f"{len(context.source_ids['worms'])} WoRMS ids, " - f"{len(context.source_ids['irmng'])} IRMNG ids, " - f"{len(context.source_ids['gbif'])} GBIF ids" - ) - - -def generate_filtered_wikidata_dump(wikipedia_dump_file, filtered_wikipedia_dump_file, context): - # This mask defines which fields we want to keep from the wikidata dump - # The goal is to keep it structurally the same as the original, but only - # include the fields we actually consume - mask = { - "type": KEEP, # Only needed for the quick 'startswith()' line check - "id": KEEP, - "labels": {"en": {"value": KEEP}}, - "claims": { - "P31": [ - { - "mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}, - "qualifiers": { - "P642": [{"datavalue": {"value": {"numeric-id": KEEP}}}] - }, # "of" (applies within the scope of a particular item) - } - ], # Instance of - "P685": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # ncbi id - "P846": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # gbif id - "P850": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # worms id - "P1391": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # if id - "P5055": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # irmng id - "P830": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # EOL id - "P961": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # IPNI id - "P9157": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # OTT id - "P3151": [{"mainsnak": {"datavalue": {"value": KEEP}}}], # iNaturalist id - "P141": [{"references": [{"snaks": {"P627": [{"datavalue": {"value": KEEP}}]}}]}], # IUCN id - "P1420": [{"mainsnak": {"datavalue": {"value": {"numeric-id": KEEP}}}}], # taxon synonym - "P18": [ - { - "mainsnak": {"datavalue": {"value": KEEP}}, - "rank": KEEP, - } - ], # image - "P1843": [ - { - "mainsnak": {"datavalue": {"value": KEEP}}, - "rank": KEEP, - } - ], # taxon common name (aka vernaculars) - }, - "sitelinks": {ANY: {"title": KEEP}}, - } - - sitelinks_key = f"{context.wikilang}wiki" - - def trim_and_write_json_item(json_item, filtered_wiki_f): - # Remove everything we don't need from the json - apply_mask_to_object_graph(json_item, mask) - - # Only keep the sitelinks that end in "wiki", e.g. enwiki, dewiki, etc. - # (leave out those ending in "wikiquote", "wikivoyage", "wikinews", "wikibooks", etc.) - if context.dont_trim_sitelinks: - # Keep the full sitelinks value for all languages if flag is passed - json_item["sitelinks"] = {k: v for k, v in json_item["sitelinks"].items() if k.endswith("wiki")} - else: - # Otherwise only keep the original value for the language we want, since the - # rest is just needed to collect the language names into the bit field - # Also, limit the sitelinks to the languages we care about for the bit field - json_item["sitelinks"] = { - k: v if k == sitelinks_key else {} - for k, v in json_item["sitelinks"].items() - if k.endswith("wiki") and len(k) == 6 and k[:2] in wikiflags - } - - # Write out a line. We set the separators to avoid spaces - filtered_wiki_f.write(json.dumps(json_item, separators=(",", ":"))) - filtered_wiki_f.write(",\n") - - included_qids = set() - - # Keep track of vernaculars and taxon synonyms that we might want to include at the end - # There are only a few hundred, so memory isn't an issue. - potential_extra_json_items = [] - - with open_file_based_on_extension(filtered_wikipedia_dump_file, "wt") as filtered_wiki_f: - filtered_wiki_f.write("[\n") - preserved_lines = 0 - - def get_line_message(line_num): - return f"Kept {preserved_lines}/{line_num} lines ({preserved_lines / line_num * 100:.2f}%)" - - for _, line in enumerate_lines_from_file(wikipedia_dump_file, 100000, get_line_message): - if not (line.startswith('{"type":') and quick_byte_match.search(line)): - continue - - json_item = json.loads(line.rstrip().rstrip(",")) - - try: - is_taxon, vernaculars_matches = find_taxon_and_vernaculars(json_item) - except KeyError: - continue - - # If it's neither, ignore it - if not is_taxon and not len(vernaculars_matches) > 0: - continue - - # When clade filter, we only want to keep the taxa that map to source ids. - # In addition, when it doesn't map to any, we want to track it if it's - # a synonym, so we may end up including it at the end. - if context.clade and is_taxon: - if not len(JSON_contains_known_dbID(json_item, context.source_ids)) > 0: - # Case 1: it could have taxon synonyms via P1420 - if "P1420" in json_item["claims"] and json_item["sitelinks"]: - potential_extra_json_items.append( - ( - "taxon_synonym", - {wikidata_value(i["mainsnak"])["numeric-id"] for i in json_item["claims"]["P1420"]}, - json_item, - ) - ) - # Case 2: it could have synonyms via a P642 in P31 - # Note: as this is a taxon, we're dealing with synonyms, not vernaculars, - # so the variable name is a bit misleading - if vernaculars_matches: - potential_extra_json_items.append(("instance_of_synonym", vernaculars_matches, json_item)) - continue - - if is_taxon: - trim_and_write_json_item(json_item, filtered_wiki_f) - - included_qids.add(Qid(json_item)) - - preserved_lines += 1 - else: - # If it's vernacular, we'll potentially write it out at the end, so save it - potential_extra_json_items.append(("vernacular", vernaculars_matches, json_item)) - - logging.info( - "Writing extra lines at the end of the file " f"(subset of {len(potential_extra_json_items)} lines)" - ) - - for desc, linked_qids, json_item in potential_extra_json_items: - for qid in linked_qids: - # Only write it if it maps to one of the entries we included above - if qid in included_qids: - trim_and_write_json_item(json_item, filtered_wiki_f) - logging.info( - f"Including {desc} entry: Q{Qid(json_item)} " - f"('{label(json_item)}','{get_wikipedia_name(json_item)}' => Q{qid}" - ) - break - - filtered_wiki_f.write("]\n") - - -def read_wikidata_dump(wikidata_dump_file, context): - context.wikidata_ids = set() - - for _, line in enumerate_lines_from_file(wikidata_dump_file): - if not line.startswith('{"type":'): - continue - - json_item = json.loads(line.rstrip().rstrip(",")) - context.wikidata_ids.add(get_wikipedia_name(json_item)) - - -def generate_filtered_wikipedia_sql_dump(wikipedia_sql_dump_file, filtered_wikipedia_sql_dump_file, context): - # the column numbers for each datum are specified in the SQL file, and hardcoded here. - page_table_namespace_column = 2 - page_table_title_column = 3 - page_is_redirect_column = 4 - page_table_pagelen_column = 10 - - with open_file_based_on_extension(filtered_wikipedia_sql_dump_file, "wt") as filtered_sql_f: - current_output_line_entry_count = 0 - max_entries_per_line = 10 - with open_file_based_on_extension(wikipedia_sql_dump_file, "rt") as sql_f: - pagelen_file = csv.reader(sql_f, quotechar="'", escapechar="\\", doublequote=False) - match_line = "INSERT INTO `page` VALUES " - for fields in filter( - lambda x: False if len(x) == 0 else x[0].startswith(match_line), - pagelen_file, - ): - field_num = 0 - # the records are all on the same line, separated by '),(', - # so we need to count fields into the line. - for field in fields: - try: - if field and field.lstrip()[0] == "(": - field_num = 0 - namespace = None - title = None - is_redirect = "0" - except IndexError: - pass - field_num += 1 - if field_num == page_table_namespace_column: - namespace = field - if field_num == page_table_title_column: - title = field - if field_num == page_is_redirect_column: - is_redirect = field - elif field_num == page_table_pagelen_column and namespace == "0": - # Only include it if it's one of our wikidata ids - if title in context.wikidata_ids: - if current_output_line_entry_count == 0: - filtered_sql_f.write(match_line) - else: - filtered_sql_f.write(",") - - # Escape the quotes in the title - title = title.replace("'", "\\'") - - # We leave all the other fields empty, as we don't need them - # e.g. (,0,'Pan_paniscus',0,,,,,,87,,) - filtered_sql_f.write(f"(,{namespace},'{title}',{is_redirect},,,,,,{field},,)") - - current_output_line_entry_count += 1 - if current_output_line_entry_count == max_entries_per_line: - filtered_sql_f.write(";\n") - current_output_line_entry_count = 0 - - -# If it's quoted, remove the quotes and unescape it -def unquote_if_quoted(s): - if s.startswith("'") and s.endswith("'") or s.startswith('"') and s.endswith('"'): - s = s[1:-1] - return bytes(s, "utf-8").decode("unicode_escape") - return s - - -def generate_filtered_pageviews_file(pageviews_file, filtered_pageviews_file, context): - match_project = context.wikilang + ".wikipedia " - - pageviews = defaultdict(int) - simplified_line_format = False - - for i, line in enumerate_lines_from_file(pageviews_file): - # Check if it's the simplified format based on the first line. - # - Simplified format: - # - Looks like: Chimpanzee 78033 - # - We process all lines - # - Only one line for a given taxon - # - Full format (original format from wikipedia): - # - Looks like: en.wikipedia Chimpanzee 7844 mobile-web 50018 A1581B168[etc...] - # - We ignore lines that don't start with en.wikipedia - # - There can be multiple lines for a given taxon (e.g. mobile vs desktop views) - if i == 0: - simplified_line_format = line.count(" ") == 1 - - if i > 0 and i % 10000000 == 0: - logging.info(f"Processed {i} lines") - - if not simplified_line_format and not line.startswith(match_project): - continue - - info = line.split(" ") - if simplified_line_format: - title = info[0] - views = info[1] - else: - title = unquote_if_quoted(info[1]) - views = info[4] - - # Only include it if it's one of our wikidata ids - if title in context.wikidata_ids: - pageviews[title] += int(views) - - # Write out the filtered pageviews in the simplified format - with open_file_based_on_extension(filtered_pageviews_file, "wt") as filtered_f: - for title, views in pageviews.items(): - filtered_f.write(title + " " + str(views) + "\n") - - def generate_all_filtered_files( context, newick_file, @@ -478,40 +81,55 @@ def generate_all_filtered_files( wikipedia_sql_dump_file, wikipedia_pageviews_files, ): + """ + Orchestrate all filtering steps. Used for clade-specific test filtering + and as a convenience wrapper. For the DVC pipeline, the individual filter + modules are invoked as separate stages instead. + """ + prefix = context.clade or one_zoom_file_prefix + if context.clade: - # If we're filtering by clade, we need to generate a filtered newick - filtered_newick_file = generate_and_cache_filtered_file(newick_file, context, generate_filtered_newick) - read_newick_file(filtered_newick_file, context) + filtered_newick_file = _compute_output_path(newick_file, prefix, context.compress) + generate_filtered_newick(newick_file, filtered_newick_file, context.clade) + otts = read_newick_otts(filtered_newick_file) - # We also need to generate a filtered taxonomy file - filtered_taxonomy_file = generate_and_cache_filtered_file( - taxonomy_file, context, generate_filtered_taxonomy_file - ) + filtered_taxonomy_file = _compute_output_path(taxonomy_file, prefix, context.compress) + generate_filtered_taxonomy_file(taxonomy_file, filtered_taxonomy_file, otts) else: - # If we're not filtering by clade, there is really nothing to filter, - # so we just use the original taxonomy file directly. - # Note that we completely ignore the newick file in this case. filtered_taxonomy_file = taxonomy_file - read_taxonomy_file(filtered_taxonomy_file, context) - - generate_and_cache_filtered_file(eol_id_file, context, generate_filtered_eol_id_file) - if os.path.basename(wikidata_dump_file).startswith(one_zoom_file_prefix) and not context.clade: - filtered_wikidata_dump_file = wikidata_dump_file - else: - filtered_wikidata_dump_file = generate_and_cache_filtered_file( - wikidata_dump_file, context, generate_filtered_wikidata_dump + source_ids = read_taxonomy_source_ids(filtered_taxonomy_file) + + if eol_id_file: + eol_output = _compute_output_path(eol_id_file, prefix, context.compress) + filter_eol_ids(eol_id_file, eol_output, source_ids, clade=context.clade) + + if wikidata_dump_file: + wikidata_output = _compute_output_path(wikidata_dump_file, prefix, context.compress) + lines = (line for _, line in enumerate_lines_from_file(wikidata_dump_file)) + filter_wikidata( + lines, + wikidata_output, + source_ids=source_ids if context.clade else None, + clade=context.clade, + wikilang=context.wikilang, + dont_trim_sitelinks=context.dont_trim_sitelinks, ) - read_wikidata_dump(filtered_wikidata_dump_file, context) + wikidata_titles = extract_wikidata_titles(wikidata_output) + else: + wikidata_titles = set() - generate_and_cache_filtered_file(wikipedia_sql_dump_file, context, generate_filtered_wikipedia_sql_dump) + if wikipedia_sql_dump_file: + sql_output = _compute_output_path(wikipedia_sql_dump_file, prefix, context.compress) + filter_wikipedia_sql(wikipedia_sql_dump_file, sql_output, wikidata_titles) - for wikipedia_pageviews_file in wikipedia_pageviews_files: - generate_and_cache_filtered_file(wikipedia_pageviews_file, context, generate_filtered_pageviews_file) + if wikipedia_pageviews_files: + for pv_file in wikipedia_pageviews_files: + pv_output = _compute_output_path(pv_file, prefix, context.compress) + filter_pageviews(pv_file, pv_output, wikidata_titles, wikilang=context.wikilang) def process_args(args): - # Create a context object to hold various things we need to pass around context = type( "", (object,), @@ -550,8 +168,8 @@ def main(): help="The OpenTree taxonomy.tsv file, from http://files.opentreeoflife.org/ott/", ) parser.add_argument( - "EOLidentifiers", - help=("The gzipped EOL identifiers file, from " "https://opendata.eol.org/dataset/identifiers-csv-gz"), + "--EOLidentifiers", + help="The gzipped EOL identifiers file (optional, previously from opendata.eol.org)", ) parser.add_argument( "wikidataDumpFile", @@ -569,7 +187,7 @@ def main(): nargs="?", help=( "The gzipped >1GB wikipedia -latest-page.sql.gz dump, " - "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-latest-page.sql.gz) " + "from https://dumps.wikimedia.org/enwiki/latest/ (enwiki-page.sql.gz) " ), ) parser.add_argument( @@ -593,7 +211,7 @@ def main(): "-f", action=argparse.BooleanOptionalAction, default=False, - help="If true, forces the regeneration of all files, ignoring caching.", + help="If true, forces the regeneration of all files (ignored, kept for CLI compatibility).", ) parser.add_argument( "--dont_trim_sitelinks", diff --git a/params.yaml b/params.yaml new file mode 100644 index 0000000..cad7876 --- /dev/null +++ b/params.yaml @@ -0,0 +1,4 @@ +oz_tree: AllLife +ot_version: "v16.1" +build_version: 28017344 +exclude_from_popularity: Archosauria_ott335588 Dinosauria_ott90215 diff --git a/pyproject.toml b/pyproject.toml index 62dc6b4..2d011d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,12 +21,14 @@ dependencies = [ "cryptography>=42.0", "mwparserfromhell>=0.6.6", "requests-cache>=1.2.1", + "dvc[s3]>=3.0", ] [project.optional-dependencies] -test = [ +dev = [ "pytest>=8.1", "ruff>=0.5.1", + "pre-commit>=4.5.1", ] [tool.pytest.ini_options] @@ -39,6 +41,16 @@ add_ott_numbers_to_trees = "oz_tree_build.tree_build.ott_mapping.add_ott_numbers build_oz_tree = "oz_tree_build.tree_build.build_oz_tree:main" get_open_trees_from_one_zoom = "oz_tree_build.tree_build.get_open_trees_from_one_zoom:main" generate_filtered_files = "oz_tree_build.utilities.generate_filtered_files:main" +filter_eol = "oz_tree_build.utilities.filter_eol:main" +filter_wikidata = "oz_tree_build.utilities.filter_wikidata:main" +extract_wikidata_titles = "oz_tree_build.utilities.filter_wikidata:extract_titles_main" +filter_wikipedia_sql = "oz_tree_build.utilities.filter_wikipedia_sql:main" +filter_pageviews = "oz_tree_build.utilities.filter_pageviews:main" +download_and_filter_pageviews = "oz_tree_build.utilities.download_and_filter_pageviews:main" +download_and_filter_wikidata = "oz_tree_build.utilities.download_and_filter_wikidata:main" +discover_latest_wikidata_dump_url = "oz_tree_build.utilities.download_and_filter_wikidata:discover_main" +discover_latest_enwiki_sql_url = "oz_tree_build.utilities.filter_wikipedia_sql:discover_main" +download_opentree = "oz_tree_build.utilities.download_opentree:main" CSV_base_table_creator = "oz_tree_build.taxon_mapping_and_popularity.CSV_base_table_creator:main" get_wiki_images = "oz_tree_build.images_and_vernaculars.get_wiki_images:main" process_image_bits = "oz_tree_build.images_and_vernaculars.process_image_bits:main"