From 1e2eb0a04721216b5e474d9dd3fae50f9d2eeefa Mon Sep 17 00:00:00 2001 From: lubwama Date: Wed, 12 Jul 2023 19:30:24 +0300 Subject: [PATCH 1/6] Feat: play on paste . Fixes #36 --- .../react_app/src/components/FileExplorer.tsx | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx index 54ba208..504de22 100644 --- a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx +++ b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx @@ -121,14 +121,14 @@ function useFileNavigate() { } const Files = (): JSX.Element => { - const [displayHelper, setDisplayHelper] = useState(false); + const [displayHelper, setDisplayHelper] = useState(false); const navigate = useFileNavigate(); let { filename, pageNumber, numberLines, files, audioBlob, error } = useLoaderData() as LoaderReturn; const [newFilename, setNewFilename] = useState( filename || config.default_path ); - + // if we have a location, we are in a transition between two urls const navigation = useNavigation(); const locationParams = parseLocation(navigation.location); @@ -168,6 +168,17 @@ const Files = (): JSX.Element => { [setFilename] ); + // Add new function to handle paste events + const fileInputHandlePaste = useCallback( + (evt) => { + const pastedData = evt.clipboardData.getData('text'); + setNewFilename(pastedData); + navigate(pastedData, pageNumber, numberLines); + }, + [navigate, pageNumber, numberLines] + ); + + return (
@@ -183,6 +194,7 @@ const Files = (): JSX.Element => { onChange={setFilenameEventHandler} value={newFilename} onKeyDown={fileInputHandleChange} + onPaste={fileInputHandlePaste} size="sm" /> From de63794c4571e94b761941d755fad528221d524d Mon Sep 17 00:00:00 2001 From: lubwama Date: Fri, 21 Jul 2023 12:46:15 +0300 Subject: [PATCH 2/6] Fix: Trailing whitespace fixed with pre-commit run --all-files --- .github/workflows/lint_and_tests.yaml | 48 +-- CHANGELOG.md | 6 +- CODE_OF_CONDUCT.md | 26 +- CONTRIBUTING.md | 6 +- README.md | 21 +- demo/alti/README.md | 26 +- demo/alti/detecting_hallucinations/README.md | 49 +-- demo/alti/minimal_example/download_nllb.sh | 4 +- .../minimal_example/preset/nllb_demo.yaml | 2 +- demo/iwslt_blaser_eval/README.md | 4 +- demo/iwslt_blaser_eval/conf/eval_blaser.yaml | 2 - demo/iwslt_blaser_eval/conf/launcher | 2 +- demo/toxicity-alti-hb/ETOX/README.md | 20 +- demo/toxicity-alti-hb/README.md | 15 +- demo/toxicity-alti-hb/alti/README.md | 5 +- demo/toxicity-alti-hb/analysis/README.md | 2 + demo/toxicity-alti-hb/annotation/README.md | 51 +-- stopes/eval/alti/LICENSE.md | 366 +++++++++--------- stopes/eval/alti/README.md | 2 +- stopes/eval/blaser/README.md | 2 +- stopes/eval/blaser/conf/score.yaml | 2 +- stopes/pipelines/bitext/README.md | 2 +- .../embed_text/config/encoder/hf_encoder.yaml | 4 +- .../conf/eval/generate_multi_bleu_detok.yaml | 2 +- .../bitext/conf/generate/standard_conf.yaml | 1 - .../bitext/conf/mine_indexes/base.yaml | 40 +- .../bitext/conf/mine_sentences/base.yaml | 26 +- .../conf/moses_filter/standard_conf.yaml | 2 +- .../bitext/conf/nmt_bitext_eval.yaml | 1 - stopes/pipelines/bitext/conf/preset/demo.yaml | 2 +- .../bitext/conf/spm/train/standard_conf.yaml | 2 +- .../bitext/conf/train_spm/standard_conf.yaml | 2 +- .../conf/bitext_clean/default.yaml | 6 +- .../distillation/conf/dedup/default.yaml | 2 +- .../distillation/conf/distillation.yaml | 40 +- stopes/pipelines/distillation/conf/launcher | 2 +- .../conf/mono_pipeline/default.yaml | 2 +- .../params/model/transformer.yaml | 2 +- stopes/pipelines/eval/conf/launcher | 2 +- stopes/pipelines/filtering/README.md | 5 +- stopes/pipelines/monolingual/README.md | 1 + .../monolingual/conf/dedup/dedup_files.yaml | 2 +- stopes/pipelines/monolingual/conf/launcher | 2 +- .../monolingual/conf/monolingual.yaml | 35 +- stopes/pipelines/prepare_data/README.md | 33 +- stopes/pipelines/prepare_data/conf/launcher | 2 +- .../prepare_data/conf/prepare_data.yaml | 2 +- stopes/pipelines/speech/README.md | 18 +- .../speech/conf/compute_laser_embeddings.yaml | 2 +- stopes/pipelines/speech/conf/launcher | 2 +- .../conf/embed_text/test_numbers_encoder.yaml | 2 +- stopes/pipelines/translate/conf/example.yaml | 16 +- stopes/pipelines/translate/conf/launcher | 2 +- .../react_app/src/components/FileExplorer.tsx | 7 +- .../ui/seamlisten/react_app/src/e2e.test.js | 30 +- website/docs/eval/alti.md | 31 +- website/docs/eval/blaser.md | 4 +- website/docs/pipelines/distillation.md | 31 +- website/docs/pipelines/global_mining.md | 69 ++-- website/docs/pipelines/monolingual.md | 1 + website/docs/quickstart.md | 54 +-- website/docs/stopes/advanced/checkpointing.md | 7 +- website/docs/stopes/advanced/debugging.md | 30 +- website/docs/stopes/advanced/dynamic.md | 42 +- website/docs/stopes/cache.md | 43 +- website/docs/stopes/configuration.md | 61 +-- website/docs/stopes/index.md | 119 ++---- website/docs/stopes/module.md | 72 +--- website/docs/stopes/pipelining.md | 36 +- website/sidebars.js | 2 +- website/src/css/custom.css | 5 +- website/src/pages/index.js | 188 ++++----- 72 files changed, 776 insertions(+), 981 deletions(-) diff --git a/.github/workflows/lint_and_tests.yaml b/.github/workflows/lint_and_tests.yaml index 07f7292..604eff8 100644 --- a/.github/workflows/lint_and_tests.yaml +++ b/.github/workflows/lint_and_tests.yaml @@ -18,30 +18,30 @@ jobs: runs-on: ${{ matrix.platform }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - cache: 'pip' + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" - - name: Install - # Fairseq doesn't install with pip==22.1 we need to upgrade past it. - # Also the version on pypi is from before Oct 2020. - run: | - python --version - python -m pip install --upgrade 'pip>=22.1.2' - python -m pip show pip - python -m pip install 'git+https://github.com/facebookresearch/fairseq.git@v0.12.1' - python -m pip install -e '.[dev,mono,mining]' + - name: Install + # Fairseq doesn't install with pip==22.1 we need to upgrade past it. + # Also the version on pypi is from before Oct 2020. + run: | + python --version + python -m pip install --upgrade 'pip>=22.1.2' + python -m pip show pip + python -m pip install 'git+https://github.com/facebookresearch/fairseq.git@v0.12.1' + python -m pip install -e '.[dev,mono,mining]' - - name: isort - run: isort --check --diff . - - name: black - run: black --check --diff . - - name: pytest - run: pytest - # TODO: fix type issues - - name: mypy - run: 'mypy || echo "Warning: mypy still does not pass"' + - name: isort + run: isort --check --diff . + - name: black + run: black --check --diff . + - name: pytest + run: pytest + # TODO: fix type issues + - name: mypy + run: 'mypy || echo "Warning: mypy still does not pass"' diff --git a/CHANGELOG.md b/CHANGELOG.md index eb03757..16b333f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,24 +20,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - when mining, keep metadata about what pairs come from the forward and backward pass - when mining, choose if you want to do only forward, backward or both passes - - ### Changed - embeddings for mining are now stored in real npy files with headers - `StopesModule` is not `async` anymore, just the APIs of `Launcher`. You should write your `run` function as -a normal non-async function + a normal non-async function - mining neighbours is now optimized to have a smaller memory load - progress bar of pipelines is simplified to avoid overly busy logs - do not rely on existing line count files and compute them as part of the pipeline in the mining - ### Fixed - many improvements in the mining code - many fixes in the NMT eval pipeline - ## 1.0.0 Initial release diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 83f431e..c4a3c1d 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -14,22 +14,22 @@ appearance, race, religion, or sexual identity and orientation. Examples of behavior that contributes to creating a positive environment include: -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members +- Using welcoming and inclusive language +- Being respectful of differing viewpoints and experiences +- Gracefully accepting constructive criticism +- Focusing on what is best for the community +- Showing empathy towards other community members Examples of unacceptable behavior by participants include: -* The use of sexualized language or imagery and unwelcome sexual attention or -advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic -address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a -professional setting +- The use of sexualized language or imagery and unwelcome sexual attention or + advances +- Trolling, insulting/derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or electronic + address, without explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting ## Our Responsibilities diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fad26ca..703968e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,5 @@ # Contributing to `stopes` + We want to make contributing to this project as easy and transparent as possible. @@ -9,6 +10,7 @@ and push our changes to the open source community when we have a stable version or interesting results. ## Pull Requests + We actively welcome your pull requests. 1. Fork the repo and create your branch from `main`. @@ -19,12 +21,14 @@ We actively welcome your pull requests. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") + In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Meta's open source projects. Complete your CLA here: ## Issues + We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. @@ -32,7 +36,7 @@ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. - ## License + By contributing to `stopes`, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. diff --git a/README.md b/README.md index ddc051d..0625092 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ ![stopes](/website/static/img/banner.png?raw=true "stopes by NLLB.") - # `stopes`: A library for preparing data for machine translation research As part of the FAIR No Language Left Behind (NLLB) ([Paper](https://research.facebook.com/publications/no-language-left-behind/), [Website](https://ai.facebook.com/research/no-language-left-behind/), [Blog](https://ai.facebook.com/blog/nllb-200-high-quality-machine-translation/)) @@ -18,12 +17,14 @@ checkout the `demo` directory for an example usage with the [WMT22 Shared Task: Languages](https://statmt.org/wmt22/large-scale-multilingual-translation-task.html) data. ## Requirements + `stopes` relies on: -* submitit to schedule jobs when ran on clusters -* hydra-core version >= 1.2.0 for configuration -* fairseq to use LASER encoders -* PyTorch version >= 1.5.0 -* Python version >= 3.8 + +- submitit to schedule jobs when ran on clusters +- hydra-core version >= 1.2.0 for configuration +- fairseq to use LASER encoders +- PyTorch version >= 1.5.0 +- Python version >= 3.8 ## Installing stopes @@ -32,11 +33,13 @@ pip for the install to work. We recommend that you first upgrade pip: `python -m pip install --upgrade pip` The mining pipeline relies on fairseq to run LASER encoders, because of competing dependency version, you'll have to first install fairseq with pip separately: + ``` pip install fairseq==0.12.1 ``` You can then install stopes with pip: + ``` git clone https://github.com/facebookresearch/stopes.git cd stopes @@ -48,6 +51,7 @@ You can choose what to install. If you are only interested in `mining`, you do n Currently `fairseq` and `stopes` require different version of hydra, so `pip` might output some warnings, do not worry about them, we want hydra>=1.1. If you plan to train a lot of NMT model you will also want to setup apex to get a faster training. + ``` git clone https://github.com/NVIDIA/apex cd apex @@ -59,16 +63,19 @@ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cud ## How `stopes` works `stopes` is made of a few different parts: + 1. `core` provides a library to write readable piplines 2. `modules` provides a set of modules using the core library and implementing common steps in our mining and evaluation pipelines 3. `pipelines` provides pipeline implementation for the data pipelines we use in NLLB: + - `monolingual` to preprocess and clean single language data - `bitext` to run the "global mining" pipeline and extract aligned sentences from two monolingual datasets. (inspired by [CCMatrix](https://ai.facebook.com/blog/ccmatrix-a-billion-scale-bitext-data-set-for-training-translation-models/)) - `distilation` to run our sequence-level knowledge distillation pipeline which trains a small student model from a pre-trained large teacher model (approach based on https://arxiv.org/abs/1606.07947) + 4. `eval` provides a set of evaluation tools, including ALTI+ and BLASER for text-free speech translation evaluation. 5. `demo` contains applications of stopes, including a quickstart demo that you can run at home of mining as well as a example usage of ALTI+ for toxicity and hallucination analysis. @@ -99,6 +106,7 @@ See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out. (in alphabetical order) ## Citation + If you use `stopes` in your work, please cite: ```bibtex @@ -125,4 +133,5 @@ Some of the tools in stopes, like BLASER and ALTI have their own publications, p ``` ## License + `stopes` is MIT licensed, as found in the LICENSE file. diff --git a/demo/alti/README.md b/demo/alti/README.md index 0bf0d69..6d6cc34 100644 --- a/demo/alti/README.md +++ b/demo/alti/README.md @@ -6,32 +6,37 @@ Our implementation is based on the code from the paper [Ferrando et al., 2022](h The code and readme for it are located at `stopes/eval/alti`. # Installation + To use the core ALTI+ code, you need to install Stopes with the `alti` dependencies: + ``` git clone https://github.com/facebookresearch/stopes.git cd stopes && pip install -e '.[alti]' && cd .. ``` -You will also need Fairseq. To work with [NLLB models](https://github.com/facebookresearch/fairseq/tree/nllb), +You will also need Fairseq. To work with [NLLB models](https://github.com/facebookresearch/fairseq/tree/nllb), you have to checkout the corresponding branch: + ``` git clone https://github.com/pytorch/fairseq cd fairseq && git checkout nllb && pip install -e . && python setup.py build_ext --inplace && cd .. ``` # The minimal example (CLI with an NLLB model) + The code, configs and toy data for this example is in the `minimal_example` directory. To download the official 600M checkpoint of the [NLLB-200 model](https://github.com/facebookresearch/fairseq/tree/nllb), -run the script `download_nllb.sh` from the `minimal_example` directory; +run the script `download_nllb.sh` from the `minimal_example` directory; it will create the `nllb` directory there and download the model and the dictiory into it. The following command will read sentence pairs `test_input.tsv` and compute ALTI and ALTI-based metrics for them. -It will output various text-level scores into `test_output.tsv`, +It will output various text-level scores into `test_output.tsv`, and token-level contributions and alignments will be stored in `test_output_alignments.jsonl`. The file `test_input.tsv` contains columns `src` and `mt` with a few French-English translation pairs with different pathologies: + ``` src mt Traduction normale. A normal translation. @@ -54,10 +59,11 @@ python compute_nllb_alti.py \ tgt_lang=eng_Latn \ +preset=nllb_demo +demo_dir=$(pwd)/nllb ``` + Some arguments to the command are stored in the configuration file: `preset/nllb_demo.yaml`. You can edit this file or create another preset, if you want. -The file `test_output.tsv` contains multiple columns with various ALTI-based metrics +The file `test_output.tsv` contains multiple columns with various ALTI-based metrics (here we show only a few columns for better readability): ``` @@ -70,22 +76,24 @@ The file `test_output.tsv` contains multiple columns with various ALTI-based met 5 A transfer in which I have accessed an error. 0.61 0.40 0.14 0.00 6 A translation with with with with with with a cyclical hallucination. 0.61 0.28 0.13 0.00 ``` -One can see that the metrics `avg_sc`, `min_sc` and `top_sc_mean` may help to detect hallucinations, -whereas `src_sum_contr_below_01` indicates incomplete translations. +One can see that the metrics `avg_sc`, `min_sc` and `top_sc_mean` may help to detect hallucinations, +whereas `src_sum_contr_below_01` indicates incomplete translations. -The file `test_output_alignments.jsonl` contains individual subword tokens of the source and target sentences, +The file `test_output_alignments.jsonl` contains individual subword tokens of the source and target sentences, the raw ALTI+ contribution matrices produced for these tokens, and the alignments computed from these matrices: + ``` {"contributions": [[0.52, 0.06, ...]], "alignment": [[0, 0], [0, 4], [0, 6], [1, 0], ...], "src_toks": ["__fra_Latn__", "▁Trad", "uction", "▁normale", ".", ""], "tgt_toks": ["", "__eng_Latn__", "▁A", "▁normal", "▁translation", "."], "pred_toks": ["__eng_Latn__", "▁A", "▁normal", "▁translation", ".", ""]} {"contributions": ... ... ``` + `src_toks` are the encoder inputs, `tgt_toks` are the inputs to the decoder, whereas `pred_toks` are its outputs. In fact, `tgt_toks` are `pred_toks` shifted by one position. - # Reproducing the hallucination detection experiments + The folder `detecting_hallucinations` contains the code for reproducing the experiments on hallucination detection from the paper [Detecting and Mitigating Hallucinations in Machine Translation: Model Internal Workings Alone Do Well, Sentence Similarity Even Better](https://arxiv.org/abs/2212.08597). @@ -93,7 +101,6 @@ The detailed instructions for reproduction are in that folder. To refer to these results, please cite: - ```bibtex @article{dale2022detecting, title={Detecting and Mitigating Hallucinations in Machine Translation: Model Internal Workings Alone Do Well, Sentence Similarity Even Better}, @@ -103,4 +110,3 @@ To refer to these results, please cite: year={2022} } ``` - diff --git a/demo/alti/detecting_hallucinations/README.md b/demo/alti/detecting_hallucinations/README.md index 665e3bb..fb65821 100644 --- a/demo/alti/detecting_hallucinations/README.md +++ b/demo/alti/detecting_hallucinations/README.md @@ -1,26 +1,27 @@ # Token contributions for hallucination detection -This folder contains the code for reproducing the experiments from the paper +This folder contains the code for reproducing the experiments from the paper [Detecting and Mitigating Hallucinations in Machine Translation: Model Internal Workings Alone Do Well, Sentence Similarity Even Better](https://arxiv.org/abs/2212.08597). The structure is following: -- `annotated_data`: - - `guerreiro2022_corpus_w_annotations.csv`: the corpus from the [Guerreiro, 2022 repository](https://github.com/deep-spin/hallucinations-in-nmt) with German-English translations annotated by pathology type. - - `annotate_hallucination_mitigation_v7_stacked.tsv`: a subset of this corpus translated by 3 improved systems and re-annottated. + +- `annotated_data`: + - `guerreiro2022_corpus_w_annotations.csv`: the corpus from the [Guerreiro, 2022 repository](https://github.com/deep-spin/hallucinations-in-nmt) with German-English translations annotated by pathology type. + - `annotate_hallucination_mitigation_v7_stacked.tsv`: a subset of this corpus translated by 3 improved systems and re-annottated. - `computed_data`: should contain the files created by our code. They can either be downloaded from [this link](https://dl.fbaipublicfiles.com/nllb/hallucination_detection_data.zip) or re-computed from scratch by the code below. - `experiments`: the code for the experiments, organized in 5 Jupyter notebooks: - - `01_Detection.ipynb`: computing various metrics of translation quality on the aforementioned corpus. - - `02_Detection_analysis.ipynb`: evaluating the metrics computed above. - - `03_Mitigation.ipynb`: translating German sentences to English by generating multiple hypotheses with various methods and reranking them with various scores. - - `04_Mitigation_more_hypotheses.ipynb`: the same experiments as above, with fewer generation methods and a larger pool of hypotheses. - - `05_Mitigation_analysis.ipynb`: evaluating the translations computed above. + - `01_Detection.ipynb`: computing various metrics of translation quality on the aforementioned corpus. + - `02_Detection_analysis.ipynb`: evaluating the metrics computed above. + - `03_Mitigation.ipynb`: translating German sentences to English by generating multiple hypotheses with various methods and reranking them with various scores. + - `04_Mitigation_more_hypotheses.ipynb`: the same experiments as above, with fewer generation methods and a larger pool of hypotheses. + - `05_Mitigation_analysis.ipynb`: evaluating the translations computed above. The notebooks `02_Detection_analysis.ipynb` and `05_Mitigation_analysis.ipynb` reproduce most of the figures and tables mentioned in the paper. - # Setup 1. Prepare the environment by installing Fairseq, Stopes, and some extra libraries: + ``` pip install fairseq==0.12.1 git clone https://github.com/facebookresearch/stopes.git @@ -32,21 +33,24 @@ pip install -r requirements.txt 2. Set up the translation model - 2.1. Download the translation [model](https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file) and [data](https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file) from https://github.com/deep-spin/hallucinations-in-nmt and put them in the `model` directory. + 2.1. Download the translation [model](https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file) and [data](https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file) from https://github.com/deep-spin/hallucinations-in-nmt and put them in the `model` directory. + + 2.2. Run the following commands to unpack the data: + + ``` + tar -xvf model/checkpoint_best.tar.xz + tar -xvf model/wmt18_de-en.tar.xz + ``` - 2.2. Run the following commands to unpack the data: - ``` - tar -xvf model/checkpoint_best.tar.xz - tar -xvf model/wmt18_de-en.tar.xz - ``` + 2.3. Run the following command to download the tokenizers: - 2.3. Run the following command to download the tokenizers: - ``` - wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.model - wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.vocab - ``` + ``` + wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.model + wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.vocab + ``` 3. Download LASER2 encoder (see https://github.com/facebookresearch/LASER/tree/main/nllb) by running: + ``` mkdir laser cd laser @@ -55,6 +59,7 @@ cd .. ``` 4. Optionally, download the computed translations and scores (instead of re-computing it by notebooks 1, 3, and 4): + ``` wget https://dl.fbaipublicfiles.com/nllb/hallucination_detection_data.zip unzip hallucination_detection_data.zip @@ -62,8 +67,8 @@ unzip hallucination_detection_data.zip Now you can run any notebook from the `experiments` folder. - # Citation + If you use refer to this code or results in your work, please cite: ```bibtex diff --git a/demo/alti/minimal_example/download_nllb.sh b/demo/alti/minimal_example/download_nllb.sh index f675bc3..632ce24 100644 --- a/demo/alti/minimal_example/download_nllb.sh +++ b/demo/alti/minimal_example/download_nllb.sh @@ -6,12 +6,12 @@ mkdir nllb cd nllb -# downloading the vocabulary; +# downloading the vocabulary; wget --trust-server-names https://tinyurl.com/flores200sacrebleuspm wget --trust-server-names https://tinyurl.com/nllb200dictionary # downloading the smallest NLLB200 model; it may take about 5 minutes -wget --trust-server-names https://tinyurl.com/nllb200densedst600mcheckpoint +wget --trust-server-names https://tinyurl.com/nllb200densedst600mcheckpoint for lang in ace_Latn acm_Arab acq_Arab aeb_Arab afr_Latn ajp_Arab aka_Latn als_Latn amh_Ethi apc_Arab arb_Arab ars_Arab ary_Arab arz_Arab asm_Beng ast_Latn awa_Deva ayr_Latn azb_Arab azj_Latn bak_Cyrl bam_Latn ban_Latn bel_Cyrl bem_Latn ben_Beng bho_Deva bjn_Latn bod_Tibt bos_Latn bul_Cyrl cat_Latn ceb_Latn ces_Latn cjk_Latn ckb_Arab crh_Latn cym_Latn dan_Latn deu_Latn dik_Latn dyu_Latn dzo_Tibt ell_Grek eng_Latn epo_Latn est_Latn eus_Latn ewe_Latn fao_Latn fij_Latn fin_Latn fon_Latn fra_Latn fur_Latn fuv_Latn gaz_Latn gla_Latn gle_Latn glg_Latn grn_Latn guj_Gujr hat_Latn hau_Latn heb_Hebr hin_Deva hne_Deva hrv_Latn hun_Latn hye_Armn ibo_Latn ilo_Latn ind_Latn isl_Latn ita_Latn jav_Latn jpn_Jpan kab_Latn kac_Latn kam_Latn kan_Knda kas_Arab kas_Deva kat_Geor kaz_Cyrl kbp_Latn kea_Latn khk_Cyrl khm_Khmr kik_Latn kin_Latn kir_Cyrl kmb_Latn kmr_Latn knc_Arab knc_Latn kon_Latn kor_Hang lao_Laoo lij_Latn lim_Latn lin_Latn lit_Latn lmo_Latn ltg_Latn ltz_Latn lua_Latn lug_Latn luo_Latn lus_Latn lvs_Latn mag_Deva mai_Deva mal_Mlym mar_Deva min_Latn mkd_Cyrl mlt_Latn mni_Beng mos_Latn mri_Latn mya_Mymr nld_Latn nno_Latn nob_Latn npi_Deva nso_Latn nus_Latn nya_Latn oci_Latn ory_Orya pag_Latn pan_Guru pap_Latn pbt_Arab pes_Arab plt_Latn pol_Latn por_Latn prs_Arab quy_Latn ron_Latn run_Latn rus_Cyrl sag_Latn san_Deva scn_Latn shn_Mymr sin_Sinh slk_Latn slv_Latn smo_Latn sna_Latn snd_Arab som_Latn sot_Latn spa_Latn srd_Latn srp_Cyrl ssw_Latn sun_Latn swe_Latn swh_Latn szl_Latn tam_Taml taq_Latn tat_Cyrl tel_Telu tgk_Cyrl tgl_Latn tha_Thai tir_Ethi tpi_Latn tsn_Latn tso_Latn tuk_Latn tum_Latn tur_Latn twi_Latn tzm_Tfng uig_Arab ukr_Cyrl umb_Latn urd_Arab uzn_Latn vec_Latn vie_Latn war_Latn wol_Latn xho_Latn ydd_Hebr yor_Latn yue_Hant zho_Hans zho_Hant zsm_Latn zul_Latn; do cp dictionary.txt dict.${lang}.txt diff --git a/demo/alti/minimal_example/preset/nllb_demo.yaml b/demo/alti/minimal_example/preset/nllb_demo.yaml index 9c8a380..2b46269 100644 --- a/demo/alti/minimal_example/preset/nllb_demo.yaml +++ b/demo/alti/minimal_example/preset/nllb_demo.yaml @@ -3,7 +3,7 @@ demo_dir: ??? DATADIR: ${demo_dir} is_multilingual: True -checkpoint: ${demo_dir}/checkpoint.pt +checkpoint: ${demo_dir}/checkpoint.pt data_dir: ${demo_dir} spm: ${demo_dir}/flores200_sacrebleu_tokenizer_spm.model src_col: src diff --git a/demo/iwslt_blaser_eval/README.md b/demo/iwslt_blaser_eval/README.md index 641e9f4..8d93447 100644 --- a/demo/iwslt_blaser_eval/README.md +++ b/demo/iwslt_blaser_eval/README.md @@ -1,4 +1,3 @@ - # Getting started with mining Welcome to `stopes`, this is a quickstart guide to discover how to run automated pipelines with `stopes`. In this example we describe how to use BLASER to evaluate speech translation as described in the https://iwslt.org/2023/s2s task. @@ -68,8 +67,8 @@ Make sure to replace: - `PATH_TARGET_MANIFEST.tsv` to the manifest you've generated for the translation files - `PATH_REFERENCE_MANIFEST.tsv` to the manifest you've generated for the reference translations - ## Citation + If you use `blaser` in your work or any of its models, please cite: ```bibtex @@ -84,4 +83,5 @@ If you use `blaser` in your work or any of its models, please cite: ``` ## License + The `blaser` code is MIT licensed, as found in the LICENSE file in the root directory. diff --git a/demo/iwslt_blaser_eval/conf/eval_blaser.yaml b/demo/iwslt_blaser_eval/conf/eval_blaser.yaml index 678b125..2ebf638 100644 --- a/demo/iwslt_blaser_eval/conf/eval_blaser.yaml +++ b/demo/iwslt_blaser_eval/conf/eval_blaser.yaml @@ -29,14 +29,12 @@ blaser_model: config_file: ${demo_dir}/blaser_model/model.config model_checkpoint: ${demo_dir}/blaser_model/model.pt - # for this IWSLT we will to EN to CMN evaluation (source is en, tgt and reference are Mandarin) src_lang: en tgt_lang: cmn # by default tgt and ref are the same lang ref_lang: ${tgt_lang} - max_tokens: 2_560_000 # mapping from lang code to encoder checkpoint diff --git a/demo/iwslt_blaser_eval/conf/launcher b/demo/iwslt_blaser_eval/conf/launcher index e5fb673..c149d04 120000 --- a/demo/iwslt_blaser_eval/conf/launcher +++ b/demo/iwslt_blaser_eval/conf/launcher @@ -1 +1 @@ -../../../stopes/pipelines/bitext/conf/launcher \ No newline at end of file +../../../stopes/pipelines/bitext/conf/launcher diff --git a/demo/toxicity-alti-hb/ETOX/README.md b/demo/toxicity-alti-hb/ETOX/README.md index fdb55bd..c0a1d4e 100644 --- a/demo/toxicity-alti-hb/ETOX/README.md +++ b/demo/toxicity-alti-hb/ETOX/README.md @@ -3,24 +3,28 @@ Contains scripts for calculating toxicity results, given files of input strings and toxicity lists. Prerequisites: + - Install the HolisticBias module ([setup instructions](https://github.com/facebookresearch/ResponsibleNLP/tree/main/holistic_bias)) - Define paths to include etox.py Files: + - `ETOX example calls.ipynb`: Example Usage of the main ETOX toxicity tool functions. - `etox.py`: contains all the python functions for the ETOX tool - `README.md`: this file Functions: Main Functions: -- `etox_single`: Takes a Pandas dataframe and a toxicity list filename, and outputs multiple dataframes of toxicity report results. -- `etox_paired`: Paired language toxicity evaluation function. Takes 2 Pandas dataframes and a toxicity list filenames, and outputs an annotated line by line labeled table of toxicity matches for further analysis. -- `etox_paired_file_wrapper`: File reading/writing wrapper for the paired language toxicity evaluation function. + +- `etox_single`: Takes a Pandas dataframe and a toxicity list filename, and outputs multiple dataframes of toxicity report results. +- `etox_paired`: Paired language toxicity evaluation function. Takes 2 Pandas dataframes and a toxicity list filenames, and outputs an annotated line by line labeled table of toxicity matches for further analysis. +- `etox_paired_file_wrapper`: File reading/writing wrapper for the paired language toxicity evaluation function. Support Functions + - `load_eval_data_line_by_line` Loads a text file of strings, returns a Pandas Dataframe -- `txt_format`: simple data cleaning function. Lowercases and uses regex to remove punctuation, etc. -- `import_toxicity_list_file`: reads a toxicity list file into memory given a filename. Returns a List. -- `token_checker`: Checks for matches between a string and a toxic word list used if 'space' tokenization selected -- `substring_checker`: checks for character level matches ignoring spaces. Will find subwords. Used if 'character' tokenization selected -- `SPM_token_checker`: Toxic phrase checker utilizing sub-word spm_tokenization rather than simply using spaces like the stard checker. Useful for a few languages where space tokenization is unreliable, or when matching subtokens may be important. Requires the Sentencepiece library to function. +- `txt_format`: simple data cleaning function. Lowercases and uses regex to remove punctuation, etc. +- `import_toxicity_list_file`: reads a toxicity list file into memory given a filename. Returns a List. +- `token_checker`: Checks for matches between a string and a toxic word list used if 'space' tokenization selected +- `substring_checker`: checks for character level matches ignoring spaces. Will find subwords. Used if 'character' tokenization selected +- `SPM_token_checker`: Toxic phrase checker utilizing sub-word spm_tokenization rather than simply using spaces like the stard checker. Useful for a few languages where space tokenization is unreliable, or when matching subtokens may be important. Requires the Sentencepiece library to function. diff --git a/demo/toxicity-alti-hb/README.md b/demo/toxicity-alti-hb/README.md index 1a15de9..6c794f4 100644 --- a/demo/toxicity-alti-hb/README.md +++ b/demo/toxicity-alti-hb/README.md @@ -1,11 +1,12 @@ # mtoxicity-alti-holisticbias + MT toxicity at scale: deep detection and analysis. Subfolders: -- `alti/`: contains (1) the outputs of the translation models and (2) the source contributions and word alignments for the MT outputs of holisticbias with the NLLB 3B dense model. We used the github repository: https://github.com/mt-upc/transformer-contributions + +- `alti/`: contains (1) the outputs of the translation models and (2) the source contributions and word alignments for the MT outputs of holisticbias with the NLLB 3B dense model. We used the github repository: https://github.com/mt-upc/transformer-contributions - `analysis/`: scripts for calculating/plotting toxicity results, given (1) toxicities precomputed with ETOX and (2) ALTI+ scores. - `annotation/`: contains the false positive and the false negative analysis conducted for 8 outputs on the holisticbias toxicity detection. - `ETOX/`: contains the tool for detecting toxicity - # Contributors: Marta R. Costa-jussà, alti/ @@ -22,13 +23,13 @@ If you use toxicity-alti-hb in your work, please cite : @article{toxicity2022, - title={Toxicity in Multilingual Machine Translation at Scale}, +title={Toxicity in Multilingual Machine Translation at Scale}, - author={Costa-jussà, M.R., Smith, E, Ropers, C., Licht.,D., Ferrando, J., Escolano, C.}, +author={Costa-jussà, M.R., Smith, E, Ropers, C., Licht.,D., Ferrando, J., Escolano, C.}, - journal={Arxiv, abs/2210.03070}, +journal={Arxiv, abs/2210.03070}, - url={https://arxiv.org/abs/2210.03070.pdf}, +url={https://arxiv.org/abs/2210.03070.pdf}, - year={2022} +year={2022} } diff --git a/demo/toxicity-alti-hb/alti/README.md b/demo/toxicity-alti-hb/alti/README.md index 8fed5c5..708df50 100644 --- a/demo/toxicity-alti-hb/alti/README.md +++ b/demo/toxicity-alti-hb/alti/README.md @@ -4,21 +4,18 @@ Data can be download from: wget --trust-server-names https://tinyurl.com/toxtranslationaltioutputs - `HB-dense3B-outputs/`: 164 folder from English to 164 languages. Each folder has the non-tokenized and the spm translation output (of HolisticBias) for the NLLB 3B dense model . Example of folder for LANGX in the 164 languages: eng_Latn-LANGX/holistic.eng_Latn-LANGX eng_Latn-LANGX/spm_holistic.eng_Latn-LANGX - -`HB-distilled600M-outputs`: 164 folder from English to 164 languages. Each folder has the non-tokenized and the spm translation output (of HolisticBias) for the NLLB 600M distilled model. Example of 1 of these folders for LANGX in the 164 languages: +`HB-distilled600M-outputs`: 164 folder from English to 164 languages. Each folder has the non-tokenized and the spm translation output (of HolisticBias) for the NLLB 600M distilled model. Example of 1 of these folders for LANGX in the 164 languages: eng_Latn-LANGX/holistic.eng_Latn-LANGX eng_Latn-LANGX/spm_holistic.eng_Latn-LANGX - `alti-outputs/`: 164 folder from English to 164 languages. Each folder has two files: the outputs of the source contributions and alignments for the MT outputs of HolisticBias with the NLLB 3B dense model. Example 1 of these folders for LANGX in the 164 languages: eng_Latn-LANGX/output.eng_Latn-LANGX diff --git a/demo/toxicity-alti-hb/analysis/README.md b/demo/toxicity-alti-hb/analysis/README.md index 0ed6c45..ba55a37 100644 --- a/demo/toxicity-alti-hb/analysis/README.md +++ b/demo/toxicity-alti-hb/analysis/README.md @@ -1,10 +1,12 @@ # Analysis code Contains scripts for calculating/plotting toxicity results, given precomputed toxicities and ALTI+ scores. Prerequisites: + - Install the HolisticBias module ([setup instructions](https://github.com/facebookresearch/ResponsibleNLP/tree/main/holistic_bias)) - Define paths for loading in pre-existing files of source/target sentences, toxicity results, ALTI+ source contribution scores, etc. in `util.py` (see `'TODO'`s) Scripts: + - `00_compile_toxicity_stats.py`: compute the course-grained analysis of toxicity as a function of language, axis, noun, template, etc. - `00c_plot_toxicity_per_lang.py`: plot the breakdown of toxicity across HolisticBias axes as a function of language - `01_sample_high_risk_translations.py`: sample translations likely to be toxic despite not being labeled as toxic, for the false negative analysis diff --git a/demo/toxicity-alti-hb/annotation/README.md b/demo/toxicity-alti-hb/annotation/README.md index 2a69236..a10d46f 100644 --- a/demo/toxicity-alti-hb/annotation/README.md +++ b/demo/toxicity-alti-hb/annotation/README.md @@ -7,36 +7,43 @@ https://tinyurl.com/hbtoxicannotation (you might need to copy paste it in a new browser window for the download to work) The folder contains 16 TSV files, 2 files for each of the below languoids. -* cat_Latn: Catalan -* eus_Latn: Basque -* fra_Latn: French -* pes_Arab: Western Persian -* spa_Latn: Spanish -* zho_Hans: Chinese (simplified script) -* zho_Hant: Chinese (traditional script) + +- cat_Latn: Catalan +- eus_Latn: Basque +- fra_Latn: French +- pes_Arab: Western Persian +- spa_Latn: Spanish +- zho_Hans: Chinese (simplified script) +- zho_Hant: Chinese (traditional script) For each languoid, one file includes annotations for sentences where candidate toxicity was automatically detected (true|false positives), the other file includes annotations for a sample of sentences where no toxicity was automatically detected (true|false negatives). ## Positives + Each file displays for each annotated item: -* the BCP47 code for the input language -* the BCP47 code for the output language -* the input sentence -* the output sentence -* the detected toxicity list entry -* the TRUE | FALSE annotation (TRUE = confirmed toxicity) + +- the BCP47 code for the input language +- the BCP47 code for the output language +- the input sentence +- the output sentence +- the detected toxicity list entry +- the TRUE | FALSE annotation (TRUE = confirmed toxicity) ## Negatives + Each file displays for each annotated item: -* the BCP47 code for the input language -* the BCP47 code for the output language -* the input sentence -* the output sentence -* the TRUE | FALSE annotation (TRUE = confirmed toxicity) + +- the BCP47 code for the input language +- the BCP47 code for the output language +- the input sentence +- the output sentence +- the TRUE | FALSE annotation (TRUE = confirmed toxicity) ## Confirmed toxicity + A positive detection is confirmed toxic when: -* it matches a toxicity list entry, and: - * it is always toxic (context-independent entries), or - * it is assessed toxic in the context of the sentence (context-dependent entries). -A negative detection is confirmed toxic when it matches a morphological variant of a toxicity list entry. + +- it matches a toxicity list entry, and: + _ it is always toxic (context-independent entries), or + _ it is assessed toxic in the context of the sentence (context-dependent entries). + A negative detection is confirmed toxic when it matches a morphological variant of a toxicity list entry. diff --git a/stopes/eval/alti/LICENSE.md b/stopes/eval/alti/LICENSE.md index 261eeb9..c61b663 100644 --- a/stopes/eval/alti/LICENSE.md +++ b/stopes/eval/alti/LICENSE.md @@ -2,180 +2,180 @@ Version 2.0, January 2004 http://www.apache.org/licenses/ - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" @@ -186,16 +186,16 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] +Copyright [yyyy] [name of copyright owner] - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/stopes/eval/alti/README.md b/stopes/eval/alti/README.md index 0a07565..1fad0f3 120000 --- a/stopes/eval/alti/README.md +++ b/stopes/eval/alti/README.md @@ -1 +1 @@ -../../../website/docs/eval/alti.md \ No newline at end of file +../../../website/docs/eval/alti.md diff --git a/stopes/eval/blaser/README.md b/stopes/eval/blaser/README.md index bf7ffcb..2815067 120000 --- a/stopes/eval/blaser/README.md +++ b/stopes/eval/blaser/README.md @@ -1 +1 @@ -../../../website/docs/eval/blaser.md \ No newline at end of file +../../../website/docs/eval/blaser.md diff --git a/stopes/eval/blaser/conf/score.yaml b/stopes/eval/blaser/conf/score.yaml index 879eef4..30e19d5 100644 --- a/stopes/eval/blaser/conf/score.yaml +++ b/stopes/eval/blaser/conf/score.yaml @@ -12,4 +12,4 @@ model: batch_size: 100 use_gpu: True -output_dir: test \ No newline at end of file +output_dir: test diff --git a/stopes/pipelines/bitext/README.md b/stopes/pipelines/bitext/README.md index 1e7844c..1ab32a9 120000 --- a/stopes/pipelines/bitext/README.md +++ b/stopes/pipelines/bitext/README.md @@ -1 +1 @@ -../../../website/docs/pipelines/global_mining.md \ No newline at end of file +../../../website/docs/pipelines/global_mining.md diff --git a/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml b/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml index 68209f6..7964cf8 100644 --- a/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml +++ b/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml @@ -7,5 +7,5 @@ fp16: False # these are not used, but required as the laser3 uses it. but sentence transformer doesn't need them # we just set it to nothing -spm_model: '' -spm_vocab: '' +spm_model: "" +spm_vocab: "" diff --git a/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml b/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml index 4c0cb22..a59a0e4 100644 --- a/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml +++ b/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml @@ -4,7 +4,7 @@ tgt_lang: ??? checkpoint_dir: ??? # directory containing checkpoints binarized_dir: ??? # contains binarized files for each lang and split output_dir: ??? -checkpoint_glob: "checkpoint[0-9]*.pt" # pattern for file checkpoints in checkpoint_dir +checkpoint_glob: "checkpoint[0-9]*.pt" # pattern for file checkpoints in checkpoint_dir beam: 5 batch_size: 32 batch_memory: 2 # mem_gb in Distributed requirements is = batch_memory * batch_size diff --git a/stopes/pipelines/bitext/conf/generate/standard_conf.yaml b/stopes/pipelines/bitext/conf/generate/standard_conf.yaml index 54ffe8a..905c695 100644 --- a/stopes/pipelines/bitext/conf/generate/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/generate/standard_conf.yaml @@ -10,4 +10,3 @@ config: beam_search: beam: 5 file_list: [] - diff --git a/stopes/pipelines/bitext/conf/mine_indexes/base.yaml b/stopes/pipelines/bitext/conf/mine_indexes/base.yaml index fad639b..fcbc596 100644 --- a/stopes/pipelines/bitext/conf/mine_indexes/base.yaml +++ b/stopes/pipelines/bitext/conf/mine_indexes/base.yaml @@ -1,24 +1,24 @@ _target_: stopes.modules.bitext.mining.mine_bitext_indexes.MineBitextIndexesModule config: - src_lang: ??? - tgt_lang: ??? - index_type: ??? + src_lang: ??? + tgt_lang: ??? + index_type: ??? - #set later in pipeline - src2tgt_dist_files: ??? - src2tgt_index_files: ??? - tgt2src_dist_files: ??? - tgt2src_index_files: ??? + #set later in pipeline + src2tgt_dist_files: ??? + src2tgt_index_files: ??? + tgt2src_dist_files: ??? + tgt2src_index_files: ??? - output_dir: ??? - knn_dist: 16 - src_k: 16 - tgt_k: 16 - k_extract: 1 - margin_type: ratio - mine_type: union - sort_neighbors: False - margin_norm: mean - num_probe: 128 - gpu_type: fp16-shard - mine_threshold: 1.06 + output_dir: ??? + knn_dist: 16 + src_k: 16 + tgt_k: 16 + k_extract: 1 + margin_type: ratio + mine_type: union + sort_neighbors: False + margin_norm: mean + num_probe: 128 + gpu_type: fp16-shard + mine_threshold: 1.06 diff --git a/stopes/pipelines/bitext/conf/mine_sentences/base.yaml b/stopes/pipelines/bitext/conf/mine_sentences/base.yaml index a6899b7..7efaf0c 100644 --- a/stopes/pipelines/bitext/conf/mine_sentences/base.yaml +++ b/stopes/pipelines/bitext/conf/mine_sentences/base.yaml @@ -1,15 +1,15 @@ _target_: stopes.modules.bitext.mining.mine_bitext_sentences.MineBitextSentencesModule config: - src_lang: ??? - tgt_lang: ??? - src_text_files: ??? - src_meta_files: ??? - tgt_text_files: ??? - tgt_meta_files: ??? - alignment_file: ??? # the mined indexes, without npz extension - data: ??? - output_dir: mine.${data.data_version} - mine_threshold: 1.04 - score_max: 1.25 - dedup_bitexts: True - compress_output: True + src_lang: ??? + tgt_lang: ??? + src_text_files: ??? + src_meta_files: ??? + tgt_text_files: ??? + tgt_meta_files: ??? + alignment_file: ??? # the mined indexes, without npz extension + data: ??? + output_dir: mine.${data.data_version} + mine_threshold: 1.04 + score_max: 1.25 + dedup_bitexts: True + compress_output: True diff --git a/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml b/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml index 47d6255..1d550b5 100644 --- a/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml @@ -1,4 +1,4 @@ output_dir: ??? -filter_ratio: 2.5 # bitexts filtering using Moses's lean-corpus-n.perl +filter_ratio: 2.5 # bitexts filtering using Moses's lean-corpus-n.perl filter_min: 1 filter_max: 250 # this means that lines longer than 250 will be deleted diff --git a/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml b/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml index 1c4da85..b1cb791 100644 --- a/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml +++ b/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml @@ -9,7 +9,6 @@ defaults: - eval: generate_multi_bleu_detok - _self_ - src_lang: ??? tgt_lang: ??? # this is the bitext being evaluated. It should have 3 columns: (score, src, tgt) diff --git a/stopes/pipelines/bitext/conf/preset/demo.yaml b/stopes/pipelines/bitext/conf/preset/demo.yaml index 5f32f2c..93d21e5 100644 --- a/stopes/pipelines/bitext/conf/preset/demo.yaml +++ b/stopes/pipelines/bitext/conf/preset/demo.yaml @@ -42,7 +42,7 @@ train_index: calculate_distances: config: gpu_memory_gb: 32 - gpu_type: "" # don't use gpu + gpu_type: "" # don't use gpu # Provides info about the data. A lot of this is used to generate nice output # file names. diff --git a/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml b/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml index 52fedad..81a2964 100644 --- a/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml @@ -7,7 +7,7 @@ config: character_coverage: 0.999995 model_type: "unigram" shuffle_input_sentence: True - num_threads : 20 + num_threads: 20 train_data_file: ??? # optional value; if passed as empty, will be auto set based on train_data_file name model_prefix_spm: "" diff --git a/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml b/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml index 1c2ce16..c5d385c 100644 --- a/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml @@ -7,7 +7,7 @@ seed_sentencepiece_size: 5_000_000 character_coverage: 0.999995 model_type: "unigram" shuffle_input_sentence: True -num_threads : 20 +num_threads: 20 train_data_file: ??? # optional value; if passed as empty, will be auto set based on train_data_file name model_prefix_spm: "" diff --git a/stopes/pipelines/distillation/conf/bitext_clean/default.yaml b/stopes/pipelines/distillation/conf/bitext_clean/default.yaml index 1d56c56..fc361f1 100644 --- a/stopes/pipelines/distillation/conf/bitext_clean/default.yaml +++ b/stopes/pipelines/distillation/conf/bitext_clean/default.yaml @@ -2,7 +2,7 @@ language_script_filename: ${mono_pipeline.language_script_filename} split_language_equivalences_filename: ${mono_pipeline.split_language_equivalences_filename} # used for sentence splitting -split_algo: ${mono_pipeline.split_algo} +split_algo: ${mono_pipeline.split_algo} local_tmp_dir: ${local_tmp_dir} dist_tmp_dir: ${dist_tmp_dir} @@ -15,8 +15,8 @@ bitext_processor: local_tmp_dir: ${..local_tmp_dir} _version: 0.3 -outfile_prefix: '' -outfile_postfix: '' +outfile_prefix: "" +outfile_postfix: "" output_dir: ??? filter: diff --git a/stopes/pipelines/distillation/conf/dedup/default.yaml b/stopes/pipelines/distillation/conf/dedup/default.yaml index a809bac..609fa83 100644 --- a/stopes/pipelines/distillation/conf/dedup/default.yaml +++ b/stopes/pipelines/distillation/conf/dedup/default.yaml @@ -17,5 +17,5 @@ field_def: 6 # on what field do we merge; update this to 7 if you have an lm sco process_locally: false # ignore: if it's faster to read locally, copy all files to tmp_dir first resort_files: false # ignore -glob: '' # ignore +glob: "" # ignore wandb: null # ignore diff --git a/stopes/pipelines/distillation/conf/distillation.yaml b/stopes/pipelines/distillation/conf/distillation.yaml index 9fd6af5..b9e9d96 100644 --- a/stopes/pipelines/distillation/conf/distillation.yaml +++ b/stopes/pipelines/distillation/conf/distillation.yaml @@ -26,7 +26,7 @@ launcher: # update this if you are not running locally, used in dedup tmp_dir: /tmp # update this to be a path on a shared partition if running on slurm -merge_dir: /tmp/merge +merge_dir: /tmp/merge # update this if you are not running locally (used in mono_pipeline cleaning and bitext clean) local_tmp_dir: ${tmp_dir} @@ -39,28 +39,28 @@ lid: model_file: # path to LID model - latest_models_path: ??? + latest_models_path: ??? # optional - + probability_threshold: 0.5 lang_thresholds: - fuv: 0.3 - bis: 0.3 - ewe: 0.2 - fon: 0.2 - kam: 0.3 - kur: 0.2 - lua: 0.4 - pag: 0.4 - sag: 0.3 - ssw: 0.3 - tso: 0.4 - umb: 0.3 - vec: 0.4 - war: 0.4 - yor: 0.4 - diq: 0.4 + fuv: 0.3 + bis: 0.3 + ewe: 0.2 + fon: 0.2 + kam: 0.3 + kur: 0.2 + lua: 0.4 + pag: 0.4 + sag: 0.3 + ssw: 0.3 + tso: 0.4 + umb: 0.3 + vec: 0.4 + war: 0.4 + yor: 0.4 + diq: 0.4 label_unk: __label__unk @@ -75,4 +75,4 @@ vocab_file_path: ??? spm_model_path: ??? # update if you use wandb -wandb: null +wandb: null diff --git a/stopes/pipelines/distillation/conf/launcher b/stopes/pipelines/distillation/conf/launcher index 36d1959..90730d4 120000 --- a/stopes/pipelines/distillation/conf/launcher +++ b/stopes/pipelines/distillation/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher \ No newline at end of file +../../bitext/conf/launcher diff --git a/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml b/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml index 4a4aa44..5f9496e 100644 --- a/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml +++ b/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml @@ -1,7 +1,7 @@ launcher: ${launcher} dedup: ${dedup} langs: ${src_langs} -corpus_filter: '' +corpus_filter: "" data_dir: ${mono_data_dir} output_dir: . language_script_filename: language_scripts_200.tsv diff --git a/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml b/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml index 6ac49b4..fe46cf9 100644 --- a/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml +++ b/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml @@ -20,7 +20,7 @@ weight_decay: 0.0001 criterion: label_smoothed_cross_entropy label_smoothing: 0.2 optimizer: adam -adam_betas: '(0.9, 0.98)' +adam_betas: "(0.9, 0.98)" clip_norm: 0.0 lr_scheduler: inverse_sqrt diff --git a/stopes/pipelines/eval/conf/launcher b/stopes/pipelines/eval/conf/launcher index 36d1959..90730d4 120000 --- a/stopes/pipelines/eval/conf/launcher +++ b/stopes/pipelines/eval/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher \ No newline at end of file +../../bitext/conf/launcher diff --git a/stopes/pipelines/filtering/README.md b/stopes/pipelines/filtering/README.md index 86833e4..c04537b 100644 --- a/stopes/pipelines/filtering/README.md +++ b/stopes/pipelines/filtering/README.md @@ -1,6 +1,6 @@ # NLLB Bitext Filtering -*NB*: This is legacy code that is older than the rest of `stopes`. It has not been +_NB_: This is legacy code that is older than the rest of `stopes`. It has not been ported yet -- do not depend on it as it will eventually be refactored. The `filter.py` pipeline applies various filters to bitext, with optional support for @@ -13,11 +13,13 @@ respectively. Please consult the help of those scripts (by running them with `-h learn more about how to configure them. A basic run using default parameters might look like this: + ``` python filter.py \ output_dir=/home/$USER/filter_test \ data_conf_dir=/home/$USER/data_conf ``` + This command will run using the output directory and `data_conf_dir` directory (the location where the `populate_data_conf.py` and `compute_length_factors.py` scripts output their configuration files) as specified above, and will additionally load the @@ -29,6 +31,7 @@ When needing to run a new filtering job with many parameter overrides, instead o manually overriding parameters on the command line it is better to create an entirely new config file, e.g. `conf/my_config.yaml`, containing all overrides. The script can then be instructed to load it as follows: + ``` python filter.py \ --config-name=my_config \ diff --git a/stopes/pipelines/monolingual/README.md b/stopes/pipelines/monolingual/README.md index 0863f6a..e1ae265 100644 --- a/stopes/pipelines/monolingual/README.md +++ b/stopes/pipelines/monolingual/README.md @@ -16,6 +16,7 @@ The core filtering is in `monolingual_line_processor.py` and `utils/text_filter. `python monolingual_pipeline.py data_dir=yourdatahere langs='[umb,ssw]'` should be enough to get it running. + - `data_dir` is where the raw data is, should have subfolders per lang and files named with the pattern corpus_name.lang.xz - `langs` an array of langs to process in this run diff --git a/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml b/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml index 16e88f0..1d197c7 120000 --- a/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml +++ b/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml @@ -1 +1 @@ -../dedup_files.yaml \ No newline at end of file +../dedup_files.yaml diff --git a/stopes/pipelines/monolingual/conf/launcher b/stopes/pipelines/monolingual/conf/launcher index 36d1959..90730d4 120000 --- a/stopes/pipelines/monolingual/conf/launcher +++ b/stopes/pipelines/monolingual/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher \ No newline at end of file +../../bitext/conf/launcher diff --git a/stopes/pipelines/monolingual/conf/monolingual.yaml b/stopes/pipelines/monolingual/conf/monolingual.yaml index f318e7d..b9fd2ba 100644 --- a/stopes/pipelines/monolingual/conf/monolingual.yaml +++ b/stopes/pipelines/monolingual/conf/monolingual.yaml @@ -4,7 +4,7 @@ defaults: - _self_ langs: ??? -corpus_filter: '' +corpus_filter: "" data_dir: ??? output_dir: . language_script_filename: language_scripts_200.tsv @@ -41,26 +41,25 @@ lid: latest_models_path: ??? probability_threshold: 0.5 lang_thresholds: - fuv: 0.3 - bis: 0.3 - ewe: 0.2 - fon: 0.2 - kam: 0.3 - kur: 0.2 - lua: 0.4 - pag: 0.4 - sag: 0.3 - ssw: 0.3 - tso: 0.4 - umb: 0.3 - vec: 0.4 - war: 0.4 - yor: 0.4 - diq: 0.4 + fuv: 0.3 + bis: 0.3 + ewe: 0.2 + fon: 0.2 + kam: 0.3 + kur: 0.2 + lua: 0.4 + pag: 0.4 + sag: 0.3 + ssw: 0.3 + tso: 0.4 + umb: 0.3 + vec: 0.4 + war: 0.4 + yor: 0.4 + diq: 0.4 label_unk: __label__unk - preprocess_buffer_size: 10_000 preproces_requirements: nodes: 1 diff --git a/stopes/pipelines/prepare_data/README.md b/stopes/pipelines/prepare_data/README.md index 917d888..98c195d 100644 --- a/stopes/pipelines/prepare_data/README.md +++ b/stopes/pipelines/prepare_data/README.md @@ -4,19 +4,19 @@ This pipeline takes in the filtered corpora text files (can be compressed), trai ## Input Config: -* fold: train, train_mining, train_mmt_bt, train_smt_bt, valid, test are possible options -* lang_dir: language direction +- fold: train, train_mining, train_mmt_bt, train_smt_bt, valid, test are possible options +- lang_dir: language direction corpora: `CorporaConfig` - : - : - : - src: - tgt: - metadata: (optional) - ... - ... - ... +: +: +: +src: +tgt: +metadata: (optional) +... +... +... Specify paths to src, tgt, and optionally metadata files per (fold, lang_dir) for each corpus. preprocessing: `PreprocessingConfig` @@ -37,17 +37,18 @@ How to launch your jobs? locally or submitit ## Run Command: Please override the default config options as required. + ``` python stopes/pipelines/prepare_data/prepare_data.py output_dir= ``` ## Pipeline Breakdown -* validate: Counts the number of lines for all parallel corpora and makes sure they're the same for src & tgt and stores train line counts statistics. -* retrieve_data: Concatenates all corpora for each (fold, lang_dir), runs Moses preprocessing over each of them as per preprocessing config and saves them to the `retrieved_data` directory. -* build_vocab: Samples a corpus as per sampling_config and trains an SPM on the sampled corpus. We need to sample a corpus since training an SPM on all of the corpora is time consuming. This is done jointly for src, tgt directinos by default but can be done separately as well. The trained SPM, the model file and vocab file are saved in the `vocab_bin` directory -* dedup_sharding: Deduplicates training corpora across eval corpora (valid, test) & optionally across folds as per dedup_config and shards training corpora. -* binarize: Binarizes all the sharded files (train, eval) using `MultiProcFairSeqBinarizerEncoder` and writes them to the sharded directories in the `data_bin` directory. +- validate: Counts the number of lines for all parallel corpora and makes sure they're the same for src & tgt and stores train line counts statistics. +- retrieve_data: Concatenates all corpora for each (fold, lang_dir), runs Moses preprocessing over each of them as per preprocessing config and saves them to the `retrieved_data` directory. +- build_vocab: Samples a corpus as per sampling_config and trains an SPM on the sampled corpus. We need to sample a corpus since training an SPM on all of the corpora is time consuming. This is done jointly for src, tgt directinos by default but can be done separately as well. The trained SPM, the model file and vocab file are saved in the `vocab_bin` directory +- dedup_sharding: Deduplicates training corpora across eval corpora (valid, test) & optionally across folds as per dedup_config and shards training corpora. +- binarize: Binarizes all the sharded files (train, eval) using `MultiProcFairSeqBinarizerEncoder` and writes them to the sharded directories in the `data_bin` directory. ## Caveat diff --git a/stopes/pipelines/prepare_data/conf/launcher b/stopes/pipelines/prepare_data/conf/launcher index 36d1959..90730d4 120000 --- a/stopes/pipelines/prepare_data/conf/launcher +++ b/stopes/pipelines/prepare_data/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher \ No newline at end of file +../../bitext/conf/launcher diff --git a/stopes/pipelines/prepare_data/conf/prepare_data.yaml b/stopes/pipelines/prepare_data/conf/prepare_data.yaml index 11be2f2..13861ea 100644 --- a/stopes/pipelines/prepare_data/conf/prepare_data.yaml +++ b/stopes/pipelines/prepare_data/conf/prepare_data.yaml @@ -11,6 +11,6 @@ output_dir: ??? launcher: partition: ??? # set as null if running locally cache: - caching_dir: ${output_dir}/cache # Cache won't be re-used if you change the output_dir. + caching_dir: ${output_dir}/cache # Cache won't be re-used if you change the output_dir. corpora: ??? diff --git a/stopes/pipelines/speech/README.md b/stopes/pipelines/speech/README.md index 069f4f1..bc0f9cd 100644 --- a/stopes/pipelines/speech/README.md +++ b/stopes/pipelines/speech/README.md @@ -5,6 +5,7 @@ This pipeline takes in audio data tsv files for multiple language directions and computes LASER embeddings for each language direction using the SpeechLASER model used for mining SpeechMatrix data. For each language direction, the pipeline first splits the audio data tsv file into chunks and computes the laser embeddings for each chunk separately on a node with 1 GPU asynchronously and saves a `.embeddings` file for each chunk. The format for each input audio data tsv file is: + ``` ::\t @@ -14,11 +15,13 @@ The format for each input audio data tsv file is: ``` We run the pipeline using the command: + ``` python stopes/pipelines/speech/compute_laser_embeddings.py ``` The input config: + ``` @dataclass class LaserEmbeddingConfig: @@ -32,10 +35,11 @@ class LaserEmbeddingConfig: ``` Parameters: -* `launcher`: Config for the Stopes launcher, either `submitit` or `local`. Make sure you specify the partition for the launcher if you're using the `submitit` launcher. -* `max_tokens`: Determines the effective batch size for feeding in the audio waveforms. Needs to be tuned to make sure we don't OOM on the GPU. -* `checkpoint_dir`: Path to the checkpoint directory of the SpeechLASER models. -* `data_dir`: Path to the audio data tsv files in the format `_.tsv`. -* `num_chunks`: number of chunks to split the audio data tsv files. -* `lang_dirs`: comma separated string of language directions. Ex: `hr-en,ro-en,es-en` -* `out_dir`: Path to the output directory to save the embedding files. + +- `launcher`: Config for the Stopes launcher, either `submitit` or `local`. Make sure you specify the partition for the launcher if you're using the `submitit` launcher. +- `max_tokens`: Determines the effective batch size for feeding in the audio waveforms. Needs to be tuned to make sure we don't OOM on the GPU. +- `checkpoint_dir`: Path to the checkpoint directory of the SpeechLASER models. +- `data_dir`: Path to the audio data tsv files in the format `_.tsv`. +- `num_chunks`: number of chunks to split the audio data tsv files. +- `lang_dirs`: comma separated string of language directions. Ex: `hr-en,ro-en,es-en` +- `out_dir`: Path to the output directory to save the embedding files. diff --git a/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml b/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml index 69d6e6e..da4323c 100644 --- a/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml +++ b/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml @@ -5,4 +5,4 @@ defaults: launcher: partition: ??? # set as null if running locallyc cache: - caching_dir: ${out_dir}/cache # Cache won't be re-used if you change the out_dir. + caching_dir: ${out_dir}/cache # Cache won't be re-used if you change the out_dir. diff --git a/stopes/pipelines/speech/conf/launcher b/stopes/pipelines/speech/conf/launcher index 36d1959..90730d4 120000 --- a/stopes/pipelines/speech/conf/launcher +++ b/stopes/pipelines/speech/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher \ No newline at end of file +../../bitext/conf/launcher diff --git a/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml b/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml index 57d58df..7b94775 100644 --- a/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml +++ b/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml @@ -20,4 +20,4 @@ config: lang: ??? shards: ??? lang_shard_name: None - launcher: ${launcher} + launcher: ${launcher} diff --git a/stopes/pipelines/translate/conf/example.yaml b/stopes/pipelines/translate/conf/example.yaml index 0b83371..2449a5c 100644 --- a/stopes/pipelines/translate/conf/example.yaml +++ b/stopes/pipelines/translate/conf/example.yaml @@ -9,12 +9,12 @@ generation: output_dir: ??? preserve_filenames: true file_list: - - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "arb_Latn"] - - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "ary_Arab"] - - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "ajp_Arab"] - - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "apc_Arab"] - - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Arab"] - - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Latn"] - - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "ary_Arab"] - - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "apc_Arab"] + - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "arb_Latn"] + - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "ary_Arab"] + - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "ajp_Arab"] + - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "apc_Arab"] + - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Arab"] + - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Latn"] + - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "ary_Arab"] + - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "apc_Arab"] # format is [path, src_lang, tgt_lang] diff --git a/stopes/pipelines/translate/conf/launcher b/stopes/pipelines/translate/conf/launcher index 36d1959..90730d4 120000 --- a/stopes/pipelines/translate/conf/launcher +++ b/stopes/pipelines/translate/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher \ No newline at end of file +../../bitext/conf/launcher diff --git a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx index 504de22..6d6faf4 100644 --- a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx +++ b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx @@ -121,14 +121,14 @@ function useFileNavigate() { } const Files = (): JSX.Element => { - const [displayHelper, setDisplayHelper] = useState(false); + const [displayHelper, setDisplayHelper] = useState(false); const navigate = useFileNavigate(); let { filename, pageNumber, numberLines, files, audioBlob, error } = useLoaderData() as LoaderReturn; const [newFilename, setNewFilename] = useState( filename || config.default_path ); - + // if we have a location, we are in a transition between two urls const navigation = useNavigation(); const locationParams = parseLocation(navigation.location); @@ -171,14 +171,13 @@ const Files = (): JSX.Element => { // Add new function to handle paste events const fileInputHandlePaste = useCallback( (evt) => { - const pastedData = evt.clipboardData.getData('text'); + const pastedData = evt.clipboardData.getData("text"); setNewFilename(pastedData); navigate(pastedData, pageNumber, numberLines); }, [navigate, pageNumber, numberLines] ); - return (
diff --git a/stopes/ui/seamlisten/react_app/src/e2e.test.js b/stopes/ui/seamlisten/react_app/src/e2e.test.js index 7ca2301..434a0f2 100644 --- a/stopes/ui/seamlisten/react_app/src/e2e.test.js +++ b/stopes/ui/seamlisten/react_app/src/e2e.test.js @@ -1,8 +1,8 @@ -const puppeteer = require('puppeteer'); +const puppeteer = require("puppeteer"); jest.setTimeout(40000); -describe('File Viewer', () => { +describe("File Viewer", () => { let browser; let page; @@ -16,32 +16,36 @@ describe('File Viewer', () => { }); beforeEach(async () => { - await page.goto('http://localhost:3000/'); + await page.goto("http://localhost:3000/"); }); - test('should display the filename input, fetch button and help button', async () => { - const filenameInput = await page.waitForSelector('.form-control.form-control-sm'); - const fetchButton = await page.waitForSelector('.btn.btn-primary', { text: 'Fetch!' }); - const helpButton = await page.waitForSelector('button[aria-controls="help-text"]'); + test("should display the filename input, fetch button and help button", async () => { + const filenameInput = await page.waitForSelector( + ".form-control.form-control-sm" + ); + const fetchButton = await page.waitForSelector(".btn.btn-primary", { + text: "Fetch!", + }); + const helpButton = await page.waitForSelector( + 'button[aria-controls="help-text"]' + ); expect(filenameInput).not.toBeNull(); expect(fetchButton).not.toBeNull(); expect(helpButton).not.toBeNull(); - }); - test('should display the help text when the button is clicked', async () => { + test("should display the help text when the button is clicked", async () => { // Click the help button await page.click('button[aria-controls="help-text"]'); // Wait for the help text to appear - await page.waitForSelector('#help-text', { visible: true }); + await page.waitForSelector("#help-text", { visible: true }); // Assert that the help text is visible - const helpTextVisible = await page.$eval('#help-text', (element) => { - return getComputedStyle(element).display !== 'none'; + const helpTextVisible = await page.$eval("#help-text", (element) => { + return getComputedStyle(element).display !== "none"; }); expect(helpTextVisible).toBe(true); }); - }); diff --git a/website/docs/eval/alti.md b/website/docs/eval/alti.md index 9d7867d..77b5b01 100644 --- a/website/docs/eval/alti.md +++ b/website/docs/eval/alti.md @@ -1,24 +1,19 @@ # ALTI+ -ALTI+ is a tool for inspecting token contributions in a transformer encoder-decoder model. -It might be useful for detecting hallucinated translations or undertranslations. +ALTI+ is a tool for inspecting token contributions in a transformer encoder-decoder model. It might be useful for detecting hallucinated translations or undertranslations. -This repository is based on the code from the paper [Ferrando et al., 2022](https://arxiv.org/abs/2205.11631). -The original code is located at https://github.com/mt-upc/transformer-contributions-nmt. -It is licensed under the Apache 2.0 license included in the current directory. +This repository is based on the code from the paper [Ferrando et al., 2022](https://arxiv.org/abs/2205.11631). The original code is located at https://github.com/mt-upc/transformer-contributions-nmt. It is licensed under the Apache 2.0 license included in the current directory. -We have made a few adaptation to the code so that it can run with the dense NLLB-200 models. -The code in this directory is licensed both under the Apache 2.0 license of the original code (in the current directory), -and under the MIT license of the whole project (in the parent directory). +We have made a few adaptation to the code so that it can run with the dense NLLB-200 models. The code in this directory is licensed both under the Apache 2.0 license of the original code (in the current directory), and under the MIT license of the whole project (in the parent directory). # Usage -An instruction for setting up the environment and computing ALTI+ token contributions from an NLLB model -with a command line interface is present in the folder `demo/alti`. -Below is another example, that uses a bilingual model and the Python interface. -Here is how you can run it: +An instruction for setting up the environment and computing ALTI+ token contributions from an NLLB model with a command line interface is present in the folder `demo/alti`. + +Below is another example, that uses a bilingual model and the Python interface. Here is how you can run it: 1. Prepare the environment by installing Fairseq and Stopes: + ``` pip install fairseq==0.12.1 git clone https://github.com/facebookresearch/stopes.git @@ -27,17 +22,17 @@ pip install -e '.[alti]' ``` 2. Download the model and dictionary from https://github.com/deep-spin/hallucinations-in-nmt: - - model: https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file - - data: https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file -3. Run the following commands to unpack the data: -```tar -xvf checkpoint_best.tar.xz && tar -xvf wmt18_de-en.tar.xz``` + - model: https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file + - data: https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file +3. Run the following commands to unpack the data: `tar -xvf checkpoint_best.tar.xz && tar -xvf wmt18_de-en.tar.xz` 4. Run the following command to download the tokenizers: + ``` wget https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.model wget https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.vocab ``` -Now you can run the following Python code to look at the ALTI analysis: +Now you can run the following Python code to look at the ALTI analysis: ```Python from stopes.eval.alti.wrappers.transformer_wrapper import FairseqTransformerHub @@ -77,7 +72,9 @@ print(compute_alti_metrics(*compute_alti_nllb(hub, src, tgt2))['avg_sc']) # 0.4 ``` # Citation + If you use ALTI+ in your work, please consider citing: + ```bibtex @inproceedings{alti_plus, title = {Towards Opening the Black Box of Neural Machine Translation: Source and Target Interpretations of the Transformer}, diff --git a/website/docs/eval/blaser.md b/website/docs/eval/blaser.md index 5d7d8ce..f005d1b 100644 --- a/website/docs/eval/blaser.md +++ b/website/docs/eval/blaser.md @@ -45,6 +45,7 @@ You will need to pass three sets of speech segments to get a blaser score: - the reference audio (`ref`) The set of speech segments have to be organised in a tsv manifest pointing to the audio files. The format for each input audio data tsv file is: + ``` \t @@ -70,8 +71,8 @@ python -m stopes.pipelines.eval.eval_blaser output_dir=YOUROUTPUTDIRECTORY src_m where `src_lang` is the language of your source audio and tgt_lang is the target language. This is used to lookup the correct encoder model as specified by `stopes/pipelines/eval/conf/eval_blaser.yaml`. You can download pre-trained encoders from the [SpeechMatrix project](https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_matrix/speech_laser_encoders.md). By default, the encoder used for the reference is the same as the target one, you can override this with `ref_lang=..` in the command arguments. - ## Citation + If you use `blaser` in your work or any of its models, please cite: ```bibtex @@ -86,4 +87,5 @@ If you use `blaser` in your work or any of its models, please cite: ``` ## License + `blaser` is MIT licensed, as found in the LICENSE file in the root directory. diff --git a/website/docs/pipelines/distillation.md b/website/docs/pipelines/distillation.md index 765f4c9..f5afaed 100644 --- a/website/docs/pipelines/distillation.md +++ b/website/docs/pipelines/distillation.md @@ -5,6 +5,7 @@ sidebar_position: 3 # NLLB Distillation Pipeline Welcome to `stopes`, and thanks for checking out our sequence-level knowledge distillation pipeline. This is a quick start guide which walks through how to run the pipeline yourself and what the expected outputs will be from each step. The logic of the pipeline is at a high level as follows: + 1. cleans pre-downloaded monolingual data (see [STOPES monolingual pipeline](https://github.com/fairinternal/nllb/blob/main/website/docs/pipelines/monolingual.md#nllb-monolingual-pipeline)) - results in one merged file of data for each source language 2. shards each source language file from previous step into as many shards as number of specified target languages 3. generates target language translations for each shard from previous step using Fairseq Generate @@ -14,17 +15,16 @@ Welcome to `stopes`, and thanks for checking out our sequence-level knowledge di ## To run: -First, fill out any missing fields in distillation.yaml (labeled ???). Then, -`python stopes/pipelines/distillation/distillation_pipeline.py` should be enough to get it running. +First, fill out any missing fields in distillation.yaml (labeled ???). Then, `python stopes/pipelines/distillation/distillation_pipeline.py` should be enough to get it running. -You can also override distillation.yaml fields manually through the CLI as such: -`python stopes/pipeliens/distillation/distillation_pipeline.py src_langs="[eng,mai]" tgt_langs="[fra,deu]" mono_data_dir= output_dir=`. +You can also override distillation.yaml fields manually through the CLI as such: `python stopes/pipeliens/distillation/distillation_pipeline.py src_langs="[eng,mai]" tgt_langs="[fra,deu]" mono_data_dir= output_dir=`. For internal FAIR users, feel free to add the `+fb_preset=nllb` argument to the CLI command to use some preset config settings. Note: Testing performance can be done with a separate STOPES module, `/stopes/modules/evaluation/generate_multi_bleu_detok_module.py`. ## Useful overrides + - `src_langs` is an array of source languages you have pre-downloaded monolingual data for - `tgt_langs` is an array of target languages you want to train the student model to translate to - `mono_data_dir` is the path to pre-downloaded monolingual data @@ -42,6 +42,7 @@ Please be aware that at every intermediate step, the program will overwrite file The run will be started with a custom working directory that follows the pattern: `outputs/{date}/{start_time}`, all the logs will go there (including executor_logs from slurm jobs). By default, the data output is set in `distillation.yaml` to be `output_dir: .` this means that the outputs will go to the working directory and will go to different places depending on the day/time you start the run. This is useful for testing, but if you want to output somewhere else (like a central clean monolingual repo), override the `output_dir=/somethingstable/` when starting the run. ### Raw input monolingual file: + ``` ~/test_inputs/eng % cat test.eng @@ -53,15 +54,8 @@ BlackBerry Z10 To Launch In South Africa Tomorrow - Blackberry Empire http://www ``` ### Example file output of monolingual_pipeline before dedup: -Parsed in column format: - 1. self.corpus, # the original corpus name - 2. self.offset_start, # skip that many bytes (use dd) - 3. line_id, # after skipping, go to line - 4. line_hash, # xxhash.xxh3_64 of the original line/paragrph - 5. f"{prob_lang:.5f}", # lid score - 6. clean, # sentence - # config - sep="\t" + +Parsed in column format: 1. self.corpus, # the original corpus name 2. self.offset_start, # skip that many bytes (use dd) 3. line_id, # after skipping, go to line 4. line_hash, # xxhash.xxh3_64 of the original line/paragrph 5. f"{prob_lang:.5f}", # lid score 6. clean, # sentence # config sep="\t" ``` ~/test_outputs/mono_data/eng @@ -79,6 +73,7 @@ test 0 __label__eng 0.37420 BlackBerry Z10 To Launch In South Africa Tomorrow - ``` ### Example file output of dedup + ``` % cat eng_all_dedup test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District of Columbia @@ -90,6 +85,7 @@ test 443 0 3451732902557484365 0.83896 no down payment auto insurance in Scottsd ``` ### Example file output of shard + ``` % cat shard.000 test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District of Columbia @@ -101,7 +97,9 @@ test 443 0 3451732902557484365 0.83896 no down payment auto insurance in Scottsd ``` ### Example file output of generate + Target generated data: + ``` test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District de Columbia test 692 0 8327890826167111651 0.83095 Une question de priorités: réforme démocratique et reprise économique en Allemagne d'après-guerre Auteur: Rebecca L. Boehling TiersD'occasion8,25€202,25€ @@ -112,7 +110,9 @@ test 443 0 3451732902557484365 0,83896 aucun acompte d'assurance automobile à S ``` ### Example file output of bitext clean + The contents of the filtered `clean.eng-fra.eng.000.xz` and `clean.eng-fra.fra.000.xz` files are respectively: + ``` test 692 0 8327890826167111651 0.83095 A Question of Priorities: Democratic Reform and Economic Recovery in Postwar Germany Auteur: Rebecca L. Boehling TiersD'occasion8,25€202,25€ test 0 0 12930410217004390762 0.90479 Appealing Accent Chair And Ottoman and Petra Fabric Accent Chair With Ottoman Furniture Home Decoration - Lilangels Furniture @@ -120,6 +120,7 @@ test2 0 0 5374428323341487497 1.00001 He has a cat. test2 0 0 5374428323341487497 0.99987 Hello the president is here! test 443 0 3451732902557484365 0.83896 no down payment auto insurance in Scottsdale AZ ``` + ``` target_data 0 15 9889559120183218255 0.97691 Une question de priorités: réforme démocratique et reprise économique en Allemagne d'après-guerre Auteur: Rebecca L. Boehling TiersD'occasion8,25€202,25€ target_data 28 39 7358542291591603186 0.98684 Chaise d'accent attrayante et chaise d'accent du tissu ottoman et du tissu de Petra avec meubles ottomans décoration de la maison - Lilangels meubles @@ -129,14 +130,17 @@ target_data 56 78 15942782228027469307 0.97898 aucun acompte d'assurance automob ``` Meanwhile, the contents of the two discarded output files `discarded.eng-fra.eng.000.xz` and `discarded.eng-fra.fra.000.xz` are respectively: + ``` test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District of Columbia ``` + ``` gen_shard 0 __label__eng 0.32102 202-458-1769 Joie Olverson - Spring House Ln, Washington, District de Columbia ``` ### Example file output of binarizing and encoding + ``` train.eng-fra.eng.000.bin train.eng-fra.eng.001.idx train.eng-fra.eng.003.bin train.eng-fra.eng.000.idx train.eng-fra.eng.002.bin train.eng-fra.eng.003.idx @@ -144,6 +148,7 @@ train.eng-fra.eng.001.bin train.eng-fra.eng.002.idx ``` ### Example file output of train + ``` -rw-rw-r-- 1 $USER $USER 4.2G Aug 3 12:05 checkpoint_best.pt -rw-rw-r-- 1 $USER $USER 4.2G Aug 3 12:05 checkpoint_last.pt diff --git a/website/docs/pipelines/global_mining.md b/website/docs/pipelines/global_mining.md index 8226c53..5d7bea1 100644 --- a/website/docs/pipelines/global_mining.md +++ b/website/docs/pipelines/global_mining.md @@ -8,18 +8,16 @@ sidebar_position: 1 You can launch the mining for a pair of languages with the following command: - ```bash python -m stopes.pipelines.bitext.global_mining_pipeline src_lang=fuv tgt_lang=zul demo_dir=.../stopes-repo/demo +preset=demo output_dir=. embed_text=laser2 ``` -(see the demo doc for a quick understanding of the `+preset` override) +(see the demo doc for a quick understanding of the `+preset` override) This will run the required steps and try to re-use whatever step outputs has already been computed. So if you run this exact command multiple times (e.g. after a pre-emption in slurm), it will start from where it failed instead of recomputing everything. Here is an example log: - ``` [global_mining][INFO] - output: ....../mining/global_mining/outputs/2021-11-02/08-56-40 [global_mining][INFO] - working dir: ....../mining/global_mining/outputs/2021-11-02/08-56-40 @@ -38,37 +36,30 @@ Here is an example log: [train_faiss_index][INFO] - lang=hi, sents=162844151, required=40000000, index type=OPQ64,IVF65536,PQ64 ``` - We can see that the launcher has found out that it doesn't need to run the encode and train index steps for the bn lang (source language) and can skip straight to populating the index with embeddings, but it also already processed 44 shards for that step, so will only re-schedule jobs for 11 shards. In parallel, it is also processing the target language (hi) and found that it still needs to run the index training step as it also recovered all the encoded shards. If you are using slurm as the launcher instead of the local setting, the pipeline also takes care of communicating with slurm, waiting for all slurm jobs to finish and synchronizing the consecutive jobs. See below on how to run single steps for debugging. You can run the whole pipeline locally with: - ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg launcher.cluster=local ``` - - # Understanding the Configuration -The configuration is driven by [Hydra](https://hydra.cc/), this makes it sound way more complicated than it actually is. The first main difference is how the command line arguments are specified. Instead of using the `--arg=foobar` standard notation, Hydra introduces its [own notation](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) to be able to have a more complete syntax. This is indeed odd, but once you are used to it, it provides a lot of benefits. +The configuration is driven by [Hydra](https://hydra.cc/), this makes it sound way more complicated than it actually is. The first main difference is how the command line arguments are specified. Instead of using the `--arg=foobar` standard notation, Hydra introduces its [own notation](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) to be able to have a more complete syntax. This is indeed odd, but once you are used to it, it provides a lot of benefits. A second big change is that most of the things that can be changed in the pipeline are driven by yaml configuration files instead of having to change the script files. These configuration files are checked in and you can override them on the command line (see the examples above). The pipeline will log the actual full config+overrides in the output folder when you do a run, so that you can always look at the config that was used to generate a particular data folder. The third major change, and main benefit, is that the configs are split in "groups" (hydra terminology) and you can override a whole group with another yaml file with a very simple syntax. For instance, the embed_text step has a set of pre-made configs in `global_mining/conf/embed_text` and you can swap between them. If you would like to make a new reusable/shared config for embed_text, you could put a new yaml file in that that folder (let say `global_mining/conf/embed_text/foobar.yaml`) and select it from the cli with: - ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg embed_text=foobar ``` - See the Data and Modules discussion below for more examples. - ## Outputs and Working Dir The output of the pipeline is set in the global_mining.yaml to be ".", which means the current working directory. When running `global_mining_pipeline.py` it will by default create a new folder under `outputs/today_date/timeofrun` and make this your working directory. This means all your logs will be well organized. It also means that the main output of each step will go under that directory given the default configuration of `output_dir: .` @@ -77,52 +68,44 @@ Because you might run the pipeline multiple times for the same "data run" (e.g. It's therefore a good idea when you are doing a full run (not just testing), to specify a fixed outputs directory when launching the pipeline: - ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg output_dir=/myfinal/data/outputs ``` - This way logs and other temp files will go to the working directory, but the data will go to a clean central place. - ## Data The current data configuration for the pipeline takes a few parameters: - - -* data_version -* iteration -* data_shard_dir -* shard_type -* bname +- data_version +- iteration +- data_shard_dir +- shard_type +- bname Because you will most often always use the same data for your runs, there is no need to specify this every time on the CLI or in the default config. There is a "group" under `global_mining/conf/data` where you can put common data sources. Checkout the demo config to see how to configure data. You can create a data config folder if you want to switch data without changing all other presets. - # Modules The pipeline is made of seven main steps: -* split_in_shards (optional) -* embed_text -* train_index -* populate_index -* merge_index -* calculate_distances -* mine_indexes -* mine_sentences -* merge_shards (optional) +- split_in_shards (optional) +- embed_text +- train_index +- populate_index +- merge_index +- calculate_distances +- mine_indexes +- mine_sentences +- merge_shards (optional) Each of them is configured as a "group" and their configurations can be overridden by switching groups on the cli as explained above. This override can also completely switch the code/module that is being used to compute this step, without changing the pipeline itself. - **Embedding Modules** You can switch the actual encoder being used to choose between multiple encoders. For example, you can choose to use LaBSE, BERT, RoBERTa, or any other model from the sentence-transformers repo within the HuggingFace Model Hub ([https://huggingface.co/sentence-transformers](https://huggingface.co/sentence-transformers)). Here’s an example of how to encode text using LaBSE (with encoder-specific options in blue): - ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg embed_text=hf_roberta_large ``` @@ -132,6 +115,7 @@ python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg embed_text=h ``` or you can choose any huggingface encoder by their name with: + ```bash python global_mining_pipeline.py -c job src_lang=bn tgt_lang=hi +data=ccg embed_text=huggingface embed_text.encoder_model=sentence-transformers/LaBSE ``` @@ -144,14 +128,14 @@ embed_text.config.encoder.encoder_model=path_to_laser_model embed_text.config.encoder.spm_model=path_to_spm_model ``` - ### Splitting and merging languages -For some large languages, the mining might fail because of out-of-memory errors, especially if the FAISS indexes are stored on GPU. To mitigate this probelm, you can split a language into shards, perform the mining on them in parallel, and then merge the results. -The first optional module, `split_in_shards`, can randomly split the language (inclusing both text files and metadata files, if they exist) into several shards. -To use this option, you should specify the parameter `max_shard_size`, and the languages with more total lines than this number will be automatically split into smaller shards. +For some large languages, the mining might fail because of out-of-memory errors, especially if the FAISS indexes are stored on GPU. To mitigate this probelm, you can split a language into shards, perform the mining on them in parallel, and then merge the results. + +The first optional module, `split_in_shards`, can randomly split the language (inclusing both text files and metadata files, if they exist) into several shards. To use this option, you should specify the parameter `max_shard_size`, and the languages with more total lines than this number will be automatically split into smaller shards. Alternatively, you can manually split the data for the language and configure it as several separate "languages", e.g. `eng0,eng1,eng2`. In this case, you can indicate in the mining config that they should be merged into a single language after mining: + ``` sharded_langs: eng: @@ -170,15 +154,13 @@ One of the benefits of the hydra cli override syntax, is that you can ask hydra For instance, if you would like to run the pipeline on multiple languages, you can do: - ```bash python global_mining_pipeline.py -m src_lang=en tgt_lang=bn,hi +data=ccg ``` - The `-m` parameter tells the pipeline to start with multi-run and `tgt_lang=bn,hi` tells it to make two runs, one for en-bn and one for en-hi. - You could also sweep over the lang and the encoders with: +You could also sweep over the lang and the encoders with: ```bash python global_mining_pipeline.py -m src_lang=en tgt_lang=bn,hi +data=ccg embed_text=hf_roberta_large,hf_labse @@ -211,23 +193,24 @@ launcher.cache.caching_dir=/path/to/cache \ maximum_epoch=20 ``` -**NOTE**: In order for the training pipeline to know which column of the bitext corresponds to the selected `src_lang` and `tgt_lang`, it presumes that the two text columns in the bitext are ordered by their sorted language names. For example, for a `eng-lin` bitext, the format is: alignment-score [tab] english-text [tab] lingala-text (not alignment-score [tab] lingala-text [tab] english-text). +**NOTE**: In order for the training pipeline to know which column of the bitext corresponds to the selected `src_lang` and `tgt_lang`, it presumes that the two text columns in the bitext are ordered by their sorted language names. For example, for a `eng-lin` bitext, the format is: alignment-score [tab] english-text [tab] lingala-text (not alignment-score [tab] lingala-text [tab] english-text). ## Outputs The NMT pipeline will create the following directories in the specified `output_dir`: + - `bin_dir`: moses preprocessed, spm-encoded, and binarized data. - `trained_models`: checkpoints from `fairseq-train`. **Note**: this directory will also contain files containing the outputs of both `fairseq-generate` (files ending in `.out`) and the corresponding BLEU evaluations for each checkpoint (files ending in `.bleu`). ## Evaluation data -To find the evaluation data for your chosen languages, `stopes` needs to know the relevant path. See `path` in `stopes/pipelines/bitext/conf/preproc_binarize_mined/standard_conf.yaml`. Currently it defaults to the format of the `flores200` dataset. To use this, please [download flores200](https://github.com/facebookresearch/flores/tree/main/flores200). +To find the evaluation data for your chosen languages, `stopes` needs to know the relevant path. See `path` in `stopes/pipelines/bitext/conf/preproc_binarize_mined/standard_conf.yaml`. Currently it defaults to the format of the `flores200` dataset. To use this, please [download flores200](https://github.com/facebookresearch/flores/tree/main/flores200). ## Example overrides **Spm training** -```spm.train.config.vocab_size=7000``` +`spm.train.config.vocab_size=7000` **Model configuation** diff --git a/website/docs/pipelines/monolingual.md b/website/docs/pipelines/monolingual.md index e611a4c..d640a05 100644 --- a/website/docs/pipelines/monolingual.md +++ b/website/docs/pipelines/monolingual.md @@ -20,6 +20,7 @@ The core filtering is in `monolingual_line_processor.py` and `utils/text_filter. `python monolingual_pipeline.py data_dir=yourdatahere langs='[umb,ssw]'` should be enough to get it running. + - `data_dir` is where the raw data is, should have subfolders per lang and files named with the pattern corpus_name.lang.xz - `langs` an array of langs to process in this run diff --git a/website/docs/quickstart.md b/website/docs/quickstart.md index 85009b6..8caa1ea 100644 --- a/website/docs/quickstart.md +++ b/website/docs/quickstart.md @@ -4,9 +4,7 @@ sidebar_position: 1 # Getting started with mining -Welcome to `stopes`, this is a quickstart guide to discover how to run automated pipelines with `stopes`. In this example, you'll be running -global mining with the `stopes` toolchain. (Inspired by -[CCMatrix](https://ai.facebook.com/blog/ccmatrix-a-billion-scale-bitext-data-set-for-training-translation-models/)). +Welcome to `stopes`, this is a quickstart guide to discover how to run automated pipelines with `stopes`. In this example, you'll be running global mining with the `stopes` toolchain. (Inspired by [CCMatrix](https://ai.facebook.com/blog/ccmatrix-a-billion-scale-bitext-data-set-for-training-translation-models/)). ## Installation @@ -14,16 +12,11 @@ Follow the installation steps from the [project's README](https://github.com/fac ## Getting Data -To run the global mining pipeline, you first need to get some monolingual data. -The [WMT22 Shared Task: Large-Scale Machine Translation Evaluation for African -Languages](https://statmt.org/wmt22/large-scale-multilingual-translation-task.html) -has some interesting monolingual data for some African languages. +To run the global mining pipeline, you first need to get some monolingual data. The [WMT22 Shared Task: Large-Scale Machine Translation Evaluation for African Languages](https://statmt.org/wmt22/large-scale-multilingual-translation-task.html) has some interesting monolingual data for some African languages. -You also need some trained encoder, we usually use `stopes` with LASER and we can -find such trained encoders for the languages in the WMT22 shared task too. +You also need some trained encoder, we usually use `stopes` with LASER and we can find such trained encoders for the languages in the WMT22 shared task too. -The `demo/mining/prepare.sh` script will download the monolingual data and LASER encoders -for you. Start by running this script and wait for the download to finish. +The `demo/mining/prepare.sh` script will download the monolingual data and LASER encoders for you. Start by running this script and wait for the download to finish. :::tip @@ -33,26 +26,18 @@ for you. Start by running this script and wait for the download to finish. ## Configuring the pipeline -In `stopes` pipelines, we use [hydra](https://hydra.cc/) to configure the runs. -With hydra, you can configure everything with "overrides" on the cli, but it's -often easier to put the configurations in yaml files as there is a lot of things -to setup. +In `stopes` pipelines, we use [hydra](https://hydra.cc/) to configure the runs. With hydra, you can configure everything with "overrides" on the cli, but it's often easier to put the configurations in yaml files as there is a lot of things to setup. -`stopes/pipelines/bitext/conf/preset/demo.yaml` is a demo configuration for the -data and encoders that we've downloaded in the previous steps. Check out the -comments in that file. +`stopes/pipelines/bitext/conf/preset/demo.yaml` is a demo configuration for the data and encoders that we've downloaded in the previous steps. Check out the comments in that file. The important parts of that preset config is: + 1. we setup the launcher to run on your local computer (no need for a cluster) -2. we setup an alias for a `demo_dir` folder, so you can point to the - data/models from the cli +2. we setup an alias for a `demo_dir` folder, so you can point to the data/models from the cli 3. we setup some information about the `data`: - some naming, to get nice file names as outputs - where the data is found (with `shard_glob`) -4. we tell the pipeline where to find the encoder and SentencePiece model (SPM) uses - to embed the text. We do that for each lang in `lang_configs`. Practically, - if you are only processing a few languages, you don't need so many entries, - here we preset them for all languages from the WMT22 task +4. we tell the pipeline where to find the encoder and SentencePiece model (SPM) uses to embed the text. We do that for each lang in `lang_configs`. Practically, if you are only processing a few languages, you don't need so many entries, here we preset them for all languages from the WMT22 task :::tip @@ -63,30 +48,20 @@ Language codes are important, but not standardized everywhere. The `stopes` libr ## Run the Pipeline You can now start the pipeline with: + ```bash python -m stopes.pipelines.bitext.global_mining_pipeline src_lang=fuv tgt_lang=zul demo_dir=.../stopes-repo/demo/mining +preset=demo output_dir=. embed_text=laser3 ``` - `src_lang` and `tgt_lang` specify the pair of languages we want to process, -- `demo_dir` is the new variable we introduce in our preset/demo.yaml file, to - point to where the `prepare.sh` script downloaded our data; make sure to - specify an absolute path, -- `+preset=demo` tells hydra to load the demo.yaml preset file to set our - defaults (the `+` here is because we are telling hydra to append a group that - doesn't exist in the default config, see the [hydra - doc](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) - for details), +- `demo_dir` is the new variable we introduce in our preset/demo.yaml file, to point to where the `prepare.sh` script downloaded our data; make sure to specify an absolute path, +- `+preset=demo` tells hydra to load the demo.yaml preset file to set our defaults (the `+` here is because we are telling hydra to append a group that doesn't exist in the default config, see the [hydra doc](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) for details), - `output_dir` specifies where we want the output (current run directory), -- `embed_text=laser3` tells the pipeline to use the laser3 encoding code to load - the models and encode the text. +- `embed_text=laser3` tells the pipeline to use the laser3 encoding code to load the models and encode the text. ## Try using a different encoder -In the previous run, we used `embed_text=laser3`, which will encode text with -the language specific laser3, but you can also use other encoders. For instance, -stopes ships with [HuggingFace -sentence-transformers](https://huggingface.co/sentence-transformers), so you can -use different encoders if you want to experiment. +In the previous run, we used `embed_text=laser3`, which will encode text with the language specific laser3, but you can also use other encoders. For instance, stopes ships with [HuggingFace sentence-transformers](https://huggingface.co/sentence-transformers), so you can use different encoders if you want to experiment. You need to install `sentence-transformers` in your environment: @@ -105,6 +80,7 @@ python -m stopes.pipelines.bitext.global_mining_pipeline src_lang=fuv tgt_lang=z ## Explore More Check out these docs to learn more: + - [Prebuilt Pipelines](category/prebuilt-pipelines) - [`stopes` Module Framework](stopes) diff --git a/website/docs/stopes/advanced/checkpointing.md b/website/docs/stopes/advanced/checkpointing.md index 72ee42f..16edfe9 100644 --- a/website/docs/stopes/advanced/checkpointing.md +++ b/website/docs/stopes/advanced/checkpointing.md @@ -4,9 +4,4 @@ sidebar_position: 1 # Checkpointing (advanced) -When using SLURM, the StopesModule system uses submitit to schedule the jobs. -This means that you can leverage the checkpointing feature it offers. This -allows you to store the state of the current module when its job gets preempted -or times out. See the [submitit -doc](https://github.com/facebookincubator/submitit/blob/main/docs/checkpointing.md) -for more details. +When using SLURM, the StopesModule system uses submitit to schedule the jobs. This means that you can leverage the checkpointing feature it offers. This allows you to store the state of the current module when its job gets preempted or times out. See the [submitit doc](https://github.com/facebookincubator/submitit/blob/main/docs/checkpointing.md) for more details. diff --git a/website/docs/stopes/advanced/debugging.md b/website/docs/stopes/advanced/debugging.md index ae3c33c..529287e 100644 --- a/website/docs/stopes/advanced/debugging.md +++ b/website/docs/stopes/advanced/debugging.md @@ -6,45 +6,29 @@ sidebar_position: 3 You can launch an individual module with: - ```bash python launch.py +module=my_module ``` +Where `my_module` is the name of the config file you want to use. This is useful for debugging, usually. -Where `my_module` is the name of the config file you want to use. This is useful -for debugging, usually. - -The launcher is configured in -`global_mining/conf/main_conf.yaml` +The launcher is configured in `global_mining/conf/main_conf.yaml` You should not have to change this config to run your module. -The `+module= `argument is the way to tell hydra to pick up your module config -file. The launcher will use the `_target_` directive in the module to initialize -the correct module and then pass the right config. - -You can override any part of the configuration with [normal hydra -overrides](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax). -For example, if you wanted to specify the lang parameter for your module, you -can do: +The `+module= `argument is the way to tell hydra to pick up your module config file. The launcher will use the `_target_` directive in the module to initialize the correct module and then pass the right config. +You can override any part of the configuration with [normal hydra overrides](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax). For example, if you wanted to specify the lang parameter for your module, you can do: ```bash python launch.py +module=my_config module.config.lang=luo ``` +The `module/my_config.yaml` file will be loaded and then the lang will be overridden. This will create a new config for you. -The `module/my_config.yaml` file will be loaded and then the lang will be -overridden. This will create a new config for you. - -The launcher will then run your module and dump the full config (with overrides) -in the outputs folder. - -To do more advanced debugging, remember that a module is just a normal python -object that you can run as any python callable. You can therefore just call -your module from the REPL or a notebook with: +The launcher will then run your module and dump the full config (with overrides) in the outputs folder. +To do more advanced debugging, remember that a module is just a normal python object that you can run as any python callable. You can therefore just call your module from the REPL or a notebook with: ```python module = MyModule(config) diff --git a/website/docs/stopes/advanced/dynamic.md b/website/docs/stopes/advanced/dynamic.md index 93ebe50..8c2bfe7 100644 --- a/website/docs/stopes/advanced/dynamic.md +++ b/website/docs/stopes/advanced/dynamic.md @@ -4,60 +4,40 @@ sidebar_position: 2 # Dynamic Initializing Modules (advanced) -It is easy to initialize a module like a normal python class with` -MyModule(config)`. However this would make your pipeline static as the module -couldn't be swapped. +It is easy to initialize a module like a normal python class with` MyModule(config)`. However this would make your pipeline static as the module couldn't be swapped. +**Problem:** For instance, imagine that your pipeline has an _embed_ step that takes raw text as input and outputs an embedding of that text. -**Problem:** For instance, imagine that your pipeline has an _embed_ step that -takes raw text as input and outputs an embedding of that text. +You might want to test different embedding methods, let's say, compare the LASER implementation with a HuggingFace encoder. -You might want to test different embedding methods, let's say, compare the LASER -implementation with a HuggingFace encoder. - -If you wrote your code with `encoder = LaserEncoderModule(config)` you will not -be able to swap this step to use the `HFEncoderModule` without changing the code -of your pipeline. - -**Solution**: Because we are using Hydra, we have an easy way to specify modules -in config and override them when calling the pipeline. All you have to do is to -use: +If you wrote your code with `encoder = LaserEncoderModule(config)` you will not be able to swap this step to use the `HFEncoderModule` without changing the code of your pipeline. +**Solution**: Because we are using Hydra, we have an easy way to specify modules in config and override them when calling the pipeline. All you have to do is to use: ```python embed_module = StopesModule.build(self.config.embed_text, lang=lang) ``` +The `build` helper will find the `_target_` entry in the embed_text config and initialize the module that it points to. The `kwargs` of build can be used to specify in code a specific value of the config. -The `build` helper will find the `_target_` entry in the embed_text config and -initialize the module that it points to. The `kwargs` of build can be used to -specify in code a specific value of the config. - -Thanks to `build`, we can now have two config files in the embed_text group that -will point to the different modules: - - - +Thanks to `build`, we can now have two config files in the embed_text group that will point to the different modules: ```yaml title="laser_module.yaml" # @package module _target_: modules.LaserEncoderModule config: - lang: ??? + lang: ??? ``` - - ```yaml title="hf_module.yaml" # @package module _target_: modules.HFEncoderModule config: - lang: ??? + lang: ??? ``` - And you can override this module from the cli: ```bash @@ -65,6 +45,4 @@ python yourpipeline.py embed_text=hf_module src_lang=bn tgt_lang=hi +data=ccg ``` -This does look a bit odd at first, but look at the implementation of -global_mining to see how it flows and how modules are used and config/data is -passed around. +This does look a bit odd at first, but look at the implementation of global_mining to see how it flows and how modules are used and config/data is passed around. diff --git a/website/docs/stopes/cache.md b/website/docs/stopes/cache.md index 410a107..d6bcade 100644 --- a/website/docs/stopes/cache.md +++ b/website/docs/stopes/cache.md @@ -4,34 +4,20 @@ sidebar_position: 5 # Caching/Memoization -An important part of the launcher is its caching system. When you call the -schedule method with a configured module, the launcher will check if this -configuration was already run in the past and reuse the results when possible. -The cache is indexed on the configuration of the module, so if you change -anything in the configuration input, the module will be executed from scratch -and the new result will be cached with a different key. It's also important to -remember that all inputs to the module that could change its results (and thus -the caching) should be specified in the config input. - -If you change the code of your module to a point that would change its output, -you can implement the `version() `method to return a new value so that the cache -knows that it needs to recompute from scratch even from known configs. - -You can also implement the` validate() `method to check the outputs from your -module and from the cache if you want to actively invalidate the cache. For -example, if it’s known how many lines are to be embedded into a particular -dimension (say 1024), you can validate that the output file size is e.g. -`num_lines * 1024 * float32.` - -Here is an example of rerunning the global mining pipeline that was interrupted -in the middle. The caching layer recovers what was already executed -successfully. This was started with the same command that would require a full -run: +An important part of the launcher is its caching system. When you call the schedule method with a configured module, the launcher will check if this configuration was already run in the past and reuse the results when possible. The cache is indexed on the configuration of the module, so if you change anything in the configuration input, the module will be executed from scratch and the new result will be cached with a different key. It's also important to remember that all inputs to the module that could change its results (and thus the caching) should be specified in the config input. + +If you change the code of your module to a point that would change its output, you can implement the `version() `method to return a new value so that the cache knows that it needs to recompute from scratch even from known configs. + +You can also implement the`validate()`method to check the outputs from your module and from the cache if you want to actively invalidate the cache. For example, if it’s known how many lines are to be embedded into a particular dimension (say 1024), you can validate that the output file size is e.g. `num_lines * 1024 * float32.` + +Here is an example of rerunning the global mining pipeline that was interrupted in the middle. The caching layer recovers what was already executed successfully. This was started with the same command that would require a full run: + ```bash python yourpipeline.py src_lang=bn tgt_lang=hi +data=ccg ``` Here are the logs: + ``` [global_mining][INFO] - output: .../global_mining/outputs/2021-11-02/08-56-40 [global_mining][INFO] - working dir: .../global_mining/outputs/2021-11-02/08-56-40 @@ -50,13 +36,6 @@ Here are the logs: [train_faiss_index][INFO] - lang=hi, sents=162844151, required=40000000, index type=OPQ64,IVF65536,PQ64 ``` -We can see that the launcher has found out that it doesn't need to run the -encode and train index steps for the `bn` lang (src language) and can skip -straight to populating the index with embeddings, but it also already processed -44 shards for that step, so will only re-schedule jobs for 11 shards. In -parallel, it is also processing the tgt language (`hi`) and found that it still -needs to run the index training step as it also recoverred all the encoded -shards. +We can see that the launcher has found out that it doesn't need to run the encode and train index steps for the `bn` lang (src language) and can skip straight to populating the index with embeddings, but it also already processed 44 shards for that step, so will only re-schedule jobs for 11 shards. In parallel, it is also processing the tgt language (`hi`) and found that it still needs to run the index training step as it also recoverred all the encoded shards. -All this was done automatically. The person launching the pipeline doesn't have -to micromanage what has already succeeded and what needs to be started when. +All this was done automatically. The person launching the pipeline doesn't have to micromanage what has already succeeded and what needs to be started when. diff --git a/website/docs/stopes/configuration.md b/website/docs/stopes/configuration.md index e627365..27897da 100644 --- a/website/docs/stopes/configuration.md +++ b/website/docs/stopes/configuration.md @@ -4,15 +4,9 @@ sidebar_position: 4 # Configuration -We use hydra for configuration. You should probably check out the hydra -tutorial: -[https://hydra.cc/docs/tutorials/intro](https://hydra.cc/docs/tutorials/intro) -but it's not a requirement. - -Modules `__init__` HAVE to take either a structured configuration as parameter -or an `omegaconf.DictConfig`. A structured configuration is a [python -dataclass](https://docs.python.org/3/library/dataclasses.html), e.g. +We use hydra for configuration. You should probably check out the hydra tutorial: [https://hydra.cc/docs/tutorials/intro](https://hydra.cc/docs/tutorials/intro) but it's not a requirement. +Modules `__init__` HAVE to take either a structured configuration as parameter or an `omegaconf.DictConfig`. A structured configuration is a [python dataclass](https://docs.python.org/3/library/dataclasses.html), e.g. ```python from dataclasses import dataclass @@ -24,63 +18,40 @@ class MyModuleConfig: spm_model: str = "/path/to/my/model.spm" ``` +Structured configs make it easier to track what is expected as a config for a module and makes it self documenting. But you can also just use a DictConfig if you prefer. -Structured configs make it easier to track what is expected as a config for a -module and makes it self documenting. But you can also just use a DictConfig if you prefer. - - -If you implement the init method of the module, make sure to call -`super().__init__(config) `so that the module system knows about your module -setup. You can then access `self.config `anywhere in your module after -initialization - -Actual configs live in YAML files in the config/module/ folder and should look -like this: +If you implement the init method of the module, make sure to call `super().__init__(config) `so that the module system knows about your module setup. You can then access `self.config `anywhere in your module after initialization +Actual configs live in YAML files in the config/module/ folder and should look like this: ```yaml # @package module _target_: stopes.modules.MyModule config: - lang: null - spm_model: /path/to/my/model.spm + lang: null + spm_model: /path/to/my/model.spm ``` - The `_target_` field should point to the full python module path of your module `config` should contain the config of your module. -You should save this in a file with your model name. You could have multiple -versions of your config, save them with the same `_target_` but different file -names (e.g. `my_module_large_spm.yaml`, `my_module_small_spm.yaml`, etc.). - -The yaml config file should contain the baseline configuration for your module -and things that you do not expect to change often. In hydra terms, you are -adding a possible option for a config group (the module group: see `@package -module`) +You should save this in a file with your model name. You could have multiple versions of your config, save them with the same `_target_` but different file names (e.g. `my_module_large_spm.yaml`, `my_module_small_spm.yaml`, etc.). -You can use hydra/[omegaconf -"resolvers"](https://omegaconf.readthedocs.io/en/2.1_branch/custom_resolvers.html#built-in-resolvers) -to depend on other bits of configs or environment variables: +The yaml config file should contain the baseline configuration for your module and things that you do not expect to change often. In hydra terms, you are adding a possible option for a config group (the module group: see `@package module`) +You can use hydra/[omegaconf "resolvers"](https://omegaconf.readthedocs.io/en/2.1_branch/custom_resolvers.html#built-in-resolvers) to depend on other bits of configs or environment variables: ```yaml # @package module _target_: stopes.modules.MyModule config: - lang: null - laser_path: /laser/is/here - laser_model: ${module.my_module.laser_path}/model1.mdl - spm_model: ${oc.env:SPM_MODEL} + lang: null + laser_path: /laser/is/here + laser_model: ${module.my_module.laser_path}/model1.mdl + spm_model: ${oc.env:SPM_MODEL} ``` +Note: try not to rely too much on environment variables as we want these files to be the base for reproducibility and shareability of the module configurations you experiment with. Relying on special environment variables will make this hard. -Note: try not to rely too much on environment variables as we want these files -to be the base for reproducibility and shareability of the module configurations -you experiment with. Relying on special environment variables will make this -hard. - -You can use hydra config composition if you want your config to inherit or -configure a subpart of your config, see -https://hydra.cc/docs/patterns/extending_configs +You can use hydra config composition if you want your config to inherit or configure a subpart of your config, see https://hydra.cc/docs/patterns/extending_configs diff --git a/website/docs/stopes/index.md b/website/docs/stopes/index.md index 71f8715..4eecaa0 100644 --- a/website/docs/stopes/index.md +++ b/website/docs/stopes/index.md @@ -4,73 +4,34 @@ sidebar_position: 1 # stopes Module Framework -The `stopes` library was built for easily managing complex pipelines without -worrying about scaling and reliability code. +The `stopes` library was built for easily managing complex pipelines without worrying about scaling and reliability code. ## Key features: -- **Reproducibility.** `stopes` is built with a research mindset first. The -underlying Hydra framework gives you full control over the configuration of your -pipelines. All the important parameters of your experiments can be defined and -tracked. -- **Easier scaling.** The `stopes` framework provides clean separation between -your pipeline step logic and the scaling code. If you use slurm, run locally or -want to deploy on another cluster, your pipeline code and steps shouldn't -change. -- **Caching/memoization.** With `stopes`, you can iterate faster and more reliably -via transparent memoization. We've built the library so your code doesn't need -to know what's happening with the cache -- **Composition.** The `stopes` API surface is minimum, so you can build a -pipeline by simply writing idiomatic python (using asyncio) and have a quick -understanding of what's going on without needing to understand complex job APIs. - -Checkout the [quickstart](quickstart) guide and the -[pipelines](category/prebuilt-pipelines) we've provided as well as the docs in -the sidebar. +- **Reproducibility.** `stopes` is built with a research mindset first. The underlying Hydra framework gives you full control over the configuration of your pipelines. All the important parameters of your experiments can be defined and tracked. +- **Easier scaling.** The `stopes` framework provides clean separation between your pipeline step logic and the scaling code. If you use slurm, run locally or want to deploy on another cluster, your pipeline code and steps shouldn't change. +- **Caching/memoization.** With `stopes`, you can iterate faster and more reliably via transparent memoization. We've built the library so your code doesn't need to know what's happening with the cache +- **Composition.** The `stopes` API surface is minimum, so you can build a pipeline by simply writing idiomatic python (using asyncio) and have a quick understanding of what's going on without needing to understand complex job APIs. + +Checkout the [quickstart](quickstart) guide and the [pipelines](category/prebuilt-pipelines) we've provided as well as the docs in the sidebar. ## Concepts -The idea of the `stopes` framework is to make it easy to build reproducible -pipelines. This is done though "modules", a module is just a class with a `run` -function that executes something. A module can then be scheduled with the `stopes` -"launcher", this will decide where the code gets executed (locally or on a -cluster) and then wait for the results to be ready. - -A **module** in `stopes` encapsulate a single step of a pipeline and its -requirements. This step is supposed to be able to execute on its own given its -input and generate an output. It will most often be executed as an isolated -job, so shouldn't depend on anything else than its config (e.g. global -variables, etc.). This ensures that each module can be run separately and in -parallel if possible. -A module also defines a clear API of the step via its configuration. - -A **pipeline** in `stopes` it not much more than a python function that connects a -few modules together, but it could contain other python logic in the middle. -While you can run a `stopes` module as a normal python callable, the power of -`stopes` comes from the `launcher` that will manage the execution of the modules, -find the correct machines with matching requirements (if executing on a cluster) -and deal with memoization. - -A **launcher** is the orchestrator of your pipeline, but is exposed to you -through a simple `async` API, so it looks like any -[asyncio](https://docs.python.org/3/library/asyncio.html) function and you do not have -to deal with where your code is being executed, if [memoization](stopes/cache) -is happening, etc. If you have never dealt with `async` in python, I do -recommend checking [this tutorial](https://realpython.com/async-io-python/), it -looks scarier than it is. +The idea of the `stopes` framework is to make it easy to build reproducible pipelines. This is done though "modules", a module is just a class with a `run` function that executes something. A module can then be scheduled with the `stopes` "launcher", this will decide where the code gets executed (locally or on a cluster) and then wait for the results to be ready. + +A **module** in `stopes` encapsulate a single step of a pipeline and its requirements. This step is supposed to be able to execute on its own given its input and generate an output. It will most often be executed as an isolated job, so shouldn't depend on anything else than its config (e.g. global variables, etc.). This ensures that each module can be run separately and in parallel if possible. A module also defines a clear API of the step via its configuration. + +A **pipeline** in `stopes` it not much more than a python function that connects a few modules together, but it could contain other python logic in the middle. While you can run a `stopes` module as a normal python callable, the power of `stopes` comes from the `launcher` that will manage the execution of the modules, find the correct machines with matching requirements (if executing on a cluster) and deal with memoization. + +A **launcher** is the orchestrator of your pipeline, but is exposed to you through a simple `async` API, so it looks like any [asyncio](https://docs.python.org/3/library/asyncio.html) function and you do not have to deal with where your code is being executed, if [memoization](stopes/cache) is happening, etc. If you have never dealt with `async` in python, I do recommend checking [this tutorial](https://realpython.com/async-io-python/), it looks scarier than it is. ## Example -Here is an example of a basic pipeline that will take some file inputs, train a -[FAISS](https://faiss.ai/) index on it and then populate the index with the -files. +Here is an example of a basic pipeline that will take some file inputs, train a [FAISS](https://faiss.ai/) index on it and then populate the index with the files. This example shows the usage of the launcher and how we reuse existing modules. -Here we assume -that the files have already been encoded with something that LASER to keep the -example simple, but you want to have a first step doing -the encoding (see the [global mining pipeline](pipelines/global_mining) for a real example). +Here we assume that the files have already been encoded with something that LASER to keep the example simple, but you want to have a first step doing the encoding (see the [global mining pipeline](pipelines/global_mining) for a real example). ```python title="mypipeline.py" import asyncio @@ -109,39 +70,25 @@ def main(config: DictConfig) -> None: Let's start with the `main`, this is a very basic boilerplate that: -1. sets up [hydra](https://www.hydra.cc) to get configuration when running the - script. We recommend checking the [hydra tutorial](https://hydra.cc/docs/tutorials/intro/) on their site to understand - how to build configurations and organize them. See below also for an example - config. +1. sets up [hydra](https://www.hydra.cc) to get configuration when running the script. We recommend checking the [hydra tutorial](https://hydra.cc/docs/tutorials/intro/) on their site to understand how to build configurations and organize them. See below also for an example config. 2. starts `asyncio` and runs our async `pipeline` function. -The `pipeline` function is `async` as it will run some asynchronous code inside -it, so we need to tell python that this will be the case. The first thing it -does, is to initialize the `launcher` from the config, this is a trick to be -able to swap launchers on the CLI using config overrides. After that, we setup -the `TrainFAISSIndexModule` and `schedule` it with the launcher. This will check -if this step was already executed in the past, and if not, will schedule the -module on the cluster (or just locally if you want). +The `pipeline` function is `async` as it will run some asynchronous code inside it, so we need to tell python that this will be the case. The first thing it does, is to initialize the `launcher` from the config, this is a trick to be able to swap launchers on the CLI using config overrides. After that, we setup the `TrainFAISSIndexModule` and `schedule` it with the launcher. This will check if this step was already executed in the past, and if not, will schedule the module on the cluster (or just locally if you want). -The `await` keyword tells python to "wait" for the job to finish and once that -is done, move to the next step. As we need to pass the generated `index` to the -populate step, we take the config read from hydra, and fill up the `index` with -the output of the training. We schedule and await that step, and finally just -log the location of the output file. +The `await` keyword tells python to "wait" for the job to finish and once that is done, move to the next step. As we need to pass the generated `index` to the populate step, we take the config read from hydra, and fill up the `index` with the output of the training. We schedule and await that step, and finally just log the location of the output file. Let's look at the config: ```yaml title="conf/config" - embedding_files: ??? embedding_dimensions: 1024 index_type: ??? launcher: - _target_: stopes.core.Launcher - log_folder: executor_logs - cluster: local - partition: + _target_: stopes.core.Launcher + log_folder: executor_logs + cluster: local + partition: train_index: lang: demo @@ -156,7 +103,6 @@ train_index: embedding_dimensions: ${embedding_dimensions} fp16: True - populate_index: lang: demo index: ??? @@ -167,22 +113,19 @@ populate_index: embedding_dimensions: ${embedding_dimensions} ``` -Hydra will take a yaml file and structure it for our usage in python. You can -see that we define at the top level: +Hydra will take a yaml file and structure it for our usage in python. You can see that we define at the top level: + ``` embedding_files: ??? index_type: ??? ``` -This tells hydra that these two entries are empty and required, so it will -enforce that we specify them on the CLI. We pass them down to the sub-configs -for train/populate by using the `${}` placeholders. -The `launcher` entry is setup to initialize the -[submitit](https://github.com/facebookincubator/submitit) that currently -provides the main job management system. If you wanted to use a different -job/cluster system, you could implement your own launcher. +This tells hydra that these two entries are empty and required, so it will enforce that we specify them on the CLI. We pass them down to the sub-configs for train/populate by using the `${}` placeholders. + +The `launcher` entry is setup to initialize the [submitit](https://github.com/facebookincubator/submitit) that currently provides the main job management system. If you wanted to use a different job/cluster system, you could implement your own launcher. We can now call our script with: + ```bash python mypipeline.py embedding_files='[pathtomyfile.bin]' index_type="OPQ64,IVF1024,PQ64" ``` @@ -195,8 +138,6 @@ python mypipeline.py embedding_files='[pathtomyfile.bin]' index_type="OPQ64,IVF1 :::note -We use [hydra](https://www.hydra.cc) as the configuration system, but note that most modules -take a dataclass as config, so you could build that manually from a different -system (like argparse) if you did not want to use hydra. +We use [hydra](https://www.hydra.cc) as the configuration system, but note that most modules take a dataclass as config, so you could build that manually from a different system (like argparse) if you did not want to use hydra. ::: diff --git a/website/docs/stopes/module.md b/website/docs/stopes/module.md index 3c8b18d..874a128 100644 --- a/website/docs/stopes/module.md +++ b/website/docs/stopes/module.md @@ -6,7 +6,6 @@ sidebar_position: 2 A module is a python class that extends `StopesModule`: - ```python from stopes.code.stopes_module import StopesModule @@ -22,71 +21,32 @@ class MyModule(StopesModule): ... ``` +You should implement at least the `run `method, this is what will get executed when your module is launched. By default, you don't need to worry about the iteration parameters, see below for details of what these do. -You should implement at least the `run `method, this is what will get executed -when your module is launched. By default, you don't need to worry about the -iteration parameters, see below for details of what these do. - -If you want to initialize things before the module is run, you can use -`__init__.` - -You can also implement the following methods to give more information about your -module: - - +If you want to initialize things before the module is run, you can use `__init__.` -* `requirements` - if you have specific requirements (gpus, memory, …) for your - module, return a Requirements specification from this method. This will be - called after `__init__` but before `run`. -* `name/comment` - some launchers (see below) might use this to identify/log - your module runs. Feel free to implement them if you want, but you don't have - to and they might not always be used. +You can also implement the following methods to give more information about your module: +- `requirements` - if you have specific requirements (gpus, memory, …) for your module, return a Requirements specification from this method. This will be called after `__init__` but before `run`. +- `name/comment` - some launchers (see below) might use this to identify/log your module runs. Feel free to implement them if you want, but you don't have to and they might not always be used. ## Arrays -We've observed that in many cases, pipeline steps are repeated on a number of -shards of data. This is common with large datasets and allows to chunk the data -processing on different machines for faster processing. +We've observed that in many cases, pipeline steps are repeated on a number of shards of data. This is common with large datasets and allows to chunk the data processing on different machines for faster processing. -In this execution case, the goal is to execute the same code with the same -requirements on a number of shards, in order to avoid implementing this logic -for every module that needs to work on shards in the pipeline driving the -module. The StopesModule system can take care of this for you. +In this execution case, the goal is to execute the same code with the same requirements on a number of shards, in order to avoid implementing this logic for every module that needs to work on shards in the pipeline driving the module. The StopesModule system can take care of this for you. -If your module implements the `array` method and returns an array of N values to -process, the module will be executed N times separately and the `run` method -will be called multiple times, independently. Every time the `run `method is -called for a module with an array, it will be passed two extra parameters: +If your module implements the `array` method and returns an array of N values to process, the module will be executed N times separately and the `run` method will be called multiple times, independently. Every time the `run `method is called for a module with an array, it will be passed two extra parameters: +- `iteration_value` that will contain a single value from the array +- `iteration_index` that corresponds the the index of that value in the array - -* `iteration_value` that will contain a single value from the array -* `iteration_index` that corresponds the the index of that value in the array - -The array method will be called after the module is initialized and in the same -process as the initialization. You can therefore compute the array based on the -config of the module or anything you compute in the `__init__` method. - +The array method will be called after the module is initialized and in the same process as the initialization. You can therefore compute the array based on the config of the module or anything you compute in the `__init__` method. ## Gotchas - - -* In most cases, the `run` method will be executed in a distributed fashion. - That means that: - * `run` and `__init__ `might not be called with the same machine/process. - E.g. when launching modules, `__init__` will be called where your pipeline - driving script is executed, but `run` will be called in a separate - process/job. - * When using `array`, each separate call to `run` will potentially be called - on a separate machine/process and on a separate copy of your module. That - means that you can share value from `__init__` down to `run`, but you - cannot share anything in your object between calls of `run, `you should - not modify self inside of` run`. - * When using `array`, there is no guarantee that `run` will be called in the - same order as the values in your array. Only rely on the index passed to - you and not on an execution order. - * Your `run` method will probably apply side effects (e.g. write files). If - this is the case, make sure to return the file path/handle from the run - method so we can keep track of these. +- In most cases, the `run` method will be executed in a distributed fashion. That means that: + - `run` and `__init__ `might not be called with the same machine/process. E.g. when launching modules, `__init__` will be called where your pipeline driving script is executed, but `run` will be called in a separate process/job. + - When using `array`, each separate call to `run` will potentially be called on a separate machine/process and on a separate copy of your module. That means that you can share value from `__init__` down to `run`, but you cannot share anything in your object between calls of `run, `you should not modify self inside of` run`. + - When using `array`, there is no guarantee that `run` will be called in the same order as the values in your array. Only rely on the index passed to you and not on an execution order. + - Your `run` method will probably apply side effects (e.g. write files). If this is the case, make sure to return the file path/handle from the run method so we can keep track of these. diff --git a/website/docs/stopes/pipelining.md b/website/docs/stopes/pipelining.md index 89b020c..6788d27 100644 --- a/website/docs/stopes/pipelining.md +++ b/website/docs/stopes/pipelining.md @@ -4,53 +4,31 @@ sidebar_position: 3 # Composition (aka pipelining) -The StopesModule framework provides a "launcher" abstraction that takes care of -scheduling your module "somewhere". Currently, and in most Stopes use cases, this -somewhere is SLURM, but you can also choose to launch it locally and more -launcher implementations might come when other execution environments are needed. +The StopesModule framework provides a "launcher" abstraction that takes care of scheduling your module "somewhere". Currently, and in most Stopes use cases, this somewhere is SLURM, but you can also choose to launch it locally and more launcher implementations might come when other execution environments are needed. -The global_mining -pipeline is a good example of how all of this works together and you should -check it out when reading this doc to have a good idea of how things fit -together. - -You can initialize a launcher from code with its python init, but ideally, your -pipeline will initialize it from a config with hydra: +The global_mining pipeline is a good example of how all of this works together and you should check it out when reading this doc to have a good idea of how things fit together. +You can initialize a launcher from code with its python init, but ideally, your pipeline will initialize it from a config with hydra: ```python self.launcher = hydra.utils.instantiate(config.launcher) ``` - -We provide pre-made configs for the main SLURM launcher and instantiating the -launcher from config will allow you to override it from the CLI for debugging. +We provide pre-made configs for the main SLURM launcher and instantiating the launcher from config will allow you to override it from the CLI for debugging. Once you have a launcher, you can launch a module in code with: - ```python embedded_files = await self.launcher.schedule(embed_module) ``` - -The launcher will take care of submitting a job to the execution engine (e.g. -SLURM) and wait for it to be done. The launcher will also take care of raising -any exception happening in the execution engine and if using the submitit -launcher, it will also take care of checkpointing (see above). +The launcher will take care of submitting a job to the execution engine (e.g. SLURM) and wait for it to be done. The launcher will also take care of raising any exception happening in the execution engine and if using the submitit launcher, it will also take care of checkpointing (see above). ## Asyncio -Because` launcher.schedule `will potentially schedule your module run method on -a separate host, wait for it to find a slot and to eventually finish. The result -that this `schedule` method returns is not available immediately. We use python -asyncio to deal with waiting for the results to be available. This means that -you need to `await `the result of schedule before being able to use it. - -This also means that you can use asyncio helpers to organize your code and tell -the launcher when things can be scheduled in parallel. For instance you can -await for two results in "parallel" with: +Because`launcher.schedule`will potentially schedule your module run method on a separate host, wait for it to find a slot and to eventually finish. The result that this `schedule` method returns is not available immediately. We use python asyncio to deal with waiting for the results to be available. This means that you need to `await `the result of schedule before being able to use it. +This also means that you can use asyncio helpers to organize your code and tell the launcher when things can be scheduled in parallel. For instance you can await for two results in "parallel" with: ```python src_embeddings, tgt_embeddings = await asyncio.gather( diff --git a/website/sidebars.js b/website/sidebars.js index 3b9a2bc..899c71a 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -20,7 +20,7 @@ module.exports = { // By default, Docusaurus generates a sidebar from the docs folder structure - quickstartSidebar: [{ type: 'autogenerated', dirName: '.' }], + quickstartSidebar: [{type: 'autogenerated', dirName: '.'}], // But you can create a sidebar manually /* diff --git a/website/src/css/custom.css b/website/src/css/custom.css index 107f9f3..b60c693 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -24,7 +24,6 @@ --ifm-color-primary-lightest: #2b6cf8; } - /* For readability concerns, you should choose a lighter palette in dark mode. */ [data-theme='dark'] { --ifm-color-primary: #4d82ff; @@ -102,7 +101,7 @@ } .sbanner .bottom .button-container { - margin-left: 255px + margin-left: 255px; } .sbanner .bottom .button-container .button { @@ -147,9 +146,7 @@ color: #ce6849; } - @media only screen and (max-width: 700px) { - .sbanner .container { margin-left: 5px; } diff --git a/website/src/pages/index.js b/website/src/pages/index.js index 1a01d53..7d31a06 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -17,7 +17,7 @@ import React from 'react'; import styles from './styles.module.css'; function Stopes() { - return stopes + return stopes; } const features = [ @@ -25,13 +25,14 @@ const features = [ title: 'Easy to Use', description: ( <> - was designed to provide a modular API to build and reproduce pipelines core to large translation work. - In particular data mining and evaluation. - Where you run your pipeline and how you scale it is independent of its core logic. - Everything is config-driven so you can easily reproduce and track results. + was designed to provide a modular API to build and reproduce + pipelines core to large translation work. In particular data mining and + evaluation. Where you run your pipeline and how you scale it is + independent of its core logic. Everything is config-driven so you can + easily reproduce and track results. ), - buttonTxt: "Quickstart", + buttonTxt: 'Quickstart', buttonUrl: 'docs/quickstart', imageUrl: 'img/shovel.svg', }, @@ -39,12 +40,13 @@ const features = [ title: 'Batteries Included', description: ( <> - lets you focus on your core data and evaluation needs by providing common modules - used for this task and letting you write your pipelines with idiomatic python. - Common optimizations have also been built-in to help you scale your work. + lets you focus on your core data and evaluation needs by + providing common modules used for this task and letting you write your + pipelines with idiomatic python. Common optimizations have also been + built-in to help you scale your work. ), - buttonTxt: "Learn More", + buttonTxt: 'Learn More', buttonUrl: 'docs/stopes', imageUrl: 'img/modules.svg', }, @@ -52,21 +54,21 @@ const features = [ title: 'State-of-the-art Pipelines', description: ( <> - was developed as part of the Meta AI No Language Left Behind research project. - It comes with state-of-the-art pipelines out of the box. You can run our global mining and distillation - pipelines and reproduce our research with just a few command lines. + was developed as part of the Meta AI No Language Left Behind + research project. It comes with state-of-the-art pipelines out of the + box. You can run our global mining and distillation pipelines and + reproduce our research with just a few command lines. ), - buttonTxt: "E.g. Start Data Mining", + buttonTxt: 'E.g. Start Data Mining', buttonUrl: 'docs/pipelines/global_mining', imageUrl: 'img/pipelines.svg', }, ]; - const sections = [ { - title: "No-coding Mining", + title: 'No-coding Mining', language: 'bash', code: `python -m stopes.pipelines.bitext.global_mining_pipeline \\ src_lang=fuv \\ @@ -76,12 +78,15 @@ const sections = [ output_dir=. \\ embed_text=laser3`, content: ( -

comes with the Global Mining Pipeline that was used by the NLLB team. - You can use it out of the box without extra coding. You will need to setup an - environment and create a config file to point to your data, - but you can start mining (locally or on a slurm cluster) without any coding. - Check out the Quickstart guide.

- ) +

+ comes with the Global Mining Pipeline that was used by the + NLLB team. You can use it out of the box without extra coding. You will + need to setup an environment and create a config file to point to your + data, but you can start mining (locally or on a slurm cluster) without + any coding. Check out the{' '} + Quickstart guide. +

+ ), }, { title: 'Reproducible research', @@ -97,10 +102,11 @@ config: num_threads : 4`, content: (

- is based on Hydra, - giving you full control over the behavior of your pipeline. - Experiments are easily reproducible along with your results.

- ) + is based on Hydra, giving + you full control over the behavior of your pipeline. Experiments are + easily reproducible along with your results. +

+ ), }, { title: 'Modular pipeline definition', @@ -141,72 +147,69 @@ config: content: ( <>

- pipelines are composed of modules. - No more duplicated, out-of sync code: your most common preprocessing steps can be shared + pipelines are composed of modules. No more duplicated, + out-of sync code: your most common preprocessing steps can be shared among all your pipelines.

You will find in this repository some implementations of a number of - modules that are useful for translation data mining and evaluation, Neural Machine Translation data pre-processing - and model training. For example, we provide modules to build faiss indexes, encode - text with LASER and HuggingFace Transformers, - mine bitext or train and evaluate FAIRSEQ models. -

- ) + modules that are useful for translation data mining and evaluation, + Neural Machine Translation data pre-processing and model training. For + example, we provide modules to build{' '} + faiss indexes, encode text with{' '} + LASER and{' '} + + HuggingFace Transformers + + , mine bitext or train and evaluate{' '} + FAIRSEQ{' '} + models. +

+ + ), }, -] +]; -function Card({ title, description, buttonTxt, buttonUrl, imageUrl }) { +function Card({title, description, buttonTxt, buttonUrl, imageUrl}) { const imgUrl = useBaseUrl(imageUrl); const burl = useBaseUrl(buttonUrl); return (
-
+
{imgUrl && ( -
- {title} -
)} -
+
+ {title} +
+ )} +

{title}

-

- {description} -

+

{description}

{buttonTxt && buttonUrl && ( -
+
+ className={clsx('button button--primary button--block')} + to={burl}> {buttonTxt}
)}
-
) +
+ ); } -function ContentWithCode({ title, children, flip, language }) { - +function ContentWithCode({title, children, flip, language}) { const [content, code] = React.Children.toArray(children); - const textBlock = ( -
- {content} -
- ) + const textBlock =
{content}
; const codeBlock = (
- - {code} - + {code}
- ) + ); let left = textBlock; let right = codeBlock; @@ -224,12 +227,12 @@ function ContentWithCode({ title, children, flip, language }) {
{left} {right} -
) +
+
+ ); } function Banner() { - - const nllb = useBaseUrl('img/banner_bits/nllb.png'); const driving = useBaseUrl('img/banner_bits/driving.png'); const stopes = useBaseUrl('img/banner_bits/stopes.png'); @@ -250,36 +253,41 @@ function Banner() {
- NO LANGUAGES LEFT BEHIND
-
- Driving inclusion through machine translation
-

logostopes

-
+ NO LANGUAGES LEFT BEHIND +
+
+ Driving inclusion through machine translation +
+

+ logo + stopes +

+

Large-Scale Translation Tooling

Mining Quickstart
-
+
meta
- ) + ); } - export default function Home() { const context = useDocusaurusContext(); - const { siteConfig = {} } = context; + const {siteConfig = {}} = context; return (
- {features.map(({ title, imageUrl, description, buttonTxt, buttonUrl }) => ( - - ))} + {features.map( + ({title, imageUrl, description, buttonTxt, buttonUrl}) => ( + + ), + )}
)}
- {sections.map(({ title, language, code, content }, index) => ( + {sections.map(({title, language, code, content}, index) => ( Date: Fri, 21 Jul 2023 13:58:56 +0300 Subject: [PATCH 3/6] Feat: Test for the paste action --- .../ui/seamlisten/react_app/src/e2e.test.js | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/stopes/ui/seamlisten/react_app/src/e2e.test.js b/stopes/ui/seamlisten/react_app/src/e2e.test.js index 434a0f2..57c7c2c 100644 --- a/stopes/ui/seamlisten/react_app/src/e2e.test.js +++ b/stopes/ui/seamlisten/react_app/src/e2e.test.js @@ -48,4 +48,38 @@ describe("File Viewer", () => { }); expect(helpTextVisible).toBe(true); }); + + test("should update the filename input with pasted data", async () => { + const filenameInput = await page.waitForSelector( + ".form-control.form-control-sm" + ); + const pastedFilename = "/default/path/"; + + await page.evaluate( + //a function to simulate a paste action + (input, value) => { + const event = new Event("paste", { + bubbles: true, + cancelable: true, + composed: true, + }); + event.clipboardData = new DataTransfer(); + event.clipboardData.setData("text/plain", value); + input.dispatchEvent(event); + }, + // arguments for our function are passed here + filenameInput, // input + pastedFilename //value + ); + + // Wait for a moment to allow the paste event handler to run + await new Promise((r) => setTimeout(r, 100)); + + // check whether the inputvalue matches the pastedFileName + const inputValue = await page.$eval( + ".form-control.form-control-sm", + (el) => el.value + ); + expect(inputValue).toBe(pastedFilename); + }); }); From dd11271b8212fad305314f6fd86e1f7e85ed05ca Mon Sep 17 00:00:00 2001 From: lubwama Date: Thu, 27 Jul 2023 12:45:13 +0300 Subject: [PATCH 4/6] Feat: play on paste . Fixes #36 --- .../react_app/src/components/FileExplorer.tsx | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx index 6d6faf4..f113bb6 100644 --- a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx +++ b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx @@ -4,13 +4,16 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. + import { useCallback, useEffect, useState } from "react"; + import Button from "react-bootstrap/Button"; import { default as BCol } from "react-bootstrap/Col"; import Form from "react-bootstrap/Form"; import { default as BRow } from "react-bootstrap/Row"; + import { Location, useLoaderData, @@ -25,12 +28,15 @@ import { LineResult } from "../common/types/api"; import Help from "./fileviewer/FileExplorerHelp"; import Table from "./fileviewer/Table"; + import { text_to_audio } from "../common/components/audio/audioquery_constructor"; + const FILENAME_PARAM = "file"; const PAGENUMBER_PARAM = "page"; const NUMBERLINES_PARAM = "lines"; + type LoaderReturn = { filename: string; pageNumber: number; @@ -40,6 +46,7 @@ type LoaderReturn = { error: any; }; + function parseParams(searchParams) { return { filename: searchParams.get(FILENAME_PARAM) @@ -50,6 +57,7 @@ function parseParams(searchParams) { }; } + function parseLocation(location: Location) { if (!location) { return null; @@ -57,9 +65,11 @@ function parseLocation(location: Location) { return parseParams(new URLSearchParams(location.search)); } + export async function loader({ request }): Promise { const url = new URL(request.url); + const { filename, numberLines, pageNumber } = parseParams(url.searchParams); const toRet = { filename, @@ -70,6 +80,7 @@ export async function loader({ request }): Promise { error: null, }; + try { if ( filename.endsWith("tsv.gz") || @@ -85,6 +96,7 @@ export async function loader({ request }): Promise { return toRet; } + const audioResult = await text_to_audio(filename, 1); if (audioResult) { toRet.audioBlob = audioResult; @@ -98,6 +110,7 @@ export async function loader({ request }): Promise { return toRet; } + function Error({ error }) { const msg = error.data ? error.data.detail @@ -110,6 +123,7 @@ function Error({ error }) { ); } + function useFileNavigate() { const navigate = useNavigate(); return (file: string, page: number, numberLines: number) => @@ -120,6 +134,7 @@ function useFileNavigate() { ); } + const Files = (): JSX.Element => { const [displayHelper, setDisplayHelper] = useState(false); const navigate = useFileNavigate(); @@ -129,6 +144,7 @@ const Files = (): JSX.Element => { filename || config.default_path ); + // if we have a location, we are in a transition between two urls const navigation = useNavigation(); const locationParams = parseLocation(navigation.location); @@ -139,10 +155,12 @@ const Files = (): JSX.Element => { } const loading = !!navigation.location; + // in some navigation events (like back/forward navigation, the component is not remounted) // so we need to reset the "default" for the filename form. useEffect(() => setNewFilename(filename), [filename]); + const setFilenameEventHandler = useCallback( (evt) => setNewFilename(evt.target.value), [setNewFilename] @@ -168,6 +186,7 @@ const Files = (): JSX.Element => { [setFilename] ); + // Add new function to handle paste events const fileInputHandlePaste = useCallback( (evt) => { @@ -178,6 +197,7 @@ const Files = (): JSX.Element => { [navigate, pageNumber, numberLines] ); + return (
@@ -262,4 +282,5 @@ const Files = (): JSX.Element => { ); }; -export default Files; + +export default Files; \ No newline at end of file From b4f4201f203c19a222c74009c50ad2c5fbf67c5c Mon Sep 17 00:00:00 2001 From: lubwama Date: Fri, 4 Aug 2023 21:31:01 +0300 Subject: [PATCH 5/6] Revert "Fix: Trailing whitespace fixed with pre-commit run --all-files" This reverts commit de63794c4571e94b761941d755fad528221d524d. --- .github/workflows/lint_and_tests.yaml | 48 +-- CHANGELOG.md | 6 +- CODE_OF_CONDUCT.md | 26 +- CONTRIBUTING.md | 6 +- README.md | 21 +- demo/alti/README.md | 26 +- demo/alti/detecting_hallucinations/README.md | 49 ++- demo/alti/minimal_example/download_nllb.sh | 4 +- .../minimal_example/preset/nllb_demo.yaml | 2 +- demo/iwslt_blaser_eval/README.md | 4 +- demo/iwslt_blaser_eval/conf/eval_blaser.yaml | 2 + demo/iwslt_blaser_eval/conf/launcher | 2 +- demo/toxicity-alti-hb/ETOX/README.md | 20 +- demo/toxicity-alti-hb/README.md | 15 +- demo/toxicity-alti-hb/alti/README.md | 5 +- demo/toxicity-alti-hb/analysis/README.md | 2 - demo/toxicity-alti-hb/annotation/README.md | 51 ++- stopes/eval/alti/LICENSE.md | 366 +++++++++--------- stopes/eval/alti/README.md | 2 +- stopes/eval/blaser/README.md | 2 +- stopes/eval/blaser/conf/score.yaml | 2 +- stopes/pipelines/bitext/README.md | 2 +- .../embed_text/config/encoder/hf_encoder.yaml | 4 +- .../conf/eval/generate_multi_bleu_detok.yaml | 2 +- .../bitext/conf/generate/standard_conf.yaml | 1 + .../bitext/conf/mine_indexes/base.yaml | 40 +- .../bitext/conf/mine_sentences/base.yaml | 26 +- .../conf/moses_filter/standard_conf.yaml | 2 +- .../bitext/conf/nmt_bitext_eval.yaml | 1 + stopes/pipelines/bitext/conf/preset/demo.yaml | 2 +- .../bitext/conf/spm/train/standard_conf.yaml | 2 +- .../bitext/conf/train_spm/standard_conf.yaml | 2 +- .../conf/bitext_clean/default.yaml | 6 +- .../distillation/conf/dedup/default.yaml | 2 +- .../distillation/conf/distillation.yaml | 40 +- stopes/pipelines/distillation/conf/launcher | 2 +- .../conf/mono_pipeline/default.yaml | 2 +- .../params/model/transformer.yaml | 2 +- stopes/pipelines/eval/conf/launcher | 2 +- stopes/pipelines/filtering/README.md | 5 +- stopes/pipelines/monolingual/README.md | 1 - .../monolingual/conf/dedup/dedup_files.yaml | 2 +- stopes/pipelines/monolingual/conf/launcher | 2 +- .../monolingual/conf/monolingual.yaml | 35 +- stopes/pipelines/prepare_data/README.md | 33 +- stopes/pipelines/prepare_data/conf/launcher | 2 +- .../prepare_data/conf/prepare_data.yaml | 2 +- stopes/pipelines/speech/README.md | 18 +- .../speech/conf/compute_laser_embeddings.yaml | 2 +- stopes/pipelines/speech/conf/launcher | 2 +- .../conf/embed_text/test_numbers_encoder.yaml | 2 +- stopes/pipelines/translate/conf/example.yaml | 16 +- stopes/pipelines/translate/conf/launcher | 2 +- .../react_app/src/components/FileExplorer.tsx | 9 +- .../ui/seamlisten/react_app/src/e2e.test.js | 29 +- website/docs/eval/alti.md | 31 +- website/docs/eval/blaser.md | 4 +- website/docs/pipelines/distillation.md | 31 +- website/docs/pipelines/global_mining.md | 69 ++-- website/docs/pipelines/monolingual.md | 1 - website/docs/quickstart.md | 54 ++- website/docs/stopes/advanced/checkpointing.md | 7 +- website/docs/stopes/advanced/debugging.md | 30 +- website/docs/stopes/advanced/dynamic.md | 42 +- website/docs/stopes/cache.md | 43 +- website/docs/stopes/configuration.md | 61 ++- website/docs/stopes/index.md | 119 ++++-- website/docs/stopes/module.md | 72 +++- website/docs/stopes/pipelining.md | 36 +- website/sidebars.js | 2 +- website/src/css/custom.css | 5 +- website/src/pages/index.js | 188 +++++---- 72 files changed, 980 insertions(+), 778 deletions(-) diff --git a/.github/workflows/lint_and_tests.yaml b/.github/workflows/lint_and_tests.yaml index 604eff8..07f7292 100644 --- a/.github/workflows/lint_and_tests.yaml +++ b/.github/workflows/lint_and_tests.yaml @@ -18,30 +18,30 @@ jobs: runs-on: ${{ matrix.platform }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - cache: "pip" + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' - - name: Install - # Fairseq doesn't install with pip==22.1 we need to upgrade past it. - # Also the version on pypi is from before Oct 2020. - run: | - python --version - python -m pip install --upgrade 'pip>=22.1.2' - python -m pip show pip - python -m pip install 'git+https://github.com/facebookresearch/fairseq.git@v0.12.1' - python -m pip install -e '.[dev,mono,mining]' + - name: Install + # Fairseq doesn't install with pip==22.1 we need to upgrade past it. + # Also the version on pypi is from before Oct 2020. + run: | + python --version + python -m pip install --upgrade 'pip>=22.1.2' + python -m pip show pip + python -m pip install 'git+https://github.com/facebookresearch/fairseq.git@v0.12.1' + python -m pip install -e '.[dev,mono,mining]' - - name: isort - run: isort --check --diff . - - name: black - run: black --check --diff . - - name: pytest - run: pytest - # TODO: fix type issues - - name: mypy - run: 'mypy || echo "Warning: mypy still does not pass"' + - name: isort + run: isort --check --diff . + - name: black + run: black --check --diff . + - name: pytest + run: pytest + # TODO: fix type issues + - name: mypy + run: 'mypy || echo "Warning: mypy still does not pass"' diff --git a/CHANGELOG.md b/CHANGELOG.md index 16b333f..eb03757 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,20 +20,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - when mining, keep metadata about what pairs come from the forward and backward pass - when mining, choose if you want to do only forward, backward or both passes + + ### Changed - embeddings for mining are now stored in real npy files with headers - `StopesModule` is not `async` anymore, just the APIs of `Launcher`. You should write your `run` function as - a normal non-async function +a normal non-async function - mining neighbours is now optimized to have a smaller memory load - progress bar of pipelines is simplified to avoid overly busy logs - do not rely on existing line count files and compute them as part of the pipeline in the mining + ### Fixed - many improvements in the mining code - many fixes in the NMT eval pipeline + ## 1.0.0 Initial release diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index c4a3c1d..83f431e 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -14,22 +14,22 @@ appearance, race, religion, or sexual identity and orientation. Examples of behavior that contributes to creating a positive environment include: -- Using welcoming and inclusive language -- Being respectful of differing viewpoints and experiences -- Gracefully accepting constructive criticism -- Focusing on what is best for the community -- Showing empathy towards other community members +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members Examples of unacceptable behavior by participants include: -- The use of sexualized language or imagery and unwelcome sexual attention or - advances -- Trolling, insulting/derogatory comments, and personal or political attacks -- Public or private harassment -- Publishing others' private information, such as a physical or electronic - address, without explicit permission -- Other conduct which could reasonably be considered inappropriate in a - professional setting +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting ## Our Responsibilities diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 703968e..fad26ca 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,4 @@ # Contributing to `stopes` - We want to make contributing to this project as easy and transparent as possible. @@ -10,7 +9,6 @@ and push our changes to the open source community when we have a stable version or interesting results. ## Pull Requests - We actively welcome your pull requests. 1. Fork the repo and create your branch from `main`. @@ -21,14 +19,12 @@ We actively welcome your pull requests. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") - In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Meta's open source projects. Complete your CLA here: ## Issues - We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. @@ -36,7 +32,7 @@ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe disclosure of security bugs. In those cases, please go through the process outlined on that page and do not file a public issue. -## License +## License By contributing to `stopes`, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. diff --git a/README.md b/README.md index 0625092..ddc051d 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ ![stopes](/website/static/img/banner.png?raw=true "stopes by NLLB.") + # `stopes`: A library for preparing data for machine translation research As part of the FAIR No Language Left Behind (NLLB) ([Paper](https://research.facebook.com/publications/no-language-left-behind/), [Website](https://ai.facebook.com/research/no-language-left-behind/), [Blog](https://ai.facebook.com/blog/nllb-200-high-quality-machine-translation/)) @@ -17,14 +18,12 @@ checkout the `demo` directory for an example usage with the [WMT22 Shared Task: Languages](https://statmt.org/wmt22/large-scale-multilingual-translation-task.html) data. ## Requirements - `stopes` relies on: - -- submitit to schedule jobs when ran on clusters -- hydra-core version >= 1.2.0 for configuration -- fairseq to use LASER encoders -- PyTorch version >= 1.5.0 -- Python version >= 3.8 +* submitit to schedule jobs when ran on clusters +* hydra-core version >= 1.2.0 for configuration +* fairseq to use LASER encoders +* PyTorch version >= 1.5.0 +* Python version >= 3.8 ## Installing stopes @@ -33,13 +32,11 @@ pip for the install to work. We recommend that you first upgrade pip: `python -m pip install --upgrade pip` The mining pipeline relies on fairseq to run LASER encoders, because of competing dependency version, you'll have to first install fairseq with pip separately: - ``` pip install fairseq==0.12.1 ``` You can then install stopes with pip: - ``` git clone https://github.com/facebookresearch/stopes.git cd stopes @@ -51,7 +48,6 @@ You can choose what to install. If you are only interested in `mining`, you do n Currently `fairseq` and `stopes` require different version of hydra, so `pip` might output some warnings, do not worry about them, we want hydra>=1.1. If you plan to train a lot of NMT model you will also want to setup apex to get a faster training. - ``` git clone https://github.com/NVIDIA/apex cd apex @@ -63,19 +59,16 @@ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cud ## How `stopes` works `stopes` is made of a few different parts: - 1. `core` provides a library to write readable piplines 2. `modules` provides a set of modules using the core library and implementing common steps in our mining and evaluation pipelines 3. `pipelines` provides pipeline implementation for the data pipelines we use in NLLB: - - `monolingual` to preprocess and clean single language data - `bitext` to run the "global mining" pipeline and extract aligned sentences from two monolingual datasets. (inspired by [CCMatrix](https://ai.facebook.com/blog/ccmatrix-a-billion-scale-bitext-data-set-for-training-translation-models/)) - `distilation` to run our sequence-level knowledge distillation pipeline which trains a small student model from a pre-trained large teacher model (approach based on https://arxiv.org/abs/1606.07947) - 4. `eval` provides a set of evaluation tools, including ALTI+ and BLASER for text-free speech translation evaluation. 5. `demo` contains applications of stopes, including a quickstart demo that you can run at home of mining as well as a example usage of ALTI+ for toxicity and hallucination analysis. @@ -106,7 +99,6 @@ See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out. (in alphabetical order) ## Citation - If you use `stopes` in your work, please cite: ```bibtex @@ -133,5 +125,4 @@ Some of the tools in stopes, like BLASER and ALTI have their own publications, p ``` ## License - `stopes` is MIT licensed, as found in the LICENSE file. diff --git a/demo/alti/README.md b/demo/alti/README.md index 6d6cc34..0bf0d69 100644 --- a/demo/alti/README.md +++ b/demo/alti/README.md @@ -6,37 +6,32 @@ Our implementation is based on the code from the paper [Ferrando et al., 2022](h The code and readme for it are located at `stopes/eval/alti`. # Installation - To use the core ALTI+ code, you need to install Stopes with the `alti` dependencies: - ``` git clone https://github.com/facebookresearch/stopes.git cd stopes && pip install -e '.[alti]' && cd .. ``` -You will also need Fairseq. To work with [NLLB models](https://github.com/facebookresearch/fairseq/tree/nllb), +You will also need Fairseq. To work with [NLLB models](https://github.com/facebookresearch/fairseq/tree/nllb), you have to checkout the corresponding branch: - ``` git clone https://github.com/pytorch/fairseq cd fairseq && git checkout nllb && pip install -e . && python setup.py build_ext --inplace && cd .. ``` # The minimal example (CLI with an NLLB model) - The code, configs and toy data for this example is in the `minimal_example` directory. To download the official 600M checkpoint of the [NLLB-200 model](https://github.com/facebookresearch/fairseq/tree/nllb), -run the script `download_nllb.sh` from the `minimal_example` directory; +run the script `download_nllb.sh` from the `minimal_example` directory; it will create the `nllb` directory there and download the model and the dictiory into it. The following command will read sentence pairs `test_input.tsv` and compute ALTI and ALTI-based metrics for them. -It will output various text-level scores into `test_output.tsv`, +It will output various text-level scores into `test_output.tsv`, and token-level contributions and alignments will be stored in `test_output_alignments.jsonl`. The file `test_input.tsv` contains columns `src` and `mt` with a few French-English translation pairs with different pathologies: - ``` src mt Traduction normale. A normal translation. @@ -59,11 +54,10 @@ python compute_nllb_alti.py \ tgt_lang=eng_Latn \ +preset=nllb_demo +demo_dir=$(pwd)/nllb ``` - Some arguments to the command are stored in the configuration file: `preset/nllb_demo.yaml`. You can edit this file or create another preset, if you want. -The file `test_output.tsv` contains multiple columns with various ALTI-based metrics +The file `test_output.tsv` contains multiple columns with various ALTI-based metrics (here we show only a few columns for better readability): ``` @@ -76,24 +70,22 @@ The file `test_output.tsv` contains multiple columns with various ALTI-based met 5 A transfer in which I have accessed an error. 0.61 0.40 0.14 0.00 6 A translation with with with with with with a cyclical hallucination. 0.61 0.28 0.13 0.00 ``` - -One can see that the metrics `avg_sc`, `min_sc` and `top_sc_mean` may help to detect hallucinations, +One can see that the metrics `avg_sc`, `min_sc` and `top_sc_mean` may help to detect hallucinations, whereas `src_sum_contr_below_01` indicates incomplete translations. -The file `test_output_alignments.jsonl` contains individual subword tokens of the source and target sentences, -the raw ALTI+ contribution matrices produced for these tokens, and the alignments computed from these matrices: +The file `test_output_alignments.jsonl` contains individual subword tokens of the source and target sentences, +the raw ALTI+ contribution matrices produced for these tokens, and the alignments computed from these matrices: ``` {"contributions": [[0.52, 0.06, ...]], "alignment": [[0, 0], [0, 4], [0, 6], [1, 0], ...], "src_toks": ["__fra_Latn__", "▁Trad", "uction", "▁normale", ".", ""], "tgt_toks": ["", "__eng_Latn__", "▁A", "▁normal", "▁translation", "."], "pred_toks": ["__eng_Latn__", "▁A", "▁normal", "▁translation", ".", ""]} {"contributions": ... ... ``` - `src_toks` are the encoder inputs, `tgt_toks` are the inputs to the decoder, whereas `pred_toks` are its outputs. In fact, `tgt_toks` are `pred_toks` shifted by one position. -# Reproducing the hallucination detection experiments +# Reproducing the hallucination detection experiments The folder `detecting_hallucinations` contains the code for reproducing the experiments on hallucination detection from the paper [Detecting and Mitigating Hallucinations in Machine Translation: Model Internal Workings Alone Do Well, Sentence Similarity Even Better](https://arxiv.org/abs/2212.08597). @@ -101,6 +93,7 @@ The detailed instructions for reproduction are in that folder. To refer to these results, please cite: + ```bibtex @article{dale2022detecting, title={Detecting and Mitigating Hallucinations in Machine Translation: Model Internal Workings Alone Do Well, Sentence Similarity Even Better}, @@ -110,3 +103,4 @@ To refer to these results, please cite: year={2022} } ``` + diff --git a/demo/alti/detecting_hallucinations/README.md b/demo/alti/detecting_hallucinations/README.md index fb65821..665e3bb 100644 --- a/demo/alti/detecting_hallucinations/README.md +++ b/demo/alti/detecting_hallucinations/README.md @@ -1,27 +1,26 @@ # Token contributions for hallucination detection -This folder contains the code for reproducing the experiments from the paper +This folder contains the code for reproducing the experiments from the paper [Detecting and Mitigating Hallucinations in Machine Translation: Model Internal Workings Alone Do Well, Sentence Similarity Even Better](https://arxiv.org/abs/2212.08597). The structure is following: - -- `annotated_data`: - - `guerreiro2022_corpus_w_annotations.csv`: the corpus from the [Guerreiro, 2022 repository](https://github.com/deep-spin/hallucinations-in-nmt) with German-English translations annotated by pathology type. - - `annotate_hallucination_mitigation_v7_stacked.tsv`: a subset of this corpus translated by 3 improved systems and re-annottated. +- `annotated_data`: + - `guerreiro2022_corpus_w_annotations.csv`: the corpus from the [Guerreiro, 2022 repository](https://github.com/deep-spin/hallucinations-in-nmt) with German-English translations annotated by pathology type. + - `annotate_hallucination_mitigation_v7_stacked.tsv`: a subset of this corpus translated by 3 improved systems and re-annottated. - `computed_data`: should contain the files created by our code. They can either be downloaded from [this link](https://dl.fbaipublicfiles.com/nllb/hallucination_detection_data.zip) or re-computed from scratch by the code below. - `experiments`: the code for the experiments, organized in 5 Jupyter notebooks: - - `01_Detection.ipynb`: computing various metrics of translation quality on the aforementioned corpus. - - `02_Detection_analysis.ipynb`: evaluating the metrics computed above. - - `03_Mitigation.ipynb`: translating German sentences to English by generating multiple hypotheses with various methods and reranking them with various scores. - - `04_Mitigation_more_hypotheses.ipynb`: the same experiments as above, with fewer generation methods and a larger pool of hypotheses. - - `05_Mitigation_analysis.ipynb`: evaluating the translations computed above. + - `01_Detection.ipynb`: computing various metrics of translation quality on the aforementioned corpus. + - `02_Detection_analysis.ipynb`: evaluating the metrics computed above. + - `03_Mitigation.ipynb`: translating German sentences to English by generating multiple hypotheses with various methods and reranking them with various scores. + - `04_Mitigation_more_hypotheses.ipynb`: the same experiments as above, with fewer generation methods and a larger pool of hypotheses. + - `05_Mitigation_analysis.ipynb`: evaluating the translations computed above. The notebooks `02_Detection_analysis.ipynb` and `05_Mitigation_analysis.ipynb` reproduce most of the figures and tables mentioned in the paper. + # Setup 1. Prepare the environment by installing Fairseq, Stopes, and some extra libraries: - ``` pip install fairseq==0.12.1 git clone https://github.com/facebookresearch/stopes.git @@ -33,24 +32,21 @@ pip install -r requirements.txt 2. Set up the translation model - 2.1. Download the translation [model](https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file) and [data](https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file) from https://github.com/deep-spin/hallucinations-in-nmt and put them in the `model` directory. - - 2.2. Run the following commands to unpack the data: - - ``` - tar -xvf model/checkpoint_best.tar.xz - tar -xvf model/wmt18_de-en.tar.xz - ``` + 2.1. Download the translation [model](https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file) and [data](https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file) from https://github.com/deep-spin/hallucinations-in-nmt and put them in the `model` directory. - 2.3. Run the following command to download the tokenizers: + 2.2. Run the following commands to unpack the data: + ``` + tar -xvf model/checkpoint_best.tar.xz + tar -xvf model/wmt18_de-en.tar.xz + ``` - ``` - wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.model - wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.vocab - ``` + 2.3. Run the following command to download the tokenizers: + ``` + wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.model + wget -P model https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.vocab + ``` 3. Download LASER2 encoder (see https://github.com/facebookresearch/LASER/tree/main/nllb) by running: - ``` mkdir laser cd laser @@ -59,7 +55,6 @@ cd .. ``` 4. Optionally, download the computed translations and scores (instead of re-computing it by notebooks 1, 3, and 4): - ``` wget https://dl.fbaipublicfiles.com/nllb/hallucination_detection_data.zip unzip hallucination_detection_data.zip @@ -67,8 +62,8 @@ unzip hallucination_detection_data.zip Now you can run any notebook from the `experiments` folder. -# Citation +# Citation If you use refer to this code or results in your work, please cite: ```bibtex diff --git a/demo/alti/minimal_example/download_nllb.sh b/demo/alti/minimal_example/download_nllb.sh index 632ce24..f675bc3 100644 --- a/demo/alti/minimal_example/download_nllb.sh +++ b/demo/alti/minimal_example/download_nllb.sh @@ -6,12 +6,12 @@ mkdir nllb cd nllb -# downloading the vocabulary; +# downloading the vocabulary; wget --trust-server-names https://tinyurl.com/flores200sacrebleuspm wget --trust-server-names https://tinyurl.com/nllb200dictionary # downloading the smallest NLLB200 model; it may take about 5 minutes -wget --trust-server-names https://tinyurl.com/nllb200densedst600mcheckpoint +wget --trust-server-names https://tinyurl.com/nllb200densedst600mcheckpoint for lang in ace_Latn acm_Arab acq_Arab aeb_Arab afr_Latn ajp_Arab aka_Latn als_Latn amh_Ethi apc_Arab arb_Arab ars_Arab ary_Arab arz_Arab asm_Beng ast_Latn awa_Deva ayr_Latn azb_Arab azj_Latn bak_Cyrl bam_Latn ban_Latn bel_Cyrl bem_Latn ben_Beng bho_Deva bjn_Latn bod_Tibt bos_Latn bul_Cyrl cat_Latn ceb_Latn ces_Latn cjk_Latn ckb_Arab crh_Latn cym_Latn dan_Latn deu_Latn dik_Latn dyu_Latn dzo_Tibt ell_Grek eng_Latn epo_Latn est_Latn eus_Latn ewe_Latn fao_Latn fij_Latn fin_Latn fon_Latn fra_Latn fur_Latn fuv_Latn gaz_Latn gla_Latn gle_Latn glg_Latn grn_Latn guj_Gujr hat_Latn hau_Latn heb_Hebr hin_Deva hne_Deva hrv_Latn hun_Latn hye_Armn ibo_Latn ilo_Latn ind_Latn isl_Latn ita_Latn jav_Latn jpn_Jpan kab_Latn kac_Latn kam_Latn kan_Knda kas_Arab kas_Deva kat_Geor kaz_Cyrl kbp_Latn kea_Latn khk_Cyrl khm_Khmr kik_Latn kin_Latn kir_Cyrl kmb_Latn kmr_Latn knc_Arab knc_Latn kon_Latn kor_Hang lao_Laoo lij_Latn lim_Latn lin_Latn lit_Latn lmo_Latn ltg_Latn ltz_Latn lua_Latn lug_Latn luo_Latn lus_Latn lvs_Latn mag_Deva mai_Deva mal_Mlym mar_Deva min_Latn mkd_Cyrl mlt_Latn mni_Beng mos_Latn mri_Latn mya_Mymr nld_Latn nno_Latn nob_Latn npi_Deva nso_Latn nus_Latn nya_Latn oci_Latn ory_Orya pag_Latn pan_Guru pap_Latn pbt_Arab pes_Arab plt_Latn pol_Latn por_Latn prs_Arab quy_Latn ron_Latn run_Latn rus_Cyrl sag_Latn san_Deva scn_Latn shn_Mymr sin_Sinh slk_Latn slv_Latn smo_Latn sna_Latn snd_Arab som_Latn sot_Latn spa_Latn srd_Latn srp_Cyrl ssw_Latn sun_Latn swe_Latn swh_Latn szl_Latn tam_Taml taq_Latn tat_Cyrl tel_Telu tgk_Cyrl tgl_Latn tha_Thai tir_Ethi tpi_Latn tsn_Latn tso_Latn tuk_Latn tum_Latn tur_Latn twi_Latn tzm_Tfng uig_Arab ukr_Cyrl umb_Latn urd_Arab uzn_Latn vec_Latn vie_Latn war_Latn wol_Latn xho_Latn ydd_Hebr yor_Latn yue_Hant zho_Hans zho_Hant zsm_Latn zul_Latn; do cp dictionary.txt dict.${lang}.txt diff --git a/demo/alti/minimal_example/preset/nllb_demo.yaml b/demo/alti/minimal_example/preset/nllb_demo.yaml index 2b46269..9c8a380 100644 --- a/demo/alti/minimal_example/preset/nllb_demo.yaml +++ b/demo/alti/minimal_example/preset/nllb_demo.yaml @@ -3,7 +3,7 @@ demo_dir: ??? DATADIR: ${demo_dir} is_multilingual: True -checkpoint: ${demo_dir}/checkpoint.pt +checkpoint: ${demo_dir}/checkpoint.pt data_dir: ${demo_dir} spm: ${demo_dir}/flores200_sacrebleu_tokenizer_spm.model src_col: src diff --git a/demo/iwslt_blaser_eval/README.md b/demo/iwslt_blaser_eval/README.md index 8d93447..641e9f4 100644 --- a/demo/iwslt_blaser_eval/README.md +++ b/demo/iwslt_blaser_eval/README.md @@ -1,3 +1,4 @@ + # Getting started with mining Welcome to `stopes`, this is a quickstart guide to discover how to run automated pipelines with `stopes`. In this example we describe how to use BLASER to evaluate speech translation as described in the https://iwslt.org/2023/s2s task. @@ -67,8 +68,8 @@ Make sure to replace: - `PATH_TARGET_MANIFEST.tsv` to the manifest you've generated for the translation files - `PATH_REFERENCE_MANIFEST.tsv` to the manifest you've generated for the reference translations -## Citation +## Citation If you use `blaser` in your work or any of its models, please cite: ```bibtex @@ -83,5 +84,4 @@ If you use `blaser` in your work or any of its models, please cite: ``` ## License - The `blaser` code is MIT licensed, as found in the LICENSE file in the root directory. diff --git a/demo/iwslt_blaser_eval/conf/eval_blaser.yaml b/demo/iwslt_blaser_eval/conf/eval_blaser.yaml index 2ebf638..678b125 100644 --- a/demo/iwslt_blaser_eval/conf/eval_blaser.yaml +++ b/demo/iwslt_blaser_eval/conf/eval_blaser.yaml @@ -29,12 +29,14 @@ blaser_model: config_file: ${demo_dir}/blaser_model/model.config model_checkpoint: ${demo_dir}/blaser_model/model.pt + # for this IWSLT we will to EN to CMN evaluation (source is en, tgt and reference are Mandarin) src_lang: en tgt_lang: cmn # by default tgt and ref are the same lang ref_lang: ${tgt_lang} + max_tokens: 2_560_000 # mapping from lang code to encoder checkpoint diff --git a/demo/iwslt_blaser_eval/conf/launcher b/demo/iwslt_blaser_eval/conf/launcher index c149d04..e5fb673 120000 --- a/demo/iwslt_blaser_eval/conf/launcher +++ b/demo/iwslt_blaser_eval/conf/launcher @@ -1 +1 @@ -../../../stopes/pipelines/bitext/conf/launcher +../../../stopes/pipelines/bitext/conf/launcher \ No newline at end of file diff --git a/demo/toxicity-alti-hb/ETOX/README.md b/demo/toxicity-alti-hb/ETOX/README.md index c0a1d4e..fdb55bd 100644 --- a/demo/toxicity-alti-hb/ETOX/README.md +++ b/demo/toxicity-alti-hb/ETOX/README.md @@ -3,28 +3,24 @@ Contains scripts for calculating toxicity results, given files of input strings and toxicity lists. Prerequisites: - - Install the HolisticBias module ([setup instructions](https://github.com/facebookresearch/ResponsibleNLP/tree/main/holistic_bias)) - Define paths to include etox.py Files: - - `ETOX example calls.ipynb`: Example Usage of the main ETOX toxicity tool functions. - `etox.py`: contains all the python functions for the ETOX tool - `README.md`: this file Functions: Main Functions: - -- `etox_single`: Takes a Pandas dataframe and a toxicity list filename, and outputs multiple dataframes of toxicity report results. -- `etox_paired`: Paired language toxicity evaluation function. Takes 2 Pandas dataframes and a toxicity list filenames, and outputs an annotated line by line labeled table of toxicity matches for further analysis. -- `etox_paired_file_wrapper`: File reading/writing wrapper for the paired language toxicity evaluation function. +- `etox_single`: Takes a Pandas dataframe and a toxicity list filename, and outputs multiple dataframes of toxicity report results. +- `etox_paired`: Paired language toxicity evaluation function. Takes 2 Pandas dataframes and a toxicity list filenames, and outputs an annotated line by line labeled table of toxicity matches for further analysis. +- `etox_paired_file_wrapper`: File reading/writing wrapper for the paired language toxicity evaluation function. Support Functions - - `load_eval_data_line_by_line` Loads a text file of strings, returns a Pandas Dataframe -- `txt_format`: simple data cleaning function. Lowercases and uses regex to remove punctuation, etc. -- `import_toxicity_list_file`: reads a toxicity list file into memory given a filename. Returns a List. -- `token_checker`: Checks for matches between a string and a toxic word list used if 'space' tokenization selected -- `substring_checker`: checks for character level matches ignoring spaces. Will find subwords. Used if 'character' tokenization selected -- `SPM_token_checker`: Toxic phrase checker utilizing sub-word spm_tokenization rather than simply using spaces like the stard checker. Useful for a few languages where space tokenization is unreliable, or when matching subtokens may be important. Requires the Sentencepiece library to function. +- `txt_format`: simple data cleaning function. Lowercases and uses regex to remove punctuation, etc. +- `import_toxicity_list_file`: reads a toxicity list file into memory given a filename. Returns a List. +- `token_checker`: Checks for matches between a string and a toxic word list used if 'space' tokenization selected +- `substring_checker`: checks for character level matches ignoring spaces. Will find subwords. Used if 'character' tokenization selected +- `SPM_token_checker`: Toxic phrase checker utilizing sub-word spm_tokenization rather than simply using spaces like the stard checker. Useful for a few languages where space tokenization is unreliable, or when matching subtokens may be important. Requires the Sentencepiece library to function. diff --git a/demo/toxicity-alti-hb/README.md b/demo/toxicity-alti-hb/README.md index 6c794f4..1a15de9 100644 --- a/demo/toxicity-alti-hb/README.md +++ b/demo/toxicity-alti-hb/README.md @@ -1,12 +1,11 @@ # mtoxicity-alti-holisticbias - MT toxicity at scale: deep detection and analysis. Subfolders: - -- `alti/`: contains (1) the outputs of the translation models and (2) the source contributions and word alignments for the MT outputs of holisticbias with the NLLB 3B dense model. We used the github repository: https://github.com/mt-upc/transformer-contributions +- `alti/`: contains (1) the outputs of the translation models and (2) the source contributions and word alignments for the MT outputs of holisticbias with the NLLB 3B dense model. We used the github repository: https://github.com/mt-upc/transformer-contributions - `analysis/`: scripts for calculating/plotting toxicity results, given (1) toxicities precomputed with ETOX and (2) ALTI+ scores. - `annotation/`: contains the false positive and the false negative analysis conducted for 8 outputs on the holisticbias toxicity detection. - `ETOX/`: contains the tool for detecting toxicity + # Contributors: Marta R. Costa-jussà, alti/ @@ -23,13 +22,13 @@ If you use toxicity-alti-hb in your work, please cite : @article{toxicity2022, -title={Toxicity in Multilingual Machine Translation at Scale}, + title={Toxicity in Multilingual Machine Translation at Scale}, -author={Costa-jussà, M.R., Smith, E, Ropers, C., Licht.,D., Ferrando, J., Escolano, C.}, + author={Costa-jussà, M.R., Smith, E, Ropers, C., Licht.,D., Ferrando, J., Escolano, C.}, -journal={Arxiv, abs/2210.03070}, + journal={Arxiv, abs/2210.03070}, -url={https://arxiv.org/abs/2210.03070.pdf}, + url={https://arxiv.org/abs/2210.03070.pdf}, -year={2022} + year={2022} } diff --git a/demo/toxicity-alti-hb/alti/README.md b/demo/toxicity-alti-hb/alti/README.md index 708df50..8fed5c5 100644 --- a/demo/toxicity-alti-hb/alti/README.md +++ b/demo/toxicity-alti-hb/alti/README.md @@ -4,18 +4,21 @@ Data can be download from: wget --trust-server-names https://tinyurl.com/toxtranslationaltioutputs + `HB-dense3B-outputs/`: 164 folder from English to 164 languages. Each folder has the non-tokenized and the spm translation output (of HolisticBias) for the NLLB 3B dense model . Example of folder for LANGX in the 164 languages: eng_Latn-LANGX/holistic.eng_Latn-LANGX eng_Latn-LANGX/spm_holistic.eng_Latn-LANGX -`HB-distilled600M-outputs`: 164 folder from English to 164 languages. Each folder has the non-tokenized and the spm translation output (of HolisticBias) for the NLLB 600M distilled model. Example of 1 of these folders for LANGX in the 164 languages: + +`HB-distilled600M-outputs`: 164 folder from English to 164 languages. Each folder has the non-tokenized and the spm translation output (of HolisticBias) for the NLLB 600M distilled model. Example of 1 of these folders for LANGX in the 164 languages: eng_Latn-LANGX/holistic.eng_Latn-LANGX eng_Latn-LANGX/spm_holistic.eng_Latn-LANGX + `alti-outputs/`: 164 folder from English to 164 languages. Each folder has two files: the outputs of the source contributions and alignments for the MT outputs of HolisticBias with the NLLB 3B dense model. Example 1 of these folders for LANGX in the 164 languages: eng_Latn-LANGX/output.eng_Latn-LANGX diff --git a/demo/toxicity-alti-hb/analysis/README.md b/demo/toxicity-alti-hb/analysis/README.md index ba55a37..0ed6c45 100644 --- a/demo/toxicity-alti-hb/analysis/README.md +++ b/demo/toxicity-alti-hb/analysis/README.md @@ -1,12 +1,10 @@ # Analysis code Contains scripts for calculating/plotting toxicity results, given precomputed toxicities and ALTI+ scores. Prerequisites: - - Install the HolisticBias module ([setup instructions](https://github.com/facebookresearch/ResponsibleNLP/tree/main/holistic_bias)) - Define paths for loading in pre-existing files of source/target sentences, toxicity results, ALTI+ source contribution scores, etc. in `util.py` (see `'TODO'`s) Scripts: - - `00_compile_toxicity_stats.py`: compute the course-grained analysis of toxicity as a function of language, axis, noun, template, etc. - `00c_plot_toxicity_per_lang.py`: plot the breakdown of toxicity across HolisticBias axes as a function of language - `01_sample_high_risk_translations.py`: sample translations likely to be toxic despite not being labeled as toxic, for the false negative analysis diff --git a/demo/toxicity-alti-hb/annotation/README.md b/demo/toxicity-alti-hb/annotation/README.md index a10d46f..2a69236 100644 --- a/demo/toxicity-alti-hb/annotation/README.md +++ b/demo/toxicity-alti-hb/annotation/README.md @@ -7,43 +7,36 @@ https://tinyurl.com/hbtoxicannotation (you might need to copy paste it in a new browser window for the download to work) The folder contains 16 TSV files, 2 files for each of the below languoids. - -- cat_Latn: Catalan -- eus_Latn: Basque -- fra_Latn: French -- pes_Arab: Western Persian -- spa_Latn: Spanish -- zho_Hans: Chinese (simplified script) -- zho_Hant: Chinese (traditional script) +* cat_Latn: Catalan +* eus_Latn: Basque +* fra_Latn: French +* pes_Arab: Western Persian +* spa_Latn: Spanish +* zho_Hans: Chinese (simplified script) +* zho_Hant: Chinese (traditional script) For each languoid, one file includes annotations for sentences where candidate toxicity was automatically detected (true|false positives), the other file includes annotations for a sample of sentences where no toxicity was automatically detected (true|false negatives). ## Positives - Each file displays for each annotated item: - -- the BCP47 code for the input language -- the BCP47 code for the output language -- the input sentence -- the output sentence -- the detected toxicity list entry -- the TRUE | FALSE annotation (TRUE = confirmed toxicity) +* the BCP47 code for the input language +* the BCP47 code for the output language +* the input sentence +* the output sentence +* the detected toxicity list entry +* the TRUE | FALSE annotation (TRUE = confirmed toxicity) ## Negatives - Each file displays for each annotated item: - -- the BCP47 code for the input language -- the BCP47 code for the output language -- the input sentence -- the output sentence -- the TRUE | FALSE annotation (TRUE = confirmed toxicity) +* the BCP47 code for the input language +* the BCP47 code for the output language +* the input sentence +* the output sentence +* the TRUE | FALSE annotation (TRUE = confirmed toxicity) ## Confirmed toxicity - A positive detection is confirmed toxic when: - -- it matches a toxicity list entry, and: - _ it is always toxic (context-independent entries), or - _ it is assessed toxic in the context of the sentence (context-dependent entries). - A negative detection is confirmed toxic when it matches a morphological variant of a toxicity list entry. +* it matches a toxicity list entry, and: + * it is always toxic (context-independent entries), or + * it is assessed toxic in the context of the sentence (context-dependent entries). +A negative detection is confirmed toxic when it matches a morphological variant of a toxicity list entry. diff --git a/stopes/eval/alti/LICENSE.md b/stopes/eval/alti/LICENSE.md index c61b663..261eeb9 100644 --- a/stopes/eval/alti/LICENSE.md +++ b/stopes/eval/alti/LICENSE.md @@ -2,180 +2,180 @@ Version 2.0, January 2004 http://www.apache.org/licenses/ -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - -2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - -3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - -4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - -5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - -6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - -8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - -END OF TERMS AND CONDITIONS - -APPENDIX: How to apply the Apache License to your work. + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" @@ -186,16 +186,16 @@ APPENDIX: How to apply the Apache License to your work. same "printed page" as the copyright notice for easier identification within third-party archives. -Copyright [yyyy] [name of copyright owner] + Copyright [yyyy] [name of copyright owner] -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/stopes/eval/alti/README.md b/stopes/eval/alti/README.md index 1fad0f3..0a07565 120000 --- a/stopes/eval/alti/README.md +++ b/stopes/eval/alti/README.md @@ -1 +1 @@ -../../../website/docs/eval/alti.md +../../../website/docs/eval/alti.md \ No newline at end of file diff --git a/stopes/eval/blaser/README.md b/stopes/eval/blaser/README.md index 2815067..bf7ffcb 120000 --- a/stopes/eval/blaser/README.md +++ b/stopes/eval/blaser/README.md @@ -1 +1 @@ -../../../website/docs/eval/blaser.md +../../../website/docs/eval/blaser.md \ No newline at end of file diff --git a/stopes/eval/blaser/conf/score.yaml b/stopes/eval/blaser/conf/score.yaml index 30e19d5..879eef4 100644 --- a/stopes/eval/blaser/conf/score.yaml +++ b/stopes/eval/blaser/conf/score.yaml @@ -12,4 +12,4 @@ model: batch_size: 100 use_gpu: True -output_dir: test +output_dir: test \ No newline at end of file diff --git a/stopes/pipelines/bitext/README.md b/stopes/pipelines/bitext/README.md index 1ab32a9..1e7844c 120000 --- a/stopes/pipelines/bitext/README.md +++ b/stopes/pipelines/bitext/README.md @@ -1 +1 @@ -../../../website/docs/pipelines/global_mining.md +../../../website/docs/pipelines/global_mining.md \ No newline at end of file diff --git a/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml b/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml index 7964cf8..68209f6 100644 --- a/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml +++ b/stopes/pipelines/bitext/conf/embed_text/config/encoder/hf_encoder.yaml @@ -7,5 +7,5 @@ fp16: False # these are not used, but required as the laser3 uses it. but sentence transformer doesn't need them # we just set it to nothing -spm_model: "" -spm_vocab: "" +spm_model: '' +spm_vocab: '' diff --git a/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml b/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml index a59a0e4..4c0cb22 100644 --- a/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml +++ b/stopes/pipelines/bitext/conf/eval/generate_multi_bleu_detok.yaml @@ -4,7 +4,7 @@ tgt_lang: ??? checkpoint_dir: ??? # directory containing checkpoints binarized_dir: ??? # contains binarized files for each lang and split output_dir: ??? -checkpoint_glob: "checkpoint[0-9]*.pt" # pattern for file checkpoints in checkpoint_dir +checkpoint_glob: "checkpoint[0-9]*.pt" # pattern for file checkpoints in checkpoint_dir beam: 5 batch_size: 32 batch_memory: 2 # mem_gb in Distributed requirements is = batch_memory * batch_size diff --git a/stopes/pipelines/bitext/conf/generate/standard_conf.yaml b/stopes/pipelines/bitext/conf/generate/standard_conf.yaml index 905c695..54ffe8a 100644 --- a/stopes/pipelines/bitext/conf/generate/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/generate/standard_conf.yaml @@ -10,3 +10,4 @@ config: beam_search: beam: 5 file_list: [] + diff --git a/stopes/pipelines/bitext/conf/mine_indexes/base.yaml b/stopes/pipelines/bitext/conf/mine_indexes/base.yaml index fcbc596..fad639b 100644 --- a/stopes/pipelines/bitext/conf/mine_indexes/base.yaml +++ b/stopes/pipelines/bitext/conf/mine_indexes/base.yaml @@ -1,24 +1,24 @@ _target_: stopes.modules.bitext.mining.mine_bitext_indexes.MineBitextIndexesModule config: - src_lang: ??? - tgt_lang: ??? - index_type: ??? + src_lang: ??? + tgt_lang: ??? + index_type: ??? - #set later in pipeline - src2tgt_dist_files: ??? - src2tgt_index_files: ??? - tgt2src_dist_files: ??? - tgt2src_index_files: ??? + #set later in pipeline + src2tgt_dist_files: ??? + src2tgt_index_files: ??? + tgt2src_dist_files: ??? + tgt2src_index_files: ??? - output_dir: ??? - knn_dist: 16 - src_k: 16 - tgt_k: 16 - k_extract: 1 - margin_type: ratio - mine_type: union - sort_neighbors: False - margin_norm: mean - num_probe: 128 - gpu_type: fp16-shard - mine_threshold: 1.06 + output_dir: ??? + knn_dist: 16 + src_k: 16 + tgt_k: 16 + k_extract: 1 + margin_type: ratio + mine_type: union + sort_neighbors: False + margin_norm: mean + num_probe: 128 + gpu_type: fp16-shard + mine_threshold: 1.06 diff --git a/stopes/pipelines/bitext/conf/mine_sentences/base.yaml b/stopes/pipelines/bitext/conf/mine_sentences/base.yaml index 7efaf0c..a6899b7 100644 --- a/stopes/pipelines/bitext/conf/mine_sentences/base.yaml +++ b/stopes/pipelines/bitext/conf/mine_sentences/base.yaml @@ -1,15 +1,15 @@ _target_: stopes.modules.bitext.mining.mine_bitext_sentences.MineBitextSentencesModule config: - src_lang: ??? - tgt_lang: ??? - src_text_files: ??? - src_meta_files: ??? - tgt_text_files: ??? - tgt_meta_files: ??? - alignment_file: ??? # the mined indexes, without npz extension - data: ??? - output_dir: mine.${data.data_version} - mine_threshold: 1.04 - score_max: 1.25 - dedup_bitexts: True - compress_output: True + src_lang: ??? + tgt_lang: ??? + src_text_files: ??? + src_meta_files: ??? + tgt_text_files: ??? + tgt_meta_files: ??? + alignment_file: ??? # the mined indexes, without npz extension + data: ??? + output_dir: mine.${data.data_version} + mine_threshold: 1.04 + score_max: 1.25 + dedup_bitexts: True + compress_output: True diff --git a/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml b/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml index 1d550b5..47d6255 100644 --- a/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/moses_filter/standard_conf.yaml @@ -1,4 +1,4 @@ output_dir: ??? -filter_ratio: 2.5 # bitexts filtering using Moses's lean-corpus-n.perl +filter_ratio: 2.5 # bitexts filtering using Moses's lean-corpus-n.perl filter_min: 1 filter_max: 250 # this means that lines longer than 250 will be deleted diff --git a/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml b/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml index b1cb791..1c4da85 100644 --- a/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml +++ b/stopes/pipelines/bitext/conf/nmt_bitext_eval.yaml @@ -9,6 +9,7 @@ defaults: - eval: generate_multi_bleu_detok - _self_ + src_lang: ??? tgt_lang: ??? # this is the bitext being evaluated. It should have 3 columns: (score, src, tgt) diff --git a/stopes/pipelines/bitext/conf/preset/demo.yaml b/stopes/pipelines/bitext/conf/preset/demo.yaml index 93d21e5..5f32f2c 100644 --- a/stopes/pipelines/bitext/conf/preset/demo.yaml +++ b/stopes/pipelines/bitext/conf/preset/demo.yaml @@ -42,7 +42,7 @@ train_index: calculate_distances: config: gpu_memory_gb: 32 - gpu_type: "" # don't use gpu + gpu_type: "" # don't use gpu # Provides info about the data. A lot of this is used to generate nice output # file names. diff --git a/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml b/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml index 81a2964..52fedad 100644 --- a/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/spm/train/standard_conf.yaml @@ -7,7 +7,7 @@ config: character_coverage: 0.999995 model_type: "unigram" shuffle_input_sentence: True - num_threads: 20 + num_threads : 20 train_data_file: ??? # optional value; if passed as empty, will be auto set based on train_data_file name model_prefix_spm: "" diff --git a/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml b/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml index c5d385c..1c2ce16 100644 --- a/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml +++ b/stopes/pipelines/bitext/conf/train_spm/standard_conf.yaml @@ -7,7 +7,7 @@ seed_sentencepiece_size: 5_000_000 character_coverage: 0.999995 model_type: "unigram" shuffle_input_sentence: True -num_threads: 20 +num_threads : 20 train_data_file: ??? # optional value; if passed as empty, will be auto set based on train_data_file name model_prefix_spm: "" diff --git a/stopes/pipelines/distillation/conf/bitext_clean/default.yaml b/stopes/pipelines/distillation/conf/bitext_clean/default.yaml index fc361f1..1d56c56 100644 --- a/stopes/pipelines/distillation/conf/bitext_clean/default.yaml +++ b/stopes/pipelines/distillation/conf/bitext_clean/default.yaml @@ -2,7 +2,7 @@ language_script_filename: ${mono_pipeline.language_script_filename} split_language_equivalences_filename: ${mono_pipeline.split_language_equivalences_filename} # used for sentence splitting -split_algo: ${mono_pipeline.split_algo} +split_algo: ${mono_pipeline.split_algo} local_tmp_dir: ${local_tmp_dir} dist_tmp_dir: ${dist_tmp_dir} @@ -15,8 +15,8 @@ bitext_processor: local_tmp_dir: ${..local_tmp_dir} _version: 0.3 -outfile_prefix: "" -outfile_postfix: "" +outfile_prefix: '' +outfile_postfix: '' output_dir: ??? filter: diff --git a/stopes/pipelines/distillation/conf/dedup/default.yaml b/stopes/pipelines/distillation/conf/dedup/default.yaml index 609fa83..a809bac 100644 --- a/stopes/pipelines/distillation/conf/dedup/default.yaml +++ b/stopes/pipelines/distillation/conf/dedup/default.yaml @@ -17,5 +17,5 @@ field_def: 6 # on what field do we merge; update this to 7 if you have an lm sco process_locally: false # ignore: if it's faster to read locally, copy all files to tmp_dir first resort_files: false # ignore -glob: "" # ignore +glob: '' # ignore wandb: null # ignore diff --git a/stopes/pipelines/distillation/conf/distillation.yaml b/stopes/pipelines/distillation/conf/distillation.yaml index b9e9d96..9fd6af5 100644 --- a/stopes/pipelines/distillation/conf/distillation.yaml +++ b/stopes/pipelines/distillation/conf/distillation.yaml @@ -26,7 +26,7 @@ launcher: # update this if you are not running locally, used in dedup tmp_dir: /tmp # update this to be a path on a shared partition if running on slurm -merge_dir: /tmp/merge +merge_dir: /tmp/merge # update this if you are not running locally (used in mono_pipeline cleaning and bitext clean) local_tmp_dir: ${tmp_dir} @@ -39,28 +39,28 @@ lid: model_file: # path to LID model - latest_models_path: ??? + latest_models_path: ??? # optional - + probability_threshold: 0.5 lang_thresholds: - fuv: 0.3 - bis: 0.3 - ewe: 0.2 - fon: 0.2 - kam: 0.3 - kur: 0.2 - lua: 0.4 - pag: 0.4 - sag: 0.3 - ssw: 0.3 - tso: 0.4 - umb: 0.3 - vec: 0.4 - war: 0.4 - yor: 0.4 - diq: 0.4 + fuv: 0.3 + bis: 0.3 + ewe: 0.2 + fon: 0.2 + kam: 0.3 + kur: 0.2 + lua: 0.4 + pag: 0.4 + sag: 0.3 + ssw: 0.3 + tso: 0.4 + umb: 0.3 + vec: 0.4 + war: 0.4 + yor: 0.4 + diq: 0.4 label_unk: __label__unk @@ -75,4 +75,4 @@ vocab_file_path: ??? spm_model_path: ??? # update if you use wandb -wandb: null +wandb: null diff --git a/stopes/pipelines/distillation/conf/launcher b/stopes/pipelines/distillation/conf/launcher index 90730d4..36d1959 120000 --- a/stopes/pipelines/distillation/conf/launcher +++ b/stopes/pipelines/distillation/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher +../../bitext/conf/launcher \ No newline at end of file diff --git a/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml b/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml index 5f9496e..4a4aa44 100644 --- a/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml +++ b/stopes/pipelines/distillation/conf/mono_pipeline/default.yaml @@ -1,7 +1,7 @@ launcher: ${launcher} dedup: ${dedup} langs: ${src_langs} -corpus_filter: "" +corpus_filter: '' data_dir: ${mono_data_dir} output_dir: . language_script_filename: language_scripts_200.tsv diff --git a/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml b/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml index fe46cf9..6ac49b4 100644 --- a/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml +++ b/stopes/pipelines/distillation/conf/train_fairseq/params/model/transformer.yaml @@ -20,7 +20,7 @@ weight_decay: 0.0001 criterion: label_smoothed_cross_entropy label_smoothing: 0.2 optimizer: adam -adam_betas: "(0.9, 0.98)" +adam_betas: '(0.9, 0.98)' clip_norm: 0.0 lr_scheduler: inverse_sqrt diff --git a/stopes/pipelines/eval/conf/launcher b/stopes/pipelines/eval/conf/launcher index 90730d4..36d1959 120000 --- a/stopes/pipelines/eval/conf/launcher +++ b/stopes/pipelines/eval/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher +../../bitext/conf/launcher \ No newline at end of file diff --git a/stopes/pipelines/filtering/README.md b/stopes/pipelines/filtering/README.md index c04537b..86833e4 100644 --- a/stopes/pipelines/filtering/README.md +++ b/stopes/pipelines/filtering/README.md @@ -1,6 +1,6 @@ # NLLB Bitext Filtering -_NB_: This is legacy code that is older than the rest of `stopes`. It has not been +*NB*: This is legacy code that is older than the rest of `stopes`. It has not been ported yet -- do not depend on it as it will eventually be refactored. The `filter.py` pipeline applies various filters to bitext, with optional support for @@ -13,13 +13,11 @@ respectively. Please consult the help of those scripts (by running them with `-h learn more about how to configure them. A basic run using default parameters might look like this: - ``` python filter.py \ output_dir=/home/$USER/filter_test \ data_conf_dir=/home/$USER/data_conf ``` - This command will run using the output directory and `data_conf_dir` directory (the location where the `populate_data_conf.py` and `compute_length_factors.py` scripts output their configuration files) as specified above, and will additionally load the @@ -31,7 +29,6 @@ When needing to run a new filtering job with many parameter overrides, instead o manually overriding parameters on the command line it is better to create an entirely new config file, e.g. `conf/my_config.yaml`, containing all overrides. The script can then be instructed to load it as follows: - ``` python filter.py \ --config-name=my_config \ diff --git a/stopes/pipelines/monolingual/README.md b/stopes/pipelines/monolingual/README.md index e1ae265..0863f6a 100644 --- a/stopes/pipelines/monolingual/README.md +++ b/stopes/pipelines/monolingual/README.md @@ -16,7 +16,6 @@ The core filtering is in `monolingual_line_processor.py` and `utils/text_filter. `python monolingual_pipeline.py data_dir=yourdatahere langs='[umb,ssw]'` should be enough to get it running. - - `data_dir` is where the raw data is, should have subfolders per lang and files named with the pattern corpus_name.lang.xz - `langs` an array of langs to process in this run diff --git a/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml b/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml index 1d197c7..16e88f0 120000 --- a/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml +++ b/stopes/pipelines/monolingual/conf/dedup/dedup_files.yaml @@ -1 +1 @@ -../dedup_files.yaml +../dedup_files.yaml \ No newline at end of file diff --git a/stopes/pipelines/monolingual/conf/launcher b/stopes/pipelines/monolingual/conf/launcher index 90730d4..36d1959 120000 --- a/stopes/pipelines/monolingual/conf/launcher +++ b/stopes/pipelines/monolingual/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher +../../bitext/conf/launcher \ No newline at end of file diff --git a/stopes/pipelines/monolingual/conf/monolingual.yaml b/stopes/pipelines/monolingual/conf/monolingual.yaml index b9fd2ba..f318e7d 100644 --- a/stopes/pipelines/monolingual/conf/monolingual.yaml +++ b/stopes/pipelines/monolingual/conf/monolingual.yaml @@ -4,7 +4,7 @@ defaults: - _self_ langs: ??? -corpus_filter: "" +corpus_filter: '' data_dir: ??? output_dir: . language_script_filename: language_scripts_200.tsv @@ -41,25 +41,26 @@ lid: latest_models_path: ??? probability_threshold: 0.5 lang_thresholds: - fuv: 0.3 - bis: 0.3 - ewe: 0.2 - fon: 0.2 - kam: 0.3 - kur: 0.2 - lua: 0.4 - pag: 0.4 - sag: 0.3 - ssw: 0.3 - tso: 0.4 - umb: 0.3 - vec: 0.4 - war: 0.4 - yor: 0.4 - diq: 0.4 + fuv: 0.3 + bis: 0.3 + ewe: 0.2 + fon: 0.2 + kam: 0.3 + kur: 0.2 + lua: 0.4 + pag: 0.4 + sag: 0.3 + ssw: 0.3 + tso: 0.4 + umb: 0.3 + vec: 0.4 + war: 0.4 + yor: 0.4 + diq: 0.4 label_unk: __label__unk + preprocess_buffer_size: 10_000 preproces_requirements: nodes: 1 diff --git a/stopes/pipelines/prepare_data/README.md b/stopes/pipelines/prepare_data/README.md index 98c195d..917d888 100644 --- a/stopes/pipelines/prepare_data/README.md +++ b/stopes/pipelines/prepare_data/README.md @@ -4,19 +4,19 @@ This pipeline takes in the filtered corpora text files (can be compressed), trai ## Input Config: -- fold: train, train_mining, train_mmt_bt, train_smt_bt, valid, test are possible options -- lang_dir: language direction +* fold: train, train_mining, train_mmt_bt, train_smt_bt, valid, test are possible options +* lang_dir: language direction corpora: `CorporaConfig` -: -: -: -src: -tgt: -metadata: (optional) -... -... -... + : + : + : + src: + tgt: + metadata: (optional) + ... + ... + ... Specify paths to src, tgt, and optionally metadata files per (fold, lang_dir) for each corpus. preprocessing: `PreprocessingConfig` @@ -37,18 +37,17 @@ How to launch your jobs? locally or submitit ## Run Command: Please override the default config options as required. - ``` python stopes/pipelines/prepare_data/prepare_data.py output_dir= ``` ## Pipeline Breakdown -- validate: Counts the number of lines for all parallel corpora and makes sure they're the same for src & tgt and stores train line counts statistics. -- retrieve_data: Concatenates all corpora for each (fold, lang_dir), runs Moses preprocessing over each of them as per preprocessing config and saves them to the `retrieved_data` directory. -- build_vocab: Samples a corpus as per sampling_config and trains an SPM on the sampled corpus. We need to sample a corpus since training an SPM on all of the corpora is time consuming. This is done jointly for src, tgt directinos by default but can be done separately as well. The trained SPM, the model file and vocab file are saved in the `vocab_bin` directory -- dedup_sharding: Deduplicates training corpora across eval corpora (valid, test) & optionally across folds as per dedup_config and shards training corpora. -- binarize: Binarizes all the sharded files (train, eval) using `MultiProcFairSeqBinarizerEncoder` and writes them to the sharded directories in the `data_bin` directory. +* validate: Counts the number of lines for all parallel corpora and makes sure they're the same for src & tgt and stores train line counts statistics. +* retrieve_data: Concatenates all corpora for each (fold, lang_dir), runs Moses preprocessing over each of them as per preprocessing config and saves them to the `retrieved_data` directory. +* build_vocab: Samples a corpus as per sampling_config and trains an SPM on the sampled corpus. We need to sample a corpus since training an SPM on all of the corpora is time consuming. This is done jointly for src, tgt directinos by default but can be done separately as well. The trained SPM, the model file and vocab file are saved in the `vocab_bin` directory +* dedup_sharding: Deduplicates training corpora across eval corpora (valid, test) & optionally across folds as per dedup_config and shards training corpora. +* binarize: Binarizes all the sharded files (train, eval) using `MultiProcFairSeqBinarizerEncoder` and writes them to the sharded directories in the `data_bin` directory. ## Caveat diff --git a/stopes/pipelines/prepare_data/conf/launcher b/stopes/pipelines/prepare_data/conf/launcher index 90730d4..36d1959 120000 --- a/stopes/pipelines/prepare_data/conf/launcher +++ b/stopes/pipelines/prepare_data/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher +../../bitext/conf/launcher \ No newline at end of file diff --git a/stopes/pipelines/prepare_data/conf/prepare_data.yaml b/stopes/pipelines/prepare_data/conf/prepare_data.yaml index 13861ea..11be2f2 100644 --- a/stopes/pipelines/prepare_data/conf/prepare_data.yaml +++ b/stopes/pipelines/prepare_data/conf/prepare_data.yaml @@ -11,6 +11,6 @@ output_dir: ??? launcher: partition: ??? # set as null if running locally cache: - caching_dir: ${output_dir}/cache # Cache won't be re-used if you change the output_dir. + caching_dir: ${output_dir}/cache # Cache won't be re-used if you change the output_dir. corpora: ??? diff --git a/stopes/pipelines/speech/README.md b/stopes/pipelines/speech/README.md index bc0f9cd..069f4f1 100644 --- a/stopes/pipelines/speech/README.md +++ b/stopes/pipelines/speech/README.md @@ -5,7 +5,6 @@ This pipeline takes in audio data tsv files for multiple language directions and computes LASER embeddings for each language direction using the SpeechLASER model used for mining SpeechMatrix data. For each language direction, the pipeline first splits the audio data tsv file into chunks and computes the laser embeddings for each chunk separately on a node with 1 GPU asynchronously and saves a `.embeddings` file for each chunk. The format for each input audio data tsv file is: - ``` ::\t @@ -15,13 +14,11 @@ The format for each input audio data tsv file is: ``` We run the pipeline using the command: - ``` python stopes/pipelines/speech/compute_laser_embeddings.py ``` The input config: - ``` @dataclass class LaserEmbeddingConfig: @@ -35,11 +32,10 @@ class LaserEmbeddingConfig: ``` Parameters: - -- `launcher`: Config for the Stopes launcher, either `submitit` or `local`. Make sure you specify the partition for the launcher if you're using the `submitit` launcher. -- `max_tokens`: Determines the effective batch size for feeding in the audio waveforms. Needs to be tuned to make sure we don't OOM on the GPU. -- `checkpoint_dir`: Path to the checkpoint directory of the SpeechLASER models. -- `data_dir`: Path to the audio data tsv files in the format `_.tsv`. -- `num_chunks`: number of chunks to split the audio data tsv files. -- `lang_dirs`: comma separated string of language directions. Ex: `hr-en,ro-en,es-en` -- `out_dir`: Path to the output directory to save the embedding files. +* `launcher`: Config for the Stopes launcher, either `submitit` or `local`. Make sure you specify the partition for the launcher if you're using the `submitit` launcher. +* `max_tokens`: Determines the effective batch size for feeding in the audio waveforms. Needs to be tuned to make sure we don't OOM on the GPU. +* `checkpoint_dir`: Path to the checkpoint directory of the SpeechLASER models. +* `data_dir`: Path to the audio data tsv files in the format `_.tsv`. +* `num_chunks`: number of chunks to split the audio data tsv files. +* `lang_dirs`: comma separated string of language directions. Ex: `hr-en,ro-en,es-en` +* `out_dir`: Path to the output directory to save the embedding files. diff --git a/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml b/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml index da4323c..69d6e6e 100644 --- a/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml +++ b/stopes/pipelines/speech/conf/compute_laser_embeddings.yaml @@ -5,4 +5,4 @@ defaults: launcher: partition: ??? # set as null if running locallyc cache: - caching_dir: ${out_dir}/cache # Cache won't be re-used if you change the out_dir. + caching_dir: ${out_dir}/cache # Cache won't be re-used if you change the out_dir. diff --git a/stopes/pipelines/speech/conf/launcher b/stopes/pipelines/speech/conf/launcher index 90730d4..36d1959 120000 --- a/stopes/pipelines/speech/conf/launcher +++ b/stopes/pipelines/speech/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher +../../bitext/conf/launcher \ No newline at end of file diff --git a/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml b/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml index 7b94775..57d58df 100644 --- a/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml +++ b/stopes/pipelines/tests/conf/embed_text/test_numbers_encoder.yaml @@ -20,4 +20,4 @@ config: lang: ??? shards: ??? lang_shard_name: None - launcher: ${launcher} + launcher: ${launcher} diff --git a/stopes/pipelines/translate/conf/example.yaml b/stopes/pipelines/translate/conf/example.yaml index 2449a5c..0b83371 100644 --- a/stopes/pipelines/translate/conf/example.yaml +++ b/stopes/pipelines/translate/conf/example.yaml @@ -9,12 +9,12 @@ generation: output_dir: ??? preserve_filenames: true file_list: - - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "arb_Latn"] - - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "ary_Arab"] - - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "ajp_Arab"] - - ["/data/monolingual.arb_Arab.txt", "arb_Arab", "apc_Arab"] - - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Arab"] - - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Latn"] - - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "ary_Arab"] - - ["/data/monolingual.ajp_Arab.txt", "ajp_Arab", "apc_Arab"] + - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "arb_Latn"] + - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "ary_Arab"] + - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "ajp_Arab"] + - [ "/data/monolingual.arb_Arab.txt", "arb_Arab", "apc_Arab"] + - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Arab"] + - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "arb_Latn"] + - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "ary_Arab"] + - [ "/data/monolingual.ajp_Arab.txt", "ajp_Arab", "apc_Arab"] # format is [path, src_lang, tgt_lang] diff --git a/stopes/pipelines/translate/conf/launcher b/stopes/pipelines/translate/conf/launcher index 90730d4..36d1959 120000 --- a/stopes/pipelines/translate/conf/launcher +++ b/stopes/pipelines/translate/conf/launcher @@ -1 +1 @@ -../../bitext/conf/launcher +../../bitext/conf/launcher \ No newline at end of file diff --git a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx index f113bb6..b155484 100644 --- a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx +++ b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx @@ -136,15 +136,14 @@ function useFileNavigate() { const Files = (): JSX.Element => { - const [displayHelper, setDisplayHelper] = useState(false); + const [displayHelper, setDisplayHelper] = useState(false); const navigate = useFileNavigate(); let { filename, pageNumber, numberLines, files, audioBlob, error } = useLoaderData() as LoaderReturn; const [newFilename, setNewFilename] = useState( filename || config.default_path ); - - + // if we have a location, we are in a transition between two urls const navigation = useNavigation(); const locationParams = parseLocation(navigation.location); @@ -190,14 +189,14 @@ const Files = (): JSX.Element => { // Add new function to handle paste events const fileInputHandlePaste = useCallback( (evt) => { - const pastedData = evt.clipboardData.getData("text"); + const pastedData = evt.clipboardData.getData('text'); setNewFilename(pastedData); navigate(pastedData, pageNumber, numberLines); }, [navigate, pageNumber, numberLines] ); - + return (
diff --git a/stopes/ui/seamlisten/react_app/src/e2e.test.js b/stopes/ui/seamlisten/react_app/src/e2e.test.js index 57c7c2c..cf8f5ea 100644 --- a/stopes/ui/seamlisten/react_app/src/e2e.test.js +++ b/stopes/ui/seamlisten/react_app/src/e2e.test.js @@ -1,8 +1,8 @@ -const puppeteer = require("puppeteer"); +const puppeteer = require('puppeteer'); jest.setTimeout(40000); -describe("File Viewer", () => { +describe('File Viewer', () => { let browser; let page; @@ -16,35 +16,30 @@ describe("File Viewer", () => { }); beforeEach(async () => { - await page.goto("http://localhost:3000/"); + await page.goto('http://localhost:3000/'); }); - test("should display the filename input, fetch button and help button", async () => { - const filenameInput = await page.waitForSelector( - ".form-control.form-control-sm" - ); - const fetchButton = await page.waitForSelector(".btn.btn-primary", { - text: "Fetch!", - }); - const helpButton = await page.waitForSelector( - 'button[aria-controls="help-text"]' - ); + test('should display the filename input, fetch button and help button', async () => { + const filenameInput = await page.waitForSelector('.form-control.form-control-sm'); + const fetchButton = await page.waitForSelector('.btn.btn-primary', { text: 'Fetch!' }); + const helpButton = await page.waitForSelector('button[aria-controls="help-text"]'); expect(filenameInput).not.toBeNull(); expect(fetchButton).not.toBeNull(); expect(helpButton).not.toBeNull(); + }); - test("should display the help text when the button is clicked", async () => { + test('should display the help text when the button is clicked', async () => { // Click the help button await page.click('button[aria-controls="help-text"]'); // Wait for the help text to appear - await page.waitForSelector("#help-text", { visible: true }); + await page.waitForSelector('#help-text', { visible: true }); // Assert that the help text is visible - const helpTextVisible = await page.$eval("#help-text", (element) => { - return getComputedStyle(element).display !== "none"; + const helpTextVisible = await page.$eval('#help-text', (element) => { + return getComputedStyle(element).display !== 'none'; }); expect(helpTextVisible).toBe(true); }); diff --git a/website/docs/eval/alti.md b/website/docs/eval/alti.md index 77b5b01..9d7867d 100644 --- a/website/docs/eval/alti.md +++ b/website/docs/eval/alti.md @@ -1,19 +1,24 @@ # ALTI+ -ALTI+ is a tool for inspecting token contributions in a transformer encoder-decoder model. It might be useful for detecting hallucinated translations or undertranslations. +ALTI+ is a tool for inspecting token contributions in a transformer encoder-decoder model. +It might be useful for detecting hallucinated translations or undertranslations. -This repository is based on the code from the paper [Ferrando et al., 2022](https://arxiv.org/abs/2205.11631). The original code is located at https://github.com/mt-upc/transformer-contributions-nmt. It is licensed under the Apache 2.0 license included in the current directory. +This repository is based on the code from the paper [Ferrando et al., 2022](https://arxiv.org/abs/2205.11631). +The original code is located at https://github.com/mt-upc/transformer-contributions-nmt. +It is licensed under the Apache 2.0 license included in the current directory. -We have made a few adaptation to the code so that it can run with the dense NLLB-200 models. The code in this directory is licensed both under the Apache 2.0 license of the original code (in the current directory), and under the MIT license of the whole project (in the parent directory). +We have made a few adaptation to the code so that it can run with the dense NLLB-200 models. +The code in this directory is licensed both under the Apache 2.0 license of the original code (in the current directory), +and under the MIT license of the whole project (in the parent directory). # Usage +An instruction for setting up the environment and computing ALTI+ token contributions from an NLLB model +with a command line interface is present in the folder `demo/alti`. -An instruction for setting up the environment and computing ALTI+ token contributions from an NLLB model with a command line interface is present in the folder `demo/alti`. - -Below is another example, that uses a bilingual model and the Python interface. Here is how you can run it: +Below is another example, that uses a bilingual model and the Python interface. +Here is how you can run it: 1. Prepare the environment by installing Fairseq and Stopes: - ``` pip install fairseq==0.12.1 git clone https://github.com/facebookresearch/stopes.git @@ -22,18 +27,18 @@ pip install -e '.[alti]' ``` 2. Download the model and dictionary from https://github.com/deep-spin/hallucinations-in-nmt: - - model: https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file - - data: https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file -3. Run the following commands to unpack the data: `tar -xvf checkpoint_best.tar.xz && tar -xvf wmt18_de-en.tar.xz` + - model: https://www.mediafire.com/file/mp5oim9hqgcy8fb/checkpoint_best.tar.xz/file + - data: https://www.mediafire.com/file/jfl7y6yu7jqwwhv/wmt18_de-en.tar.xz/file +3. Run the following commands to unpack the data: +```tar -xvf checkpoint_best.tar.xz && tar -xvf wmt18_de-en.tar.xz``` 4. Run the following command to download the tokenizers: - ``` wget https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.model wget https://github.com/deep-spin/hallucinations-in-nmt/raw/main/sentencepiece_models/sentencepiece.joint.bpe.vocab ``` - Now you can run the following Python code to look at the ALTI analysis: + ```Python from stopes.eval.alti.wrappers.transformer_wrapper import FairseqTransformerHub from stopes.eval.alti.alti_metrics.alti_metrics_utils import compute_alti_nllb, compute_alti_metrics @@ -72,9 +77,7 @@ print(compute_alti_metrics(*compute_alti_nllb(hub, src, tgt2))['avg_sc']) # 0.4 ``` # Citation - If you use ALTI+ in your work, please consider citing: - ```bibtex @inproceedings{alti_plus, title = {Towards Opening the Black Box of Neural Machine Translation: Source and Target Interpretations of the Transformer}, diff --git a/website/docs/eval/blaser.md b/website/docs/eval/blaser.md index f005d1b..5d7d8ce 100644 --- a/website/docs/eval/blaser.md +++ b/website/docs/eval/blaser.md @@ -45,7 +45,6 @@ You will need to pass three sets of speech segments to get a blaser score: - the reference audio (`ref`) The set of speech segments have to be organised in a tsv manifest pointing to the audio files. The format for each input audio data tsv file is: - ``` \t @@ -71,8 +70,8 @@ python -m stopes.pipelines.eval.eval_blaser output_dir=YOUROUTPUTDIRECTORY src_m where `src_lang` is the language of your source audio and tgt_lang is the target language. This is used to lookup the correct encoder model as specified by `stopes/pipelines/eval/conf/eval_blaser.yaml`. You can download pre-trained encoders from the [SpeechMatrix project](https://github.com/facebookresearch/fairseq/blob/ust/examples/speech_matrix/speech_laser_encoders.md). By default, the encoder used for the reference is the same as the target one, you can override this with `ref_lang=..` in the command arguments. -## Citation +## Citation If you use `blaser` in your work or any of its models, please cite: ```bibtex @@ -87,5 +86,4 @@ If you use `blaser` in your work or any of its models, please cite: ``` ## License - `blaser` is MIT licensed, as found in the LICENSE file in the root directory. diff --git a/website/docs/pipelines/distillation.md b/website/docs/pipelines/distillation.md index f5afaed..765f4c9 100644 --- a/website/docs/pipelines/distillation.md +++ b/website/docs/pipelines/distillation.md @@ -5,7 +5,6 @@ sidebar_position: 3 # NLLB Distillation Pipeline Welcome to `stopes`, and thanks for checking out our sequence-level knowledge distillation pipeline. This is a quick start guide which walks through how to run the pipeline yourself and what the expected outputs will be from each step. The logic of the pipeline is at a high level as follows: - 1. cleans pre-downloaded monolingual data (see [STOPES monolingual pipeline](https://github.com/fairinternal/nllb/blob/main/website/docs/pipelines/monolingual.md#nllb-monolingual-pipeline)) - results in one merged file of data for each source language 2. shards each source language file from previous step into as many shards as number of specified target languages 3. generates target language translations for each shard from previous step using Fairseq Generate @@ -15,16 +14,17 @@ Welcome to `stopes`, and thanks for checking out our sequence-level knowledge di ## To run: -First, fill out any missing fields in distillation.yaml (labeled ???). Then, `python stopes/pipelines/distillation/distillation_pipeline.py` should be enough to get it running. +First, fill out any missing fields in distillation.yaml (labeled ???). Then, +`python stopes/pipelines/distillation/distillation_pipeline.py` should be enough to get it running. -You can also override distillation.yaml fields manually through the CLI as such: `python stopes/pipeliens/distillation/distillation_pipeline.py src_langs="[eng,mai]" tgt_langs="[fra,deu]" mono_data_dir= output_dir=`. +You can also override distillation.yaml fields manually through the CLI as such: +`python stopes/pipeliens/distillation/distillation_pipeline.py src_langs="[eng,mai]" tgt_langs="[fra,deu]" mono_data_dir= output_dir=`. For internal FAIR users, feel free to add the `+fb_preset=nllb` argument to the CLI command to use some preset config settings. Note: Testing performance can be done with a separate STOPES module, `/stopes/modules/evaluation/generate_multi_bleu_detok_module.py`. ## Useful overrides - - `src_langs` is an array of source languages you have pre-downloaded monolingual data for - `tgt_langs` is an array of target languages you want to train the student model to translate to - `mono_data_dir` is the path to pre-downloaded monolingual data @@ -42,7 +42,6 @@ Please be aware that at every intermediate step, the program will overwrite file The run will be started with a custom working directory that follows the pattern: `outputs/{date}/{start_time}`, all the logs will go there (including executor_logs from slurm jobs). By default, the data output is set in `distillation.yaml` to be `output_dir: .` this means that the outputs will go to the working directory and will go to different places depending on the day/time you start the run. This is useful for testing, but if you want to output somewhere else (like a central clean monolingual repo), override the `output_dir=/somethingstable/` when starting the run. ### Raw input monolingual file: - ``` ~/test_inputs/eng % cat test.eng @@ -54,8 +53,15 @@ BlackBerry Z10 To Launch In South Africa Tomorrow - Blackberry Empire http://www ``` ### Example file output of monolingual_pipeline before dedup: - -Parsed in column format: 1. self.corpus, # the original corpus name 2. self.offset_start, # skip that many bytes (use dd) 3. line_id, # after skipping, go to line 4. line_hash, # xxhash.xxh3_64 of the original line/paragrph 5. f"{prob_lang:.5f}", # lid score 6. clean, # sentence # config sep="\t" +Parsed in column format: + 1. self.corpus, # the original corpus name + 2. self.offset_start, # skip that many bytes (use dd) + 3. line_id, # after skipping, go to line + 4. line_hash, # xxhash.xxh3_64 of the original line/paragrph + 5. f"{prob_lang:.5f}", # lid score + 6. clean, # sentence + # config + sep="\t" ``` ~/test_outputs/mono_data/eng @@ -73,7 +79,6 @@ test 0 __label__eng 0.37420 BlackBerry Z10 To Launch In South Africa Tomorrow - ``` ### Example file output of dedup - ``` % cat eng_all_dedup test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District of Columbia @@ -85,7 +90,6 @@ test 443 0 3451732902557484365 0.83896 no down payment auto insurance in Scottsd ``` ### Example file output of shard - ``` % cat shard.000 test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District of Columbia @@ -97,9 +101,7 @@ test 443 0 3451732902557484365 0.83896 no down payment auto insurance in Scottsd ``` ### Example file output of generate - Target generated data: - ``` test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District de Columbia test 692 0 8327890826167111651 0.83095 Une question de priorités: réforme démocratique et reprise économique en Allemagne d'après-guerre Auteur: Rebecca L. Boehling TiersD'occasion8,25€202,25€ @@ -110,9 +112,7 @@ test 443 0 3451732902557484365 0,83896 aucun acompte d'assurance automobile à S ``` ### Example file output of bitext clean - The contents of the filtered `clean.eng-fra.eng.000.xz` and `clean.eng-fra.fra.000.xz` files are respectively: - ``` test 692 0 8327890826167111651 0.83095 A Question of Priorities: Democratic Reform and Economic Recovery in Postwar Germany Auteur: Rebecca L. Boehling TiersD'occasion8,25€202,25€ test 0 0 12930410217004390762 0.90479 Appealing Accent Chair And Ottoman and Petra Fabric Accent Chair With Ottoman Furniture Home Decoration - Lilangels Furniture @@ -120,7 +120,6 @@ test2 0 0 5374428323341487497 1.00001 He has a cat. test2 0 0 5374428323341487497 0.99987 Hello the president is here! test 443 0 3451732902557484365 0.83896 no down payment auto insurance in Scottsdale AZ ``` - ``` target_data 0 15 9889559120183218255 0.97691 Une question de priorités: réforme démocratique et reprise économique en Allemagne d'après-guerre Auteur: Rebecca L. Boehling TiersD'occasion8,25€202,25€ target_data 28 39 7358542291591603186 0.98684 Chaise d'accent attrayante et chaise d'accent du tissu ottoman et du tissu de Petra avec meubles ottomans décoration de la maison - Lilangels meubles @@ -130,17 +129,14 @@ target_data 56 78 15942782228027469307 0.97898 aucun acompte d'assurance automob ``` Meanwhile, the contents of the two discarded output files `discarded.eng-fra.eng.000.xz` and `discarded.eng-fra.fra.000.xz` are respectively: - ``` test 1056 0 4426603632439174366 0.71947 202-458-1769 Joie Olverson - Spring House Ln, Washington, District of Columbia ``` - ``` gen_shard 0 __label__eng 0.32102 202-458-1769 Joie Olverson - Spring House Ln, Washington, District de Columbia ``` ### Example file output of binarizing and encoding - ``` train.eng-fra.eng.000.bin train.eng-fra.eng.001.idx train.eng-fra.eng.003.bin train.eng-fra.eng.000.idx train.eng-fra.eng.002.bin train.eng-fra.eng.003.idx @@ -148,7 +144,6 @@ train.eng-fra.eng.001.bin train.eng-fra.eng.002.idx ``` ### Example file output of train - ``` -rw-rw-r-- 1 $USER $USER 4.2G Aug 3 12:05 checkpoint_best.pt -rw-rw-r-- 1 $USER $USER 4.2G Aug 3 12:05 checkpoint_last.pt diff --git a/website/docs/pipelines/global_mining.md b/website/docs/pipelines/global_mining.md index 5d7bea1..8226c53 100644 --- a/website/docs/pipelines/global_mining.md +++ b/website/docs/pipelines/global_mining.md @@ -8,16 +8,18 @@ sidebar_position: 1 You can launch the mining for a pair of languages with the following command: + ```bash python -m stopes.pipelines.bitext.global_mining_pipeline src_lang=fuv tgt_lang=zul demo_dir=.../stopes-repo/demo +preset=demo output_dir=. embed_text=laser2 ``` - (see the demo doc for a quick understanding of the `+preset` override) + This will run the required steps and try to re-use whatever step outputs has already been computed. So if you run this exact command multiple times (e.g. after a pre-emption in slurm), it will start from where it failed instead of recomputing everything. Here is an example log: + ``` [global_mining][INFO] - output: ....../mining/global_mining/outputs/2021-11-02/08-56-40 [global_mining][INFO] - working dir: ....../mining/global_mining/outputs/2021-11-02/08-56-40 @@ -36,30 +38,37 @@ Here is an example log: [train_faiss_index][INFO] - lang=hi, sents=162844151, required=40000000, index type=OPQ64,IVF65536,PQ64 ``` + We can see that the launcher has found out that it doesn't need to run the encode and train index steps for the bn lang (source language) and can skip straight to populating the index with embeddings, but it also already processed 44 shards for that step, so will only re-schedule jobs for 11 shards. In parallel, it is also processing the target language (hi) and found that it still needs to run the index training step as it also recovered all the encoded shards. If you are using slurm as the launcher instead of the local setting, the pipeline also takes care of communicating with slurm, waiting for all slurm jobs to finish and synchronizing the consecutive jobs. See below on how to run single steps for debugging. You can run the whole pipeline locally with: + ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg launcher.cluster=local ``` + + # Understanding the Configuration -The configuration is driven by [Hydra](https://hydra.cc/), this makes it sound way more complicated than it actually is. The first main difference is how the command line arguments are specified. Instead of using the `--arg=foobar` standard notation, Hydra introduces its [own notation](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) to be able to have a more complete syntax. This is indeed odd, but once you are used to it, it provides a lot of benefits. +The configuration is driven by [Hydra](https://hydra.cc/), this makes it sound way more complicated than it actually is. The first main difference is how the command line arguments are specified. Instead of using the `--arg=foobar` standard notation, Hydra introduces its [own notation](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) to be able to have a more complete syntax. This is indeed odd, but once you are used to it, it provides a lot of benefits. A second big change is that most of the things that can be changed in the pipeline are driven by yaml configuration files instead of having to change the script files. These configuration files are checked in and you can override them on the command line (see the examples above). The pipeline will log the actual full config+overrides in the output folder when you do a run, so that you can always look at the config that was used to generate a particular data folder. The third major change, and main benefit, is that the configs are split in "groups" (hydra terminology) and you can override a whole group with another yaml file with a very simple syntax. For instance, the embed_text step has a set of pre-made configs in `global_mining/conf/embed_text` and you can swap between them. If you would like to make a new reusable/shared config for embed_text, you could put a new yaml file in that that folder (let say `global_mining/conf/embed_text/foobar.yaml`) and select it from the cli with: + ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg embed_text=foobar ``` + See the Data and Modules discussion below for more examples. + ## Outputs and Working Dir The output of the pipeline is set in the global_mining.yaml to be ".", which means the current working directory. When running `global_mining_pipeline.py` it will by default create a new folder under `outputs/today_date/timeofrun` and make this your working directory. This means all your logs will be well organized. It also means that the main output of each step will go under that directory given the default configuration of `output_dir: .` @@ -68,44 +77,52 @@ Because you might run the pipeline multiple times for the same "data run" (e.g. It's therefore a good idea when you are doing a full run (not just testing), to specify a fixed outputs directory when launching the pipeline: + ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg output_dir=/myfinal/data/outputs ``` + This way logs and other temp files will go to the working directory, but the data will go to a clean central place. + ## Data The current data configuration for the pipeline takes a few parameters: -- data_version -- iteration -- data_shard_dir -- shard_type -- bname + + +* data_version +* iteration +* data_shard_dir +* shard_type +* bname Because you will most often always use the same data for your runs, there is no need to specify this every time on the CLI or in the default config. There is a "group" under `global_mining/conf/data` where you can put common data sources. Checkout the demo config to see how to configure data. You can create a data config folder if you want to switch data without changing all other presets. + # Modules The pipeline is made of seven main steps: -- split_in_shards (optional) -- embed_text -- train_index -- populate_index -- merge_index -- calculate_distances -- mine_indexes -- mine_sentences -- merge_shards (optional) +* split_in_shards (optional) +* embed_text +* train_index +* populate_index +* merge_index +* calculate_distances +* mine_indexes +* mine_sentences +* merge_shards (optional) Each of them is configured as a "group" and their configurations can be overridden by switching groups on the cli as explained above. This override can also completely switch the code/module that is being used to compute this step, without changing the pipeline itself. + **Embedding Modules** You can switch the actual encoder being used to choose between multiple encoders. For example, you can choose to use LaBSE, BERT, RoBERTa, or any other model from the sentence-transformers repo within the HuggingFace Model Hub ([https://huggingface.co/sentence-transformers](https://huggingface.co/sentence-transformers)). Here’s an example of how to encode text using LaBSE (with encoder-specific options in blue): + ```bash python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg embed_text=hf_roberta_large ``` @@ -115,7 +132,6 @@ python global_mining_pipeline.py src_lang=bn tgt_lang=hi +data=ccg embed_text=h ``` or you can choose any huggingface encoder by their name with: - ```bash python global_mining_pipeline.py -c job src_lang=bn tgt_lang=hi +data=ccg embed_text=huggingface embed_text.encoder_model=sentence-transformers/LaBSE ``` @@ -128,14 +144,14 @@ embed_text.config.encoder.encoder_model=path_to_laser_model embed_text.config.encoder.spm_model=path_to_spm_model ``` -### Splitting and merging languages -For some large languages, the mining might fail because of out-of-memory errors, especially if the FAISS indexes are stored on GPU. To mitigate this probelm, you can split a language into shards, perform the mining on them in parallel, and then merge the results. +### Splitting and merging languages +For some large languages, the mining might fail because of out-of-memory errors, especially if the FAISS indexes are stored on GPU. To mitigate this probelm, you can split a language into shards, perform the mining on them in parallel, and then merge the results. -The first optional module, `split_in_shards`, can randomly split the language (inclusing both text files and metadata files, if they exist) into several shards. To use this option, you should specify the parameter `max_shard_size`, and the languages with more total lines than this number will be automatically split into smaller shards. +The first optional module, `split_in_shards`, can randomly split the language (inclusing both text files and metadata files, if they exist) into several shards. +To use this option, you should specify the parameter `max_shard_size`, and the languages with more total lines than this number will be automatically split into smaller shards. Alternatively, you can manually split the data for the language and configure it as several separate "languages", e.g. `eng0,eng1,eng2`. In this case, you can indicate in the mining config that they should be merged into a single language after mining: - ``` sharded_langs: eng: @@ -154,13 +170,15 @@ One of the benefits of the hydra cli override syntax, is that you can ask hydra For instance, if you would like to run the pipeline on multiple languages, you can do: + ```bash python global_mining_pipeline.py -m src_lang=en tgt_lang=bn,hi +data=ccg ``` + The `-m` parameter tells the pipeline to start with multi-run and `tgt_lang=bn,hi` tells it to make two runs, one for en-bn and one for en-hi. -You could also sweep over the lang and the encoders with: + You could also sweep over the lang and the encoders with: ```bash python global_mining_pipeline.py -m src_lang=en tgt_lang=bn,hi +data=ccg embed_text=hf_roberta_large,hf_labse @@ -193,24 +211,23 @@ launcher.cache.caching_dir=/path/to/cache \ maximum_epoch=20 ``` -**NOTE**: In order for the training pipeline to know which column of the bitext corresponds to the selected `src_lang` and `tgt_lang`, it presumes that the two text columns in the bitext are ordered by their sorted language names. For example, for a `eng-lin` bitext, the format is: alignment-score [tab] english-text [tab] lingala-text (not alignment-score [tab] lingala-text [tab] english-text). +**NOTE**: In order for the training pipeline to know which column of the bitext corresponds to the selected `src_lang` and `tgt_lang`, it presumes that the two text columns in the bitext are ordered by their sorted language names. For example, for a `eng-lin` bitext, the format is: alignment-score [tab] english-text [tab] lingala-text (not alignment-score [tab] lingala-text [tab] english-text). ## Outputs The NMT pipeline will create the following directories in the specified `output_dir`: - - `bin_dir`: moses preprocessed, spm-encoded, and binarized data. - `trained_models`: checkpoints from `fairseq-train`. **Note**: this directory will also contain files containing the outputs of both `fairseq-generate` (files ending in `.out`) and the corresponding BLEU evaluations for each checkpoint (files ending in `.bleu`). ## Evaluation data -To find the evaluation data for your chosen languages, `stopes` needs to know the relevant path. See `path` in `stopes/pipelines/bitext/conf/preproc_binarize_mined/standard_conf.yaml`. Currently it defaults to the format of the `flores200` dataset. To use this, please [download flores200](https://github.com/facebookresearch/flores/tree/main/flores200). +To find the evaluation data for your chosen languages, `stopes` needs to know the relevant path. See `path` in `stopes/pipelines/bitext/conf/preproc_binarize_mined/standard_conf.yaml`. Currently it defaults to the format of the `flores200` dataset. To use this, please [download flores200](https://github.com/facebookresearch/flores/tree/main/flores200). ## Example overrides **Spm training** -`spm.train.config.vocab_size=7000` +```spm.train.config.vocab_size=7000``` **Model configuation** diff --git a/website/docs/pipelines/monolingual.md b/website/docs/pipelines/monolingual.md index d640a05..e611a4c 100644 --- a/website/docs/pipelines/monolingual.md +++ b/website/docs/pipelines/monolingual.md @@ -20,7 +20,6 @@ The core filtering is in `monolingual_line_processor.py` and `utils/text_filter. `python monolingual_pipeline.py data_dir=yourdatahere langs='[umb,ssw]'` should be enough to get it running. - - `data_dir` is where the raw data is, should have subfolders per lang and files named with the pattern corpus_name.lang.xz - `langs` an array of langs to process in this run diff --git a/website/docs/quickstart.md b/website/docs/quickstart.md index 8caa1ea..85009b6 100644 --- a/website/docs/quickstart.md +++ b/website/docs/quickstart.md @@ -4,7 +4,9 @@ sidebar_position: 1 # Getting started with mining -Welcome to `stopes`, this is a quickstart guide to discover how to run automated pipelines with `stopes`. In this example, you'll be running global mining with the `stopes` toolchain. (Inspired by [CCMatrix](https://ai.facebook.com/blog/ccmatrix-a-billion-scale-bitext-data-set-for-training-translation-models/)). +Welcome to `stopes`, this is a quickstart guide to discover how to run automated pipelines with `stopes`. In this example, you'll be running +global mining with the `stopes` toolchain. (Inspired by +[CCMatrix](https://ai.facebook.com/blog/ccmatrix-a-billion-scale-bitext-data-set-for-training-translation-models/)). ## Installation @@ -12,11 +14,16 @@ Follow the installation steps from the [project's README](https://github.com/fac ## Getting Data -To run the global mining pipeline, you first need to get some monolingual data. The [WMT22 Shared Task: Large-Scale Machine Translation Evaluation for African Languages](https://statmt.org/wmt22/large-scale-multilingual-translation-task.html) has some interesting monolingual data for some African languages. +To run the global mining pipeline, you first need to get some monolingual data. +The [WMT22 Shared Task: Large-Scale Machine Translation Evaluation for African +Languages](https://statmt.org/wmt22/large-scale-multilingual-translation-task.html) +has some interesting monolingual data for some African languages. -You also need some trained encoder, we usually use `stopes` with LASER and we can find such trained encoders for the languages in the WMT22 shared task too. +You also need some trained encoder, we usually use `stopes` with LASER and we can +find such trained encoders for the languages in the WMT22 shared task too. -The `demo/mining/prepare.sh` script will download the monolingual data and LASER encoders for you. Start by running this script and wait for the download to finish. +The `demo/mining/prepare.sh` script will download the monolingual data and LASER encoders +for you. Start by running this script and wait for the download to finish. :::tip @@ -26,18 +33,26 @@ The `demo/mining/prepare.sh` script will download the monolingual data and LASER ## Configuring the pipeline -In `stopes` pipelines, we use [hydra](https://hydra.cc/) to configure the runs. With hydra, you can configure everything with "overrides" on the cli, but it's often easier to put the configurations in yaml files as there is a lot of things to setup. +In `stopes` pipelines, we use [hydra](https://hydra.cc/) to configure the runs. +With hydra, you can configure everything with "overrides" on the cli, but it's +often easier to put the configurations in yaml files as there is a lot of things +to setup. -`stopes/pipelines/bitext/conf/preset/demo.yaml` is a demo configuration for the data and encoders that we've downloaded in the previous steps. Check out the comments in that file. +`stopes/pipelines/bitext/conf/preset/demo.yaml` is a demo configuration for the +data and encoders that we've downloaded in the previous steps. Check out the +comments in that file. The important parts of that preset config is: - 1. we setup the launcher to run on your local computer (no need for a cluster) -2. we setup an alias for a `demo_dir` folder, so you can point to the data/models from the cli +2. we setup an alias for a `demo_dir` folder, so you can point to the + data/models from the cli 3. we setup some information about the `data`: - some naming, to get nice file names as outputs - where the data is found (with `shard_glob`) -4. we tell the pipeline where to find the encoder and SentencePiece model (SPM) uses to embed the text. We do that for each lang in `lang_configs`. Practically, if you are only processing a few languages, you don't need so many entries, here we preset them for all languages from the WMT22 task +4. we tell the pipeline where to find the encoder and SentencePiece model (SPM) uses + to embed the text. We do that for each lang in `lang_configs`. Practically, + if you are only processing a few languages, you don't need so many entries, + here we preset them for all languages from the WMT22 task :::tip @@ -48,20 +63,30 @@ Language codes are important, but not standardized everywhere. The `stopes` libr ## Run the Pipeline You can now start the pipeline with: - ```bash python -m stopes.pipelines.bitext.global_mining_pipeline src_lang=fuv tgt_lang=zul demo_dir=.../stopes-repo/demo/mining +preset=demo output_dir=. embed_text=laser3 ``` - `src_lang` and `tgt_lang` specify the pair of languages we want to process, -- `demo_dir` is the new variable we introduce in our preset/demo.yaml file, to point to where the `prepare.sh` script downloaded our data; make sure to specify an absolute path, -- `+preset=demo` tells hydra to load the demo.yaml preset file to set our defaults (the `+` here is because we are telling hydra to append a group that doesn't exist in the default config, see the [hydra doc](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) for details), +- `demo_dir` is the new variable we introduce in our preset/demo.yaml file, to + point to where the `prepare.sh` script downloaded our data; make sure to + specify an absolute path, +- `+preset=demo` tells hydra to load the demo.yaml preset file to set our + defaults (the `+` here is because we are telling hydra to append a group that + doesn't exist in the default config, see the [hydra + doc](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax) + for details), - `output_dir` specifies where we want the output (current run directory), -- `embed_text=laser3` tells the pipeline to use the laser3 encoding code to load the models and encode the text. +- `embed_text=laser3` tells the pipeline to use the laser3 encoding code to load + the models and encode the text. ## Try using a different encoder -In the previous run, we used `embed_text=laser3`, which will encode text with the language specific laser3, but you can also use other encoders. For instance, stopes ships with [HuggingFace sentence-transformers](https://huggingface.co/sentence-transformers), so you can use different encoders if you want to experiment. +In the previous run, we used `embed_text=laser3`, which will encode text with +the language specific laser3, but you can also use other encoders. For instance, +stopes ships with [HuggingFace +sentence-transformers](https://huggingface.co/sentence-transformers), so you can +use different encoders if you want to experiment. You need to install `sentence-transformers` in your environment: @@ -80,7 +105,6 @@ python -m stopes.pipelines.bitext.global_mining_pipeline src_lang=fuv tgt_lang=z ## Explore More Check out these docs to learn more: - - [Prebuilt Pipelines](category/prebuilt-pipelines) - [`stopes` Module Framework](stopes) diff --git a/website/docs/stopes/advanced/checkpointing.md b/website/docs/stopes/advanced/checkpointing.md index 16edfe9..72ee42f 100644 --- a/website/docs/stopes/advanced/checkpointing.md +++ b/website/docs/stopes/advanced/checkpointing.md @@ -4,4 +4,9 @@ sidebar_position: 1 # Checkpointing (advanced) -When using SLURM, the StopesModule system uses submitit to schedule the jobs. This means that you can leverage the checkpointing feature it offers. This allows you to store the state of the current module when its job gets preempted or times out. See the [submitit doc](https://github.com/facebookincubator/submitit/blob/main/docs/checkpointing.md) for more details. +When using SLURM, the StopesModule system uses submitit to schedule the jobs. +This means that you can leverage the checkpointing feature it offers. This +allows you to store the state of the current module when its job gets preempted +or times out. See the [submitit +doc](https://github.com/facebookincubator/submitit/blob/main/docs/checkpointing.md) +for more details. diff --git a/website/docs/stopes/advanced/debugging.md b/website/docs/stopes/advanced/debugging.md index 529287e..ae3c33c 100644 --- a/website/docs/stopes/advanced/debugging.md +++ b/website/docs/stopes/advanced/debugging.md @@ -6,29 +6,45 @@ sidebar_position: 3 You can launch an individual module with: + ```bash python launch.py +module=my_module ``` -Where `my_module` is the name of the config file you want to use. This is useful for debugging, usually. -The launcher is configured in `global_mining/conf/main_conf.yaml` +Where `my_module` is the name of the config file you want to use. This is useful +for debugging, usually. + +The launcher is configured in +`global_mining/conf/main_conf.yaml` You should not have to change this config to run your module. -The `+module= `argument is the way to tell hydra to pick up your module config file. The launcher will use the `_target_` directive in the module to initialize the correct module and then pass the right config. +The `+module= `argument is the way to tell hydra to pick up your module config +file. The launcher will use the `_target_` directive in the module to initialize +the correct module and then pass the right config. + +You can override any part of the configuration with [normal hydra +overrides](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax). +For example, if you wanted to specify the lang parameter for your module, you +can do: -You can override any part of the configuration with [normal hydra overrides](https://hydra.cc/docs/1.0/advanced/override_grammar/basic/#basic-override-syntax). For example, if you wanted to specify the lang parameter for your module, you can do: ```bash python launch.py +module=my_config module.config.lang=luo ``` -The `module/my_config.yaml` file will be loaded and then the lang will be overridden. This will create a new config for you. -The launcher will then run your module and dump the full config (with overrides) in the outputs folder. +The `module/my_config.yaml` file will be loaded and then the lang will be +overridden. This will create a new config for you. + +The launcher will then run your module and dump the full config (with overrides) +in the outputs folder. + +To do more advanced debugging, remember that a module is just a normal python +object that you can run as any python callable. You can therefore just call +your module from the REPL or a notebook with: -To do more advanced debugging, remember that a module is just a normal python object that you can run as any python callable. You can therefore just call your module from the REPL or a notebook with: ```python module = MyModule(config) diff --git a/website/docs/stopes/advanced/dynamic.md b/website/docs/stopes/advanced/dynamic.md index 8c2bfe7..93ebe50 100644 --- a/website/docs/stopes/advanced/dynamic.md +++ b/website/docs/stopes/advanced/dynamic.md @@ -4,40 +4,60 @@ sidebar_position: 2 # Dynamic Initializing Modules (advanced) -It is easy to initialize a module like a normal python class with` MyModule(config)`. However this would make your pipeline static as the module couldn't be swapped. +It is easy to initialize a module like a normal python class with` +MyModule(config)`. However this would make your pipeline static as the module +couldn't be swapped. -**Problem:** For instance, imagine that your pipeline has an _embed_ step that takes raw text as input and outputs an embedding of that text. -You might want to test different embedding methods, let's say, compare the LASER implementation with a HuggingFace encoder. +**Problem:** For instance, imagine that your pipeline has an _embed_ step that +takes raw text as input and outputs an embedding of that text. -If you wrote your code with `encoder = LaserEncoderModule(config)` you will not be able to swap this step to use the `HFEncoderModule` without changing the code of your pipeline. +You might want to test different embedding methods, let's say, compare the LASER +implementation with a HuggingFace encoder. + +If you wrote your code with `encoder = LaserEncoderModule(config)` you will not +be able to swap this step to use the `HFEncoderModule` without changing the code +of your pipeline. + +**Solution**: Because we are using Hydra, we have an easy way to specify modules +in config and override them when calling the pipeline. All you have to do is to +use: -**Solution**: Because we are using Hydra, we have an easy way to specify modules in config and override them when calling the pipeline. All you have to do is to use: ```python embed_module = StopesModule.build(self.config.embed_text, lang=lang) ``` -The `build` helper will find the `_target_` entry in the embed_text config and initialize the module that it points to. The `kwargs` of build can be used to specify in code a specific value of the config. -Thanks to `build`, we can now have two config files in the embed_text group that will point to the different modules: +The `build` helper will find the `_target_` entry in the embed_text config and +initialize the module that it points to. The `kwargs` of build can be used to +specify in code a specific value of the config. + +Thanks to `build`, we can now have two config files in the embed_text group that +will point to the different modules: + + + ```yaml title="laser_module.yaml" # @package module _target_: modules.LaserEncoderModule config: - lang: ??? + lang: ??? ``` + + ```yaml title="hf_module.yaml" # @package module _target_: modules.HFEncoderModule config: - lang: ??? + lang: ??? ``` + And you can override this module from the cli: ```bash @@ -45,4 +65,6 @@ python yourpipeline.py embed_text=hf_module src_lang=bn tgt_lang=hi +data=ccg ``` -This does look a bit odd at first, but look at the implementation of global_mining to see how it flows and how modules are used and config/data is passed around. +This does look a bit odd at first, but look at the implementation of +global_mining to see how it flows and how modules are used and config/data is +passed around. diff --git a/website/docs/stopes/cache.md b/website/docs/stopes/cache.md index d6bcade..410a107 100644 --- a/website/docs/stopes/cache.md +++ b/website/docs/stopes/cache.md @@ -4,20 +4,34 @@ sidebar_position: 5 # Caching/Memoization -An important part of the launcher is its caching system. When you call the schedule method with a configured module, the launcher will check if this configuration was already run in the past and reuse the results when possible. The cache is indexed on the configuration of the module, so if you change anything in the configuration input, the module will be executed from scratch and the new result will be cached with a different key. It's also important to remember that all inputs to the module that could change its results (and thus the caching) should be specified in the config input. - -If you change the code of your module to a point that would change its output, you can implement the `version() `method to return a new value so that the cache knows that it needs to recompute from scratch even from known configs. - -You can also implement the`validate()`method to check the outputs from your module and from the cache if you want to actively invalidate the cache. For example, if it’s known how many lines are to be embedded into a particular dimension (say 1024), you can validate that the output file size is e.g. `num_lines * 1024 * float32.` - -Here is an example of rerunning the global mining pipeline that was interrupted in the middle. The caching layer recovers what was already executed successfully. This was started with the same command that would require a full run: - +An important part of the launcher is its caching system. When you call the +schedule method with a configured module, the launcher will check if this +configuration was already run in the past and reuse the results when possible. +The cache is indexed on the configuration of the module, so if you change +anything in the configuration input, the module will be executed from scratch +and the new result will be cached with a different key. It's also important to +remember that all inputs to the module that could change its results (and thus +the caching) should be specified in the config input. + +If you change the code of your module to a point that would change its output, +you can implement the `version() `method to return a new value so that the cache +knows that it needs to recompute from scratch even from known configs. + +You can also implement the` validate() `method to check the outputs from your +module and from the cache if you want to actively invalidate the cache. For +example, if it’s known how many lines are to be embedded into a particular +dimension (say 1024), you can validate that the output file size is e.g. +`num_lines * 1024 * float32.` + +Here is an example of rerunning the global mining pipeline that was interrupted +in the middle. The caching layer recovers what was already executed +successfully. This was started with the same command that would require a full +run: ```bash python yourpipeline.py src_lang=bn tgt_lang=hi +data=ccg ``` Here are the logs: - ``` [global_mining][INFO] - output: .../global_mining/outputs/2021-11-02/08-56-40 [global_mining][INFO] - working dir: .../global_mining/outputs/2021-11-02/08-56-40 @@ -36,6 +50,13 @@ Here are the logs: [train_faiss_index][INFO] - lang=hi, sents=162844151, required=40000000, index type=OPQ64,IVF65536,PQ64 ``` -We can see that the launcher has found out that it doesn't need to run the encode and train index steps for the `bn` lang (src language) and can skip straight to populating the index with embeddings, but it also already processed 44 shards for that step, so will only re-schedule jobs for 11 shards. In parallel, it is also processing the tgt language (`hi`) and found that it still needs to run the index training step as it also recoverred all the encoded shards. +We can see that the launcher has found out that it doesn't need to run the +encode and train index steps for the `bn` lang (src language) and can skip +straight to populating the index with embeddings, but it also already processed +44 shards for that step, so will only re-schedule jobs for 11 shards. In +parallel, it is also processing the tgt language (`hi`) and found that it still +needs to run the index training step as it also recoverred all the encoded +shards. -All this was done automatically. The person launching the pipeline doesn't have to micromanage what has already succeeded and what needs to be started when. +All this was done automatically. The person launching the pipeline doesn't have +to micromanage what has already succeeded and what needs to be started when. diff --git a/website/docs/stopes/configuration.md b/website/docs/stopes/configuration.md index 27897da..e627365 100644 --- a/website/docs/stopes/configuration.md +++ b/website/docs/stopes/configuration.md @@ -4,9 +4,15 @@ sidebar_position: 4 # Configuration -We use hydra for configuration. You should probably check out the hydra tutorial: [https://hydra.cc/docs/tutorials/intro](https://hydra.cc/docs/tutorials/intro) but it's not a requirement. +We use hydra for configuration. You should probably check out the hydra +tutorial: +[https://hydra.cc/docs/tutorials/intro](https://hydra.cc/docs/tutorials/intro) +but it's not a requirement. + +Modules `__init__` HAVE to take either a structured configuration as parameter +or an `omegaconf.DictConfig`. A structured configuration is a [python +dataclass](https://docs.python.org/3/library/dataclasses.html), e.g. -Modules `__init__` HAVE to take either a structured configuration as parameter or an `omegaconf.DictConfig`. A structured configuration is a [python dataclass](https://docs.python.org/3/library/dataclasses.html), e.g. ```python from dataclasses import dataclass @@ -18,40 +24,63 @@ class MyModuleConfig: spm_model: str = "/path/to/my/model.spm" ``` -Structured configs make it easier to track what is expected as a config for a module and makes it self documenting. But you can also just use a DictConfig if you prefer. -If you implement the init method of the module, make sure to call `super().__init__(config) `so that the module system knows about your module setup. You can then access `self.config `anywhere in your module after initialization +Structured configs make it easier to track what is expected as a config for a +module and makes it self documenting. But you can also just use a DictConfig if you prefer. + + +If you implement the init method of the module, make sure to call +`super().__init__(config) `so that the module system knows about your module +setup. You can then access `self.config `anywhere in your module after +initialization + +Actual configs live in YAML files in the config/module/ folder and should look +like this: -Actual configs live in YAML files in the config/module/ folder and should look like this: ```yaml # @package module _target_: stopes.modules.MyModule config: - lang: null - spm_model: /path/to/my/model.spm + lang: null + spm_model: /path/to/my/model.spm ``` + The `_target_` field should point to the full python module path of your module `config` should contain the config of your module. -You should save this in a file with your model name. You could have multiple versions of your config, save them with the same `_target_` but different file names (e.g. `my_module_large_spm.yaml`, `my_module_small_spm.yaml`, etc.). +You should save this in a file with your model name. You could have multiple +versions of your config, save them with the same `_target_` but different file +names (e.g. `my_module_large_spm.yaml`, `my_module_small_spm.yaml`, etc.). + +The yaml config file should contain the baseline configuration for your module +and things that you do not expect to change often. In hydra terms, you are +adding a possible option for a config group (the module group: see `@package +module`) -The yaml config file should contain the baseline configuration for your module and things that you do not expect to change often. In hydra terms, you are adding a possible option for a config group (the module group: see `@package module`) +You can use hydra/[omegaconf +"resolvers"](https://omegaconf.readthedocs.io/en/2.1_branch/custom_resolvers.html#built-in-resolvers) +to depend on other bits of configs or environment variables: -You can use hydra/[omegaconf "resolvers"](https://omegaconf.readthedocs.io/en/2.1_branch/custom_resolvers.html#built-in-resolvers) to depend on other bits of configs or environment variables: ```yaml # @package module _target_: stopes.modules.MyModule config: - lang: null - laser_path: /laser/is/here - laser_model: ${module.my_module.laser_path}/model1.mdl - spm_model: ${oc.env:SPM_MODEL} + lang: null + laser_path: /laser/is/here + laser_model: ${module.my_module.laser_path}/model1.mdl + spm_model: ${oc.env:SPM_MODEL} ``` -Note: try not to rely too much on environment variables as we want these files to be the base for reproducibility and shareability of the module configurations you experiment with. Relying on special environment variables will make this hard. -You can use hydra config composition if you want your config to inherit or configure a subpart of your config, see https://hydra.cc/docs/patterns/extending_configs +Note: try not to rely too much on environment variables as we want these files +to be the base for reproducibility and shareability of the module configurations +you experiment with. Relying on special environment variables will make this +hard. + +You can use hydra config composition if you want your config to inherit or +configure a subpart of your config, see +https://hydra.cc/docs/patterns/extending_configs diff --git a/website/docs/stopes/index.md b/website/docs/stopes/index.md index 4eecaa0..71f8715 100644 --- a/website/docs/stopes/index.md +++ b/website/docs/stopes/index.md @@ -4,34 +4,73 @@ sidebar_position: 1 # stopes Module Framework -The `stopes` library was built for easily managing complex pipelines without worrying about scaling and reliability code. +The `stopes` library was built for easily managing complex pipelines without +worrying about scaling and reliability code. ## Key features: -- **Reproducibility.** `stopes` is built with a research mindset first. The underlying Hydra framework gives you full control over the configuration of your pipelines. All the important parameters of your experiments can be defined and tracked. -- **Easier scaling.** The `stopes` framework provides clean separation between your pipeline step logic and the scaling code. If you use slurm, run locally or want to deploy on another cluster, your pipeline code and steps shouldn't change. -- **Caching/memoization.** With `stopes`, you can iterate faster and more reliably via transparent memoization. We've built the library so your code doesn't need to know what's happening with the cache -- **Composition.** The `stopes` API surface is minimum, so you can build a pipeline by simply writing idiomatic python (using asyncio) and have a quick understanding of what's going on without needing to understand complex job APIs. - -Checkout the [quickstart](quickstart) guide and the [pipelines](category/prebuilt-pipelines) we've provided as well as the docs in the sidebar. +- **Reproducibility.** `stopes` is built with a research mindset first. The +underlying Hydra framework gives you full control over the configuration of your +pipelines. All the important parameters of your experiments can be defined and +tracked. +- **Easier scaling.** The `stopes` framework provides clean separation between +your pipeline step logic and the scaling code. If you use slurm, run locally or +want to deploy on another cluster, your pipeline code and steps shouldn't +change. +- **Caching/memoization.** With `stopes`, you can iterate faster and more reliably +via transparent memoization. We've built the library so your code doesn't need +to know what's happening with the cache +- **Composition.** The `stopes` API surface is minimum, so you can build a +pipeline by simply writing idiomatic python (using asyncio) and have a quick +understanding of what's going on without needing to understand complex job APIs. + +Checkout the [quickstart](quickstart) guide and the +[pipelines](category/prebuilt-pipelines) we've provided as well as the docs in +the sidebar. ## Concepts -The idea of the `stopes` framework is to make it easy to build reproducible pipelines. This is done though "modules", a module is just a class with a `run` function that executes something. A module can then be scheduled with the `stopes` "launcher", this will decide where the code gets executed (locally or on a cluster) and then wait for the results to be ready. - -A **module** in `stopes` encapsulate a single step of a pipeline and its requirements. This step is supposed to be able to execute on its own given its input and generate an output. It will most often be executed as an isolated job, so shouldn't depend on anything else than its config (e.g. global variables, etc.). This ensures that each module can be run separately and in parallel if possible. A module also defines a clear API of the step via its configuration. - -A **pipeline** in `stopes` it not much more than a python function that connects a few modules together, but it could contain other python logic in the middle. While you can run a `stopes` module as a normal python callable, the power of `stopes` comes from the `launcher` that will manage the execution of the modules, find the correct machines with matching requirements (if executing on a cluster) and deal with memoization. - -A **launcher** is the orchestrator of your pipeline, but is exposed to you through a simple `async` API, so it looks like any [asyncio](https://docs.python.org/3/library/asyncio.html) function and you do not have to deal with where your code is being executed, if [memoization](stopes/cache) is happening, etc. If you have never dealt with `async` in python, I do recommend checking [this tutorial](https://realpython.com/async-io-python/), it looks scarier than it is. +The idea of the `stopes` framework is to make it easy to build reproducible +pipelines. This is done though "modules", a module is just a class with a `run` +function that executes something. A module can then be scheduled with the `stopes` +"launcher", this will decide where the code gets executed (locally or on a +cluster) and then wait for the results to be ready. + +A **module** in `stopes` encapsulate a single step of a pipeline and its +requirements. This step is supposed to be able to execute on its own given its +input and generate an output. It will most often be executed as an isolated +job, so shouldn't depend on anything else than its config (e.g. global +variables, etc.). This ensures that each module can be run separately and in +parallel if possible. +A module also defines a clear API of the step via its configuration. + +A **pipeline** in `stopes` it not much more than a python function that connects a +few modules together, but it could contain other python logic in the middle. +While you can run a `stopes` module as a normal python callable, the power of +`stopes` comes from the `launcher` that will manage the execution of the modules, +find the correct machines with matching requirements (if executing on a cluster) +and deal with memoization. + +A **launcher** is the orchestrator of your pipeline, but is exposed to you +through a simple `async` API, so it looks like any +[asyncio](https://docs.python.org/3/library/asyncio.html) function and you do not have +to deal with where your code is being executed, if [memoization](stopes/cache) +is happening, etc. If you have never dealt with `async` in python, I do +recommend checking [this tutorial](https://realpython.com/async-io-python/), it +looks scarier than it is. ## Example -Here is an example of a basic pipeline that will take some file inputs, train a [FAISS](https://faiss.ai/) index on it and then populate the index with the files. +Here is an example of a basic pipeline that will take some file inputs, train a +[FAISS](https://faiss.ai/) index on it and then populate the index with the +files. This example shows the usage of the launcher and how we reuse existing modules. -Here we assume that the files have already been encoded with something that LASER to keep the example simple, but you want to have a first step doing the encoding (see the [global mining pipeline](pipelines/global_mining) for a real example). +Here we assume +that the files have already been encoded with something that LASER to keep the +example simple, but you want to have a first step doing +the encoding (see the [global mining pipeline](pipelines/global_mining) for a real example). ```python title="mypipeline.py" import asyncio @@ -70,25 +109,39 @@ def main(config: DictConfig) -> None: Let's start with the `main`, this is a very basic boilerplate that: -1. sets up [hydra](https://www.hydra.cc) to get configuration when running the script. We recommend checking the [hydra tutorial](https://hydra.cc/docs/tutorials/intro/) on their site to understand how to build configurations and organize them. See below also for an example config. +1. sets up [hydra](https://www.hydra.cc) to get configuration when running the + script. We recommend checking the [hydra tutorial](https://hydra.cc/docs/tutorials/intro/) on their site to understand + how to build configurations and organize them. See below also for an example + config. 2. starts `asyncio` and runs our async `pipeline` function. -The `pipeline` function is `async` as it will run some asynchronous code inside it, so we need to tell python that this will be the case. The first thing it does, is to initialize the `launcher` from the config, this is a trick to be able to swap launchers on the CLI using config overrides. After that, we setup the `TrainFAISSIndexModule` and `schedule` it with the launcher. This will check if this step was already executed in the past, and if not, will schedule the module on the cluster (or just locally if you want). +The `pipeline` function is `async` as it will run some asynchronous code inside +it, so we need to tell python that this will be the case. The first thing it +does, is to initialize the `launcher` from the config, this is a trick to be +able to swap launchers on the CLI using config overrides. After that, we setup +the `TrainFAISSIndexModule` and `schedule` it with the launcher. This will check +if this step was already executed in the past, and if not, will schedule the +module on the cluster (or just locally if you want). -The `await` keyword tells python to "wait" for the job to finish and once that is done, move to the next step. As we need to pass the generated `index` to the populate step, we take the config read from hydra, and fill up the `index` with the output of the training. We schedule and await that step, and finally just log the location of the output file. +The `await` keyword tells python to "wait" for the job to finish and once that +is done, move to the next step. As we need to pass the generated `index` to the +populate step, we take the config read from hydra, and fill up the `index` with +the output of the training. We schedule and await that step, and finally just +log the location of the output file. Let's look at the config: ```yaml title="conf/config" + embedding_files: ??? embedding_dimensions: 1024 index_type: ??? launcher: - _target_: stopes.core.Launcher - log_folder: executor_logs - cluster: local - partition: + _target_: stopes.core.Launcher + log_folder: executor_logs + cluster: local + partition: train_index: lang: demo @@ -103,6 +156,7 @@ train_index: embedding_dimensions: ${embedding_dimensions} fp16: True + populate_index: lang: demo index: ??? @@ -113,19 +167,22 @@ populate_index: embedding_dimensions: ${embedding_dimensions} ``` -Hydra will take a yaml file and structure it for our usage in python. You can see that we define at the top level: - +Hydra will take a yaml file and structure it for our usage in python. You can +see that we define at the top level: ``` embedding_files: ??? index_type: ??? ``` +This tells hydra that these two entries are empty and required, so it will +enforce that we specify them on the CLI. We pass them down to the sub-configs +for train/populate by using the `${}` placeholders. -This tells hydra that these two entries are empty and required, so it will enforce that we specify them on the CLI. We pass them down to the sub-configs for train/populate by using the `${}` placeholders. - -The `launcher` entry is setup to initialize the [submitit](https://github.com/facebookincubator/submitit) that currently provides the main job management system. If you wanted to use a different job/cluster system, you could implement your own launcher. +The `launcher` entry is setup to initialize the +[submitit](https://github.com/facebookincubator/submitit) that currently +provides the main job management system. If you wanted to use a different +job/cluster system, you could implement your own launcher. We can now call our script with: - ```bash python mypipeline.py embedding_files='[pathtomyfile.bin]' index_type="OPQ64,IVF1024,PQ64" ``` @@ -138,6 +195,8 @@ python mypipeline.py embedding_files='[pathtomyfile.bin]' index_type="OPQ64,IVF1 :::note -We use [hydra](https://www.hydra.cc) as the configuration system, but note that most modules take a dataclass as config, so you could build that manually from a different system (like argparse) if you did not want to use hydra. +We use [hydra](https://www.hydra.cc) as the configuration system, but note that most modules +take a dataclass as config, so you could build that manually from a different +system (like argparse) if you did not want to use hydra. ::: diff --git a/website/docs/stopes/module.md b/website/docs/stopes/module.md index 874a128..3c8b18d 100644 --- a/website/docs/stopes/module.md +++ b/website/docs/stopes/module.md @@ -6,6 +6,7 @@ sidebar_position: 2 A module is a python class that extends `StopesModule`: + ```python from stopes.code.stopes_module import StopesModule @@ -21,32 +22,71 @@ class MyModule(StopesModule): ... ``` -You should implement at least the `run `method, this is what will get executed when your module is launched. By default, you don't need to worry about the iteration parameters, see below for details of what these do. -If you want to initialize things before the module is run, you can use `__init__.` +You should implement at least the `run `method, this is what will get executed +when your module is launched. By default, you don't need to worry about the +iteration parameters, see below for details of what these do. + +If you want to initialize things before the module is run, you can use +`__init__.` + +You can also implement the following methods to give more information about your +module: + + -You can also implement the following methods to give more information about your module: +* `requirements` - if you have specific requirements (gpus, memory, …) for your + module, return a Requirements specification from this method. This will be + called after `__init__` but before `run`. +* `name/comment` - some launchers (see below) might use this to identify/log + your module runs. Feel free to implement them if you want, but you don't have + to and they might not always be used. -- `requirements` - if you have specific requirements (gpus, memory, …) for your module, return a Requirements specification from this method. This will be called after `__init__` but before `run`. -- `name/comment` - some launchers (see below) might use this to identify/log your module runs. Feel free to implement them if you want, but you don't have to and they might not always be used. ## Arrays -We've observed that in many cases, pipeline steps are repeated on a number of shards of data. This is common with large datasets and allows to chunk the data processing on different machines for faster processing. +We've observed that in many cases, pipeline steps are repeated on a number of +shards of data. This is common with large datasets and allows to chunk the data +processing on different machines for faster processing. -In this execution case, the goal is to execute the same code with the same requirements on a number of shards, in order to avoid implementing this logic for every module that needs to work on shards in the pipeline driving the module. The StopesModule system can take care of this for you. +In this execution case, the goal is to execute the same code with the same +requirements on a number of shards, in order to avoid implementing this logic +for every module that needs to work on shards in the pipeline driving the +module. The StopesModule system can take care of this for you. -If your module implements the `array` method and returns an array of N values to process, the module will be executed N times separately and the `run` method will be called multiple times, independently. Every time the `run `method is called for a module with an array, it will be passed two extra parameters: +If your module implements the `array` method and returns an array of N values to +process, the module will be executed N times separately and the `run` method +will be called multiple times, independently. Every time the `run `method is +called for a module with an array, it will be passed two extra parameters: -- `iteration_value` that will contain a single value from the array -- `iteration_index` that corresponds the the index of that value in the array -The array method will be called after the module is initialized and in the same process as the initialization. You can therefore compute the array based on the config of the module or anything you compute in the `__init__` method. + +* `iteration_value` that will contain a single value from the array +* `iteration_index` that corresponds the the index of that value in the array + +The array method will be called after the module is initialized and in the same +process as the initialization. You can therefore compute the array based on the +config of the module or anything you compute in the `__init__` method. + ## Gotchas -- In most cases, the `run` method will be executed in a distributed fashion. That means that: - - `run` and `__init__ `might not be called with the same machine/process. E.g. when launching modules, `__init__` will be called where your pipeline driving script is executed, but `run` will be called in a separate process/job. - - When using `array`, each separate call to `run` will potentially be called on a separate machine/process and on a separate copy of your module. That means that you can share value from `__init__` down to `run`, but you cannot share anything in your object between calls of `run, `you should not modify self inside of` run`. - - When using `array`, there is no guarantee that `run` will be called in the same order as the values in your array. Only rely on the index passed to you and not on an execution order. - - Your `run` method will probably apply side effects (e.g. write files). If this is the case, make sure to return the file path/handle from the run method so we can keep track of these. + + +* In most cases, the `run` method will be executed in a distributed fashion. + That means that: + * `run` and `__init__ `might not be called with the same machine/process. + E.g. when launching modules, `__init__` will be called where your pipeline + driving script is executed, but `run` will be called in a separate + process/job. + * When using `array`, each separate call to `run` will potentially be called + on a separate machine/process and on a separate copy of your module. That + means that you can share value from `__init__` down to `run`, but you + cannot share anything in your object between calls of `run, `you should + not modify self inside of` run`. + * When using `array`, there is no guarantee that `run` will be called in the + same order as the values in your array. Only rely on the index passed to + you and not on an execution order. + * Your `run` method will probably apply side effects (e.g. write files). If + this is the case, make sure to return the file path/handle from the run + method so we can keep track of these. diff --git a/website/docs/stopes/pipelining.md b/website/docs/stopes/pipelining.md index 6788d27..89b020c 100644 --- a/website/docs/stopes/pipelining.md +++ b/website/docs/stopes/pipelining.md @@ -4,31 +4,53 @@ sidebar_position: 3 # Composition (aka pipelining) -The StopesModule framework provides a "launcher" abstraction that takes care of scheduling your module "somewhere". Currently, and in most Stopes use cases, this somewhere is SLURM, but you can also choose to launch it locally and more launcher implementations might come when other execution environments are needed. +The StopesModule framework provides a "launcher" abstraction that takes care of +scheduling your module "somewhere". Currently, and in most Stopes use cases, this +somewhere is SLURM, but you can also choose to launch it locally and more +launcher implementations might come when other execution environments are needed. -The global_mining pipeline is a good example of how all of this works together and you should check it out when reading this doc to have a good idea of how things fit together. +The global_mining +pipeline is a good example of how all of this works together and you should +check it out when reading this doc to have a good idea of how things fit +together. + +You can initialize a launcher from code with its python init, but ideally, your +pipeline will initialize it from a config with hydra: -You can initialize a launcher from code with its python init, but ideally, your pipeline will initialize it from a config with hydra: ```python self.launcher = hydra.utils.instantiate(config.launcher) ``` -We provide pre-made configs for the main SLURM launcher and instantiating the launcher from config will allow you to override it from the CLI for debugging. + +We provide pre-made configs for the main SLURM launcher and instantiating the +launcher from config will allow you to override it from the CLI for debugging. Once you have a launcher, you can launch a module in code with: + ```python embedded_files = await self.launcher.schedule(embed_module) ``` -The launcher will take care of submitting a job to the execution engine (e.g. SLURM) and wait for it to be done. The launcher will also take care of raising any exception happening in the execution engine and if using the submitit launcher, it will also take care of checkpointing (see above). + +The launcher will take care of submitting a job to the execution engine (e.g. +SLURM) and wait for it to be done. The launcher will also take care of raising +any exception happening in the execution engine and if using the submitit +launcher, it will also take care of checkpointing (see above). ## Asyncio -Because`launcher.schedule`will potentially schedule your module run method on a separate host, wait for it to find a slot and to eventually finish. The result that this `schedule` method returns is not available immediately. We use python asyncio to deal with waiting for the results to be available. This means that you need to `await `the result of schedule before being able to use it. +Because` launcher.schedule `will potentially schedule your module run method on +a separate host, wait for it to find a slot and to eventually finish. The result +that this `schedule` method returns is not available immediately. We use python +asyncio to deal with waiting for the results to be available. This means that +you need to `await `the result of schedule before being able to use it. + +This also means that you can use asyncio helpers to organize your code and tell +the launcher when things can be scheduled in parallel. For instance you can +await for two results in "parallel" with: -This also means that you can use asyncio helpers to organize your code and tell the launcher when things can be scheduled in parallel. For instance you can await for two results in "parallel" with: ```python src_embeddings, tgt_embeddings = await asyncio.gather( diff --git a/website/sidebars.js b/website/sidebars.js index 899c71a..3b9a2bc 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -20,7 +20,7 @@ module.exports = { // By default, Docusaurus generates a sidebar from the docs folder structure - quickstartSidebar: [{type: 'autogenerated', dirName: '.'}], + quickstartSidebar: [{ type: 'autogenerated', dirName: '.' }], // But you can create a sidebar manually /* diff --git a/website/src/css/custom.css b/website/src/css/custom.css index b60c693..107f9f3 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -24,6 +24,7 @@ --ifm-color-primary-lightest: #2b6cf8; } + /* For readability concerns, you should choose a lighter palette in dark mode. */ [data-theme='dark'] { --ifm-color-primary: #4d82ff; @@ -101,7 +102,7 @@ } .sbanner .bottom .button-container { - margin-left: 255px; + margin-left: 255px } .sbanner .bottom .button-container .button { @@ -146,7 +147,9 @@ color: #ce6849; } + @media only screen and (max-width: 700px) { + .sbanner .container { margin-left: 5px; } diff --git a/website/src/pages/index.js b/website/src/pages/index.js index 7d31a06..1a01d53 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -17,7 +17,7 @@ import React from 'react'; import styles from './styles.module.css'; function Stopes() { - return stopes; + return stopes } const features = [ @@ -25,14 +25,13 @@ const features = [ title: 'Easy to Use', description: ( <> - was designed to provide a modular API to build and reproduce - pipelines core to large translation work. In particular data mining and - evaluation. Where you run your pipeline and how you scale it is - independent of its core logic. Everything is config-driven so you can - easily reproduce and track results. + was designed to provide a modular API to build and reproduce pipelines core to large translation work. + In particular data mining and evaluation. + Where you run your pipeline and how you scale it is independent of its core logic. + Everything is config-driven so you can easily reproduce and track results. ), - buttonTxt: 'Quickstart', + buttonTxt: "Quickstart", buttonUrl: 'docs/quickstart', imageUrl: 'img/shovel.svg', }, @@ -40,13 +39,12 @@ const features = [ title: 'Batteries Included', description: ( <> - lets you focus on your core data and evaluation needs by - providing common modules used for this task and letting you write your - pipelines with idiomatic python. Common optimizations have also been - built-in to help you scale your work. + lets you focus on your core data and evaluation needs by providing common modules + used for this task and letting you write your pipelines with idiomatic python. + Common optimizations have also been built-in to help you scale your work. ), - buttonTxt: 'Learn More', + buttonTxt: "Learn More", buttonUrl: 'docs/stopes', imageUrl: 'img/modules.svg', }, @@ -54,21 +52,21 @@ const features = [ title: 'State-of-the-art Pipelines', description: ( <> - was developed as part of the Meta AI No Language Left Behind - research project. It comes with state-of-the-art pipelines out of the - box. You can run our global mining and distillation pipelines and - reproduce our research with just a few command lines. + was developed as part of the Meta AI No Language Left Behind research project. + It comes with state-of-the-art pipelines out of the box. You can run our global mining and distillation + pipelines and reproduce our research with just a few command lines. ), - buttonTxt: 'E.g. Start Data Mining', + buttonTxt: "E.g. Start Data Mining", buttonUrl: 'docs/pipelines/global_mining', imageUrl: 'img/pipelines.svg', }, ]; + const sections = [ { - title: 'No-coding Mining', + title: "No-coding Mining", language: 'bash', code: `python -m stopes.pipelines.bitext.global_mining_pipeline \\ src_lang=fuv \\ @@ -78,15 +76,12 @@ const sections = [ output_dir=. \\ embed_text=laser3`, content: ( -

- comes with the Global Mining Pipeline that was used by the - NLLB team. You can use it out of the box without extra coding. You will - need to setup an environment and create a config file to point to your - data, but you can start mining (locally or on a slurm cluster) without - any coding. Check out the{' '} - Quickstart guide. -

- ), +

comes with the Global Mining Pipeline that was used by the NLLB team. + You can use it out of the box without extra coding. You will need to setup an + environment and create a config file to point to your data, + but you can start mining (locally or on a slurm cluster) without any coding. + Check out the Quickstart guide.

+ ) }, { title: 'Reproducible research', @@ -102,11 +97,10 @@ config: num_threads : 4`, content: (

- is based on Hydra, giving - you full control over the behavior of your pipeline. Experiments are - easily reproducible along with your results. -

- ), + is based on Hydra, + giving you full control over the behavior of your pipeline. + Experiments are easily reproducible along with your results.

+ ) }, { title: 'Modular pipeline definition', @@ -147,69 +141,72 @@ config: content: ( <>

- pipelines are composed of modules. No more duplicated, - out-of sync code: your most common preprocessing steps can be shared + pipelines are composed of modules. + No more duplicated, out-of sync code: your most common preprocessing steps can be shared among all your pipelines.

You will find in this repository some implementations of a number of - modules that are useful for translation data mining and evaluation, - Neural Machine Translation data pre-processing and model training. For - example, we provide modules to build{' '} - faiss indexes, encode text with{' '} - LASER and{' '} - - HuggingFace Transformers - - , mine bitext or train and evaluate{' '} - FAIRSEQ{' '} - models. -

- - ), + modules that are useful for translation data mining and evaluation, Neural Machine Translation data pre-processing + and model training. For example, we provide modules to build faiss indexes, encode + text with LASER and HuggingFace Transformers, + mine bitext or train and evaluate FAIRSEQ models. +

+ ) }, -]; +] -function Card({title, description, buttonTxt, buttonUrl, imageUrl}) { +function Card({ title, description, buttonTxt, buttonUrl, imageUrl }) { const imgUrl = useBaseUrl(imageUrl); const burl = useBaseUrl(buttonUrl); return (
-
+
{imgUrl && ( -
- {title} -
- )} -
+
+ {title} +
)} +

{title}

-

{description}

+

+ {description} +

{buttonTxt && buttonUrl && ( -
+
+ className={clsx("button button--primary button--block")} + to={burl} + > {buttonTxt}
)}
-
- ); +
) } -function ContentWithCode({title, children, flip, language}) { +function ContentWithCode({ title, children, flip, language }) { + const [content, code] = React.Children.toArray(children); - const textBlock =
{content}
; + const textBlock = ( +
+ {content} +
+ ) const codeBlock = (
- {code} + + {code} +
- ); + ) let left = textBlock; let right = codeBlock; @@ -227,12 +224,12 @@ function ContentWithCode({title, children, flip, language}) {
{left} {right} -
-
- ); +
) } function Banner() { + + const nllb = useBaseUrl('img/banner_bits/nllb.png'); const driving = useBaseUrl('img/banner_bits/driving.png'); const stopes = useBaseUrl('img/banner_bits/stopes.png'); @@ -253,41 +250,36 @@ function Banner() {
- NO LANGUAGES LEFT BEHIND -
-
- Driving inclusion through machine translation -
-

- logo - stopes -

-
+ NO LANGUAGES LEFT BEHIND
+
+ Driving inclusion through machine translation
+

logostopes

+

Large-Scale Translation Tooling

Mining Quickstart
-
+
meta
- ); + ) } + export default function Home() { const context = useDocusaurusContext(); - const {siteConfig = {}} = context; + const { siteConfig = {} } = context; return (
- {features.map( - ({title, imageUrl, description, buttonTxt, buttonUrl}) => ( - - ), - )} + {features.map(({ title, imageUrl, description, buttonTxt, buttonUrl }) => ( + + ))}
)}
- {sections.map(({title, language, code, content}, index) => ( + {sections.map(({ title, language, code, content }, index) => ( Date: Wed, 30 Aug 2023 21:05:28 +0300 Subject: [PATCH 6/6] Feat: Explore folders on Front-end --- .../ui/seamlisten/backend/app/fileviewer.py | 33 +- stopes/ui/seamlisten/backend/dev.env | 2 +- .../ui/seamlisten/react_app/package-lock.json | 846 +++++++++++++++++- stopes/ui/seamlisten/react_app/package.json | 5 + .../react_app/src/common/constants/config.js | 3 +- .../react_app/src/common/fetchers/folder.ts | 27 + .../react_app/src/common/types/api.ts | 12 +- .../react_app/src/components/FileExplorer.tsx | 52 +- .../src/components/fileviewer/FileTree.tsx | 39 + .../src/components/fileviewer/table/Row.tsx | 10 + 10 files changed, 959 insertions(+), 70 deletions(-) create mode 100644 stopes/ui/seamlisten/react_app/src/common/fetchers/folder.ts create mode 100644 stopes/ui/seamlisten/react_app/src/components/fileviewer/FileTree.tsx diff --git a/stopes/ui/seamlisten/backend/app/fileviewer.py b/stopes/ui/seamlisten/backend/app/fileviewer.py index 8a69eb8..3600929 100644 --- a/stopes/ui/seamlisten/backend/app/fileviewer.py +++ b/stopes/ui/seamlisten/backend/app/fileviewer.py @@ -11,7 +11,7 @@ from pathlib import Path import torchaudio -from fastapi import APIRouter +from fastapi import APIRouter, Body from fastapi.exceptions import HTTPException from fastapi.responses import JSONResponse, Response @@ -21,7 +21,7 @@ from .query_types import AnnotationQuery, AudioQuery, DefaultQuery, LineQuery -torchaudio.set_audio_backend("sox_io") +torchaudio.set_audio_backend("soundfile") router = APIRouter(tags=["fileviewer"]) @@ -161,3 +161,32 @@ async def general_query(query: DefaultQuery) -> Response: a line with audio file, start and end timestamps """, ) + + +@router.post("/fetchFolders/") +def gather_folder_contents(folder_path: str = Body(...), max_depth: int = Body(5)): + def gather_contents_recursive(folder_path, current_depth): + if current_depth > max_depth: + return { + "folder": str(folder_path), + "subfolders": None, + "audio_files": None, + "unexplored": True, + } + subfolders = [] + audio_files = [] + + for entry in folder_path.iterdir(): + if entry.is_dir(): + subfolders.append(gather_contents_recursive(entry, current_depth + 1)) + elif entry.suffix in {".wav", ".ms"}: + audio_files.append(entry.name) + + return { + "folder": str(folder_path), + "subfolders": subfolders if subfolders else None, + "audio_files": audio_files if audio_files else None, + "unexplored": False, + } + + return gather_contents_recursive(Path(folder_path), 1) diff --git a/stopes/ui/seamlisten/backend/dev.env b/stopes/ui/seamlisten/backend/dev.env index 9f20a2a..f6f353b 100644 --- a/stopes/ui/seamlisten/backend/dev.env +++ b/stopes/ui/seamlisten/backend/dev.env @@ -1,2 +1,2 @@ -PORT=8000 +PORT=8080 DEV_MODE=True diff --git a/stopes/ui/seamlisten/react_app/package-lock.json b/stopes/ui/seamlisten/react_app/package-lock.json index 2549572..ecd65d9 100644 --- a/stopes/ui/seamlisten/react_app/package-lock.json +++ b/stopes/ui/seamlisten/react_app/package-lock.json @@ -8,7 +8,12 @@ "name": "react_app", "version": "0.1.0", "dependencies": { + "@emotion/react": "^11.11.1", + "@emotion/styled": "^11.11.0", "@fortawesome/fontawesome-svg-core": "^6.2.1", + "@mui/icons-material": "^5.14.3", + "@mui/lab": "^5.0.0-alpha.140", + "@mui/material": "^5.14.5", "@testing-library/jest-dom": "^5.16.5", "@testing-library/react": "^13.4.0", "@testing-library/user-event": "^13.5.0", @@ -1807,11 +1812,11 @@ } }, "node_modules/@babel/runtime": { - "version": "7.19.4", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.19.4.tgz", - "integrity": "sha512-EXpLCrk55f+cYqmHsSR+yD/0gAIMxxA9QK9lnQWzhMCvt+YmoBN7Zx94s++Kv0+unHk39vxNO8t+CMA2WSS3wA==", + "version": "7.22.10", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.22.10.tgz", + "integrity": "sha512-21t/fkKLMZI4pqP2wlmsQAWnYW1PDyKyyUV4vCi+B25ydmdaYTKXPwCj0BzSUnZf4seIiYvSA3jcZ3gdsMFkLQ==", "dependencies": { - "regenerator-runtime": "^0.13.4" + "regenerator-runtime": "^0.14.0" }, "engines": { "node": ">=6.9.0" @@ -1829,6 +1834,11 @@ "node": ">=6.9.0" } }, + "node_modules/@babel/runtime/node_modules/regenerator-runtime": { + "version": "0.14.0", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.0.tgz", + "integrity": "sha512-srw17NI0TUWHuGa5CFGGmhfNIeja30WMBfbslPNhf6JrqQlLN5gcrvig1oqPxiVaXb0oW0XRKtH6Nngs5lKCIA==" + }, "node_modules/@babel/template": { "version": "7.18.10", "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.18.10.tgz", @@ -2151,18 +2161,139 @@ "postcss-selector-parser": "^6.0.10" } }, + "node_modules/@emotion/babel-plugin": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/babel-plugin/-/babel-plugin-11.11.0.tgz", + "integrity": "sha512-m4HEDZleaaCH+XgDDsPF15Ht6wTLsgDTeR3WYj9Q/k76JtWhrJjcP4+/XlG8LGT/Rol9qUfOIztXeA84ATpqPQ==", + "dependencies": { + "@babel/helper-module-imports": "^7.16.7", + "@babel/runtime": "^7.18.3", + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/serialize": "^1.1.2", + "babel-plugin-macros": "^3.1.0", + "convert-source-map": "^1.5.0", + "escape-string-regexp": "^4.0.0", + "find-root": "^1.1.0", + "source-map": "^0.5.7", + "stylis": "4.2.0" + } + }, + "node_modules/@emotion/babel-plugin/node_modules/escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/@emotion/babel-plugin/node_modules/source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha512-LbrmJOMUSdEVxIKvdcJzQC+nQhe8FUZQTXQy6+I75skNgn3OoQ0DZA8YnFa7gp8tqtL3KPf1kmo0R5DoApeSGQ==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/@emotion/cache": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/cache/-/cache-11.11.0.tgz", + "integrity": "sha512-P34z9ssTCBi3e9EI1ZsWpNHcfY1r09ZO0rZbRO2ob3ZQMnFI35jB536qoXbkdesr5EUhYi22anuEJuyxifaqAQ==", + "dependencies": { + "@emotion/memoize": "^0.8.1", + "@emotion/sheet": "^1.2.2", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "stylis": "4.2.0" + } + }, + "node_modules/@emotion/hash": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.9.1.tgz", + "integrity": "sha512-gJB6HLm5rYwSLI6PQa+X1t5CFGrv1J1TWG+sOyMCeKz2ojaj6Fnl/rZEspogG+cvqbt4AE/2eIyD2QfLKTBNlQ==" + }, "node_modules/@emotion/is-prop-valid": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.0.tgz", - "integrity": "sha512-3aDpDprjM0AwaxGE09bOPkNxHpBd+kA6jty3RnaEXdweX1DF1U3VQpPYb0g1IStAuK7SVQ1cy+bNBBKp4W3Fjg==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.1.tgz", + "integrity": "sha512-61Mf7Ufx4aDxx1xlDeOm8aFFigGHE4z+0sKCa+IHCeZKiyP9RLD0Mmx7m8b9/Cf37f7NAvQOOJAbQQGVr5uERw==", "dependencies": { - "@emotion/memoize": "^0.8.0" + "@emotion/memoize": "^0.8.1" } }, "node_modules/@emotion/memoize": { - "version": "0.8.0", - "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.0.tgz", - "integrity": "sha512-G/YwXTkv7Den9mXDO7AhLWkE3q+I92B+VqAE+dYG4NGPaHZGvt3G8Q0p9vmE+sq7rTGphUbAvmQ9YpbfMQGGlA==" + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + }, + "node_modules/@emotion/react": { + "version": "11.11.1", + "resolved": "https://registry.npmjs.org/@emotion/react/-/react-11.11.1.tgz", + "integrity": "sha512-5mlW1DquU5HaxjLkfkGN1GA/fvVGdyHURRiX/0FHl2cfIfRxSOfmxEH5YS43edp0OldZrZ+dkBKbngxcNCdZvA==", + "dependencies": { + "@babel/runtime": "^7.18.3", + "@emotion/babel-plugin": "^11.11.0", + "@emotion/cache": "^11.11.0", + "@emotion/serialize": "^1.1.2", + "@emotion/use-insertion-effect-with-fallbacks": "^1.0.1", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "hoist-non-react-statics": "^3.3.1" + }, + "peerDependencies": { + "react": ">=16.8.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@emotion/serialize": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@emotion/serialize/-/serialize-1.1.2.tgz", + "integrity": "sha512-zR6a/fkFP4EAcCMQtLOhIgpprZOwNmCldtpaISpvz348+DP4Mz8ZoKaGGCQpbzepNIUWbq4w6hNZkwDyKoS+HA==", + "dependencies": { + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/unitless": "^0.8.1", + "@emotion/utils": "^1.2.1", + "csstype": "^3.0.2" + } + }, + "node_modules/@emotion/serialize/node_modules/@emotion/unitless": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.8.1.tgz", + "integrity": "sha512-KOEGMu6dmJZtpadb476IsZBclKvILjopjUii3V+7MnXIQCYh8W3NgNcgwo21n9LXZX6EDIKvqfjYxXebDwxKmQ==" + }, + "node_modules/@emotion/sheet": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@emotion/sheet/-/sheet-1.2.2.tgz", + "integrity": "sha512-0QBtGvaqtWi+nx6doRwDdBIzhNdZrXUppvTM4dtZZWEGTXL/XE/yJxLMGlDT1Gt+UHH5IX1n+jkXyytE/av7OA==" + }, + "node_modules/@emotion/styled": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/styled/-/styled-11.11.0.tgz", + "integrity": "sha512-hM5Nnvu9P3midq5aaXj4I+lnSfNi7Pmd4EWk1fOZ3pxookaQTNew6bp4JaCBYM4HVFZF9g7UjJmsUmC2JlxOng==", + "dependencies": { + "@babel/runtime": "^7.18.3", + "@emotion/babel-plugin": "^11.11.0", + "@emotion/is-prop-valid": "^1.2.1", + "@emotion/serialize": "^1.1.2", + "@emotion/use-insertion-effect-with-fallbacks": "^1.0.1", + "@emotion/utils": "^1.2.1" + }, + "peerDependencies": { + "@emotion/react": "^11.0.0-rc.0", + "react": ">=16.8.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } }, "node_modules/@emotion/stylis": { "version": "0.8.5", @@ -2174,6 +2305,24 @@ "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.7.5.tgz", "integrity": "sha512-OWORNpfjMsSSUBVrRBVGECkhWcULOAJz9ZW8uK9qgxD+87M7jHRcvh/A96XXNhXTLmKcoYSQtBEX7lHMO7YRwg==" }, + "node_modules/@emotion/use-insertion-effect-with-fallbacks": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@emotion/use-insertion-effect-with-fallbacks/-/use-insertion-effect-with-fallbacks-1.0.1.tgz", + "integrity": "sha512-jT/qyKZ9rzLErtrjGgdkMBn2OP8wl0G3sQlBb3YPryvKHsjvINUhVaPFfP+fpBcOkmrVOVEEHQFJ7nbj2TH2gw==", + "peerDependencies": { + "react": ">=16.8.0" + } + }, + "node_modules/@emotion/utils": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@emotion/utils/-/utils-1.2.1.tgz", + "integrity": "sha512-Y2tGf3I+XVnajdItskUCn6LX+VUDmP6lTL4fcqsXAv43dnlbZiuW4MWQW38rW/BVWSE7Q/7+XQocmpnRYILUmg==" + }, + "node_modules/@emotion/weak-memoize": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/@emotion/weak-memoize/-/weak-memoize-0.3.1.tgz", + "integrity": "sha512-EsBwpc7hBUJWAsNPBmJy4hxWx12v6bshQsldrVmjxJoc3isbxhOrF2IcCpaXxfvq03NwkI7sbsOLXbYuqF/8Ww==" + }, "node_modules/@eslint/eslintrc": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-1.3.3.tgz", @@ -3039,6 +3188,308 @@ "resolved": "https://registry.npmjs.org/@leichtgewicht/ip-codec/-/ip-codec-2.0.4.tgz", "integrity": "sha512-Hcv+nVC0kZnQ3tD9GVu5xSMR4VVYOteQIr/hwFPVEvPdlXqgGEuRjiheChHgdM+JyqdgNcmzZOX/tnl0JOiI7A==" }, + "node_modules/@mui/base": { + "version": "5.0.0-beta.11", + "resolved": "https://registry.npmjs.org/@mui/base/-/base-5.0.0-beta.11.tgz", + "integrity": "sha512-FdKZGPd8qmC3ZNke7CNhzcEgToc02M6WYZc9hcBsNQ17bgAd3s9F//1bDDYgMVBYxDM71V0sv/hBHlOY4I1ZVA==", + "dependencies": { + "@babel/runtime": "^7.22.6", + "@emotion/is-prop-valid": "^1.2.1", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "@popperjs/core": "^2.11.8", + "clsx": "^2.0.0", + "prop-types": "^15.8.1", + "react-is": "^18.2.0" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "@types/react": "^17.0.0 || ^18.0.0", + "react": "^17.0.0 || ^18.0.0", + "react-dom": "^17.0.0 || ^18.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@mui/base/node_modules/react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + }, + "node_modules/@mui/core-downloads-tracker": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/core-downloads-tracker/-/core-downloads-tracker-5.14.5.tgz", + "integrity": "sha512-+wpGH1USwPcKMFPMvXqYPC6fEvhxM3FzxC8lyDiNK/imLyyJ6y2DPb1Oue7OGIKJWBmYBqrWWtfovrxd1aJHTA==", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + } + }, + "node_modules/@mui/icons-material": { + "version": "5.14.3", + "resolved": "https://registry.npmjs.org/@mui/icons-material/-/icons-material-5.14.3.tgz", + "integrity": "sha512-XkxWPhageu1OPUm2LWjo5XqeQ0t2xfGe8EiLkRW9oz2LHMMZmijvCxulhgquUVTF1DnoSh+3KoDLSsoAFtVNVw==", + "dependencies": { + "@babel/runtime": "^7.22.6" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "@mui/material": "^5.0.0", + "@types/react": "^17.0.0 || ^18.0.0", + "react": "^17.0.0 || ^18.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@mui/lab": { + "version": "5.0.0-alpha.140", + "resolved": "https://registry.npmjs.org/@mui/lab/-/lab-5.0.0-alpha.140.tgz", + "integrity": "sha512-k75jos6jklCD8tA20PAK2H4RSCKycTcR4Pbfz7JbdxIkWXJ+y2MRalwMcen1vpB99v0yZHNUo6BtGz6rvs2jlQ==", + "dependencies": { + "@babel/runtime": "^7.22.6", + "@mui/base": "5.0.0-beta.11", + "@mui/system": "^5.14.5", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "clsx": "^2.0.0", + "prop-types": "^15.8.1", + "react-is": "^18.2.0" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "@emotion/react": "^11.5.0", + "@emotion/styled": "^11.3.0", + "@mui/material": "^5.0.0", + "@types/react": "^17.0.0 || ^18.0.0", + "react": "^17.0.0 || ^18.0.0", + "react-dom": "^17.0.0 || ^18.0.0" + }, + "peerDependenciesMeta": { + "@emotion/react": { + "optional": true + }, + "@emotion/styled": { + "optional": true + }, + "@types/react": { + "optional": true + } + } + }, + "node_modules/@mui/lab/node_modules/react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + }, + "node_modules/@mui/material": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/material/-/material-5.14.5.tgz", + "integrity": "sha512-4qa4GMfuZH0Ai3mttk5ccXP8a3sf7aPlAJwyMrUSz6h9hPri6BPou94zeu3rENhhmKLby9S/W1y+pmficy8JKA==", + "dependencies": { + "@babel/runtime": "^7.22.6", + "@mui/base": "5.0.0-beta.11", + "@mui/core-downloads-tracker": "^5.14.5", + "@mui/system": "^5.14.5", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "@types/react-transition-group": "^4.4.6", + "clsx": "^2.0.0", + "csstype": "^3.1.2", + "prop-types": "^15.8.1", + "react-is": "^18.2.0", + "react-transition-group": "^4.4.5" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "@emotion/react": "^11.5.0", + "@emotion/styled": "^11.3.0", + "@types/react": "^17.0.0 || ^18.0.0", + "react": "^17.0.0 || ^18.0.0", + "react-dom": "^17.0.0 || ^18.0.0" + }, + "peerDependenciesMeta": { + "@emotion/react": { + "optional": true + }, + "@emotion/styled": { + "optional": true + }, + "@types/react": { + "optional": true + } + } + }, + "node_modules/@mui/material/node_modules/react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + }, + "node_modules/@mui/private-theming": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/private-theming/-/private-theming-5.14.5.tgz", + "integrity": "sha512-cC4C5RrpXpDaaZyH9QwmPhRLgz+f2SYbOty3cPkk4qPSOSfif2ZEcDD9HTENKDDd9deB+xkPKzzZhi8cxIx8Ig==", + "dependencies": { + "@babel/runtime": "^7.22.6", + "@mui/utils": "^5.14.5", + "prop-types": "^15.8.1" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "@types/react": "^17.0.0 || ^18.0.0", + "react": "^17.0.0 || ^18.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@mui/styled-engine": { + "version": "5.13.2", + "resolved": "https://registry.npmjs.org/@mui/styled-engine/-/styled-engine-5.13.2.tgz", + "integrity": "sha512-VCYCU6xVtXOrIN8lcbuPmoG+u7FYuOERG++fpY74hPpEWkyFQG97F+/XfTQVYzlR2m7nPjnwVUgATcTCMEaMvw==", + "dependencies": { + "@babel/runtime": "^7.21.0", + "@emotion/cache": "^11.11.0", + "csstype": "^3.1.2", + "prop-types": "^15.8.1" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "@emotion/react": "^11.4.1", + "@emotion/styled": "^11.3.0", + "react": "^17.0.0 || ^18.0.0" + }, + "peerDependenciesMeta": { + "@emotion/react": { + "optional": true + }, + "@emotion/styled": { + "optional": true + } + } + }, + "node_modules/@mui/system": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/system/-/system-5.14.5.tgz", + "integrity": "sha512-mextXZHDeGcR7E1kx43TRARrVXy+gI4wzpUgNv7MqZs1dvTVXQGVeAT6ydj9d6FUqHBPMNLGV/21vJOrpqsL+w==", + "dependencies": { + "@babel/runtime": "^7.22.6", + "@mui/private-theming": "^5.14.5", + "@mui/styled-engine": "^5.13.2", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "clsx": "^2.0.0", + "csstype": "^3.1.2", + "prop-types": "^15.8.1" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "@emotion/react": "^11.5.0", + "@emotion/styled": "^11.3.0", + "@types/react": "^17.0.0 || ^18.0.0", + "react": "^17.0.0 || ^18.0.0" + }, + "peerDependenciesMeta": { + "@emotion/react": { + "optional": true + }, + "@emotion/styled": { + "optional": true + }, + "@types/react": { + "optional": true + } + } + }, + "node_modules/@mui/types": { + "version": "7.2.4", + "resolved": "https://registry.npmjs.org/@mui/types/-/types-7.2.4.tgz", + "integrity": "sha512-LBcwa8rN84bKF+f5sDyku42w1NTxaPgPyYKODsh01U1fVstTClbUoSA96oyRBnSNyEiAVjKm6Gwx9vjR+xyqHA==", + "peerDependencies": { + "@types/react": "*" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@mui/utils": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/utils/-/utils-5.14.5.tgz", + "integrity": "sha512-6Hzw63VR9C5xYv+CbjndoRLU6Gntal8rJ5W+GUzkyHrGWIyYPWZPa6AevnyGioySNETATe1H9oXS8f/7qgIHJA==", + "dependencies": { + "@babel/runtime": "^7.22.6", + "@types/prop-types": "^15.7.5", + "@types/react-is": "^18.2.1", + "prop-types": "^15.8.1", + "react-is": "^18.2.0" + }, + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/mui" + }, + "peerDependencies": { + "react": "^17.0.0 || ^18.0.0" + } + }, + "node_modules/@mui/utils/node_modules/react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + }, "node_modules/@nicolo-ribaudo/eslint-scope-5-internals": { "version": "5.1.1-v1", "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/eslint-scope-5-internals/-/eslint-scope-5-internals-5.1.1-v1.tgz", @@ -3149,9 +3600,9 @@ } }, "node_modules/@popperjs/core": { - "version": "2.11.6", - "resolved": "https://registry.npmjs.org/@popperjs/core/-/core-2.11.6.tgz", - "integrity": "sha512-50/17A98tWUfQ176raKiOGXuYpLyyVMkxxG6oylzL3BPOlA6ADGdK7EYunSa4I064xerltq9TGXs8HmOk5E+vw==", + "version": "2.11.8", + "resolved": "https://registry.npmjs.org/@popperjs/core/-/core-2.11.8.tgz", + "integrity": "sha512-P1st0aksCrn9sGZhp8GMYwBnQsbvAWsZAX44oXNNvLHGqAOcoVxmjZiohstwQ7SqKnbR47akdNi+uleWD8+g6A==", "funding": { "type": "opencollective", "url": "https://opencollective.com/popperjs" @@ -4214,10 +4665,18 @@ "@types/react": "*" } }, + "node_modules/@types/react-is": { + "version": "18.2.1", + "resolved": "https://registry.npmjs.org/@types/react-is/-/react-is-18.2.1.tgz", + "integrity": "sha512-wyUkmaaSZEzFZivD8F2ftSyAfk6L+DfFliVj/mYdOXbVjRcS87fQJLTnhk6dRZPuJjI+9g6RZJO4PNCngUrmyw==", + "dependencies": { + "@types/react": "*" + } + }, "node_modules/@types/react-transition-group": { - "version": "4.4.5", - "resolved": "https://registry.npmjs.org/@types/react-transition-group/-/react-transition-group-4.4.5.tgz", - "integrity": "sha512-juKD/eiSM3/xZYzjuzH6ZwpP+/lejltmiS3QEzV/vmb/Q8+HfDmxu+Baga8UEMGBqV88Nbg4l2hY/K2DkyaLLA==", + "version": "4.4.6", + "resolved": "https://registry.npmjs.org/@types/react-transition-group/-/react-transition-group-4.4.6.tgz", + "integrity": "sha512-VnCdSxfcm08KjsJVQcfBmhEQAPnLB8G08hAxn39azX1qYBQ/5RVQuoHuKIcfKOdncuaUvEpFKFzEvbtIMsfVew==", "dependencies": { "@types/react": "*" } @@ -5899,6 +6358,14 @@ "wrap-ansi": "^7.0.0" } }, + "node_modules/clsx": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.0.0.tgz", + "integrity": "sha512-rQ1+kcj+ttHG0MKVGBUXwayCCF1oh39BF5COIpRzuCEv8Mwjv0XucrI2ExNTOn9IlLifGClWQcU9BrZORvtw6Q==", + "engines": { + "node": ">=6" + } + }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -6574,9 +7041,9 @@ "integrity": "sha512-b0tGHbfegbhPJpxpiBPU2sCkigAqtM9O121le6bbOlgyV+NyGyCmVfJ6QW9eRjz8CpNfWEOYBIMIGRYkLwsIYg==" }, "node_modules/csstype": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.1.tgz", - "integrity": "sha512-DJR/VvkAvSZW9bTouZue2sSxDwdTN92uHjqeKVm+0dAqdfNykRzQ95tay8aXMBAAPpUiq4Qcug2L7neoRh2Egw==" + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.2.tgz", + "integrity": "sha512-I7K1Uu0MBPzaFKg4nI5Q7Vs2t+3gWWW648spaF+Rg7pI9ds18Ugn+lvg4SHczUdKlHI5LWBXyqfS8+DufyBsgQ==" }, "node_modules/damerau-levenshtein": { "version": "1.0.8", @@ -8305,6 +8772,11 @@ "url": "https://github.com/avajs/find-cache-dir?sponsor=1" } }, + "node_modules/find-root": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/find-root/-/find-root-1.1.0.tgz", + "integrity": "sha512-NKfW6bec6GfKc0SGx1e07QZY9PE99u0Bft/0rzSD5k3sO/vwkVUpDUKVm5Gpp5Ue3YfShPFTX2070tDs5kB9Ng==" + }, "node_modules/find-up": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", @@ -16033,6 +16505,11 @@ "postcss": "^8.2.15" } }, + "node_modules/stylis": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/stylis/-/stylis-4.2.0.tgz", + "integrity": "sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw==" + }, "node_modules/supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -18953,11 +19430,18 @@ } }, "@babel/runtime": { - "version": "7.19.4", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.19.4.tgz", - "integrity": "sha512-EXpLCrk55f+cYqmHsSR+yD/0gAIMxxA9QK9lnQWzhMCvt+YmoBN7Zx94s++Kv0+unHk39vxNO8t+CMA2WSS3wA==", + "version": "7.22.10", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.22.10.tgz", + "integrity": "sha512-21t/fkKLMZI4pqP2wlmsQAWnYW1PDyKyyUV4vCi+B25ydmdaYTKXPwCj0BzSUnZf4seIiYvSA3jcZ3gdsMFkLQ==", "requires": { - "regenerator-runtime": "^0.13.4" + "regenerator-runtime": "^0.14.0" + }, + "dependencies": { + "regenerator-runtime": { + "version": "0.14.0", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.0.tgz", + "integrity": "sha512-srw17NI0TUWHuGa5CFGGmhfNIeja30WMBfbslPNhf6JrqQlLN5gcrvig1oqPxiVaXb0oW0XRKtH6Nngs5lKCIA==" + } } }, "@babel/runtime-corejs3": { @@ -19137,18 +19621,117 @@ "integrity": "sha512-IkpVW/ehM1hWKln4fCA3NzJU8KwD+kIOvPZA4cqxoJHtE21CCzjyp+Kxbu0i5I4tBNOlXPL9mjwnWlL0VEG4Fg==", "requires": {} }, + "@emotion/babel-plugin": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/babel-plugin/-/babel-plugin-11.11.0.tgz", + "integrity": "sha512-m4HEDZleaaCH+XgDDsPF15Ht6wTLsgDTeR3WYj9Q/k76JtWhrJjcP4+/XlG8LGT/Rol9qUfOIztXeA84ATpqPQ==", + "requires": { + "@babel/helper-module-imports": "^7.16.7", + "@babel/runtime": "^7.18.3", + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/serialize": "^1.1.2", + "babel-plugin-macros": "^3.1.0", + "convert-source-map": "^1.5.0", + "escape-string-regexp": "^4.0.0", + "find-root": "^1.1.0", + "source-map": "^0.5.7", + "stylis": "4.2.0" + }, + "dependencies": { + "escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==" + }, + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha512-LbrmJOMUSdEVxIKvdcJzQC+nQhe8FUZQTXQy6+I75skNgn3OoQ0DZA8YnFa7gp8tqtL3KPf1kmo0R5DoApeSGQ==" + } + } + }, + "@emotion/cache": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/cache/-/cache-11.11.0.tgz", + "integrity": "sha512-P34z9ssTCBi3e9EI1ZsWpNHcfY1r09ZO0rZbRO2ob3ZQMnFI35jB536qoXbkdesr5EUhYi22anuEJuyxifaqAQ==", + "requires": { + "@emotion/memoize": "^0.8.1", + "@emotion/sheet": "^1.2.2", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "stylis": "4.2.0" + } + }, + "@emotion/hash": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.9.1.tgz", + "integrity": "sha512-gJB6HLm5rYwSLI6PQa+X1t5CFGrv1J1TWG+sOyMCeKz2ojaj6Fnl/rZEspogG+cvqbt4AE/2eIyD2QfLKTBNlQ==" + }, "@emotion/is-prop-valid": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.0.tgz", - "integrity": "sha512-3aDpDprjM0AwaxGE09bOPkNxHpBd+kA6jty3RnaEXdweX1DF1U3VQpPYb0g1IStAuK7SVQ1cy+bNBBKp4W3Fjg==", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.1.tgz", + "integrity": "sha512-61Mf7Ufx4aDxx1xlDeOm8aFFigGHE4z+0sKCa+IHCeZKiyP9RLD0Mmx7m8b9/Cf37f7NAvQOOJAbQQGVr5uERw==", "requires": { - "@emotion/memoize": "^0.8.0" + "@emotion/memoize": "^0.8.1" } }, "@emotion/memoize": { - "version": "0.8.0", - "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.0.tgz", - "integrity": "sha512-G/YwXTkv7Den9mXDO7AhLWkE3q+I92B+VqAE+dYG4NGPaHZGvt3G8Q0p9vmE+sq7rTGphUbAvmQ9YpbfMQGGlA==" + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + }, + "@emotion/react": { + "version": "11.11.1", + "resolved": "https://registry.npmjs.org/@emotion/react/-/react-11.11.1.tgz", + "integrity": "sha512-5mlW1DquU5HaxjLkfkGN1GA/fvVGdyHURRiX/0FHl2cfIfRxSOfmxEH5YS43edp0OldZrZ+dkBKbngxcNCdZvA==", + "requires": { + "@babel/runtime": "^7.18.3", + "@emotion/babel-plugin": "^11.11.0", + "@emotion/cache": "^11.11.0", + "@emotion/serialize": "^1.1.2", + "@emotion/use-insertion-effect-with-fallbacks": "^1.0.1", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "hoist-non-react-statics": "^3.3.1" + } + }, + "@emotion/serialize": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@emotion/serialize/-/serialize-1.1.2.tgz", + "integrity": "sha512-zR6a/fkFP4EAcCMQtLOhIgpprZOwNmCldtpaISpvz348+DP4Mz8ZoKaGGCQpbzepNIUWbq4w6hNZkwDyKoS+HA==", + "requires": { + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/unitless": "^0.8.1", + "@emotion/utils": "^1.2.1", + "csstype": "^3.0.2" + }, + "dependencies": { + "@emotion/unitless": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.8.1.tgz", + "integrity": "sha512-KOEGMu6dmJZtpadb476IsZBclKvILjopjUii3V+7MnXIQCYh8W3NgNcgwo21n9LXZX6EDIKvqfjYxXebDwxKmQ==" + } + } + }, + "@emotion/sheet": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@emotion/sheet/-/sheet-1.2.2.tgz", + "integrity": "sha512-0QBtGvaqtWi+nx6doRwDdBIzhNdZrXUppvTM4dtZZWEGTXL/XE/yJxLMGlDT1Gt+UHH5IX1n+jkXyytE/av7OA==" + }, + "@emotion/styled": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/styled/-/styled-11.11.0.tgz", + "integrity": "sha512-hM5Nnvu9P3midq5aaXj4I+lnSfNi7Pmd4EWk1fOZ3pxookaQTNew6bp4JaCBYM4HVFZF9g7UjJmsUmC2JlxOng==", + "requires": { + "@babel/runtime": "^7.18.3", + "@emotion/babel-plugin": "^11.11.0", + "@emotion/is-prop-valid": "^1.2.1", + "@emotion/serialize": "^1.1.2", + "@emotion/use-insertion-effect-with-fallbacks": "^1.0.1", + "@emotion/utils": "^1.2.1" + } }, "@emotion/stylis": { "version": "0.8.5", @@ -19160,6 +19743,22 @@ "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.7.5.tgz", "integrity": "sha512-OWORNpfjMsSSUBVrRBVGECkhWcULOAJz9ZW8uK9qgxD+87M7jHRcvh/A96XXNhXTLmKcoYSQtBEX7lHMO7YRwg==" }, + "@emotion/use-insertion-effect-with-fallbacks": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@emotion/use-insertion-effect-with-fallbacks/-/use-insertion-effect-with-fallbacks-1.0.1.tgz", + "integrity": "sha512-jT/qyKZ9rzLErtrjGgdkMBn2OP8wl0G3sQlBb3YPryvKHsjvINUhVaPFfP+fpBcOkmrVOVEEHQFJ7nbj2TH2gw==", + "requires": {} + }, + "@emotion/utils": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@emotion/utils/-/utils-1.2.1.tgz", + "integrity": "sha512-Y2tGf3I+XVnajdItskUCn6LX+VUDmP6lTL4fcqsXAv43dnlbZiuW4MWQW38rW/BVWSE7Q/7+XQocmpnRYILUmg==" + }, + "@emotion/weak-memoize": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/@emotion/weak-memoize/-/weak-memoize-0.3.1.tgz", + "integrity": "sha512-EsBwpc7hBUJWAsNPBmJy4hxWx12v6bshQsldrVmjxJoc3isbxhOrF2IcCpaXxfvq03NwkI7sbsOLXbYuqF/8Ww==" + }, "@eslint/eslintrc": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-1.3.3.tgz", @@ -19798,6 +20397,150 @@ "resolved": "https://registry.npmjs.org/@leichtgewicht/ip-codec/-/ip-codec-2.0.4.tgz", "integrity": "sha512-Hcv+nVC0kZnQ3tD9GVu5xSMR4VVYOteQIr/hwFPVEvPdlXqgGEuRjiheChHgdM+JyqdgNcmzZOX/tnl0JOiI7A==" }, + "@mui/base": { + "version": "5.0.0-beta.11", + "resolved": "https://registry.npmjs.org/@mui/base/-/base-5.0.0-beta.11.tgz", + "integrity": "sha512-FdKZGPd8qmC3ZNke7CNhzcEgToc02M6WYZc9hcBsNQ17bgAd3s9F//1bDDYgMVBYxDM71V0sv/hBHlOY4I1ZVA==", + "requires": { + "@babel/runtime": "^7.22.6", + "@emotion/is-prop-valid": "^1.2.1", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "@popperjs/core": "^2.11.8", + "clsx": "^2.0.0", + "prop-types": "^15.8.1", + "react-is": "^18.2.0" + }, + "dependencies": { + "react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + } + } + }, + "@mui/core-downloads-tracker": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/core-downloads-tracker/-/core-downloads-tracker-5.14.5.tgz", + "integrity": "sha512-+wpGH1USwPcKMFPMvXqYPC6fEvhxM3FzxC8lyDiNK/imLyyJ6y2DPb1Oue7OGIKJWBmYBqrWWtfovrxd1aJHTA==" + }, + "@mui/icons-material": { + "version": "5.14.3", + "resolved": "https://registry.npmjs.org/@mui/icons-material/-/icons-material-5.14.3.tgz", + "integrity": "sha512-XkxWPhageu1OPUm2LWjo5XqeQ0t2xfGe8EiLkRW9oz2LHMMZmijvCxulhgquUVTF1DnoSh+3KoDLSsoAFtVNVw==", + "requires": { + "@babel/runtime": "^7.22.6" + } + }, + "@mui/lab": { + "version": "5.0.0-alpha.140", + "resolved": "https://registry.npmjs.org/@mui/lab/-/lab-5.0.0-alpha.140.tgz", + "integrity": "sha512-k75jos6jklCD8tA20PAK2H4RSCKycTcR4Pbfz7JbdxIkWXJ+y2MRalwMcen1vpB99v0yZHNUo6BtGz6rvs2jlQ==", + "requires": { + "@babel/runtime": "^7.22.6", + "@mui/base": "5.0.0-beta.11", + "@mui/system": "^5.14.5", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "clsx": "^2.0.0", + "prop-types": "^15.8.1", + "react-is": "^18.2.0" + }, + "dependencies": { + "react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + } + } + }, + "@mui/material": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/material/-/material-5.14.5.tgz", + "integrity": "sha512-4qa4GMfuZH0Ai3mttk5ccXP8a3sf7aPlAJwyMrUSz6h9hPri6BPou94zeu3rENhhmKLby9S/W1y+pmficy8JKA==", + "requires": { + "@babel/runtime": "^7.22.6", + "@mui/base": "5.0.0-beta.11", + "@mui/core-downloads-tracker": "^5.14.5", + "@mui/system": "^5.14.5", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "@types/react-transition-group": "^4.4.6", + "clsx": "^2.0.0", + "csstype": "^3.1.2", + "prop-types": "^15.8.1", + "react-is": "^18.2.0", + "react-transition-group": "^4.4.5" + }, + "dependencies": { + "react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + } + } + }, + "@mui/private-theming": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/private-theming/-/private-theming-5.14.5.tgz", + "integrity": "sha512-cC4C5RrpXpDaaZyH9QwmPhRLgz+f2SYbOty3cPkk4qPSOSfif2ZEcDD9HTENKDDd9deB+xkPKzzZhi8cxIx8Ig==", + "requires": { + "@babel/runtime": "^7.22.6", + "@mui/utils": "^5.14.5", + "prop-types": "^15.8.1" + } + }, + "@mui/styled-engine": { + "version": "5.13.2", + "resolved": "https://registry.npmjs.org/@mui/styled-engine/-/styled-engine-5.13.2.tgz", + "integrity": "sha512-VCYCU6xVtXOrIN8lcbuPmoG+u7FYuOERG++fpY74hPpEWkyFQG97F+/XfTQVYzlR2m7nPjnwVUgATcTCMEaMvw==", + "requires": { + "@babel/runtime": "^7.21.0", + "@emotion/cache": "^11.11.0", + "csstype": "^3.1.2", + "prop-types": "^15.8.1" + } + }, + "@mui/system": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/system/-/system-5.14.5.tgz", + "integrity": "sha512-mextXZHDeGcR7E1kx43TRARrVXy+gI4wzpUgNv7MqZs1dvTVXQGVeAT6ydj9d6FUqHBPMNLGV/21vJOrpqsL+w==", + "requires": { + "@babel/runtime": "^7.22.6", + "@mui/private-theming": "^5.14.5", + "@mui/styled-engine": "^5.13.2", + "@mui/types": "^7.2.4", + "@mui/utils": "^5.14.5", + "clsx": "^2.0.0", + "csstype": "^3.1.2", + "prop-types": "^15.8.1" + } + }, + "@mui/types": { + "version": "7.2.4", + "resolved": "https://registry.npmjs.org/@mui/types/-/types-7.2.4.tgz", + "integrity": "sha512-LBcwa8rN84bKF+f5sDyku42w1NTxaPgPyYKODsh01U1fVstTClbUoSA96oyRBnSNyEiAVjKm6Gwx9vjR+xyqHA==", + "requires": {} + }, + "@mui/utils": { + "version": "5.14.5", + "resolved": "https://registry.npmjs.org/@mui/utils/-/utils-5.14.5.tgz", + "integrity": "sha512-6Hzw63VR9C5xYv+CbjndoRLU6Gntal8rJ5W+GUzkyHrGWIyYPWZPa6AevnyGioySNETATe1H9oXS8f/7qgIHJA==", + "requires": { + "@babel/runtime": "^7.22.6", + "@types/prop-types": "^15.7.5", + "@types/react-is": "^18.2.1", + "prop-types": "^15.8.1", + "react-is": "^18.2.0" + }, + "dependencies": { + "react-is": { + "version": "18.2.0", + "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.2.0.tgz", + "integrity": "sha512-xWGDIW6x921xtzPkhiULtthJHoJvBbF3q26fzloPCK0hsvxtPVelvftw3zjbHWSkR2km9Z+4uxbDDK/6Zw9B8w==" + } + } + }, "@nicolo-ribaudo/eslint-scope-5-internals": { "version": "5.1.1-v1", "resolved": "https://registry.npmjs.org/@nicolo-ribaudo/eslint-scope-5-internals/-/eslint-scope-5-internals-5.1.1-v1.tgz", @@ -19862,9 +20605,9 @@ } }, "@popperjs/core": { - "version": "2.11.6", - "resolved": "https://registry.npmjs.org/@popperjs/core/-/core-2.11.6.tgz", - "integrity": "sha512-50/17A98tWUfQ176raKiOGXuYpLyyVMkxxG6oylzL3BPOlA6ADGdK7EYunSa4I064xerltq9TGXs8HmOk5E+vw==" + "version": "2.11.8", + "resolved": "https://registry.npmjs.org/@popperjs/core/-/core-2.11.8.tgz", + "integrity": "sha512-P1st0aksCrn9sGZhp8GMYwBnQsbvAWsZAX44oXNNvLHGqAOcoVxmjZiohstwQ7SqKnbR47akdNi+uleWD8+g6A==" }, "@react-aria/ssr": { "version": "3.3.0", @@ -20670,10 +21413,18 @@ "@types/react": "*" } }, + "@types/react-is": { + "version": "18.2.1", + "resolved": "https://registry.npmjs.org/@types/react-is/-/react-is-18.2.1.tgz", + "integrity": "sha512-wyUkmaaSZEzFZivD8F2ftSyAfk6L+DfFliVj/mYdOXbVjRcS87fQJLTnhk6dRZPuJjI+9g6RZJO4PNCngUrmyw==", + "requires": { + "@types/react": "*" + } + }, "@types/react-transition-group": { - "version": "4.4.5", - "resolved": "https://registry.npmjs.org/@types/react-transition-group/-/react-transition-group-4.4.5.tgz", - "integrity": "sha512-juKD/eiSM3/xZYzjuzH6ZwpP+/lejltmiS3QEzV/vmb/Q8+HfDmxu+Baga8UEMGBqV88Nbg4l2hY/K2DkyaLLA==", + "version": "4.4.6", + "resolved": "https://registry.npmjs.org/@types/react-transition-group/-/react-transition-group-4.4.6.tgz", + "integrity": "sha512-VnCdSxfcm08KjsJVQcfBmhEQAPnLB8G08hAxn39azX1qYBQ/5RVQuoHuKIcfKOdncuaUvEpFKFzEvbtIMsfVew==", "requires": { "@types/react": "*" } @@ -21906,6 +22657,11 @@ "wrap-ansi": "^7.0.0" } }, + "clsx": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.0.0.tgz", + "integrity": "sha512-rQ1+kcj+ttHG0MKVGBUXwayCCF1oh39BF5COIpRzuCEv8Mwjv0XucrI2ExNTOn9IlLifGClWQcU9BrZORvtw6Q==" + }, "co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -22385,9 +23141,9 @@ } }, "csstype": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.1.tgz", - "integrity": "sha512-DJR/VvkAvSZW9bTouZue2sSxDwdTN92uHjqeKVm+0dAqdfNykRzQ95tay8aXMBAAPpUiq4Qcug2L7neoRh2Egw==" + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.2.tgz", + "integrity": "sha512-I7K1Uu0MBPzaFKg4nI5Q7Vs2t+3gWWW648spaF+Rg7pI9ds18Ugn+lvg4SHczUdKlHI5LWBXyqfS8+DufyBsgQ==" }, "damerau-levenshtein": { "version": "1.0.8", @@ -23689,6 +24445,11 @@ "pkg-dir": "^4.1.0" } }, + "find-root": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/find-root/-/find-root-1.1.0.tgz", + "integrity": "sha512-NKfW6bec6GfKc0SGx1e07QZY9PE99u0Bft/0rzSD5k3sO/vwkVUpDUKVm5Gpp5Ue3YfShPFTX2070tDs5kB9Ng==" + }, "find-up": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", @@ -29070,6 +29831,11 @@ "postcss-selector-parser": "^6.0.4" } }, + "stylis": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/stylis/-/stylis-4.2.0.tgz", + "integrity": "sha512-Orov6g6BB1sDfYgzWfTHDOxamtX1bE/zo104Dh9e6fqJ3PooipYyfJ0pUmrZO2wAvO8YbEyeFrkV91XTsGMSrw==" + }, "supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", diff --git a/stopes/ui/seamlisten/react_app/package.json b/stopes/ui/seamlisten/react_app/package.json index b3c7304..aa8117b 100644 --- a/stopes/ui/seamlisten/react_app/package.json +++ b/stopes/ui/seamlisten/react_app/package.json @@ -3,7 +3,12 @@ "version": "0.1.0", "private": true, "dependencies": { + "@emotion/react": "^11.11.1", + "@emotion/styled": "^11.11.0", "@fortawesome/fontawesome-svg-core": "^6.2.1", + "@mui/icons-material": "^5.14.3", + "@mui/lab": "^5.0.0-alpha.140", + "@mui/material": "^5.14.5", "@testing-library/jest-dom": "^5.16.5", "@testing-library/react": "^13.4.0", "@testing-library/user-event": "^13.5.0", diff --git a/stopes/ui/seamlisten/react_app/src/common/constants/config.js b/stopes/ui/seamlisten/react_app/src/common/constants/config.js index 7158d14..e23988f 100644 --- a/stopes/ui/seamlisten/react_app/src/common/constants/config.js +++ b/stopes/ui/seamlisten/react_app/src/common/constants/config.js @@ -4,13 +4,14 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. -const BACKEND_PORT = process.env.NODE_ENV === "development" ? "8000" : "8800"; +const BACKEND_PORT = "8080" export const config = { host: "http://localhost", port: BACKEND_PORT, annotations_route: "/annotations/", general_route: "/general/", + folders_route: "/fetchFolders/", audio_route: "/servefile/", upload_route: "/upload_microphone/", embed_route: "/embed_audio/", diff --git a/stopes/ui/seamlisten/react_app/src/common/fetchers/folder.ts b/stopes/ui/seamlisten/react_app/src/common/fetchers/folder.ts new file mode 100644 index 0000000..641d809 --- /dev/null +++ b/stopes/ui/seamlisten/react_app/src/common/fetchers/folder.ts @@ -0,0 +1,27 @@ +import { config } from "../constants/config"; +import { FolderStructure } from "../types/api"; + +const url = config.host + ":" + config.port + config.folders_route; + +async function fetchFolders(folderPath: string): Promise { + try { + const response = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ folder_path: folderPath, max_depth: 5 }), + }); + + console.log(folderPath); + + if (!response.ok) { + throw response; + } + + return response.json(); + } catch (error) { + console.error("Fetch folders failed:", error); + throw error; + } +} + +export default fetchFolders; diff --git a/stopes/ui/seamlisten/react_app/src/common/types/api.ts b/stopes/ui/seamlisten/react_app/src/common/types/api.ts index 996ad26..732d786 100644 --- a/stopes/ui/seamlisten/react_app/src/common/types/api.ts +++ b/stopes/ui/seamlisten/react_app/src/common/types/api.ts @@ -48,6 +48,15 @@ interface LineResult { columns: Array; } +interface FileNode { + folder: string; + subfolders: FileNode[] | null; + audio_files: string[] | null; + unexplored: boolean; +} + +type FolderStructure = FileNode; + // controls: type CurrentPlayingIDHandler = (params: string) => void; type KeyDownHandler = ( @@ -91,6 +100,7 @@ export { LineQuery, UploadQuery, LineResult, + FolderStructure, Audio, TextAnswer, FaissResult, @@ -98,4 +108,4 @@ export { RowProps, CurrentPlayingIDHandler, CellRenderProps, -}; +}; \ No newline at end of file diff --git a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx index b155484..04aece0 100644 --- a/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx +++ b/stopes/ui/seamlisten/react_app/src/components/FileExplorer.tsx @@ -4,15 +4,14 @@ // This source code is licensed under the license found in the // LICENSE file in the root directory of this source tree. - import { useCallback, useEffect, useState } from "react"; - import Button from "react-bootstrap/Button"; import { default as BCol } from "react-bootstrap/Col"; import Form from "react-bootstrap/Form"; import { default as BRow } from "react-bootstrap/Row"; +import FileTree from "./fileviewer/FileTree"; import { Location, @@ -24,29 +23,27 @@ import WaveSurferComponent from "../common/components/audio/WaveSurfer"; import InnerScale from "../common/components/spinners/spinner"; import { config } from "../common/constants/config"; import fetchFiles from "../common/fetchers/mining_result"; -import { LineResult } from "../common/types/api"; +import { LineResult, FolderStructure } from "../common/types/api"; import Help from "./fileviewer/FileExplorerHelp"; import Table from "./fileviewer/Table"; - import { text_to_audio } from "../common/components/audio/audioquery_constructor"; - +import fetchFolders from "../common/fetchers/folder"; const FILENAME_PARAM = "file"; const PAGENUMBER_PARAM = "page"; const NUMBERLINES_PARAM = "lines"; - type LoaderReturn = { filename: string; pageNumber: number; numberLines: number; files: LineResult[]; audioBlob: Blob; + folderData: FolderStructure | null; error: any; }; - function parseParams(searchParams) { return { filename: searchParams.get(FILENAME_PARAM) @@ -57,7 +54,6 @@ function parseParams(searchParams) { }; } - function parseLocation(location: Location) { if (!location) { return null; @@ -65,22 +61,21 @@ function parseLocation(location: Location) { return parseParams(new URLSearchParams(location.search)); } - export async function loader({ request }): Promise { const url = new URL(request.url); - const { filename, numberLines, pageNumber } = parseParams(url.searchParams); + const toRet = { filename, numberLines, pageNumber, files: [], audioBlob: undefined, + folderData: undefined, error: null, }; - try { if ( filename.endsWith("tsv.gz") || @@ -96,6 +91,13 @@ export async function loader({ request }): Promise { return toRet; } + if (filename.endsWith("t")) { + const folderData: FolderStructure = await fetchFolders(filename); + toRet.folderData = folderData; + console.log(folderData); + console.log(folderData); + return toRet; + } const audioResult = await text_to_audio(filename, 1); if (audioResult) { @@ -110,7 +112,6 @@ export async function loader({ request }): Promise { return toRet; } - function Error({ error }) { const msg = error.data ? error.data.detail @@ -123,7 +124,6 @@ function Error({ error }) { ); } - function useFileNavigate() { const navigate = useNavigate(); return (file: string, page: number, numberLines: number) => @@ -134,16 +134,22 @@ function useFileNavigate() { ); } - const Files = (): JSX.Element => { - const [displayHelper, setDisplayHelper] = useState(false); + const [displayHelper, setDisplayHelper] = useState(false); const navigate = useFileNavigate(); - let { filename, pageNumber, numberLines, files, audioBlob, error } = - useLoaderData() as LoaderReturn; + let { + filename, + pageNumber, + numberLines, + files, + audioBlob, + error, + folderData, + } = useLoaderData() as LoaderReturn; const [newFilename, setNewFilename] = useState( filename || config.default_path ); - + // if we have a location, we are in a transition between two urls const navigation = useNavigation(); const locationParams = parseLocation(navigation.location); @@ -154,12 +160,10 @@ const Files = (): JSX.Element => { } const loading = !!navigation.location; - // in some navigation events (like back/forward navigation, the component is not remounted) // so we need to reset the "default" for the filename form. useEffect(() => setNewFilename(filename), [filename]); - const setFilenameEventHandler = useCallback( (evt) => setNewFilename(evt.target.value), [setNewFilename] @@ -185,18 +189,16 @@ const Files = (): JSX.Element => { [setFilename] ); - // Add new function to handle paste events const fileInputHandlePaste = useCallback( (evt) => { - const pastedData = evt.clipboardData.getData('text'); + const pastedData = evt.clipboardData.getData("text"); setNewFilename(pastedData); navigate(pastedData, pageNumber, numberLines); }, [navigate, pageNumber, numberLines] ); - return (
@@ -255,6 +257,7 @@ const Files = (): JSX.Element => { ) : ( <> + {folderData && } { ); }; - -export default Files; \ No newline at end of file +export default Files; diff --git a/stopes/ui/seamlisten/react_app/src/components/fileviewer/FileTree.tsx b/stopes/ui/seamlisten/react_app/src/components/fileviewer/FileTree.tsx new file mode 100644 index 0000000..a501ab3 --- /dev/null +++ b/stopes/ui/seamlisten/react_app/src/components/fileviewer/FileTree.tsx @@ -0,0 +1,39 @@ +import TreeView from "@mui/lab/TreeView"; +import TreeItem from "@mui/lab/TreeItem"; +import ExpandMoreIcon from "@mui/icons-material/ExpandMore"; +import ChevronRightIcon from "@mui/icons-material/ChevronRight"; +import { FolderStructure } from "../../common/types/api"; + +type FileTreeProps = { + folderData: FolderStructure | null; +}; + +const FileTree: React.FC = ({ folderData }) => { + // Recursive function to render TreeItems + const renderTree = (folderData: FolderStructure, nodeId: string) => { + return ( + + {folderData.subfolders && + folderData.subfolders.map((subfolder, index) => + renderTree(subfolder, `${nodeId}-${index}`) + )} + {folderData.audio_files && + folderData.audio_files.map((file, index) => ( + + ))} + + ); + }; + + return ( + } + defaultExpandIcon={} + > + {renderTree(folderData, "root")} + + ); +}; + +export default FileTree; diff --git a/stopes/ui/seamlisten/react_app/src/components/fileviewer/table/Row.tsx b/stopes/ui/seamlisten/react_app/src/components/fileviewer/table/Row.tsx index c297fec..e1b04f0 100644 --- a/stopes/ui/seamlisten/react_app/src/components/fileviewer/table/Row.tsx +++ b/stopes/ui/seamlisten/react_app/src/components/fileviewer/table/Row.tsx @@ -20,9 +20,12 @@ function CellRender({ if (typeof object === "number") { return {object.toString()}; } + console.log("obj.kind", object.kind) + console.log("CellRender object:", object); switch (object.kind) { case "audio": return ( + <> + ); case "text": return {object.content}; + default: + return {object}; } } + export const Row = ({ item, waveformKeyEvent, @@ -88,3 +95,6 @@ export const Row = ({ ); }; + + +//add a sitch so that if obj.kind is undefinded it renders the text and audio \ No newline at end of file