From bce7ede9cd073bdf811af16200f0403be1df54f5 Mon Sep 17 00:00:00 2001 From: Jordan Smith Date: Tue, 15 Apr 2025 01:03:15 -0700 Subject: [PATCH 1/4] removals --- CMakeLists.txt | 80 - alignment/README.md | 67 - alignment/align.py | 145 -- alignment/eval.py | 60 - alignment/example.sh | 51 - alignment/unsup_align.py | 109 -- alignment/unsup_multialign.py | 198 -- alignment/utils.py | 154 -- classification-example.sh | 41 - classification-results.sh | 94 - crawl/README.md | 26 - crawl/dedup.cc | 51 - crawl/download_crawl.sh | 57 - crawl/filter_dedup.sh | 13 - crawl/filter_utf8.cc | 105 -- crawl/process_wet_file.sh | 30 - docs/aligned-vectors.md | 64 - docs/api.md | 6 - docs/autotune.md | 156 -- docs/cheatsheet.md | 89 - docs/crawl-vectors.md | 203 --- docs/dataset.md | 6 - docs/english-vectors.md | 53 - docs/faqs.md | 66 - docs/language-identification.md | 74 - docs/options.md | 60 - docs/pretrained-vectors.md | 142 -- docs/python-module.md | 314 ---- docs/references.md | 41 - docs/supervised-models.md | 54 - docs/supervised-tutorial.md | 597 ------ docs/support.md | 58 - docs/unsupervised-tutorials.md | 483 ----- docs/webassembly-module.md | 338 ---- download_model.py | 48 - eval.py | 95 - fasttext.pc.in | 10 - get-wikimedia.sh | 79 - python/README.md | 322 ---- python/README.rst | 406 ----- python/benchmarks/README.rst | 3 - python/benchmarks/get_word_vector.py | 50 - python/doc/examples/FastTextEmbeddingBag.py | 81 - python/doc/examples/bin_to_vec.py | 41 - python/doc/examples/compute_accuracy.py | 163 -- python/doc/examples/get_vocab.py | 48 - python/doc/examples/train_supervised.py | 43 - python/doc/examples/train_unsupervised.py | 56 - .../fasttext/tests/__init__.py | 14 - .../fasttext/tests/test_configurations.py | 239 --- .../fasttext/tests/test_script.py | 629 ------- .../fasttext_module/fasttext/util/__init__.py | 15 - python/fasttext_module/fasttext/util/util.py | 209 --- quantization-example.sh | 40 - reduce_model.py | 98 - runtests.py | 60 - scripts/kbcompletion/README.md | 19 - scripts/kbcompletion/data.sh | 69 - scripts/kbcompletion/eval.cpp | 108 -- scripts/kbcompletion/fb15k.sh | 49 - scripts/kbcompletion/fb15k237.sh | 45 - scripts/kbcompletion/svo.sh | 38 - scripts/kbcompletion/wn18.sh | 49 - scripts/quantization/quantization-results.sh | 43 - src/autotune.cc | 477 ----- src/autotune.h | 89 - src/loss.cc | 103 -- src/loss.h | 39 - src/main.cc | 454 ----- src/meter.cc | 214 --- src/meter.h | 91 - src/model.cc | 37 - src/model.h | 11 - src/productquantizer.cc | 155 -- src/productquantizer.h | 9 - src/quantmatrix.cc | 58 +- src/quantmatrix.h | 11 +- src/utils.cc | 53 - src/utils.h | 42 - src/vector.cc | 34 - src/vector.h | 4 - tests/fetch_test_data.sh | 203 --- webassembly/README.md | 37 - webassembly/doc/examples/misc.html | 62 - webassembly/doc/examples/predict.html | 42 - .../doc/examples/train_supervised.html | 66 - .../doc/examples/train_unsupervised.html | 44 - webassembly/fasttext.js | 520 ------ webassembly/fasttext_wasm.cc | 328 ---- website/README.md | 6 - website/blog/2016-08-18-blog-post.md | 42 - website/blog/2017-05-02-blog-post.md | 60 - website/blog/2017-10-02-blog-post.md | 90 - website/blog/2019-06-25-blog-post.md | 168 -- website/core/Footer.js | 127 -- website/package.json | 12 - website/pages/en/index.js | 325 ---- website/sidebars.json | 18 - website/siteConfig.js | 105 -- ...assfasttext_1_1QMatrix-members.html.i4eKqy | 0 website/static/docs/en/html/annotated.html | 115 -- website/static/docs/en/html/annotated_dup.js | 4 - website/static/docs/en/html/args_8cc.html | 113 -- website/static/docs/en/html/args_8h.html | 134 -- website/static/docs/en/html/args_8h.js | 14 - .../static/docs/en/html/args_8h_source.html | 139 -- website/static/docs/en/html/bc_s.png | Bin 676 -> 0 bytes website/static/docs/en/html/bdwn.png | Bin 147 -> 0 bytes website/static/docs/en/html/classes.html | 121 -- .../html/classfasttext_1_1Args-members.html | 140 -- .../docs/en/html/classfasttext_1_1Args.html | 753 -------- .../docs/en/html/classfasttext_1_1Args.js | 40 - .../classfasttext_1_1Dictionary-members.html | 148 -- .../en/html/classfasttext_1_1Dictionary.html | 1266 ------------- .../en/html/classfasttext_1_1Dictionary.js | 43 - .../classfasttext_1_1FastText-members.html | 145 -- .../en/html/classfasttext_1_1FastText.html | 1149 ------------ .../docs/en/html/classfasttext_1_1FastText.js | 45 - .../html/classfasttext_1_1Matrix-members.html | 123 -- .../docs/en/html/classfasttext_1_1Matrix.html | 610 ------- .../docs/en/html/classfasttext_1_1Matrix.js | 23 - .../html/classfasttext_1_1Model-members.html | 150 -- .../docs/en/html/classfasttext_1_1Model.html | 1400 --------------- .../docs/en/html/classfasttext_1_1Model.js | 48 - ...sfasttext_1_1ProductQuantizer-members.html | 131 -- .../classfasttext_1_1ProductQuantizer.html | 950 ---------- .../html/classfasttext_1_1ProductQuantizer.js | 31 - .../classfasttext_1_1QMatrix-members.html | 122 -- .../en/html/classfasttext_1_1QMatrix.html | 565 ------ .../docs/en/html/classfasttext_1_1QMatrix.js | 22 - .../html/classfasttext_1_1Vector-members.html | 121 -- .../docs/en/html/classfasttext_1_1Vector.html | 542 ------ .../docs/en/html/classfasttext_1_1Vector.js | 21 - website/static/docs/en/html/closed.png | Bin 132 -> 0 bytes .../static/docs/en/html/dictionary_8cc.html | 116 -- .../static/docs/en/html/dictionary_8h.html | 142 -- website/static/docs/en/html/dictionary_8h.js | 10 - .../docs/en/html/dictionary_8h_source.html | 127 -- .../dir_68267d1309a1af8e8297ef4c3efbcdba.html | 145 -- .../dir_68267d1309a1af8e8297ef4c3efbcdba.js | 29 - website/static/docs/en/html/doc.png | Bin 746 -> 0 bytes website/static/docs/en/html/doxygen.css | 1596 ----------------- website/static/docs/en/html/doxygen.png | Bin 3779 -> 0 bytes website/static/docs/en/html/dynsections.js | 97 - website/static/docs/en/html/fasttext_8cc.html | 119 -- website/static/docs/en/html/fasttext_8h.html | 168 -- website/static/docs/en/html/fasttext_8h.js | 6 - .../docs/en/html/fasttext_8h_source.html | 155 -- website/static/docs/en/html/favicon.png | Bin 4259 -> 0 bytes website/static/docs/en/html/files.html | 125 -- website/static/docs/en/html/files.js | 4 - website/static/docs/en/html/folderclosed.png | Bin 616 -> 0 bytes website/static/docs/en/html/folderopen.png | Bin 597 -> 0 bytes website/static/docs/en/html/functions.html | 139 -- .../static/docs/en/html/functions_0x7e.html | 112 -- website/static/docs/en/html/functions_b.html | 115 -- website/static/docs/en/html/functions_c.html | 143 -- website/static/docs/en/html/functions_d.html | 135 -- website/static/docs/en/html/functions_dup.js | 27 - website/static/docs/en/html/functions_e.html | 115 -- website/static/docs/en/html/functions_f.html | 112 -- .../static/docs/en/html/functions_func.html | 563 ------ website/static/docs/en/html/functions_g.html | 145 -- website/static/docs/en/html/functions_h.html | 112 -- website/static/docs/en/html/functions_i.html | 121 -- website/static/docs/en/html/functions_k.html | 106 -- website/static/docs/en/html/functions_l.html | 140 -- website/static/docs/en/html/functions_m.html | 153 -- website/static/docs/en/html/functions_n.html | 164 -- website/static/docs/en/html/functions_o.html | 116 -- website/static/docs/en/html/functions_p.html | 161 -- website/static/docs/en/html/functions_q.html | 135 -- website/static/docs/en/html/functions_r.html | 116 -- website/static/docs/en/html/functions_s.html | 159 -- website/static/docs/en/html/functions_t.html | 138 -- website/static/docs/en/html/functions_u.html | 106 -- website/static/docs/en/html/functions_v.html | 106 -- .../static/docs/en/html/functions_vars.html | 486 ----- website/static/docs/en/html/functions_w.html | 124 -- website/static/docs/en/html/functions_z.html | 104 -- website/static/docs/en/html/globals.html | 170 -- website/static/docs/en/html/globals_defs.html | 113 -- website/static/docs/en/html/globals_func.html | 155 -- website/static/docs/en/html/index.html | 100 -- website/static/docs/en/html/jquery.js | 87 - website/static/docs/en/html/main_8cc.html | 582 ------ website/static/docs/en/html/main_8cc.js | 22 - website/static/docs/en/html/matrix_8cc.html | 114 -- website/static/docs/en/html/matrix_8h.html | 121 -- .../static/docs/en/html/matrix_8h_source.html | 123 -- website/static/docs/en/html/menu.js | 26 - website/static/docs/en/html/menudata.js | 90 - website/static/docs/en/html/model_8cc.html | 113 -- website/static/docs/en/html/model_8h.html | 183 -- website/static/docs/en/html/model_8h.js | 8 - .../static/docs/en/html/model_8h_source.html | 139 -- .../docs/en/html/namespacefasttext.html | 343 ---- .../static/docs/en/html/namespacefasttext.js | 13 - .../en/html/namespacefasttext_1_1utils.html | 158 -- .../static/docs/en/html/namespacemembers.html | 125 -- .../docs/en/html/namespacemembers_enum.html | 107 -- .../docs/en/html/namespacemembers_func.html | 110 -- .../docs/en/html/namespacemembers_type.html | 104 -- website/static/docs/en/html/namespaces.html | 106 -- website/static/docs/en/html/namespaces.js | 4 - website/static/docs/en/html/nav_f.png | Bin 153 -> 0 bytes website/static/docs/en/html/nav_g.png | Bin 95 -> 0 bytes website/static/docs/en/html/nav_h.png | Bin 98 -> 0 bytes website/static/docs/en/html/navtree.css | 146 -- website/static/docs/en/html/navtree.js | 517 ------ website/static/docs/en/html/navtreedata.js | 40 - website/static/docs/en/html/navtreeindex0.js | 253 --- website/static/docs/en/html/navtreeindex1.js | 139 -- website/static/docs/en/html/open.png | Bin 123 -> 0 bytes .../docs/en/html/productquantizer_8cc.html | 118 -- .../docs/en/html/productquantizer_8cc.js | 4 - .../docs/en/html/productquantizer_8h.html | 124 -- .../en/html/productquantizer_8h_source.html | 133 -- website/static/docs/en/html/qmatrix_8cc.html | 112 -- website/static/docs/en/html/qmatrix_8h.html | 126 -- .../docs/en/html/qmatrix_8h_source.html | 128 -- website/static/docs/en/html/real_8h.html | 117 -- website/static/docs/en/html/real_8h.js | 4 - .../static/docs/en/html/real_8h_source.html | 103 -- website/static/docs/en/html/resize.js | 114 -- .../docs/en/html/search/.files_7.html.StRRNc | 0 .../en/html/search/.variables_a.html.1MGQ27 | 0 website/static/docs/en/html/search/all_0.html | 26 - website/static/docs/en/html/search/all_0.js | 17 - website/static/docs/en/html/search/all_1.html | 26 - website/static/docs/en/html/search/all_1.js | 8 - .../static/docs/en/html/search/all_10.html | 26 - website/static/docs/en/html/search/all_10.js | 10 - .../static/docs/en/html/search/all_11.html | 26 - website/static/docs/en/html/search/all_11.js | 25 - .../static/docs/en/html/search/all_12.html | 26 - website/static/docs/en/html/search/all_12.js | 15 - .../static/docs/en/html/search/all_13.html | 26 - website/static/docs/en/html/search/all_13.js | 7 - .../static/docs/en/html/search/all_14.html | 26 - website/static/docs/en/html/search/all_14.js | 7 - .../static/docs/en/html/search/all_15.html | 26 - website/static/docs/en/html/search/all_15.js | 11 - .../static/docs/en/html/search/all_16.html | 26 - website/static/docs/en/html/search/all_16.js | 4 - .../static/docs/en/html/search/all_17.html | 26 - website/static/docs/en/html/search/all_17.js | 7 - website/static/docs/en/html/search/all_2.html | 26 - website/static/docs/en/html/search/all_2.js | 17 - website/static/docs/en/html/search/all_3.html | 26 - website/static/docs/en/html/search/all_3.js | 17 - website/static/docs/en/html/search/all_4.html | 26 - website/static/docs/en/html/search/all_4.js | 10 - website/static/docs/en/html/search/all_5.html | 26 - website/static/docs/en/html/search/all_5.js | 12 - website/static/docs/en/html/search/all_6.html | 26 - website/static/docs/en/html/search/all_6.js | 18 - website/static/docs/en/html/search/all_7.html | 26 - website/static/docs/en/html/search/all_7.js | 8 - website/static/docs/en/html/search/all_8.html | 26 - website/static/docs/en/html/search/all_8.js | 11 - website/static/docs/en/html/search/all_9.html | 26 - website/static/docs/en/html/search/all_9.js | 5 - website/static/docs/en/html/search/all_a.html | 26 - website/static/docs/en/html/search/all_a.js | 17 - website/static/docs/en/html/search/all_b.html | 26 - website/static/docs/en/html/search/all_b.js | 27 - website/static/docs/en/html/search/all_c.html | 26 - website/static/docs/en/html/search/all_c.js | 26 - website/static/docs/en/html/search/all_d.html | 26 - website/static/docs/en/html/search/all_d.js | 9 - website/static/docs/en/html/search/all_e.html | 26 - website/static/docs/en/html/search/all_e.js | 35 - website/static/docs/en/html/search/all_f.html | 26 - website/static/docs/en/html/search/all_f.js | 16 - .../static/docs/en/html/search/classes_0.html | 26 - .../static/docs/en/html/search/classes_0.js | 4 - .../static/docs/en/html/search/classes_1.html | 26 - .../static/docs/en/html/search/classes_1.js | 4 - .../static/docs/en/html/search/classes_2.html | 26 - .../static/docs/en/html/search/classes_2.js | 4 - .../static/docs/en/html/search/classes_3.html | 26 - .../static/docs/en/html/search/classes_3.js | 4 - .../static/docs/en/html/search/classes_4.html | 26 - .../static/docs/en/html/search/classes_4.js | 5 - .../static/docs/en/html/search/classes_5.html | 26 - .../static/docs/en/html/search/classes_5.js | 4 - .../static/docs/en/html/search/classes_6.html | 26 - .../static/docs/en/html/search/classes_6.js | 4 - .../static/docs/en/html/search/classes_7.html | 26 - .../static/docs/en/html/search/classes_7.js | 4 - .../static/docs/en/html/search/classes_8.html | 26 - .../static/docs/en/html/search/classes_8.js | 4 - website/static/docs/en/html/search/close.png | Bin 273 -> 0 bytes .../static/docs/en/html/search/defines_0.html | 26 - .../static/docs/en/html/search/defines_0.js | 5 - .../static/docs/en/html/search/defines_1.html | 26 - .../static/docs/en/html/search/defines_1.js | 4 - .../static/docs/en/html/search/defines_2.html | 26 - .../static/docs/en/html/search/defines_2.js | 4 - .../static/docs/en/html/search/defines_3.html | 26 - .../static/docs/en/html/search/defines_3.js | 4 - .../static/docs/en/html/search/enums_0.html | 26 - website/static/docs/en/html/search/enums_0.js | 4 - .../static/docs/en/html/search/enums_1.html | 26 - website/static/docs/en/html/search/enums_1.js | 4 - .../static/docs/en/html/search/enums_2.html | 26 - website/static/docs/en/html/search/enums_2.js | 4 - .../docs/en/html/search/enumvalues_0.html | 26 - .../docs/en/html/search/enumvalues_0.js | 4 - .../docs/en/html/search/enumvalues_1.html | 26 - .../docs/en/html/search/enumvalues_1.js | 4 - .../docs/en/html/search/enumvalues_2.html | 26 - .../docs/en/html/search/enumvalues_2.js | 4 - .../docs/en/html/search/enumvalues_3.html | 26 - .../docs/en/html/search/enumvalues_3.js | 4 - .../docs/en/html/search/enumvalues_4.html | 26 - .../docs/en/html/search/enumvalues_4.js | 6 - .../docs/en/html/search/enumvalues_5.html | 26 - .../docs/en/html/search/enumvalues_5.js | 4 - .../static/docs/en/html/search/files_0.html | 26 - website/static/docs/en/html/search/files_0.js | 5 - .../static/docs/en/html/search/files_1.html | 26 - website/static/docs/en/html/search/files_1.js | 5 - .../static/docs/en/html/search/files_2.html | 26 - website/static/docs/en/html/search/files_2.js | 5 - .../static/docs/en/html/search/files_3.html | 26 - website/static/docs/en/html/search/files_3.js | 8 - .../static/docs/en/html/search/files_4.html | 26 - website/static/docs/en/html/search/files_4.js | 5 - .../static/docs/en/html/search/files_5.html | 26 - website/static/docs/en/html/search/files_5.js | 5 - .../static/docs/en/html/search/files_6.html | 26 - website/static/docs/en/html/search/files_6.js | 4 - .../static/docs/en/html/search/files_7.html | 26 - website/static/docs/en/html/search/files_7.js | 5 - .../static/docs/en/html/search/files_8.html | 26 - website/static/docs/en/html/search/files_8.js | 5 - .../docs/en/html/search/functions_0.html | 26 - .../static/docs/en/html/search/functions_0.js | 14 - .../docs/en/html/search/functions_1.html | 26 - .../static/docs/en/html/search/functions_1.js | 5 - .../docs/en/html/search/functions_10.html | 26 - .../docs/en/html/search/functions_10.js | 5 - .../docs/en/html/search/functions_11.html | 26 - .../docs/en/html/search/functions_11.js | 18 - .../docs/en/html/search/functions_12.html | 26 - .../docs/en/html/search/functions_12.js | 8 - .../docs/en/html/search/functions_13.html | 26 - .../docs/en/html/search/functions_13.js | 5 - .../docs/en/html/search/functions_14.html | 26 - .../docs/en/html/search/functions_14.js | 4 - .../docs/en/html/search/functions_15.html | 26 - .../docs/en/html/search/functions_15.js | 4 - .../docs/en/html/search/functions_16.html | 26 - .../docs/en/html/search/functions_16.js | 4 - .../docs/en/html/search/functions_17.html | 26 - .../docs/en/html/search/functions_17.js | 7 - .../docs/en/html/search/functions_2.html | 26 - .../static/docs/en/html/search/functions_2.js | 11 - .../docs/en/html/search/functions_3.html | 26 - .../static/docs/en/html/search/functions_3.js | 9 - .../docs/en/html/search/functions_4.html | 26 - .../static/docs/en/html/search/functions_4.js | 4 - .../docs/en/html/search/functions_5.html | 26 - .../static/docs/en/html/search/functions_5.js | 7 - .../docs/en/html/search/functions_6.html | 26 - .../static/docs/en/html/search/functions_6.js | 17 - .../docs/en/html/search/functions_7.html | 26 - .../static/docs/en/html/search/functions_7.js | 5 - .../docs/en/html/search/functions_8.html | 26 - .../static/docs/en/html/search/functions_8.js | 8 - .../docs/en/html/search/functions_9.html | 26 - .../static/docs/en/html/search/functions_9.js | 4 - .../docs/en/html/search/functions_a.html | 26 - .../static/docs/en/html/search/functions_a.js | 8 - .../docs/en/html/search/functions_b.html | 26 - .../static/docs/en/html/search/functions_b.js | 10 - .../docs/en/html/search/functions_c.html | 26 - .../static/docs/en/html/search/functions_c.js | 10 - .../docs/en/html/search/functions_d.html | 26 - .../static/docs/en/html/search/functions_d.js | 6 - .../docs/en/html/search/functions_e.html | 26 - .../static/docs/en/html/search/functions_e.js | 26 - .../docs/en/html/search/functions_f.html | 26 - .../static/docs/en/html/search/functions_f.js | 6 - .../static/docs/en/html/search/mag_sel.png | Bin 563 -> 0 bytes .../docs/en/html/search/namespaces_0.html | 26 - .../docs/en/html/search/namespaces_0.js | 5 - .../static/docs/en/html/search/nomatches.html | 12 - website/static/docs/en/html/search/search.css | 271 --- website/static/docs/en/html/search/search.js | 791 -------- .../static/docs/en/html/search/search_l.png | Bin 604 -> 0 bytes .../static/docs/en/html/search/search_m.png | Bin 158 -> 0 bytes .../static/docs/en/html/search/search_r.png | Bin 612 -> 0 bytes .../static/docs/en/html/search/searchdata.js | 42 - .../docs/en/html/search/typedefs_0.html | 26 - .../static/docs/en/html/search/typedefs_0.js | 4 - .../docs/en/html/search/typedefs_1.html | 26 - .../static/docs/en/html/search/typedefs_1.js | 4 - .../docs/en/html/search/variables_0.html | 26 - .../static/docs/en/html/search/variables_0.js | 4 - .../docs/en/html/search/variables_1.html | 26 - .../static/docs/en/html/search/variables_1.js | 6 - .../docs/en/html/search/variables_10.html | 26 - .../docs/en/html/search/variables_10.js | 8 - .../docs/en/html/search/variables_11.html | 26 - .../docs/en/html/search/variables_11.js | 11 - .../docs/en/html/search/variables_12.html | 26 - .../docs/en/html/search/variables_12.js | 4 - .../docs/en/html/search/variables_13.html | 26 - .../docs/en/html/search/variables_13.js | 10 - .../docs/en/html/search/variables_2.html | 26 - .../static/docs/en/html/search/variables_2.js | 9 - .../docs/en/html/search/variables_3.html | 26 - .../static/docs/en/html/search/variables_3.js | 9 - .../docs/en/html/search/variables_4.html | 26 - .../static/docs/en/html/search/variables_4.js | 7 - .../docs/en/html/search/variables_5.html | 26 - .../static/docs/en/html/search/variables_5.js | 4 - .../docs/en/html/search/variables_6.html | 26 - .../static/docs/en/html/search/variables_6.js | 5 - .../docs/en/html/search/variables_7.html | 26 - .../static/docs/en/html/search/variables_7.js | 5 - .../docs/en/html/search/variables_8.html | 26 - .../static/docs/en/html/search/variables_8.js | 4 - .../docs/en/html/search/variables_9.html | 26 - .../static/docs/en/html/search/variables_9.js | 10 - .../docs/en/html/search/variables_a.html | 26 - .../static/docs/en/html/search/variables_a.js | 14 - .../docs/en/html/search/variables_b.html | 26 - .../static/docs/en/html/search/variables_b.js | 17 - .../docs/en/html/search/variables_c.html | 26 - .../static/docs/en/html/search/variables_c.js | 6 - .../docs/en/html/search/variables_d.html | 26 - .../static/docs/en/html/search/variables_d.js | 10 - .../docs/en/html/search/variables_e.html | 26 - .../static/docs/en/html/search/variables_e.js | 11 - .../docs/en/html/search/variables_f.html | 26 - .../static/docs/en/html/search/variables_f.js | 6 - website/static/docs/en/html/splitbar.png | Bin 314 -> 0 bytes .../html/structfasttext_1_1Node-members.html | 108 -- .../docs/en/html/structfasttext_1_1Node.html | 194 -- .../docs/en/html/structfasttext_1_1Node.js | 8 - .../html/structfasttext_1_1entry-members.html | 107 -- .../docs/en/html/structfasttext_1_1entry.html | 178 -- .../docs/en/html/structfasttext_1_1entry.js | 7 - website/static/docs/en/html/sync_off.png | Bin 853 -> 0 bytes website/static/docs/en/html/sync_on.png | Bin 845 -> 0 bytes website/static/docs/en/html/tab_a.png | Bin 142 -> 0 bytes website/static/docs/en/html/tab_b.png | Bin 169 -> 0 bytes website/static/docs/en/html/tab_h.png | Bin 177 -> 0 bytes website/static/docs/en/html/tab_s.png | Bin 184 -> 0 bytes website/static/docs/en/html/tabs.css | 1 - website/static/docs/en/html/utils_8cc.html | 121 -- website/static/docs/en/html/utils_8cc.js | 5 - website/static/docs/en/html/utils_8h.html | 122 -- website/static/docs/en/html/utils_8h.js | 5 - .../static/docs/en/html/utils_8h_source.html | 104 -- website/static/docs/en/html/vector_8cc.html | 121 -- website/static/docs/en/html/vector_8cc.js | 4 - website/static/docs/en/html/vector_8h.html | 126 -- website/static/docs/en/html/vector_8h.js | 5 - .../static/docs/en/html/vector_8h_source.html | 120 -- website/static/fasttext.css | 57 - website/static/img/authors/armand_joulin.jpg | Bin 12566 -> 0 bytes .../static/img/authors/christian_puhrsch.png | Bin 125356 -> 0 bytes website/static/img/authors/edouard_grave.jpeg | Bin 14397 -> 0 bytes .../static/img/authors/piotr_bojanowski.jpg | Bin 823035 -> 0 bytes website/static/img/authors/tomas_mikolov.jpg | Bin 2184912 -> 0 bytes .../img/blog/2016-08-18-blog-post-img1.png | Bin 13686 -> 0 bytes .../img/blog/2016-08-18-blog-post-img2.png | Bin 21772 -> 0 bytes .../img/blog/2017-05-02-blog-post-img1.jpg | Bin 157398 -> 0 bytes .../img/blog/2017-05-02-blog-post-img2.jpg | Bin 116081 -> 0 bytes .../img/blog/2017-10-02-blog-post-img1.png | Bin 38634 -> 0 bytes website/static/img/cbo_vs_skipgram.png | Bin 51144 -> 0 bytes website/static/img/fasttext-icon-api.png | Bin 1573 -> 0 bytes website/static/img/fasttext-icon-bg-web.png | Bin 6104 -> 0 bytes .../static/img/fasttext-icon-color-square.png | Bin 3587 -> 0 bytes .../static/img/fasttext-icon-color-web.png | Bin 3393 -> 0 bytes website/static/img/fasttext-icon-faq.png | Bin 5584 -> 0 bytes website/static/img/fasttext-icon-tutorial.png | Bin 5728 -> 0 bytes .../static/img/fasttext-icon-white-web.png | Bin 4611 -> 0 bytes .../static/img/fasttext-logo-color-web.png | Bin 29206 -> 0 bytes .../static/img/fasttext-logo-white-web.png | Bin 25504 -> 0 bytes website/static/img/logo-color.png | Bin 29206 -> 0 bytes website/static/img/model-black.png | Bin 13144 -> 0 bytes website/static/img/model-blue.png | Bin 22703 -> 0 bytes website/static/img/model-red.png | Bin 23274 -> 0 bytes website/static/img/ogimage.png | Bin 26643 -> 0 bytes website/static/img/oss_logo.png | Bin 4370 -> 0 bytes website/static/tabber.js | 42 - wikifil.pl | 57 - word-vector-example.sh | 39 - 494 files changed, 22 insertions(+), 39137 deletions(-) delete mode 100644 CMakeLists.txt delete mode 100644 alignment/README.md delete mode 100644 alignment/align.py delete mode 100644 alignment/eval.py delete mode 100755 alignment/example.sh delete mode 100644 alignment/unsup_align.py delete mode 100644 alignment/unsup_multialign.py delete mode 100644 alignment/utils.py delete mode 100755 classification-example.sh delete mode 100755 classification-results.sh delete mode 100644 crawl/README.md delete mode 100644 crawl/dedup.cc delete mode 100644 crawl/download_crawl.sh delete mode 100644 crawl/filter_dedup.sh delete mode 100644 crawl/filter_utf8.cc delete mode 100644 crawl/process_wet_file.sh delete mode 100644 docs/aligned-vectors.md delete mode 100644 docs/api.md delete mode 100644 docs/autotune.md delete mode 100644 docs/cheatsheet.md delete mode 100644 docs/crawl-vectors.md delete mode 100644 docs/dataset.md delete mode 100644 docs/english-vectors.md delete mode 100644 docs/faqs.md delete mode 100644 docs/language-identification.md delete mode 100644 docs/options.md delete mode 100644 docs/pretrained-vectors.md delete mode 100644 docs/python-module.md delete mode 100644 docs/references.md delete mode 100644 docs/supervised-models.md delete mode 100644 docs/supervised-tutorial.md delete mode 100644 docs/support.md delete mode 100644 docs/unsupervised-tutorials.md delete mode 100644 docs/webassembly-module.md delete mode 100755 download_model.py delete mode 100644 eval.py delete mode 100644 fasttext.pc.in delete mode 100755 get-wikimedia.sh delete mode 100644 python/README.md delete mode 100644 python/README.rst delete mode 100644 python/benchmarks/README.rst delete mode 100644 python/benchmarks/get_word_vector.py delete mode 100644 python/doc/examples/FastTextEmbeddingBag.py delete mode 100644 python/doc/examples/bin_to_vec.py delete mode 100644 python/doc/examples/compute_accuracy.py delete mode 100644 python/doc/examples/get_vocab.py delete mode 100644 python/doc/examples/train_supervised.py delete mode 100644 python/doc/examples/train_unsupervised.py delete mode 100644 python/fasttext_module/fasttext/tests/__init__.py delete mode 100644 python/fasttext_module/fasttext/tests/test_configurations.py delete mode 100644 python/fasttext_module/fasttext/tests/test_script.py delete mode 100644 python/fasttext_module/fasttext/util/__init__.py delete mode 100644 python/fasttext_module/fasttext/util/util.py delete mode 100755 quantization-example.sh delete mode 100755 reduce_model.py delete mode 100644 runtests.py delete mode 100644 scripts/kbcompletion/README.md delete mode 100755 scripts/kbcompletion/data.sh delete mode 100644 scripts/kbcompletion/eval.cpp delete mode 100755 scripts/kbcompletion/fb15k.sh delete mode 100755 scripts/kbcompletion/fb15k237.sh delete mode 100755 scripts/kbcompletion/svo.sh delete mode 100755 scripts/kbcompletion/wn18.sh delete mode 100644 scripts/quantization/quantization-results.sh delete mode 100644 src/autotune.cc delete mode 100644 src/autotune.h delete mode 100644 src/main.cc delete mode 100644 src/meter.cc delete mode 100644 src/meter.h delete mode 100644 src/utils.cc delete mode 100755 tests/fetch_test_data.sh delete mode 100644 webassembly/README.md delete mode 100644 webassembly/doc/examples/misc.html delete mode 100644 webassembly/doc/examples/predict.html delete mode 100644 webassembly/doc/examples/train_supervised.html delete mode 100644 webassembly/doc/examples/train_unsupervised.html delete mode 100644 webassembly/fasttext.js delete mode 100644 webassembly/fasttext_wasm.cc delete mode 100644 website/README.md delete mode 100644 website/blog/2016-08-18-blog-post.md delete mode 100755 website/blog/2017-05-02-blog-post.md delete mode 100644 website/blog/2017-10-02-blog-post.md delete mode 100644 website/blog/2019-06-25-blog-post.md delete mode 100644 website/core/Footer.js delete mode 100644 website/package.json delete mode 100755 website/pages/en/index.js delete mode 100644 website/sidebars.json delete mode 100644 website/siteConfig.js delete mode 100644 website/static/docs/en/html/.classfasttext_1_1QMatrix-members.html.i4eKqy delete mode 100644 website/static/docs/en/html/annotated.html delete mode 100644 website/static/docs/en/html/annotated_dup.js delete mode 100644 website/static/docs/en/html/args_8cc.html delete mode 100644 website/static/docs/en/html/args_8h.html delete mode 100644 website/static/docs/en/html/args_8h.js delete mode 100644 website/static/docs/en/html/args_8h_source.html delete mode 100644 website/static/docs/en/html/bc_s.png delete mode 100644 website/static/docs/en/html/bdwn.png delete mode 100644 website/static/docs/en/html/classes.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Args-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Args.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Args.js delete mode 100644 website/static/docs/en/html/classfasttext_1_1Dictionary-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Dictionary.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Dictionary.js delete mode 100644 website/static/docs/en/html/classfasttext_1_1FastText-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1FastText.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1FastText.js delete mode 100644 website/static/docs/en/html/classfasttext_1_1Matrix-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Matrix.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Matrix.js delete mode 100644 website/static/docs/en/html/classfasttext_1_1Model-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Model.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Model.js delete mode 100644 website/static/docs/en/html/classfasttext_1_1ProductQuantizer-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1ProductQuantizer.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1ProductQuantizer.js delete mode 100644 website/static/docs/en/html/classfasttext_1_1QMatrix-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1QMatrix.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1QMatrix.js delete mode 100644 website/static/docs/en/html/classfasttext_1_1Vector-members.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Vector.html delete mode 100644 website/static/docs/en/html/classfasttext_1_1Vector.js delete mode 100644 website/static/docs/en/html/closed.png delete mode 100644 website/static/docs/en/html/dictionary_8cc.html delete mode 100644 website/static/docs/en/html/dictionary_8h.html delete mode 100644 website/static/docs/en/html/dictionary_8h.js delete mode 100644 website/static/docs/en/html/dictionary_8h_source.html delete mode 100644 website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html delete mode 100644 website/static/docs/en/html/dir_68267d1309a1af8e8297ef4c3efbcdba.js delete mode 100644 website/static/docs/en/html/doc.png delete mode 100644 website/static/docs/en/html/doxygen.css delete mode 100644 website/static/docs/en/html/doxygen.png delete mode 100644 website/static/docs/en/html/dynsections.js delete mode 100644 website/static/docs/en/html/fasttext_8cc.html delete mode 100644 website/static/docs/en/html/fasttext_8h.html delete mode 100644 website/static/docs/en/html/fasttext_8h.js delete mode 100644 website/static/docs/en/html/fasttext_8h_source.html delete mode 100644 website/static/docs/en/html/favicon.png delete mode 100644 website/static/docs/en/html/files.html delete mode 100644 website/static/docs/en/html/files.js delete mode 100644 website/static/docs/en/html/folderclosed.png delete mode 100644 website/static/docs/en/html/folderopen.png delete mode 100644 website/static/docs/en/html/functions.html delete mode 100644 website/static/docs/en/html/functions_0x7e.html delete mode 100644 website/static/docs/en/html/functions_b.html delete mode 100644 website/static/docs/en/html/functions_c.html delete mode 100644 website/static/docs/en/html/functions_d.html delete mode 100644 website/static/docs/en/html/functions_dup.js delete mode 100644 website/static/docs/en/html/functions_e.html delete mode 100644 website/static/docs/en/html/functions_f.html delete mode 100644 website/static/docs/en/html/functions_func.html delete mode 100644 website/static/docs/en/html/functions_g.html delete mode 100644 website/static/docs/en/html/functions_h.html delete mode 100644 website/static/docs/en/html/functions_i.html delete mode 100644 website/static/docs/en/html/functions_k.html delete mode 100644 website/static/docs/en/html/functions_l.html delete mode 100644 website/static/docs/en/html/functions_m.html delete mode 100644 website/static/docs/en/html/functions_n.html delete mode 100644 website/static/docs/en/html/functions_o.html delete mode 100644 website/static/docs/en/html/functions_p.html delete mode 100644 website/static/docs/en/html/functions_q.html delete mode 100644 website/static/docs/en/html/functions_r.html delete mode 100644 website/static/docs/en/html/functions_s.html delete mode 100644 website/static/docs/en/html/functions_t.html delete mode 100644 website/static/docs/en/html/functions_u.html delete mode 100644 website/static/docs/en/html/functions_v.html delete mode 100644 website/static/docs/en/html/functions_vars.html delete mode 100644 website/static/docs/en/html/functions_w.html delete mode 100644 website/static/docs/en/html/functions_z.html delete mode 100644 website/static/docs/en/html/globals.html delete mode 100644 website/static/docs/en/html/globals_defs.html delete mode 100644 website/static/docs/en/html/globals_func.html delete mode 100644 website/static/docs/en/html/index.html delete mode 100644 website/static/docs/en/html/jquery.js delete mode 100644 website/static/docs/en/html/main_8cc.html delete mode 100644 website/static/docs/en/html/main_8cc.js delete mode 100644 website/static/docs/en/html/matrix_8cc.html delete mode 100644 website/static/docs/en/html/matrix_8h.html delete mode 100644 website/static/docs/en/html/matrix_8h_source.html delete mode 100644 website/static/docs/en/html/menu.js delete mode 100644 website/static/docs/en/html/menudata.js delete mode 100644 website/static/docs/en/html/model_8cc.html delete mode 100644 website/static/docs/en/html/model_8h.html delete mode 100644 website/static/docs/en/html/model_8h.js delete mode 100644 website/static/docs/en/html/model_8h_source.html delete mode 100644 website/static/docs/en/html/namespacefasttext.html delete mode 100644 website/static/docs/en/html/namespacefasttext.js delete mode 100644 website/static/docs/en/html/namespacefasttext_1_1utils.html delete mode 100644 website/static/docs/en/html/namespacemembers.html delete mode 100644 website/static/docs/en/html/namespacemembers_enum.html delete mode 100644 website/static/docs/en/html/namespacemembers_func.html delete mode 100644 website/static/docs/en/html/namespacemembers_type.html delete mode 100644 website/static/docs/en/html/namespaces.html delete mode 100644 website/static/docs/en/html/namespaces.js delete mode 100644 website/static/docs/en/html/nav_f.png delete mode 100644 website/static/docs/en/html/nav_g.png delete mode 100644 website/static/docs/en/html/nav_h.png delete mode 100644 website/static/docs/en/html/navtree.css delete mode 100644 website/static/docs/en/html/navtree.js delete mode 100644 website/static/docs/en/html/navtreedata.js delete mode 100644 website/static/docs/en/html/navtreeindex0.js delete mode 100644 website/static/docs/en/html/navtreeindex1.js delete mode 100644 website/static/docs/en/html/open.png delete mode 100644 website/static/docs/en/html/productquantizer_8cc.html delete mode 100644 website/static/docs/en/html/productquantizer_8cc.js delete mode 100644 website/static/docs/en/html/productquantizer_8h.html delete mode 100644 website/static/docs/en/html/productquantizer_8h_source.html delete mode 100644 website/static/docs/en/html/qmatrix_8cc.html delete mode 100644 website/static/docs/en/html/qmatrix_8h.html delete mode 100644 website/static/docs/en/html/qmatrix_8h_source.html delete mode 100644 website/static/docs/en/html/real_8h.html delete mode 100644 website/static/docs/en/html/real_8h.js delete mode 100644 website/static/docs/en/html/real_8h_source.html delete mode 100644 website/static/docs/en/html/resize.js delete mode 100644 website/static/docs/en/html/search/.files_7.html.StRRNc delete mode 100644 website/static/docs/en/html/search/.variables_a.html.1MGQ27 delete mode 100644 website/static/docs/en/html/search/all_0.html delete mode 100644 website/static/docs/en/html/search/all_0.js delete mode 100644 website/static/docs/en/html/search/all_1.html delete mode 100644 website/static/docs/en/html/search/all_1.js delete mode 100644 website/static/docs/en/html/search/all_10.html delete mode 100644 website/static/docs/en/html/search/all_10.js delete mode 100644 website/static/docs/en/html/search/all_11.html delete mode 100644 website/static/docs/en/html/search/all_11.js delete mode 100644 website/static/docs/en/html/search/all_12.html delete mode 100644 website/static/docs/en/html/search/all_12.js delete mode 100644 website/static/docs/en/html/search/all_13.html delete mode 100644 website/static/docs/en/html/search/all_13.js delete mode 100644 website/static/docs/en/html/search/all_14.html delete mode 100644 website/static/docs/en/html/search/all_14.js delete mode 100644 website/static/docs/en/html/search/all_15.html delete mode 100644 website/static/docs/en/html/search/all_15.js delete mode 100644 website/static/docs/en/html/search/all_16.html delete mode 100644 website/static/docs/en/html/search/all_16.js delete mode 100644 website/static/docs/en/html/search/all_17.html delete mode 100644 website/static/docs/en/html/search/all_17.js delete mode 100644 website/static/docs/en/html/search/all_2.html delete mode 100644 website/static/docs/en/html/search/all_2.js delete mode 100644 website/static/docs/en/html/search/all_3.html delete mode 100644 website/static/docs/en/html/search/all_3.js delete mode 100644 website/static/docs/en/html/search/all_4.html delete mode 100644 website/static/docs/en/html/search/all_4.js delete mode 100644 website/static/docs/en/html/search/all_5.html delete mode 100644 website/static/docs/en/html/search/all_5.js delete mode 100644 website/static/docs/en/html/search/all_6.html delete mode 100644 website/static/docs/en/html/search/all_6.js delete mode 100644 website/static/docs/en/html/search/all_7.html delete mode 100644 website/static/docs/en/html/search/all_7.js delete mode 100644 website/static/docs/en/html/search/all_8.html delete mode 100644 website/static/docs/en/html/search/all_8.js delete mode 100644 website/static/docs/en/html/search/all_9.html delete mode 100644 website/static/docs/en/html/search/all_9.js delete mode 100644 website/static/docs/en/html/search/all_a.html delete mode 100644 website/static/docs/en/html/search/all_a.js delete mode 100644 website/static/docs/en/html/search/all_b.html delete mode 100644 website/static/docs/en/html/search/all_b.js delete mode 100644 website/static/docs/en/html/search/all_c.html delete mode 100644 website/static/docs/en/html/search/all_c.js delete mode 100644 website/static/docs/en/html/search/all_d.html delete mode 100644 website/static/docs/en/html/search/all_d.js delete mode 100644 website/static/docs/en/html/search/all_e.html delete mode 100644 website/static/docs/en/html/search/all_e.js delete mode 100644 website/static/docs/en/html/search/all_f.html delete mode 100644 website/static/docs/en/html/search/all_f.js delete mode 100644 website/static/docs/en/html/search/classes_0.html delete mode 100644 website/static/docs/en/html/search/classes_0.js delete mode 100644 website/static/docs/en/html/search/classes_1.html delete mode 100644 website/static/docs/en/html/search/classes_1.js delete mode 100644 website/static/docs/en/html/search/classes_2.html delete mode 100644 website/static/docs/en/html/search/classes_2.js delete mode 100644 website/static/docs/en/html/search/classes_3.html delete mode 100644 website/static/docs/en/html/search/classes_3.js delete mode 100644 website/static/docs/en/html/search/classes_4.html delete mode 100644 website/static/docs/en/html/search/classes_4.js delete mode 100644 website/static/docs/en/html/search/classes_5.html delete mode 100644 website/static/docs/en/html/search/classes_5.js delete mode 100644 website/static/docs/en/html/search/classes_6.html delete mode 100644 website/static/docs/en/html/search/classes_6.js delete mode 100644 website/static/docs/en/html/search/classes_7.html delete mode 100644 website/static/docs/en/html/search/classes_7.js delete mode 100644 website/static/docs/en/html/search/classes_8.html delete mode 100644 website/static/docs/en/html/search/classes_8.js delete mode 100644 website/static/docs/en/html/search/close.png delete mode 100644 website/static/docs/en/html/search/defines_0.html delete mode 100644 website/static/docs/en/html/search/defines_0.js delete mode 100644 website/static/docs/en/html/search/defines_1.html delete mode 100644 website/static/docs/en/html/search/defines_1.js delete mode 100644 website/static/docs/en/html/search/defines_2.html delete mode 100644 website/static/docs/en/html/search/defines_2.js delete mode 100644 website/static/docs/en/html/search/defines_3.html delete mode 100644 website/static/docs/en/html/search/defines_3.js delete mode 100644 website/static/docs/en/html/search/enums_0.html delete mode 100644 website/static/docs/en/html/search/enums_0.js delete mode 100644 website/static/docs/en/html/search/enums_1.html delete mode 100644 website/static/docs/en/html/search/enums_1.js delete mode 100644 website/static/docs/en/html/search/enums_2.html delete mode 100644 website/static/docs/en/html/search/enums_2.js delete mode 100644 website/static/docs/en/html/search/enumvalues_0.html delete mode 100644 website/static/docs/en/html/search/enumvalues_0.js delete mode 100644 website/static/docs/en/html/search/enumvalues_1.html delete mode 100644 website/static/docs/en/html/search/enumvalues_1.js delete mode 100644 website/static/docs/en/html/search/enumvalues_2.html delete mode 100644 website/static/docs/en/html/search/enumvalues_2.js delete mode 100644 website/static/docs/en/html/search/enumvalues_3.html delete mode 100644 website/static/docs/en/html/search/enumvalues_3.js delete mode 100644 website/static/docs/en/html/search/enumvalues_4.html delete mode 100644 website/static/docs/en/html/search/enumvalues_4.js delete mode 100644 website/static/docs/en/html/search/enumvalues_5.html delete mode 100644 website/static/docs/en/html/search/enumvalues_5.js delete mode 100644 website/static/docs/en/html/search/files_0.html delete mode 100644 website/static/docs/en/html/search/files_0.js delete mode 100644 website/static/docs/en/html/search/files_1.html delete mode 100644 website/static/docs/en/html/search/files_1.js delete mode 100644 website/static/docs/en/html/search/files_2.html delete mode 100644 website/static/docs/en/html/search/files_2.js delete mode 100644 website/static/docs/en/html/search/files_3.html delete mode 100644 website/static/docs/en/html/search/files_3.js delete mode 100644 website/static/docs/en/html/search/files_4.html delete mode 100644 website/static/docs/en/html/search/files_4.js delete mode 100644 website/static/docs/en/html/search/files_5.html delete mode 100644 website/static/docs/en/html/search/files_5.js delete mode 100644 website/static/docs/en/html/search/files_6.html delete mode 100644 website/static/docs/en/html/search/files_6.js delete mode 100644 website/static/docs/en/html/search/files_7.html delete mode 100644 website/static/docs/en/html/search/files_7.js delete mode 100644 website/static/docs/en/html/search/files_8.html delete mode 100644 website/static/docs/en/html/search/files_8.js delete mode 100644 website/static/docs/en/html/search/functions_0.html delete mode 100644 website/static/docs/en/html/search/functions_0.js delete mode 100644 website/static/docs/en/html/search/functions_1.html delete mode 100644 website/static/docs/en/html/search/functions_1.js delete mode 100644 website/static/docs/en/html/search/functions_10.html delete mode 100644 website/static/docs/en/html/search/functions_10.js delete mode 100644 website/static/docs/en/html/search/functions_11.html delete mode 100644 website/static/docs/en/html/search/functions_11.js delete mode 100644 website/static/docs/en/html/search/functions_12.html delete mode 100644 website/static/docs/en/html/search/functions_12.js delete mode 100644 website/static/docs/en/html/search/functions_13.html delete mode 100644 website/static/docs/en/html/search/functions_13.js delete mode 100644 website/static/docs/en/html/search/functions_14.html delete mode 100644 website/static/docs/en/html/search/functions_14.js delete mode 100644 website/static/docs/en/html/search/functions_15.html delete mode 100644 website/static/docs/en/html/search/functions_15.js delete mode 100644 website/static/docs/en/html/search/functions_16.html delete mode 100644 website/static/docs/en/html/search/functions_16.js delete mode 100644 website/static/docs/en/html/search/functions_17.html delete mode 100644 website/static/docs/en/html/search/functions_17.js delete mode 100644 website/static/docs/en/html/search/functions_2.html delete mode 100644 website/static/docs/en/html/search/functions_2.js delete mode 100644 website/static/docs/en/html/search/functions_3.html delete mode 100644 website/static/docs/en/html/search/functions_3.js delete mode 100644 website/static/docs/en/html/search/functions_4.html delete mode 100644 website/static/docs/en/html/search/functions_4.js delete mode 100644 website/static/docs/en/html/search/functions_5.html delete mode 100644 website/static/docs/en/html/search/functions_5.js delete mode 100644 website/static/docs/en/html/search/functions_6.html delete mode 100644 website/static/docs/en/html/search/functions_6.js delete mode 100644 website/static/docs/en/html/search/functions_7.html delete mode 100644 website/static/docs/en/html/search/functions_7.js delete mode 100644 website/static/docs/en/html/search/functions_8.html delete mode 100644 website/static/docs/en/html/search/functions_8.js delete mode 100644 website/static/docs/en/html/search/functions_9.html delete mode 100644 website/static/docs/en/html/search/functions_9.js delete mode 100644 website/static/docs/en/html/search/functions_a.html delete mode 100644 website/static/docs/en/html/search/functions_a.js delete mode 100644 website/static/docs/en/html/search/functions_b.html delete mode 100644 website/static/docs/en/html/search/functions_b.js delete mode 100644 website/static/docs/en/html/search/functions_c.html delete mode 100644 website/static/docs/en/html/search/functions_c.js delete mode 100644 website/static/docs/en/html/search/functions_d.html delete mode 100644 website/static/docs/en/html/search/functions_d.js delete mode 100644 website/static/docs/en/html/search/functions_e.html delete mode 100644 website/static/docs/en/html/search/functions_e.js delete mode 100644 website/static/docs/en/html/search/functions_f.html delete mode 100644 website/static/docs/en/html/search/functions_f.js delete mode 100644 website/static/docs/en/html/search/mag_sel.png delete mode 100644 website/static/docs/en/html/search/namespaces_0.html delete mode 100644 website/static/docs/en/html/search/namespaces_0.js delete mode 100644 website/static/docs/en/html/search/nomatches.html delete mode 100644 website/static/docs/en/html/search/search.css delete mode 100644 website/static/docs/en/html/search/search.js delete mode 100644 website/static/docs/en/html/search/search_l.png delete mode 100644 website/static/docs/en/html/search/search_m.png delete mode 100644 website/static/docs/en/html/search/search_r.png delete mode 100644 website/static/docs/en/html/search/searchdata.js delete mode 100644 website/static/docs/en/html/search/typedefs_0.html delete mode 100644 website/static/docs/en/html/search/typedefs_0.js delete mode 100644 website/static/docs/en/html/search/typedefs_1.html delete mode 100644 website/static/docs/en/html/search/typedefs_1.js delete mode 100644 website/static/docs/en/html/search/variables_0.html delete mode 100644 website/static/docs/en/html/search/variables_0.js delete mode 100644 website/static/docs/en/html/search/variables_1.html delete mode 100644 website/static/docs/en/html/search/variables_1.js delete mode 100644 website/static/docs/en/html/search/variables_10.html delete mode 100644 website/static/docs/en/html/search/variables_10.js delete mode 100644 website/static/docs/en/html/search/variables_11.html delete mode 100644 website/static/docs/en/html/search/variables_11.js delete mode 100644 website/static/docs/en/html/search/variables_12.html delete mode 100644 website/static/docs/en/html/search/variables_12.js delete mode 100644 website/static/docs/en/html/search/variables_13.html delete mode 100644 website/static/docs/en/html/search/variables_13.js delete mode 100644 website/static/docs/en/html/search/variables_2.html delete mode 100644 website/static/docs/en/html/search/variables_2.js delete mode 100644 website/static/docs/en/html/search/variables_3.html delete mode 100644 website/static/docs/en/html/search/variables_3.js delete mode 100644 website/static/docs/en/html/search/variables_4.html delete mode 100644 website/static/docs/en/html/search/variables_4.js delete mode 100644 website/static/docs/en/html/search/variables_5.html delete mode 100644 website/static/docs/en/html/search/variables_5.js delete mode 100644 website/static/docs/en/html/search/variables_6.html delete mode 100644 website/static/docs/en/html/search/variables_6.js delete mode 100644 website/static/docs/en/html/search/variables_7.html delete mode 100644 website/static/docs/en/html/search/variables_7.js delete mode 100644 website/static/docs/en/html/search/variables_8.html delete mode 100644 website/static/docs/en/html/search/variables_8.js delete mode 100644 website/static/docs/en/html/search/variables_9.html delete mode 100644 website/static/docs/en/html/search/variables_9.js delete mode 100644 website/static/docs/en/html/search/variables_a.html delete mode 100644 website/static/docs/en/html/search/variables_a.js delete mode 100644 website/static/docs/en/html/search/variables_b.html delete mode 100644 website/static/docs/en/html/search/variables_b.js delete mode 100644 website/static/docs/en/html/search/variables_c.html delete mode 100644 website/static/docs/en/html/search/variables_c.js delete mode 100644 website/static/docs/en/html/search/variables_d.html delete mode 100644 website/static/docs/en/html/search/variables_d.js delete mode 100644 website/static/docs/en/html/search/variables_e.html delete mode 100644 website/static/docs/en/html/search/variables_e.js delete mode 100644 website/static/docs/en/html/search/variables_f.html delete mode 100644 website/static/docs/en/html/search/variables_f.js delete mode 100644 website/static/docs/en/html/splitbar.png delete mode 100644 website/static/docs/en/html/structfasttext_1_1Node-members.html delete mode 100644 website/static/docs/en/html/structfasttext_1_1Node.html delete mode 100644 website/static/docs/en/html/structfasttext_1_1Node.js delete mode 100644 website/static/docs/en/html/structfasttext_1_1entry-members.html delete mode 100644 website/static/docs/en/html/structfasttext_1_1entry.html delete mode 100644 website/static/docs/en/html/structfasttext_1_1entry.js delete mode 100644 website/static/docs/en/html/sync_off.png delete mode 100644 website/static/docs/en/html/sync_on.png delete mode 100644 website/static/docs/en/html/tab_a.png delete mode 100644 website/static/docs/en/html/tab_b.png delete mode 100644 website/static/docs/en/html/tab_h.png delete mode 100644 website/static/docs/en/html/tab_s.png delete mode 100644 website/static/docs/en/html/tabs.css delete mode 100644 website/static/docs/en/html/utils_8cc.html delete mode 100644 website/static/docs/en/html/utils_8cc.js delete mode 100644 website/static/docs/en/html/utils_8h.html delete mode 100644 website/static/docs/en/html/utils_8h.js delete mode 100644 website/static/docs/en/html/utils_8h_source.html delete mode 100644 website/static/docs/en/html/vector_8cc.html delete mode 100644 website/static/docs/en/html/vector_8cc.js delete mode 100644 website/static/docs/en/html/vector_8h.html delete mode 100644 website/static/docs/en/html/vector_8h.js delete mode 100644 website/static/docs/en/html/vector_8h_source.html delete mode 100644 website/static/fasttext.css delete mode 100644 website/static/img/authors/armand_joulin.jpg delete mode 100644 website/static/img/authors/christian_puhrsch.png delete mode 100644 website/static/img/authors/edouard_grave.jpeg delete mode 100644 website/static/img/authors/piotr_bojanowski.jpg delete mode 100644 website/static/img/authors/tomas_mikolov.jpg delete mode 100644 website/static/img/blog/2016-08-18-blog-post-img1.png delete mode 100644 website/static/img/blog/2016-08-18-blog-post-img2.png delete mode 100644 website/static/img/blog/2017-05-02-blog-post-img1.jpg delete mode 100644 website/static/img/blog/2017-05-02-blog-post-img2.jpg delete mode 100644 website/static/img/blog/2017-10-02-blog-post-img1.png delete mode 100644 website/static/img/cbo_vs_skipgram.png delete mode 100644 website/static/img/fasttext-icon-api.png delete mode 100644 website/static/img/fasttext-icon-bg-web.png delete mode 100644 website/static/img/fasttext-icon-color-square.png delete mode 100644 website/static/img/fasttext-icon-color-web.png delete mode 100644 website/static/img/fasttext-icon-faq.png delete mode 100644 website/static/img/fasttext-icon-tutorial.png delete mode 100644 website/static/img/fasttext-icon-white-web.png delete mode 100644 website/static/img/fasttext-logo-color-web.png delete mode 100644 website/static/img/fasttext-logo-white-web.png delete mode 100644 website/static/img/logo-color.png delete mode 100644 website/static/img/model-black.png delete mode 100644 website/static/img/model-blue.png delete mode 100644 website/static/img/model-red.png delete mode 100644 website/static/img/ogimage.png delete mode 100644 website/static/img/oss_logo.png delete mode 100644 website/static/tabber.js delete mode 100644 wikifil.pl delete mode 100755 word-vector-example.sh diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 0ca55b14d..000000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,80 +0,0 @@ -# -# Copyright (c) 2016-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# - -cmake_minimum_required(VERSION 2.8.9) -project(fasttext) - -set(CMAKE_CXX_STANDARD 17) - -# The version number. -set (fasttext_VERSION_MAJOR 0) -set (fasttext_VERSION_MINOR 1) - -include_directories(fasttext) - -set(CMAKE_CXX_FLAGS " -pthread -std=c++17 -funroll-loops -O3 -march=native") - -set(HEADER_FILES - src/args.h - src/autotune.h - src/densematrix.h - src/dictionary.h - src/fasttext.h - src/loss.h - src/matrix.h - src/meter.h - src/model.h - src/productquantizer.h - src/quantmatrix.h - src/real.h - src/utils.h - src/vector.h) - -set(SOURCE_FILES - src/args.cc - src/autotune.cc - src/densematrix.cc - src/dictionary.cc - src/fasttext.cc - src/loss.cc - src/main.cc - src/matrix.cc - src/meter.cc - src/model.cc - src/productquantizer.cc - src/quantmatrix.cc - src/utils.cc - src/vector.cc) - - -if (NOT MSVC) - include(GNUInstallDirs) - configure_file("fasttext.pc.in" "fasttext.pc" @ONLY) - install(FILES "${CMAKE_BINARY_DIR}/fasttext.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) -endif() - -add_library(fasttext-shared SHARED ${SOURCE_FILES} ${HEADER_FILES}) -add_library(fasttext-static STATIC ${SOURCE_FILES} ${HEADER_FILES}) -add_library(fasttext-static_pic STATIC ${SOURCE_FILES} ${HEADER_FILES}) -set_target_properties(fasttext-shared PROPERTIES OUTPUT_NAME fasttext - SOVERSION "${fasttext_VERSION_MAJOR}") -set_target_properties(fasttext-static PROPERTIES OUTPUT_NAME fasttext) -set_target_properties(fasttext-static_pic PROPERTIES OUTPUT_NAME fasttext_pic - POSITION_INDEPENDENT_CODE True) -add_executable(fasttext-bin src/main.cc) -target_link_libraries(fasttext-bin pthread fasttext-static) -set_target_properties(fasttext-bin PROPERTIES PUBLIC_HEADER "${HEADER_FILES}" OUTPUT_NAME fasttext) -install (TARGETS fasttext-shared - LIBRARY DESTINATION lib) -install (TARGETS fasttext-static - ARCHIVE DESTINATION lib) -install (TARGETS fasttext-static_pic - ARCHIVE DESTINATION lib) -install (TARGETS fasttext-bin - RUNTIME DESTINATION bin - PUBLIC_HEADER DESTINATION include/fasttext) diff --git a/alignment/README.md b/alignment/README.md deleted file mode 100644 index 4113007ca..000000000 --- a/alignment/README.md +++ /dev/null @@ -1,67 +0,0 @@ -## Alignment of Word Embeddings - -This directory provides code for learning alignments between word embeddings in different languages. - -The code is in Python 3 and requires [NumPy](http://www.numpy.org/). - -The script `example.sh` shows how to use this code to learn and evaluate a bilingual alignment of word embeddings. - -The word embeddings used in [1] can be found on the [fastText project page](https://fasttext.cc) and the supervised bilingual lexicons on the [MUSE project page](https://github.com/facebookresearch/MUSE). - -### Supervised alignment - -The script `align.py` aligns word embeddings from two languages using a bilingual lexicon as supervision. -The details of this approach can be found in [1]. - -### Unsupervised alignment - -The script `unsup_align.py` aligns word embeddings from two languages without requiring any supervision. -Additionally, the script `unsup_multialign.py` aligns multiple languages to a common space with no supervision. -The details of these approaches can be found in [2] and [3] respectively. - -In addition to NumPy, the unsupervised methods require the [Python Optimal Transport](https://pot.readthedocs.io/en/stable/) toolbox. - -### Download - -Wikipedia fastText embeddings aligned with our method can be found [here](https://fasttext.cc/docs/en/aligned-vectors.html). - -### References - -If you use the supervised alignment method, please cite: - -[1] A. Joulin, P. Bojanowski, T. Mikolov, H. Jegou, E. Grave, [*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745) - -``` -@InProceedings{joulin2018loss, - title={Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion}, - author={Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and J\'egou, Herv\'e and Grave, Edouard}, - year={2018}, - booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, -} -``` - -If you use the unsupervised bilingual alignment method, please cite: - -[2] E. Grave, A. Joulin, Q. Berthet, [*Unsupervised Alignment of Embeddings with Wasserstein Procrustes*](https://arxiv.org/abs/1805.11222) - -``` -@article{grave2018unsupervised, - title={Unsupervised Alignment of Embeddings with Wasserstein Procrustes}, - author={Grave, Edouard and Joulin, Armand and Berthet, Quentin}, - journal={arXiv preprint arXiv:1805.11222}, - year={2018} -} -``` - -If you use the unsupervised alignment script `unsup_multialign.py`, please cite: - -[3] J. Alaux, E. Grave, M. Cuturi, A. Joulin, [*Unsupervised Hyperalignment for Multilingual Word Embeddings*](https://arxiv.org/abs/1811.01124) - -``` -@article{alaux2018unsupervised, - title={Unsupervised hyperalignment for multilingual word embeddings}, - author={Alaux, Jean and Grave, Edouard and Cuturi, Marco and Joulin, Armand}, - journal={arXiv preprint arXiv:1811.01124}, - year={2018} -} -``` diff --git a/alignment/align.py b/alignment/align.py deleted file mode 100644 index e29b8ab94..000000000 --- a/alignment/align.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -import numpy as np -import argparse -from utils import * -import sys - -parser = argparse.ArgumentParser(description='RCSLS for supervised word alignment') - -parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings") -parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings") -parser.add_argument('--center', action='store_true', help='whether to center embeddings or not') - -parser.add_argument("--dico_train", type=str, default='', help="train dictionary") -parser.add_argument("--dico_test", type=str, default='', help="validation dictionary") - -parser.add_argument("--output", type=str, default='', help="where to save aligned embeddings") - -parser.add_argument("--knn", type=int, default=10, help="number of nearest neighbors in RCSL/CSLS") -parser.add_argument("--maxneg", type=int, default=200000, help="Maximum number of negatives for the Extended RCSLS") -parser.add_argument("--maxsup", type=int, default=-1, help="Maximum number of training examples") -parser.add_argument("--maxload", type=int, default=200000, help="Maximum number of loaded vectors") - -parser.add_argument("--model", type=str, default="none", help="Set of constraints: spectral or none") -parser.add_argument("--reg", type=float, default=0.0 , help='regularization parameters') - -parser.add_argument("--lr", type=float, default=1.0, help='learning rate') -parser.add_argument("--niter", type=int, default=10, help='number of iterations') -parser.add_argument('--sgd', action='store_true', help='use sgd') -parser.add_argument("--batchsize", type=int, default=10000, help="batch size for sgd") - -params = parser.parse_args() - -###### SPECIFIC FUNCTIONS ###### -# functions specific to RCSLS -# the rest of the functions are in utils.py - -def getknn(sc, x, y, k=10): - sidx = np.argpartition(sc, -k, axis=1)[:, -k:] - ytopk = y[sidx.flatten(), :] - ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1]) - f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx]) - df = np.dot(ytopk.sum(1).T, x) - return f / k, df / k - - -def rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, knn=10): - X_trans = np.dot(X_src, R.T) - f = 2 * np.sum(X_trans * Y_tgt) - df = 2 * np.dot(Y_tgt.T, X_src) - fk0, dfk0 = getknn(np.dot(X_trans, Z_tgt.T), X_src, Z_tgt, knn) - fk1, dfk1 = getknn(np.dot(np.dot(Z_src, R.T), Y_tgt.T).T, Y_tgt, Z_src, knn) - f = f - fk0 -fk1 - df = df - dfk0 - dfk1.T - return -f / X_src.shape[0], -df / X_src.shape[0] - - -def proj_spectral(R): - U, s, V = np.linalg.svd(R) - s[s > 1] = 1 - s[s < 0] = 0 - return np.dot(U, np.dot(np.diag(s), V)) - - -###### MAIN ###### - -# load word embeddings -words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center) -words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center) - -# load validation bilingual lexicon -src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt) - -# word --> vector indices -idx_src = idx(words_src) -idx_tgt = idx(words_tgt) - -# load train bilingual lexicon -pairs = load_pairs(params.dico_train, idx_src, idx_tgt) -if params.maxsup > 0 and params.maxsup < len(pairs): - pairs = pairs[:params.maxsup] - -# selecting training vector pairs -X_src, Y_tgt = select_vectors_from_pairs(x_src, x_tgt, pairs) - -# adding negatives for RCSLS -Z_src = x_src[:params.maxneg, :] -Z_tgt = x_tgt[:params.maxneg, :] - -# initialization: -R = procrustes(X_src, Y_tgt) -nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size) -print("[init -- Procrustes] NN: %.4f"%(nnacc)) -sys.stdout.flush() - -# optimization -fold, Rold = 0, [] -niter, lr = params.niter, params.lr - -for it in range(0, niter + 1): - if lr < 1e-4: - break - - if params.sgd: - indices = np.random.choice(X_src.shape[0], size=params.batchsize, replace=False) - f, df = rcsls(X_src[indices, :], Y_tgt[indices, :], Z_src, Z_tgt, R, params.knn) - else: - f, df = rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, params.knn) - - if params.reg > 0: - R *= (1 - lr * params.reg) - R -= lr * df - if params.model == "spectral": - R = proj_spectral(R) - - print("[it=%d] f = %.4f" % (it, f)) - sys.stdout.flush() - - if f > fold and it > 0 and not params.sgd: - lr /= 2 - f, R = fold, Rold - - fold, Rold = f, R - - if (it > 0 and it % 10 == 0) or it == niter: - nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size) - print("[it=%d] NN = %.4f - Coverage = %.4f" % (it, nnacc, len(src2tgt) / lexicon_size)) - -nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size) -print("[final] NN = %.4f - Coverage = %.4f" % (nnacc, len(src2tgt) / lexicon_size)) - -if params.output != "": - print("Saving all aligned vectors at %s" % params.output) - words_full, x_full = load_vectors(params.src_emb, maxload=-1, center=params.center, verbose=False) - x = np.dot(x_full, R.T) - x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8 - save_vectors(params.output, x, words_full) - save_matrix(params.output + "-mat", R) diff --git a/alignment/eval.py b/alignment/eval.py deleted file mode 100644 index c09e4d567..000000000 --- a/alignment/eval.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -import io -import numpy as np -import argparse -from utils import * - -parser = argparse.ArgumentParser(description='Evaluation of word alignment') -parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings") -parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings") -parser.add_argument('--center', action='store_true', help='whether to center embeddings or not') -parser.add_argument("--src_mat", type=str, default='', help="Load source alignment matrix. If none given, the aligment matrix is the identity.") -parser.add_argument("--tgt_mat", type=str, default='', help="Load target alignment matrix. If none given, the aligment matrix is the identity.") -parser.add_argument("--dico_test", type=str, default='', help="test dictionary") -parser.add_argument("--maxload", type=int, default=200000) -parser.add_argument("--nomatch", action='store_true', help="no exact match in lexicon") -params = parser.parse_args() - - -###### SPECIFIC FUNCTIONS ###### -# function specific to evaluation -# the rest of the functions are in utils.py - -def load_transform(fname, d1=300, d2=300): - fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') - R = np.zeros([d1, d2]) - for i, line in enumerate(fin): - tokens = line.split(' ') - R[i, :] = np.array(tokens[0:d2], dtype=float) - return R - - -###### MAIN ###### - -print("Evaluation of alignment on %s" % params.dico_test) -if params.nomatch: - print("running without exact string matches") - -words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center) -words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center) - -if params.tgt_mat != "": - R_tgt = load_transform(params.tgt_mat) - x_tgt = np.dot(x_tgt, R_tgt) -if params.src_mat != "": - R_src = load_transform(params.src_mat) - x_src = np.dot(x_src, R_src) - -src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt) - -nnacc = compute_nn_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size) -cslsproc = compute_csls_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size) -print("NN = %.4f - CSLS = %.4f - Coverage = %.4f" % (nnacc, cslsproc, len(src2tgt) / lexicon_size)) diff --git a/alignment/example.sh b/alignment/example.sh deleted file mode 100755 index 51ef292d5..000000000 --- a/alignment/example.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/usr/env sh -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -set -e -s=${1:-en} -t=${2:-es} -echo "Example based on the ${s}->${t} alignment" - -if [ ! -d data/ ]; then - mkdir -p data; -fi - -if [ ! -d res/ ]; then - mkdir -p res; -fi - -dico_train=data/${s}-${t}.0-5000.txt -if [ ! -f "${dico_train}" ]; then - DICO=$(basename -- "${dico_train}") - wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/ -fi - -dico_test=data/${s}-${t}.5000-6500.txt -if [ ! -f "${dico_test}" ]; then - DICO=$(basename -- "${dico_test}") - wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/ -fi - -src_emb=data/wiki.${s}.vec -if [ ! -f "${src_emb}" ]; then - EMB=$(basename -- "${src_emb}") - wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/ -fi - -tgt_emb=data/wiki.${t}.vec -if [ ! -f "${tgt_emb}" ]; then - EMB=$(basename -- "${tgt_emb}") - wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/ -fi - -output=res/wiki.${s}-${t}.vec - -python3 align.py --src_emb "${src_emb}" --tgt_emb "${tgt_emb}" \ - --dico_train "${dico_train}" --dico_test "${dico_test}" --output "${output}" \ - --lr 25 --niter 10 -python3 eval.py --src_emb "${output}" --tgt_emb "${tgt_emb}" \ - --dico_test "${dico_test}" diff --git a/alignment/unsup_align.py b/alignment/unsup_align.py deleted file mode 100644 index 189420747..000000000 --- a/alignment/unsup_align.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import codecs, sys, time, math, argparse, ot -import numpy as np -from utils import * - -parser = argparse.ArgumentParser(description='Wasserstein Procrustes for Embedding Alignment') -parser.add_argument('--model_src', type=str, help='Path to source word embeddings') -parser.add_argument('--model_tgt', type=str, help='Path to target word embeddings') -parser.add_argument('--lexicon', type=str, help='Path to the evaluation lexicon') -parser.add_argument('--output_src', default='', type=str, help='Path to save the aligned source embeddings') -parser.add_argument('--output_tgt', default='', type=str, help='Path to save the aligned target embeddings') -parser.add_argument('--seed', default=1111, type=int, help='Random number generator seed') -parser.add_argument('--nepoch', default=5, type=int, help='Number of epochs') -parser.add_argument('--niter', default=5000, type=int, help='Initial number of iterations') -parser.add_argument('--bsz', default=500, type=int, help='Initial batch size') -parser.add_argument('--lr', default=500., type=float, help='Learning rate') -parser.add_argument('--nmax', default=20000, type=int, help='Vocabulary size for learning the alignment') -parser.add_argument('--reg', default=0.05, type=float, help='Regularization parameter for sinkhorn') -args = parser.parse_args() - - -def objective(X, Y, R, n=5000): - Xn, Yn = X[:n], Y[:n] - C = -np.dot(np.dot(Xn, R), Yn.T) - P = ot.sinkhorn(np.ones(n), np.ones(n), C, 0.025, stopThr=1e-3) - return 1000 * np.linalg.norm(np.dot(Xn, R) - np.dot(P, Yn)) / n - - -def sqrt_eig(x): - U, s, VT = np.linalg.svd(x, full_matrices=False) - return np.dot(U, np.dot(np.diag(np.sqrt(s)), VT)) - - -def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000, - nmax=10000, reg=0.05, verbose=True): - for epoch in range(1, nepoch + 1): - for _it in range(1, niter + 1): - # sample mini-batch - xt = X[np.random.permutation(nmax)[:bsz], :] - yt = Y[np.random.permutation(nmax)[:bsz], :] - # compute OT on minibatch - C = -np.dot(np.dot(xt, R), yt.T) - P = ot.sinkhorn(np.ones(bsz), np.ones(bsz), C, reg, stopThr=1e-3) - # compute gradient - G = - np.dot(xt.T, np.dot(P, yt)) - R -= lr / bsz * G - # project on orthogonal matrices - U, s, VT = np.linalg.svd(R) - R = np.dot(U, VT) - bsz *= 2 - niter //= 4 - if verbose: - print("epoch: %d obj: %.3f" % (epoch, objective(X, Y, R))) - return R - - -def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False): - n, d = X.shape - if apply_sqrt: - X, Y = sqrt_eig(X), sqrt_eig(Y) - K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T) - K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y) - K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y) - P = np.ones([n, n]) / float(n) - for it in range(1, niter + 1): - G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X)) - q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3) - alpha = 2.0 / float(2.0 + it) - P = alpha * q + (1.0 - alpha) * P - obj = np.linalg.norm(np.dot(P, K_X) - np.dot(K_Y, P)) - print(obj) - return procrustes(np.dot(P, X), Y).T - - -print("\n*** Wasserstein Procrustes ***\n") - -np.random.seed(args.seed) - -maxload = 200000 -w_src, x_src = load_vectors(args.model_src, maxload, norm=True, center=True) -w_tgt, x_tgt = load_vectors(args.model_tgt, maxload, norm=True, center=True) -src2trg, _ = load_lexicon(args.lexicon, w_src, w_tgt) - -print("\nComputing initial mapping with convex relaxation...") -t0 = time.time() -R0 = convex_init(x_src[:2500], x_tgt[:2500], reg=args.reg, apply_sqrt=True) -print("Done [%03d sec]" % math.floor(time.time() - t0)) - -print("\nComputing mapping with Wasserstein Procrustes...") -t0 = time.time() -R = align(x_src, x_tgt, R0.copy(), bsz=args.bsz, lr=args.lr, niter=args.niter, - nepoch=args.nepoch, reg=args.reg, nmax=args.nmax) -print("Done [%03d sec]" % math.floor(time.time() - t0)) - -acc = compute_nn_accuracy(x_src, np.dot(x_tgt, R.T), src2trg) -print("\nPrecision@1: %.3f\n" % acc) - -if args.output_src != '': - x_src = x_src / np.linalg.norm(x_src, 2, 1).reshape([-1, 1]) - save_vectors(args.output_src, x_src, w_src) -if args.output_tgt != '': - x_tgt = x_tgt / np.linalg.norm(x_tgt, 2, 1).reshape([-1, 1]) - save_vectors(args.output_tgt, np.dot(x_tgt, R.T), w_tgt) diff --git a/alignment/unsup_multialign.py b/alignment/unsup_multialign.py deleted file mode 100644 index e7bfe0ceb..000000000 --- a/alignment/unsup_multialign.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Copyright (c) 2019-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -import io, os, ot, argparse, random -import numpy as np -from utils import * - -parser = argparse.ArgumentParser(description=' ') - -parser.add_argument('--embdir', default='data/', type=str) -parser.add_argument('--outdir', default='output/', type=str) -parser.add_argument('--lglist', default='en-fr-es-it-pt-de-pl-ru-da-nl-cs', type=str, - help='list of languages. The first element is the pivot. Example: en-fr-es to align English, French and Spanish with English as the pivot.') - -parser.add_argument('--maxload', default=20000, type=int, help='Max number of loaded vectors') -parser.add_argument('--uniform', action='store_true', help='switch to uniform probability of picking language pairs') - -# optimization parameters for the square loss -parser.add_argument('--epoch', default=2, type=int, help='nb of epochs for square loss') -parser.add_argument('--niter', default=500, type=int, help='max number of iteration per epoch for square loss') -parser.add_argument('--lr', default=0.1, type=float, help='learning rate for square loss') -parser.add_argument('--bsz', default=500, type=int, help='batch size for square loss') - -# optimization parameters for the RCSLS loss -parser.add_argument('--altepoch', default=100, type=int, help='nb of epochs for RCSLS loss') -parser.add_argument('--altlr', default=25, type=float, help='learning rate for RCSLS loss') -parser.add_argument("--altbsz", type=int, default=1000, help="batch size for RCSLS") - -args = parser.parse_args() - -###### SPECIFIC FUNCTIONS ###### - -def getknn(sc, x, y, k=10): - sidx = np.argpartition(sc, -k, axis=1)[:, -k:] - ytopk = y[sidx.flatten(), :] - ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1]) - f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx]) - df = np.dot(ytopk.sum(1).T, x) - return f / k, df / k - - -def rcsls(Xi, Xj, Zi, Zj, R, knn=10): - X_trans = np.dot(Xi, R.T) - f = 2 * np.sum(X_trans * Xj) - df = 2 * np.dot(Xj.T, Xi) - fk0, dfk0 = getknn(np.dot(X_trans, Zj.T), Xi, Zj, knn) - fk1, dfk1 = getknn(np.dot(np.dot(Zi, R.T), Xj.T).T, Xj, Zi, knn) - f = f - fk0 -fk1 - df = df - dfk0 - dfk1.T - return -f / Xi.shape[0], -df.T / Xi.shape[0] - - -def GWmatrix(emb0): - N = np.shape(emb0)[0] - N2 = .5* np.linalg.norm(emb0, axis=1).reshape(1, N) - C2 = np.tile(N2.transpose(), (1, N)) + np.tile(N2, (N, 1)) - C2 -= np.dot(emb0,emb0.T) - return C2 - - -def gromov_wasserstein(x_src, x_tgt, C2): - N = x_src.shape[0] - C1 = GWmatrix(x_src) - M = ot.gromov_wasserstein(C1,C2,np.ones(N),np.ones(N),'square_loss',epsilon=0.55,max_iter=100,tol=1e-4) - return procrustes(np.dot(M,x_tgt), x_src) - - -def align(EMB, TRANS, lglist, args): - nmax, l = args.maxload, len(lglist) - # create a list of language pairs to sample from - # (default == higher probability to pick a language pair contianing the pivot) - # if --uniform: uniform probability of picking a language pair - samples = [] - for i in range(l): - for j in range(l): - if j == i : - continue - if j > 0 and args.uniform == False: - samples.append((0,j)) - if i > 0 and args.uniform == False: - samples.append((i,0)) - samples.append((i,j)) - - # optimization of the l2 loss - print('start optimizing L2 loss') - lr0, bsz, nepoch, niter = args.lr, args.bsz, args.epoch, args.niter - for epoch in range(nepoch): - print("start epoch %d / %d"%(epoch+1, nepoch)) - ones = np.ones(bsz) - f, fold, nb, lr = 0.0, 0.0, 0.0, lr0 - for it in range(niter): - if it > 1 and f > fold + 1e-3: - lr /= 2 - if lr < .05: - break - fold = f - f, nb = 0.0, 0.0 - for k in range(100 * (l-1)): - (i,j) = random.choice(samples) - embi = EMB[i][np.random.permutation(nmax)[:bsz], :] - embj = EMB[j][np.random.permutation(nmax)[:bsz], :] - perm = ot.sinkhorn(ones, ones, np.linalg.multi_dot([embi, -TRANS[i], TRANS[j].T,embj.T]), reg = 0.025, stopThr = 1e-3) - grad = np.linalg.multi_dot([embi.T, perm, embj]) - f -= np.trace(np.linalg.multi_dot([TRANS[i].T, grad, TRANS[j]])) / embi.shape[0] - nb += 1 - if i > 0: - TRANS[i] = proj_ortho(TRANS[i] + lr * np.dot(grad, TRANS[j])) - if j > 0: - TRANS[j] = proj_ortho(TRANS[j] + lr * np.dot(grad.transpose(), TRANS[i])) - print("iter %d / %d - epoch %d - loss: %.5f lr: %.4f" % (it, niter, epoch+1, f / nb , lr)) - print("end of epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr)) - niter, bsz = max(int(niter/2),2), min(1000, bsz * 2) - #end for epoch in range(nepoch): - - # optimization of the RCSLS loss - print('start optimizing RCSLS loss') - f, fold, nb, lr = 0.0, 0.0, 0.0, args.altlr - for epoch in range(args.altepoch): - if epoch > 1 and f-fold > -1e-4 * abs(fold): - lr/= 2 - if lr < 1e-1: - break - fold = f - f, nb = 0.0, 0.0 - for k in range(round(nmax / args.altbsz) * 10 * (l-1)): - (i,j) = random.choice(samples) - sgdidx = np.random.choice(nmax, size=args.altbsz, replace=False) - embi = EMB[i][sgdidx, :] - embj = EMB[j][:nmax, :] - # crude alignment approximation: - T = np.dot(TRANS[i], TRANS[j].T) - scores = np.linalg.multi_dot([embi, T, embj.T]) - perm = np.zeros_like(scores) - perm[np.arange(len(scores)), scores.argmax(1)] = 1 - embj = np.dot(perm, embj) - # normalization over a subset of embeddings for speed up - fi, grad = rcsls(embi, embj, embi, embj, T.T) - f += fi - nb += 1 - if i > 0: - TRANS[i] = proj_ortho(TRANS[i] - lr * np.dot(grad, TRANS[j])) - if j > 0: - TRANS[j] = proj_ortho(TRANS[j] - lr * np.dot(grad.transpose(), TRANS[i])) - print("epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr)) - #end for epoch in range(args.altepoch): - return TRANS - -def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False): - n, d = X.shape - K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T) - K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y) - K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y) - P = np.ones([n, n]) / float(n) - for it in range(1, niter + 1): - G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X)) - q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3) - alpha = 2.0 / float(2.0 + it) - P = alpha * q + (1.0 - alpha) * P - return procrustes(np.dot(P, X), Y).T - - -###### MAIN ###### - -lglist = args.lglist.split('-') -l = len(lglist) - -# embs: -EMB = {} -for i in range(l): - fn = args.embdir + '/wiki.' + lglist[i] + '.vec' - _, vecs = load_vectors(fn, maxload=args.maxload) - EMB[i] = vecs - -#init -print("Computing initial bilingual apping with Gromov-Wasserstein...") -TRANS={} -maxinit = 2000 -emb0 = EMB[0][:maxinit,:] -C0 = GWmatrix(emb0) -TRANS[0] = np.eye(300) -for i in range(1, l): - print("init "+lglist[i]) - embi = EMB[i][:maxinit,:] - TRANS[i] = gromov_wasserstein(embi, emb0, C0) - -# align -align(EMB, TRANS, lglist, args) - -print('saving matrices in ' + args.outdir) -languages=''.join(lglist) -for i in range(l): - save_matrix(args.outdir + '/W-' + languages + '-' + lglist[i], TRANS[i]) diff --git a/alignment/utils.py b/alignment/utils.py deleted file mode 100644 index 5adac5e2c..000000000 --- a/alignment/utils.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. - -import io -import numpy as np -import collections - - -def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True): - if verbose: - print("Loading vectors from %s" % fname) - fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') - n, d = map(int, fin.readline().split()) - if maxload > 0: - n = min(n, maxload) - x = np.zeros([n, d]) - words = [] - for i, line in enumerate(fin): - if i >= n: - break - tokens = line.rstrip().split(' ') - words.append(tokens[0]) - v = np.array(tokens[1:], dtype=float) - x[i, :] = v - if norm: - x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8 - if center: - x -= x.mean(axis=0)[np.newaxis, :] - x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8 - if verbose: - print("%d word vectors loaded" % (len(words))) - return words, x - - -def idx(words): - w2i = {} - for i, w in enumerate(words): - if w not in w2i: - w2i[w] = i - return w2i - - -def save_vectors(fname, x, words): - n, d = x.shape - fout = io.open(fname, 'w', encoding='utf-8') - fout.write(u"%d %d\n" % (n, d)) - for i in range(n): - fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n") - fout.close() - - -def save_matrix(fname, x): - n, d = x.shape - fout = io.open(fname, 'w', encoding='utf-8') - fout.write(u"%d %d\n" % (n, d)) - for i in range(n): - fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n") - fout.close() - - -def procrustes(X_src, Y_tgt): - U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src)) - return np.dot(U, V) - - -def select_vectors_from_pairs(x_src, y_tgt, pairs): - n = len(pairs) - d = x_src.shape[1] - x = np.zeros([n, d]) - y = np.zeros([n, d]) - for k, ij in enumerate(pairs): - i, j = ij - x[k, :] = x_src[i, :] - y[k, :] = y_tgt[j, :] - return x, y - - -def load_lexicon(filename, words_src, words_tgt, verbose=True): - f = io.open(filename, 'r', encoding='utf-8') - lexicon = collections.defaultdict(set) - idx_src , idx_tgt = idx(words_src), idx(words_tgt) - vocab = set() - for line in f: - word_src, word_tgt = line.split() - if word_src in idx_src and word_tgt in idx_tgt: - lexicon[idx_src[word_src]].add(idx_tgt[word_tgt]) - vocab.add(word_src) - if verbose: - coverage = len(lexicon) / float(len(vocab)) - print("Coverage of source vocab: %.4f" % (coverage)) - return lexicon, float(len(vocab)) - - -def load_pairs(filename, idx_src, idx_tgt, verbose=True): - f = io.open(filename, 'r', encoding='utf-8') - pairs = [] - tot = 0 - for line in f: - a, b = line.rstrip().split(' ') - tot += 1 - if a in idx_src and b in idx_tgt: - pairs.append((idx_src[a], idx_tgt[b])) - if verbose: - coverage = (1.0 * len(pairs)) / tot - print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage)) - return pairs - - -def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1): - if lexicon_size < 0: - lexicon_size = len(lexicon) - idx_src = list(lexicon.keys()) - acc = 0.0 - x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8 - x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8 - for i in range(0, len(idx_src), bsz): - e = min(i + bsz, len(idx_src)) - scores = np.dot(x_tgt, x_src[idx_src[i:e]].T) - pred = scores.argmax(axis=0) - for j in range(i, e): - if pred[j - i] in lexicon[idx_src[j]]: - acc += 1.0 - return acc / lexicon_size - - -def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024): - if lexicon_size < 0: - lexicon_size = len(lexicon) - idx_src = list(lexicon.keys()) - - x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8 - x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8 - - sr = x_src[list(idx_src)] - sc = np.dot(sr, x_tgt.T) - similarities = 2 * sc - sc2 = np.zeros(x_tgt.shape[0]) - for i in range(0, x_tgt.shape[0], bsz): - j = min(i + bsz, x_tgt.shape[0]) - sc_batch = np.dot(x_tgt[i:j, :], x_src.T) - dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:] - sc2[i:j] = np.mean(dotprod, axis=1) - similarities -= sc2[np.newaxis, :] - - nn = np.argmax(similarities, axis=1).tolist() - correct = 0.0 - for k in range(0, len(lexicon)): - if nn[k] in lexicon[idx_src[k]]: - correct += 1.0 - return correct / lexicon_size diff --git a/classification-example.sh b/classification-example.sh deleted file mode 100755 index 10717ae2b..000000000 --- a/classification-example.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2016-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# - -myshuf() { - perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@"; -} - -normalize_text() { - tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \ - sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/
/ /g' \ - -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ - -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf -} - -RESULTDIR=result -DATADIR=data - -mkdir -p "${RESULTDIR}" -mkdir -p "${DATADIR}" - -if [ ! -f "${DATADIR}/dbpedia.train" ] -then - wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "${DATADIR}/dbpedia_csv.tar.gz" - tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}" - cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train" - cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test" -fi - -make - -./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 - -./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" - -./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict" diff --git a/classification-results.sh b/classification-results.sh deleted file mode 100755 index 0c945cf68..000000000 --- a/classification-results.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2016-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# - -# This script produces the results from Table 1 in the following paper: -# Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016 - -myshuf() { - perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@"; -} - -normalize_text() { - tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \ - sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/
/ /g' \ - -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ - -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf -} - -DATASET=( - ag_news - sogou_news - dbpedia - yelp_review_polarity - yelp_review_full - yahoo_answers - amazon_review_full - amazon_review_polarity -) - -ID=( - 0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news - 0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news - 0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia - 0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity - 0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full - 0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers - 0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full - 0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity -) - -# These learning rates were chosen by validation on a subset of the training set. -LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 ) - -RESULTDIR=result -DATADIR=data - -mkdir -p "${RESULTDIR}" -mkdir -p "${DATADIR}" - -# Small datasets first - -for i in {0..0} -do - echo "Downloading dataset ${DATASET[i]}" - if [ ! -f "${DATADIR}/${DATASET[i]}.train" ] - then - wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz" - tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}" - cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train" - cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test" - fi -done - -# Large datasets require a bit more work due to the extra request page - -for i in {1..7} -do - echo "Downloading dataset ${DATASET[i]}" - if [ ! -f "${DATADIR}/${DATASET[i]}.train" ] - then - curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html - curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz" - tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}" - cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train" - cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test" - fi -done - -make - -for i in {0..7} -do - echo "Working on dataset ${DATASET[i]}" - ./fasttext supervised -input "${DATADIR}/${DATASET[i]}.train" \ - -output "${RESULTDIR}/${DATASET[i]}" -dim 10 -lr "${LR[i]}" -wordNgrams 2 \ - -minCount 1 -bucket 10000000 -epoch 5 -thread 4 > /dev/null - ./fasttext test "${RESULTDIR}/${DATASET[i]}.bin" \ - "${DATADIR}/${DATASET[i]}.test" -done diff --git a/crawl/README.md b/crawl/README.md deleted file mode 100644 index 7cbd87de5..000000000 --- a/crawl/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## Preprocessing Common Crawl - -This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/). - -This script uses the scripts and language identifier of [1]. - -This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText). - -Set the variable WET_PATHS_URL to the crawl you want to process. -Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine. -Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM). - -### Reference - -If you use this code, please cite: - -[1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893) - -``` -@inproceedings{grave2018learning, - title={Learning Word Vectors for 157 Languages}, - author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas}, - booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)}, - year={2018} -} -``` diff --git a/crawl/dedup.cc b/crawl/dedup.cc deleted file mode 100644 index 67df66c16..000000000 --- a/crawl/dedup.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2018-present, Facebook, Inc. -// All rights reserved. -// -// This source code is licensed under the MIT license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include - -uint64_t fnv1a_64(uint8_t *data, size_t sz, uint64_t h=14695981039346656037ull) -{ - for (size_t i = 0; i < sz; i++, data++) { - h ^= uint64_t(*data); - h *= 1099511628211ull; - } - return h; -} - -int main(int argc, char** argv) -{ - uint64_t init_values[] = { - 14695981039346656037ull, - 9425296925403859339ull, - 13716263814064014149ull, - 3525492407291847033ull, - 8607404175481815707ull, - 9818874561736458749ull, - 10026508429719773353ull, - 3560712257386009938ull - }; - size_t n = 1ull<<34, num_hashes = 2; - std::vector seen(n); - - std::ios_base::sync_with_stdio(false); - - for (std::string line; std::getline(std::cin, line);) { - bool b = true; - for (size_t i = 0; i < num_hashes; i++) { - uint64_t h = fnv1a_64((uint8_t*) line.data(), line.length(), init_values[i]) % n; - b = b && seen[h]; - seen[h] = true; - } - if (!b) { - std::cout << line << std::endl; - } - } - return 0; -} diff --git a/crawl/download_crawl.sh b/crawl/download_crawl.sh deleted file mode 100644 index b94f1f296..000000000 --- a/crawl/download_crawl.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/bin/usr/env sh -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -set -e - -# Set this variable to the crawl you want to process. -WET_PATHS_URL="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2018-34/wet.paths.gz" - -# Set NUM_LANGID and NUM_DEDUP according to the capacity of your machine. -# Please note that each dedup process uses 2GB of RAM, while langid is -# mostly limited by cpu usage. -NUM_LANGID=12 -NUM_DEDUP=8 -URL="https://commoncrawl.s3.amazonaws.com/" - -if [ ! -d fastText ]; then - git clone https://github.com/facebookresearch/fastText.git -fi - -if [ ! -f fastText/fasttext ]; then - cd fastText - make - cd .. -fi - -if [ ! -f lid.176.bin ]; then - wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -fi - -if [ ! -d tmp ]; then - mkdir tmp -fi - -if [ ! -d shard ]; then - mkdir shard -fi - -if [ ! -f wet.paths ]; then - wget "${WET_PATHS_URL}" - gunzip wet.paths.gz -fi - -## Language identification -cat wet.paths | xargs -n 1 -P "${NUM_LANGID}" -I '{}' sh process_wet_file.sh "${URL}{}" - -## Deduplication -g++ -std=c++11 -O3 -o dedup dedup.cc -g++ -std=c++11 -O3 -o filter_utf8 filter_utf8.cc -find shard -name '*.txt' | xargs -n 1 -P "${NUM_DEDUP}" -I '{}' sh filter_dedup.sh "{}" - -## Example of data filtering + tokenization -git clone https://github.com/moses-smt/mosesdecoder.git -perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l es < shard/es.dedup > shard/es.tok diff --git a/crawl/filter_dedup.sh b/crawl/filter_dedup.sh deleted file mode 100644 index f07f9ceaa..000000000 --- a/crawl/filter_dedup.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/usr/env sh -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -set -e - -LG=$(basename --suffix=".txt" "${1}") - -./filter_utf8 < "shard/${LG}.txt" \ - | ./dedup > "shard/${LG}.dedup" diff --git a/crawl/filter_utf8.cc b/crawl/filter_utf8.cc deleted file mode 100644 index 5e493e3d2..000000000 --- a/crawl/filter_utf8.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2018-present, Facebook, Inc. -// All rights reserved. -// -// This source code is licensed under the MIT license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include - -// Check that the next n bytes are continuation bytes. -bool continuation(uint8_t* str, int n) -{ - for (int i = 0; i < n; i++) { - if ((str[i] & 0xc0) != 0x80) return false; - } - return true; -} - -// Invalid UTF8 correspond to codepoints which are larger than U+10FFFF. -// This value is encoded in UTF8 as: -// * 11110.100 10.001111 10.111111 10.111111 -// We thus check if the first byte is larger than 0xf4, or if it is equal -// to 0xf4 and the second byte is larger than 0x8f. -bool invalid(uint8_t* str) -{ - return str[0] > 0xf4 || (str[0] == 0xf4 && str[1] > 0x8f); -} - -// Surrogate halves corresponds to the range U+D800 through U+DFFF, -// which are encoded in UTF8 as: -// * 1110.1101 10.100000 10.000000 -// * 1110.1101 10.111111 10.111111 -// We thus check is the first byte is equal to 0xed and if the -// sixth bit of the second byte is set. -bool surrogate(uint8_t* str) -{ - return str[0] == 0xed && str[1] & 0x20; -} - -// Sequences of length 2 are overlong if the leading 4 bits (noted as y) -// are equal to 0: 110.yyyyx 10xxxxxx -bool overlong_2(uint8_t* str) -{ - return (str[0] & 0x1e) == 0; -} - -// Sequences of lenth 3 are overlong if the leading 5 bits (noted as y) -// are equal to 0: 1110.yyyy 10.yxxxxx 10.xxxxxx -bool overlong_3(uint8_t* str) -{ - return (str[0] & 0x0f) == 0 && (str[1] & 0x20) == 0; -} - -// Sequences of length 4 are overlong if the leading 5 bits (noted as y) -// are equal to 0: 11110.yyy 10.yyxxxx 10.xxxxxx 10.xxxxxx -bool overlong_4(uint8_t* str) -{ - return (str[0] & 0x07) == 0 && (str[1] & 0x30) == 0; -} - -bool valid_utf8(uint8_t* str, size_t length) -{ - uint8_t* end = str + length; - while (str < end) { - if (str[0] < 0x80) { - // 0.xxxxxxx - str += 1; - } else if ((str[0] & 0xe0) == 0xc0) { - // 110.xxxxx 10.xxxxxx - if (str + 1 >= end) return false; - if (!continuation(str + 1, 1)) return false; - if (overlong_2(str)) return false; - str += 2; - } else if ((str[0] & 0xf0) == 0xe0) { - // 1110.xxxx 10.xxxxxx 10.xxxxxx - if (str + 2 >= end) return false; - if (!continuation(str + 1, 2)) return false; - if (overlong_3(str)) return false; - if (surrogate(str)) return false; - str += 3; - } else if ((str[0] & 0xf8) == 0xf0) { - // 11110.xxx 10.xxxxxx 10.xxxxxx 10.xxxxxx - if (str + 3 >= end) return false; - if (!continuation(str + 1, 3)) return false; - if (overlong_4(str)) return false; - if (invalid(str)) return false; - str += 4; - } else { - return false; - } - } - return true; -} - -int main(int argc, char** argv) -{ - std::ios_base::sync_with_stdio(false); - for (std::string line; std::getline(std::cin, line);) { - if (valid_utf8((uint8_t*) line.data(), line.length())) { - std::cout << line << std::endl; - } - } - return 0; -} diff --git a/crawl/process_wet_file.sh b/crawl/process_wet_file.sh deleted file mode 100644 index 77d311eb1..000000000 --- a/crawl/process_wet_file.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/usr/env sh -# Copyright (c) 2018-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -set -e - -URL=$1 - -FILENAME=$(basename --suffix=".warc.wet.gz" "${URL}") - -echo "Processing ${FILENAME}." - -wget -q -P tmp "${URL}" - -#echo "Extracting ${FILENAME}.warc.wet.gz" -gunzip "tmp/${FILENAME}.warc.wet.gz" - -#echo "Language identification for ${FILENAME}.warc.wet" -fastText/fasttext predict-prob lid.176.bin "tmp/${FILENAME}.warc.wet" > "tmp/${FILENAME}.lid" - -#echo "Splitting ${FILENAME}.warc.wet per language" -paste "tmp/${FILENAME}.lid" "tmp/${FILENAME}.warc.wet" | \ - awk '($2 > 0.8 || ($1=="__label__hr" && $2 > 0.4)) && length() > 100 {lang = substr($1, 10); $1=""; $2=""; print $0 >> "shard/"lang".txt"}' - -#echo "Removing tmp files" -rm "tmp/${FILENAME}.lid" -rm "tmp/${FILENAME}.warc.wet" diff --git a/docs/aligned-vectors.md b/docs/aligned-vectors.md deleted file mode 100644 index ebcbafafe..000000000 --- a/docs/aligned-vectors.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -id: aligned-vectors -title: Aligned word vectors ---- - -We are publishing aligned word vectors for 44 languages based on the pre-trained vectors computed on [*Wikipedia*](https://www.wikipedia.org) using fastText. -The alignments are performed with the RCSLS method described in [*Joulin et al (2018)*](https://arxiv.org/abs/1804.07745). - -### Vectors - -The aligned vectors can be downloaded from: - -||||| -|-|-|-|-| -| Afrikaans: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.af.align.vec) | Arabic: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ar.align.vec) | Bulgarian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bg.align.vec) | Bengali: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bn.align.vec) | -| Bosnian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.bs.align.vec) | Catalan: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ca.align.vec) | Czech: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.cs.align.vec) | Danish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.da.align.vec) | -| German: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.de.align.vec) | Greek: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.el.align.vec) | English: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.en.align.vec) | Spanish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.es.align.vec) | -| Estonian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.et.align.vec) | Persian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fa.align.vec) | Finnish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fi.align.vec) | French: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.fr.align.vec) | -| Hebrew: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.he.align.vec) | Hindi: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hi.align.vec) | Croatian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hr.align.vec) | Hungarian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.hu.align.vec) | -| Indonesian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.id.align.vec) | Italian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.it.align.vec) | Korean: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ko.align.vec) | Lithuanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.lt.align.vec) | -| Latvian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.lv.align.vec) | Macedonian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.mk.align.vec) | Malay: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ms.align.vec) | Dutch: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.nl.align.vec) | -| Norwegian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.no.align.vec) | Polish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.pl.align.vec) | Portuguese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.pt.align.vec) | Romanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ro.align.vec) | -| Russian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ru.align.vec) | Slovak: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sk.align.vec) | Slovenian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sl.align.vec) | Albanian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sq.align.vec) | -| Swedish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.sv.align.vec) | Tamil: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.ta.align.vec) | Thai: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.th.align.vec) | Tagalog: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.tl.align.vec) | -| Turkish: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.tr.align.vec) | Ukrainian: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.uk.align.vec) | Vietnamese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.vi.align.vec) | Chinese: [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.zh.align.vec) | - -### Format - -The word vectors come in the default text format of fastText. -The first line gives the number of vectors and their dimension. -The other lines contain a word followed by its vector. Each value is space separated. - -### License - -The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/). - -### References - -If you use these word vectors, please cite the following papers: - -[1] A. Joulin, P. Bojanowski, T. Mikolov, H. Jegou, E. Grave, [*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745) - -```markup -@InProceedings{joulin2018loss, - title={Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion}, - author={Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and J\'egou, Herv\'e and Grave, Edouard}, - year={2018}, - booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, -} -``` - -[2] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606) - -```markup -@article{bojanowski2017enriching, - title={Enriching Word Vectors with Subword Information}, - author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, - journal={Transactions of the Association for Computational Linguistics}, - volume={5}, - year={2017}, - issn={2307-387X}, - pages={135--146} -} -``` diff --git a/docs/api.md b/docs/api.md deleted file mode 100644 index 4e89b60b1..000000000 --- a/docs/api.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -id: api -title:API ---- - -We automatically generate our [API documentation](/docs/en/html/index.html) with doxygen. diff --git a/docs/autotune.md b/docs/autotune.md deleted file mode 100644 index 09dc904e7..000000000 --- a/docs/autotune.md +++ /dev/null @@ -1,156 +0,0 @@ ---- -id: autotune -title: Automatic hyperparameter optimization ---- - -As we saw in [the tutorial](/docs/en/supervised-tutorial.html#more-epochs-and-larger-learning-rate), finding the best hyperparameters is crucial for building efficient models. However, searching the best hyperparameters manually is difficult. Parameters are dependent and the effect of each parameter vary from one dataset to another. - -FastText's autotune feature allows you to find automatically the best hyperparameters for your dataset. - -# How to use it - -In order to activate hyperparameter optimization, we must provide a validation file with the `-autotune-validation` argument. - -For example, using the same data as our [tutorial example](/docs/en/supervised-tutorial.html#our-first-classifier), the autotune can be used in the following way: - - - -```sh ->> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid -``` - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid') -``` - - - -Then, fastText will search the hyperparameters that gives the best f1-score on `cooking.valid` file: -```sh -Progress: 100.0% Trials: 27 Best score: 0.406763 ETA: 0h 0m 0s -``` - -Now we can test the obtained model with: - - -```sh ->> ./fasttext test model_cooking.bin cooking.valid -N 3000 -P@1 0.666 -R@1 0.288 -``` - -```py ->>> model.test("cooking.valid") -(3000L, 0.666, 0.288) -``` - - - -By default, the search will take 5 minutes. You can set the timeout in seconds with the `-autotune-duration` argument. For example, if you want to set the limit to 10 minutes: - - - -```sh ->> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid -autotune-duration 600 -``` - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid', autotuneDuration=600) -``` - - - -While autotuning, fastText displays the best f1-score found so far. If we decide to stop the tuning before the time limit, we can send one `SIGINT` signal (via `CTLR-C` for example). FastText will then finish the current training, and retrain with the best parameters found so far. - - - -# Constrain model size - -As you may know, fastText can compress the model with [quantization](/docs/en/cheatsheet.html#quantization). However, this compression task comes with its own [hyperparameters](/docs/en/options.html) (`-cutoff`, `-retrain`, `-qnorm`, `-qout`, `-dsub`) that have a consequence on the accuracy and the size of the final model. - -Fortunately, autotune can also find the hyperparameters for this compression task while targeting the desired model size. To this end, we can set the `-autotune-modelsize` argument: - - - -```sh ->> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid -autotune-modelsize 2M -``` -This will produce a `.ftz` file with the best accuracy having the desired size: -```sh ->> ls -la model_cooking.ftz --rw-r--r--. 1 celebio users 1990862 Aug 25 05:39 model_cooking.ftz ->> ./fasttext test model_cooking.ftz cooking.valid -N 3000 -P@1 0.57 -R@1 0.246 -``` - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid', autotuneModelSize="2M") -``` -If you save the model, you will obtain a model file with the desired size: -```py ->>> model.save_model("model_cooking.ftz") ->>> import os ->>> os.stat("model_cooking.ftz").st_size -1990862 ->>> model.test("cooking.valid") -(3000L, 0.57, 0.246) -``` - - - -# How to set the optimization metric? - - - -
-By default, autotune will test the validation file you provide, exactly the same way as `./fasttext test model_cooking.bin cooking.valid` and try to optimize to get the highest [f1-score](https://en.wikipedia.org/wiki/F1_score). - -But, if we want to optimize the score of a specific label, say `__label__baking`, we can set the `-autotune-metric` argument: - -```sh ->> ./fasttext supervised -input cooking.train -output model_cooking -autotune-validation cooking.valid -autotune-metric f1:__label__baking -``` - -This is equivalent to manually optimize the f1-score we get when we test with `./fasttext test-label model_cooking.bin cooking.valid | grep __label__baking` in command line. - -Sometimes, you may be interested in predicting more than one label. For example, if you were optimizing the hyperparameters manually to get the best score to predict two labels, you would test with `./fasttext test model_cooking.bin cooking.valid 2`. You can also tell autotune to optimize the parameters by testing two labels with the `-autotune-predictions` argument. - -
-By default, autotune will test the validation file you provide, exactly the same way as `model.test("cooking.valid")` and try to optimize to get the highest [f1-score](https://en.wikipedia.org/wiki/F1_score). - -But, if we want to optimize the score of a specific label, say `__label__baking`, we can set the `autotuneMetric` argument: - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input='cooking.train', autotuneValidationFile='cooking.valid', autotuneMetric="f1:__label__baking") -``` - -This is equivalent to manually optimize the f1-score we get when we test with `model.test_label('cooking.valid')['__label__baking']`. - -Sometimes, you may be interested in predicting more than one label. For example, if you were optimizing the hyperparameters manually to get the best score to predict two labels, you would test with `model.test("cooking.valid", k=2)`. You can also tell autotune to optimize the parameters by testing two labels with the `autotunePredictions` argument. - - -You can also force autotune to optimize for the best precision for a given recall, or the best recall for a given precision, for all labels, or for a specific label: - -For example, in order to get the best precision at recall = `30%`: -```sh ->> ./fasttext supervised [...] -autotune-metric precisionAtRecall:30 -``` -And to get the best precision at recall = `30%` for the label `__label__baking`: -```sh ->> ./fasttext supervised [...] -autotune-metric precisionAtRecall:30:__label__baking -``` - -Similarly, you can use `recallAtPrecision`: -```sh ->> ./fasttext supervised [...] -autotune-metric recallAtPrecision:30 ->> ./fasttext supervised [...] -autotune-metric recallAtPrecision:30:__label__baking -``` - - diff --git a/docs/cheatsheet.md b/docs/cheatsheet.md deleted file mode 100644 index be240e0f6..000000000 --- a/docs/cheatsheet.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -id: cheatsheet -title: Cheatsheet ---- - -## Word representation learning - -In order to learn word vectors do: - -```bash -$ ./fasttext skipgram -input data.txt -output model -``` - -## Obtaining word vectors - -Print word vectors for a text file `queries.txt` containing words. - -```bash -$ ./fasttext print-word-vectors model.bin < queries.txt -``` - -## Text classification - -In order to train a text classifier do: - -```bash -$ ./fasttext supervised -input train.txt -output model -``` - -Once the model was trained, you can evaluate it by computing the precision and recall at k (P@k and R@k) on a test set using: - -```bash -$ ./fasttext test model.bin test.txt 1 -``` - -In order to obtain the k most likely labels for a piece of text, use: - -```bash -$ ./fasttext predict model.bin test.txt k -``` - -In order to obtain the k most likely labels and their associated probabilities for a piece of text, use: - -```bash -$ ./fasttext predict-prob model.bin test.txt k -``` - -If you want to compute vector representations of sentences or paragraphs, please use: - -```bash -$ ./fasttext print-sentence-vectors model.bin < text.txt -``` - -## Quantization - -In order to create a `.ftz` file with a smaller memory footprint do: - -```bash -$ ./fasttext quantize -output model -``` - -All other commands such as test also work with this model - -```bash -$ ./fasttext test model.ftz test.txt -``` - -## Autotune - -Activate hyperparameter optimization with `-autotune-validation` argument: - -```bash -$ ./fasttext supervised -input train.txt -output model -autotune-validation valid.txt -``` - -Set timeout (in seconds): -```bash -$ ./fasttext supervised -input train.txt -output model -autotune-validation valid.txt -autotune-duration 600 -``` - -Constrain the final model size: -```bash -$ ./fasttext supervised -input train.txt -output model -autotune-validation valid.txt -autotune-modelsize 2M -``` - - - - - diff --git a/docs/crawl-vectors.md b/docs/crawl-vectors.md deleted file mode 100644 index 671e7dad7..000000000 --- a/docs/crawl-vectors.md +++ /dev/null @@ -1,203 +0,0 @@ ---- -id: crawl-vectors -title: Word vectors for 157 languages ---- - -We distribute pre-trained word vectors for 157 languages, trained on [*Common Crawl*](http://commoncrawl.org/) and [*Wikipedia*](https://www.wikipedia.org) using fastText. -These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives. -We also distribute three new word analogy datasets, for French, Hindi and Polish. - -### Download directly with command line or from python - -In order to download with command line or from python code, you must have installed the python package as [described here](/docs/en/support.html#building-fasttext-python-module). - - - -```bash -$ ./download_model.py en # English -Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz - (19.78%) [=========> ] -``` -Once the download is finished, use the model as usual: -```bash -$ ./fasttext nn cc.en.300.bin 10 -Query word? -``` - -```py ->>> import fasttext.util ->>> fasttext.util.download_model('en', if_exists='ignore') # English ->>> ft = fasttext.load_model('cc.en.300.bin') -``` - - -#### 🤗 HuggingFace Integration -Word vectors for 157 languages available on the Hugging Face Hub under the [`fasttext`](https://huggingface.co/models?library=fasttext) tag and more documentation is available [here](https://huggingface.co/facebook/fasttext-en-vectors/blob/main/README.md). - -```python ->>> import fasttext ->>> from huggingface_hub import hf_hub_download - ->>> model_path = hf_hub_download(repo_id="facebook/fasttext-en-vectors", filename="model.bin") ->>> model = fasttext.load_model(model_path) -``` - -### Adapt the dimension - -The pre-trained word vectors we distribute have dimension 300. If you need a smaller size, you can use our dimension reducer. -In order to use that feature, you must have installed the python package as [described here](/docs/en/support.html#building-fasttext-python-module). - -For example, in order to get vectors of dimension 100: - - - -```bash -$ ./reduce_model.py cc.en.300.bin 100 -Loading model -Reducing matrix dimensions -Saving model -cc.en.100.bin saved -``` -Then you can use the `cc.en.100.bin` model file as usual. - - -```py ->>> import fasttext ->>> import fasttext.util ->>> ft = fasttext.load_model('cc.en.300.bin') ->>> ft.get_dimension() -300 ->>> fasttext.util.reduce_model(ft, 100) ->>> ft.get_dimension() -100 -``` -Then you can use `ft` model object as usual: -```py ->>> ft.get_word_vector('hello').shape -(100,) ->>> ft.get_nearest_neighbors('hello') -[(0.775576114654541, u'heyyyy'), (0.7686290144920349, u'hellow'), (0.7663413286209106, u'hello-'), (0.7579624056816101, u'heyyyyy'), (0.7495524287223816, u'hullo'), (0.7473770380020142, u'.hello'), (0.7407292127609253, u'Hiiiii'), (0.7402616739273071, u'hellooo'), (0.7399682402610779, u'hello.'), (0.7396857738494873, u'Heyyyyy')] -``` -or save it for later use: -```py ->>> ft.save_model('cc.en.100.bin') -``` - - - -### Format - -The word vectors are available in both binary and text formats. - -Using the binary models, vectors for out-of-vocabulary words can be obtained with -``` -$ ./fasttext print-word-vectors wiki.it.300.bin < oov_words.txt -``` -where the file oov_words.txt contains out-of-vocabulary words. - -In the text format, each line contain a word followed by its vector. -Each value is space separated, and words are sorted by frequency in descending order. -These text models can easily be loaded in Python using the following code: -```python -import io - -def load_vectors(fname): - fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') - n, d = map(int, fin.readline().split()) - data = {} - for line in fin: - tokens = line.rstrip().split(' ') - data[tokens[0]] = map(float, tokens[1:]) - return data -``` - -### Tokenization - -We used the [*Stanford word segmenter*](https://nlp.stanford.edu/software/segmenter.html) for Chinese, [*Mecab*](http://taku910.github.io/mecab/) for Japanese and [*UETsegmenter*](https://github.com/phongnt570/UETsegmenter) for Vietnamese. -For languages using the Latin, Cyrillic, Hebrew or Greek scripts, we used the tokenizer from the [*Europarl*](http://www.statmt.org/europarl/) preprocessing tools. -For the remaining languages, we used the ICU tokenizer. - -More information about the training of these models can be found in the article [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893). - -### License - -The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/). - -### References - -If you use these word vectors, please cite the following paper: - -E. Grave\*, P. Bojanowski\*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893) - -```markup -@inproceedings{grave2018learning, - title={Learning Word Vectors for 157 Languages}, - author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas}, - booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)}, - year={2018} -} -``` - -### Evaluation datasets - -The analogy evaluation datasets described in the paper are available here: [French](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-fr.txt), [Hindi](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-hi.txt), [Polish](https://dl.fbaipublicfiles.com/fasttext/word-analogies/questions-words-pl.txt). - -### Models - -The models can be downloaded from: - -|||| -|-|-|-| -| Afrikaans: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.af.300.vec.gz) | Albanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sq.300.vec.gz) | Alemannic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.als.300.vec.gz) | -| Amharic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.am.300.vec.gz) | Arabic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ar.300.vec.gz) | Aragonese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.an.300.vec.gz) | -| Armenian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hy.300.vec.gz) | Assamese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.as.300.vec.gz) | Asturian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ast.300.vec.gz) | -| Azerbaijani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.az.300.vec.gz) | Bashkir: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz) | Basque: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eu.300.vec.gz) | -| Bavarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bar.300.vec.gz) | Belarusian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.be.300.vec.gz) | Bengali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bn.300.vec.gz) | -| Bihari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bh.300.vec.gz) | Bishnupriya Manipuri: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bpy.300.vec.gz) | Bosnian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bs.300.vec.gz) | -| Breton: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.br.300.vec.gz) | Bulgarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bg.300.vec.gz) | Burmese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.my.300.vec.gz) | -| Catalan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ca.300.vec.gz) | Cebuano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ceb.300.vec.gz) | Central Bicolano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bcl.300.vec.gz) | -| Chechen: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ce.300.vec.gz) | Chinese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zh.300.vec.gz) | Chuvash: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cv.300.vec.gz) | -| Corsican: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.co.300.vec.gz) | Croatian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hr.300.vec.gz) | Czech: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz) | -| Danish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.da.300.vec.gz) | Divehi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.dv.300.vec.gz) | Dutch: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz) | -| Eastern Punjabi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pa.300.vec.gz) | Egyptian Arabic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.arz.300.vec.gz) | Emilian-Romagnol: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eml.300.vec.gz) | -| English: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz) | Erzya: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.myv.300.vec.gz) | Esperanto: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.eo.300.vec.gz) | -| Estonian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.et.300.vec.gz) | Fiji Hindi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hif.300.vec.gz) | Finnish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fi.300.vec.gz) | -| French: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz) | Galician: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gl.300.vec.gz) | Georgian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ka.300.vec.gz) | -| German: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz) | Goan Konkani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gom.300.vec.gz) | Greek: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.el.300.vec.gz) | -| Gujarati: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gu.300.vec.gz) | Haitian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ht.300.vec.gz) | Hebrew: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.he.300.vec.gz) | -| Hill Mari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mrj.300.vec.gz) | Hindi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz) | Hungarian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hu.300.vec.gz) | -| Icelandic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.is.300.vec.gz) | Ido: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.io.300.vec.gz) | Ilokano: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ilo.300.vec.gz) | -| Indonesian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz) | Interlingua: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ia.300.vec.gz) | Irish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ga.300.vec.gz) | -| Italian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.it.300.vec.gz) | Japanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz) | Javanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.jv.300.vec.gz) | -| Kannada: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kn.300.vec.gz) | Kapampangan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pam.300.vec.gz) | Kazakh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.kk.300.vec.gz) | -| Khmer: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.km.300.vec.gz) | Kirghiz: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ky.300.vec.gz) | Korean: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.vec.gz) | -| Kurdish (Kurmanji): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ku.300.vec.gz) | Kurdish (Sorani): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ckb.300.vec.gz) | Latin: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.la.300.vec.gz) | -| Latvian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lv.300.vec.gz) | Limburgish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.li.300.vec.gz) | Lithuanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lt.300.vec.gz) | -| Lombard: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lmo.300.vec.gz) | Low Saxon: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nds.300.vec.gz) | Luxembourgish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.lb.300.vec.gz) | -| Macedonian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mk.300.vec.gz) | Maithili: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mai.300.vec.gz) | Malagasy: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mg.300.vec.gz) | -| Malay: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ms.300.vec.gz) | Malayalam: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ml.300.vec.gz) | Maltese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mt.300.vec.gz) | -| Manx: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gv.300.vec.gz) | Marathi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz) | Mazandarani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mzn.300.vec.gz) | -| Meadow Mari: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mhr.300.vec.gz) | Minangkabau: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.min.300.vec.gz) | Mingrelian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.xmf.300.vec.gz) | -| Mirandese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mwl.300.vec.gz) | Mongolian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mn.300.vec.gz) | Nahuatl: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nah.300.vec.gz) | -| Neapolitan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nap.300.vec.gz) | Nepali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ne.300.vec.gz) | Newar: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.new.300.vec.gz) | -| North Frisian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.frr.300.vec.gz) | Northern Sotho: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nso.300.vec.gz) | Norwegian (Bokmål): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.no.300.vec.gz) | -| Norwegian (Nynorsk): [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nn.300.vec.gz) | Occitan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.oc.300.vec.gz) | Oriya: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.or.300.vec.gz) | -| Ossetian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.os.300.vec.gz) | Palatinate German: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pfl.300.vec.gz) | Pashto: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ps.300.vec.gz) | -| Persian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz) | Piedmontese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pms.300.vec.gz) | Polish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz) | -| Portuguese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz) | Quechua: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.qu.300.vec.gz) | Romanian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ro.300.vec.gz) | -| Romansh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.rm.300.vec.gz) | Russian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz) | Sakha: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sah.300.vec.gz) | -| Sanskrit: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sa.300.vec.gz) | Sardinian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sc.300.vec.gz) | Scots: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sco.300.vec.gz) | -| Scottish Gaelic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.gd.300.vec.gz) | Serbian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sr.300.vec.gz) | Serbo-Croatian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sh.300.vec.gz) | -| Sicilian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.scn.300.vec.gz) | Sindhi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sd.300.vec.gz) | Sinhalese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.si.300.vec.gz) | -| Slovak: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sk.300.vec.gz) | Slovenian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sl.300.vec.gz) | Somali: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.so.300.vec.gz) | -| Southern Azerbaijani: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.azb.300.vec.gz) | Spanish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz) | Sundanese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.su.300.vec.gz) | -| Swahili: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sw.300.vec.gz) | Swedish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.sv.300.vec.gz) | Tagalog: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tl.300.vec.gz) | -| Tajik: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tg.300.vec.gz) | Tamil: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz) | Tatar: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tt.300.vec.gz) | -| Telugu: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.te.300.vec.gz) | Thai: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.th.300.vec.gz) | Tibetan: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.bo.300.vec.gz) | -| Turkish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tr.300.vec.gz) | Turkmen: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.tk.300.vec.gz) | Ukrainian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uk.300.vec.gz) | -| Upper Sorbian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hsb.300.vec.gz) | Urdu: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ur.300.vec.gz) | Uyghur: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ug.300.vec.gz) | -| Uzbek: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.uz.300.vec.gz) | Venetian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vec.300.vec.gz) | Vietnamese: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vi.300.vec.gz) | -| Volapük: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vo.300.vec.gz) | Walloon: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.wa.300.vec.gz) | Waray: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.war.300.vec.gz) | -| Welsh: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cy.300.vec.gz) | West Flemish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.vls.300.vec.gz) | West Frisian: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fy.300.vec.gz) | -| Western Punjabi: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pnb.300.vec.gz) | Yiddish: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yi.300.vec.gz) | Yoruba: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.yo.300.vec.gz) | -| Zazaki: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.diq.300.vec.gz) | Zeelandic: [bin](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.bin.gz), [text](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.zea.300.vec.gz) | diff --git a/docs/dataset.md b/docs/dataset.md deleted file mode 100644 index 9161a6b08..000000000 --- a/docs/dataset.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -id: dataset -title: Datasets ---- - -[Download YFCC100M Dataset](https://fb-public.box.com/s/htfdbrvycvroebv9ecaezaztocbcnsdn) diff --git a/docs/english-vectors.md b/docs/english-vectors.md deleted file mode 100644 index 52c35f1f5..000000000 --- a/docs/english-vectors.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -id: english-vectors -title: English word vectors ---- - -This page gathers several pre-trained word vectors trained using fastText. - -### Download pre-trained word vectors - -Pre-trained word vectors learned on different sources can be downloaded below: - -1. [wiki-news-300d-1M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip): 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens). -2. [wiki-news-300d-1M-subword.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M-subword.vec.zip): 1 million word vectors trained with subword infomation on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens). -3. [crawl-300d-2M.vec.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip): 2 million word vectors trained on Common Crawl (600B tokens). -4. [crawl-300d-2M-subword.zip](https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip): 2 million word vectors trained with subword information on Common Crawl (600B tokens). - -### Format - -The first line of the file contains the number of words in the vocabulary and the size of the vectors. -Each line contains a word followed by its vectors, like in the default fastText text format. -Each value is space separated. Words are ordered by descending frequency. -These text models can easily be loaded in Python using the following code: -```python -import io - -def load_vectors(fname): - fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') - n, d = map(int, fin.readline().split()) - data = {} - for line in fin: - tokens = line.rstrip().split(' ') - data[tokens[0]] = map(float, tokens[1:]) - return data -``` - -### License - -These word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/). - -### References - -If you use these word vectors, please cite the following paper: - -T. Mikolov, E. Grave, P. Bojanowski, C. Puhrsch, A. Joulin. [*Advances in Pre-Training Distributed Word Representations*](https://arxiv.org/abs/1712.09405) - -```markup -@inproceedings{mikolov2018advances, - title={Advances in Pre-Training Distributed Word Representations}, - author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand}, - booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)}, - year={2018} -} -``` diff --git a/docs/faqs.md b/docs/faqs.md deleted file mode 100644 index c92b62479..000000000 --- a/docs/faqs.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -id: faqs -title:FAQ ---- - -## What is fastText? Are there tutorials? - -FastText is a library for text classification and representation. It transforms text into continuous vectors that can later be used on any language related task. A few tutorials are available. - -## How can I reduce the size of my fastText models? - -fastText uses a hashtable for either word or character ngrams. The size of the hashtable directly impacts the size of a model. To reduce the size of the model, it is possible to reduce the size of this table with the option '-hash'. For example a good value is 20000. Another option that greatly impacts the size of a model is the size of the vectors (-dim). This dimension can be reduced to save space but this can significantly impact performance. If that still produce a model that is too big, one can further reduce the size of a trained model with the quantization option. -```bash -./fasttext quantize -output model -``` - -## What would be the best way to represent word phrases rather than words? - -Currently the best approach to represent word phrases or sentence is to take a bag of words of word vectors. Additionally, for phrases like “New York”, preprocessing the data so that it becomes a single token “New_York” can greatly help. - -## Why does fastText produce vectors even for unknown words? - -One of the key features of fastText word representation is its ability to produce vectors for any words, even made-up ones. -Indeed, fastText word vectors are built from vectors of substrings of characters contained in it. -This allows to build vectors even for misspelled words or concatenation of words. - -## Why is the hierarchical softmax slightly worse in performance than the full softmax? - -The hierarchical softmax is an approximation of the full softmax loss that allows to train on large number of class efficiently. This is often at the cost of a few percent of accuracy. -Note also that this loss is thought for classes that are unbalanced, that is some classes are more frequent than others. If your dataset has a balanced number of examples per class, it is worth trying the negative sampling loss (-loss ns -neg 100). -However, negative sampling will still be very slow at test time, since the full softmax will be computed. - -## Can we run fastText program on a GPU? - -As of now, fastText only works on CPU. -Please note that one of the goal of fastText is to be an efficient CPU tool, allowing to train models without requiring a GPU. - -## Can I use fastText with python? Or other languages? - -[Python is officially supported](/docs/en/support.html#building-fasttext-python-module). -There are few unofficial wrappers for javascript, lua and other languages available on github. - -## Can I use fastText with continuous data? - -FastText works on discrete tokens and thus cannot be directly used on continuous tokens. However, one can discretize continuous tokens to use fastText on them, for example by rounding values to a specific digit ("12.3" becomes "12"). - -## There are misspellings in the dictionary. Should we improve text normalization? - -If the words are infrequent, there is no need to worry. - -## I'm encountering a NaN, why could this be? - -You'll likely see this behavior because your learning rate is too high. Try reducing it until you don't see this error anymore. - -## My compiler / architecture can't build fastText. What should I do? -Try a newer version of your compiler. We try to maintain compatibility with older versions of gcc and many platforms, however sometimes maintaining backwards compatibility becomes very hard. In general, compilers and tool chains that ship with LTS versions of major linux distributions should be fair game. In any case, create an issue with your compiler version and architecture and we'll try to implement compatibility. - -## How do I run fastText in a fully reproducible way? Each time I run it I get different results. -If you run fastText multiple times you'll obtain slightly different results each time due to the optimization algorithm (asynchronous stochastic gradient descent, or Hogwild). If you need to get the same results (e.g. to confront different input params set) you have to set the 'thread' parameter to 1. In this way you'll get exactly the same performances at each run (with the same input params). - - -## Why do I get a probability of 1.00001? -This is a known rounding issue. You can consider it as 1.0. - -## How can I change the dimension of word vectors of a model file? -If you already trained a model, or downloaded a pre-trained word vectors model, you can adapt the dimension of the word vectors with the `reduce_model.py` script or by calling `fasttext.util.reduce_model` from python, as [described here](/docs/en/crawl-vectors.html#adapt-the-dimension) diff --git a/docs/language-identification.md b/docs/language-identification.md deleted file mode 100644 index 8ee0d3891..000000000 --- a/docs/language-identification.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -id: language-identification -title: Language identification ---- - -### Description - -We distribute two models for language identification, which can recognize 176 languages (see the list of ISO codes below). These models were trained on data from [Wikipedia](https://www.wikipedia.org/), [Tatoeba](https://tatoeba.org/eng/) and [SETimes](http://nlp.ffzg.hr/resources/corpora/setimes/), used under [CC-BY-SA](http://creativecommons.org/licenses/by-sa/3.0/). - -We distribute two versions of the models: - -* [lid.176.bin](https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin), which is faster and slightly more accurate, but has a file size of 126MB ; -* [lid.176.ftz](https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz), which is the compressed version of the model, with a file size of 917kB. - -These models were trained on UTF-8 data, and therefore expect UTF-8 as input. - -#### Updated model (NLLB project) -A newer LID (**L**anguage **ID**entification) model was [released as part of the NLLB project](https://github.com/facebookresearch/fairseq/tree/nllb#lid-model) under [CC-BY-NC 4.0](LICENSE.model.md) license. - -* [lid218e.bin](https://tinyurl.com/nllblid218e) uses different language codes from the original models—the ISO 639-3 code (e.g. "eng", "fra", "rus") plus an additional code describing the script (e.g., "eng_Latn", "ukr_Cyrl")—and has a file size of 1.2GB. - -You can read more about the data the model was trained on [here](https://github.com/facebookresearch/fairseq/blob/nllb/README.md#datasets). - -#### 🤗 HuggingFace Integration -This model is [available](https://huggingface.co/facebook/fasttext-language-identification) on the Hugging Face Hub. - -```python ->>> import fasttext ->>> from huggingface_hub import hf_hub_download - ->>> model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin") ->>> model = fasttext.load_model(model_path) ->>> model.predict("Hello, world!") - -(('__label__eng_Latn',), array([0.81148803])) - ->>> model.predict("Hello, world!", k=5) - -(('__label__eng_Latn', '__label__vie_Latn', '__label__nld_Latn', '__label__pol_Latn', '__label__deu_Latn'), - array([0.61224753, 0.21323682, 0.09696738, 0.01359863, 0.01319415])) -``` - - -### License - -The models are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/). - -### List of supported languages -``` -af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh -``` - -### References - -If you use these models, please cite the following papers: - -[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759) -``` -@article{joulin2016bag, - title={Bag of Tricks for Efficient Text Classification}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1607.01759}, - year={2016} -} -``` -[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models* ](https://arxiv.org/abs/1612.03651) -``` -@article{joulin2016fasttext, - title={FastText.zip: Compressing text classification models}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1612.03651}, - year={2016} -} -``` diff --git a/docs/options.md b/docs/options.md deleted file mode 100644 index 931966409..000000000 --- a/docs/options.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -id: options -title: List of options ---- - -Invoke a command without arguments to list available arguments and their default values: - -```bash -$ ./fasttext supervised -Empty input or output path. - -The following arguments are mandatory: - -input training file path - -output output file path - - The following arguments are optional: - -verbose verbosity level [2] - - The following arguments for the dictionary are optional: - -minCount minimal number of word occurrences [1] - -minCountLabel minimal number of label occurrences [0] - -wordNgrams max length of word ngram [1] - -bucket number of buckets [2000000] - -minn min length of char ngram [0] - -maxn max length of char ngram [0] - -t sampling threshold [0.0001] - -label labels prefix [__label__] - - The following arguments for training are optional: - -lr learning rate [0.1] - -lrUpdateRate change the rate of updates for the learning rate [100] - -dim size of word vectors [100] - -ws size of the context window [5] - -epoch number of epochs [5] - -neg number of negatives sampled [5] - -loss loss function {ns, hs, softmax} [softmax] - -thread number of threads [12] - -pretrainedVectors pretrained word vectors for supervised learning [] - -saveOutput whether output params should be saved [0] - - The following arguments for quantization are optional: - -cutoff number of words and ngrams to retain [0] - -retrain finetune embeddings if a cutoff is applied [0] - -qnorm quantizing the norm separately [0] - -qout quantizing the classifier [0] - -dsub size of each sub-vector [2] -``` - -Defaults may vary by mode. (Word-representation modes `skipgram` and `cbow` use a default `-minCount` of 5.) - - -Hyperparameter optimization (autotune) is activated when you provide a validation file with `-autotune-validation` argument. -```text -The following arguments are for autotune: - -autotune-validation validation file to be used for evaluation - -autotune-metric metric objective {f1, f1:labelname} [f1] - -autotune-predictions number of predictions used for evaluation [1] - -autotune-duration maximum duration in seconds [300] - -autotune-modelsize constraint model file size [] (empty = do not quantize) -``` diff --git a/docs/pretrained-vectors.md b/docs/pretrained-vectors.md deleted file mode 100644 index 2034b347d..000000000 --- a/docs/pretrained-vectors.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -id: pretrained-vectors -title: Wiki word vectors ---- - -We are publishing pre-trained word vectors for 294 languages, trained on [*Wikipedia*](https://www.wikipedia.org) using fastText. -These vectors in dimension 300 were obtained using the skip-gram model described in [*Bojanowski et al. (2016)*](https://arxiv.org/abs/1607.04606) with default parameters. - -Please note that a newer version of multi-lingual word vectors are available at: [Word vectors for 157 languages](https://fasttext.cc/docs/en/crawl-vectors.html). - -### Models - -The models can be downloaded from: - -|||| -|-|-|-| -| Abkhazian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ab.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ab.vec) | Acehnese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ace.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ace.vec) | Adyghe: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ady.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ady.vec) | -| Afar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.aa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.aa.vec) | Afrikaans: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.af.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.af.vec) | Akan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ak.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ak.vec) | -| Albanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sq.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sq.vec) | Alemannic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.als.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.als.vec) | Amharic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.am.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.am.vec) | -| Anglo_Saxon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ang.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ang.vec) | Arabic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ar.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ar.vec) | Aragonese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.an.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.an.vec) | -| Aramaic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arc.vec) | Armenian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hy.vec) | Aromanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_rup.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_rup.vec) | -| Assamese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.as.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.as.vec) | Asturian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ast.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ast.vec) | Avar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.av.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.av.vec) | -| Aymara: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ay.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ay.vec) | Azerbaijani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.az.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.az.vec) | Bambara: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bm.vec) | -| Banjar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bjn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bjn.vec) | Banyumasan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.map_bms.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.map_bms.vec) | Bashkir: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ba.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ba.vec) | -| Basque: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eu.vec) | Bavarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bar.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bar.vec) | Belarusian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.be.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.be.vec) | -| Bengali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bn.vec) | Bihari: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bh.vec) | Bishnupriya Manipuri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bpy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bpy.vec) | -| Bislama: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bi.vec) | Bosnian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bs.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bs.vec) | Breton: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.br.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.br.vec) | -| Buginese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bug.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bug.vec) | Bulgarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bg.vec) | Burmese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.my.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.my.vec) | -| Buryat: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bxr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bxr.vec) | Cantonese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_yue.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_yue.vec) | Catalan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ca.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ca.vec) | -| Cebuano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ceb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ceb.vec) | Central Bicolano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bcl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bcl.vec) | Chamorro: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ch.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ch.vec) | -| Chavacano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cbk_zam.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cbk_zam.vec) | Chechen: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ce.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ce.vec) | Cherokee: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chr.vec) | -| Cheyenne: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.chy.vec) | Chichewa: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ny.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ny.vec) | Chinese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.vec) | -| Choctaw: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cho.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cho.vec) | Chuvash: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cv.vec) | Classical Chinese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.vec) | -| Cornish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kw.vec) | Corsican: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.co.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.co.vec) | Cree: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cr.vec) | -| Crimean Tatar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.crh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.crh.vec) | Croatian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hr.vec) | Czech: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cs.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cs.vec) | -| Danish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.da.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.da.vec) | Divehi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dv.vec) | Dutch: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nl.vec) | -| Dutch Low Saxon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds_nl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds_nl.vec) | Dzongkha: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dz.vec) | Eastern Punjabi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pa.vec) | -| Egyptian Arabic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.arz.vec) | Emilian_Romagnol: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eml.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eml.vec) | English: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.vec) | -| Erzya: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.myv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.myv.vec) | Esperanto: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.eo.vec) | Estonian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.et.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.et.vec) | -| Ewe: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ee.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ee.vec) | Extremaduran: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ext.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ext.vec) | Faroese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fo.vec) | -| Fiji Hindi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hif.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hif.vec) | Fijian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fj.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fj.vec) | Finnish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fi.vec) | -| Franco_Provençal: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frp.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frp.vec) | French: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.vec) | Friulian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fur.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fur.vec) | -| Fula: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ff.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ff.vec) | Gagauz: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gag.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gag.vec) | Galician: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gl.vec) | -| Gan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gan.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gan.vec) | Georgian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ka.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ka.vec) | German: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.de.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.de.vec) | -| Gilaki: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.glk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.glk.vec) | Goan Konkani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gom.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gom.vec) | Gothic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.got.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.got.vec) | -| Greek: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.el.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.el.vec) | Greenlandic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kl.vec) | Guarani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gn.vec) | -| Gujarati: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gu.vec) | Haitian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ht.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ht.vec) | Hakka: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hak.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hak.vec) | -| Hausa: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ha.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ha.vec) | Hawaiian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.haw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.haw.vec) | Hebrew: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.he.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.he.vec) | -| Herero: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hz.vec) | Hill Mari: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mrj.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mrj.vec) | Hindi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hi.vec) | -| Hiri Motu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ho.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ho.vec) | Hungarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hu.vec) | Icelandic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.is.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.is.vec) | -| Ido: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.io.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.io.vec) | Igbo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ig.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ig.vec) | Ilokano: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ilo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ilo.vec) | -| Indonesian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.id.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.id.vec) | Interlingua: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ia.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ia.vec) | Interlingue: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ie.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ie.vec) | -| Inuktitut: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.iu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.iu.vec) | Inupiak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ik.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ik.vec) | Irish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ga.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ga.vec) | -| Italian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.it.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.it.vec) | Jamaican Patois: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jam.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jam.vec) | Japanese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ja.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ja.vec) | -| Javanese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jv.vec) | Kabardian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kbd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kbd.vec) | Kabyle: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kab.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kab.vec) | -| Kalmyk: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xal.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xal.vec) | Kannada: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kn.vec) | Kanuri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kr.vec) | -| Kapampangan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pam.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pam.vec) | Karachay_Balkar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.krc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.krc.vec) | Karakalpak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kaa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kaa.vec) | -| Kashmiri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ks.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ks.vec) | Kashubian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.csb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.csb.vec) | Kazakh: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kk.vec) | -| Khmer: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.km.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.km.vec) | Kikuyu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ki.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ki.vec) | Kinyarwanda: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rw.vec) | -| Kirghiz: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ky.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ky.vec) | Kirundi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rn.vec) | Komi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kv.vec) | -| Komi_Permyak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.koi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.koi.vec) | Kongo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kg.vec) | Korean: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ko.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ko.vec) | -| Kuanyama: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kj.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.kj.vec) | Kurdish (Kurmanji): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ku.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ku.vec) | Kurdish (Sorani): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ckb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ckb.vec) | -| Ladino: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lad.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lad.vec) | Lak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lbe.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lbe.vec) | Lao: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lo.vec) | -| Latgalian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ltg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ltg.vec) | Latin: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.la.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.la.vec) | Latvian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lv.vec) | -| Lezgian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lez.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lez.vec) | Ligurian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lij.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lij.vec) | Limburgish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.li.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.li.vec) | -| Lingala: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ln.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ln.vec) | Lithuanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lt.vec) | Livvi_Karelian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.olo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.olo.vec) | -| Lojban: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jbo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.jbo.vec) | Lombard: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lmo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lmo.vec) | Low Saxon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nds.vec) | -| Lower Sorbian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dsb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.dsb.vec) | Luganda: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lg.vec) | Luxembourgish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lb.vec) | -| Macedonian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mk.vec) | Maithili: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mai.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mai.vec) | Malagasy: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mg.vec) | -| Malay: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ms.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ms.vec) | Malayalam: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ml.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ml.vec) | Maltese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mt.vec) | -| Manx: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gv.vec) | Maori: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mi.vec) | Marathi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mr.vec) | -| Marshallese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mh.vec) | Mazandarani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mzn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mzn.vec) | Meadow Mari: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mhr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mhr.vec) | -| Min Dong: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cdo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cdo.vec) | Min Nan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_min_nan.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_min_nan.vec) | Minangkabau: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.min.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.min.vec) | -| Mingrelian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xmf.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xmf.vec) | Mirandese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mwl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mwl.vec) | Moksha: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mdf.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mdf.vec) | -| Moldovan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mo.vec) | Mongolian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mn.vec) | Muscogee: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mus.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.mus.vec) | -| Nahuatl: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nah.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nah.vec) | Nauruan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.na.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.na.vec) | Navajo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nv.vec) | -| Ndonga: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ng.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ng.vec) | Neapolitan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nap.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nap.vec) | Nepali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ne.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ne.vec) | -| Newar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.new.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.new.vec) | Norfolk: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pih.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pih.vec) | Norman: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nrm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nrm.vec) | -| North Frisian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.frr.vec) | Northern Luri: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lrc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.lrc.vec) | Northern Sami: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.se.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.se.vec) | -| Northern Sotho: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nso.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nso.vec) | Norwegian (Bokmål): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.no.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.no.vec) | Norwegian (Nynorsk): [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nn.vec) | -| Novial: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nov.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.nov.vec) | Nuosu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ii.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ii.vec) | Occitan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.oc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.oc.vec) | -| Old Church Slavonic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cu.vec) | Oriya: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.or.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.or.vec) | Oromo: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.om.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.om.vec) | -| Ossetian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.os.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.os.vec) | Palatinate German: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pfl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pfl.vec) | Pali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pi.vec) | -| Pangasinan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pag.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pag.vec) | Papiamentu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pap.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pap.vec) | Pashto: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ps.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ps.vec) | -| Pennsylvania German: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pdc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pdc.vec) | Persian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fa.vec) | Picard: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pcd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pcd.vec) | -| Piedmontese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pms.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pms.vec) | Polish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pl.vec) | Pontic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnt.vec) | -| Portuguese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pt.vec) | Quechua: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.qu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.qu.vec) | Ripuarian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ksh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ksh.vec) | -| Romani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rmy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rmy.vec) | Romanian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ro.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ro.vec) | Romansh: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rm.vec) | -| Russian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ru.vec) | Rusyn: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rue.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.rue.vec) | Sakha: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sah.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sah.vec) | -| Samoan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sm.vec) | Samogitian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bat_smg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bat_smg.vec) | Sango: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sg.vec) | -| Sanskrit: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sa.vec) | Sardinian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sc.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sc.vec) | Saterland Frisian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.stq.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.stq.vec) | -| Scots: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sco.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sco.vec) | Scottish Gaelic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.gd.vec) | Serbian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sr.vec) | -| Serbo_Croatian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sh.vec) | Sesotho: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.st.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.st.vec) | Shona: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sn.vec) | -| Sicilian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.scn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.scn.vec) | Silesian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.szl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.szl.vec) | Simple English: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec) | -| Sindhi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sd.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sd.vec) | Sinhalese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.si.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.si.vec) | Slovak: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sk.vec) | -| Slovenian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sl.vec) | Somali: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.so.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.so.vec) | Southern Azerbaijani: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.azb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.azb.vec) | -| Spanish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.es.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.es.vec) | Sranan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.srn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.srn.vec) | Sundanese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.su.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.su.vec) | -| Swahili: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sw.vec) | Swati: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ss.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ss.vec) | Swedish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sv.vec) | -| Tagalog: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tl.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tl.vec) | Tahitian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ty.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ty.vec) | Tajik: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tg.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tg.vec) | -| Tamil: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ta.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ta.vec) | Tarantino: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_tara.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.roa_tara.vec) | Tatar: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tt.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tt.vec) | -| Telugu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.te.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.te.vec) | Tetum: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tet.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tet.vec) | Thai: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.th.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.th.vec) | -| Tibetan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.bo.vec) | Tigrinya: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ti.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ti.vec) | Tok Pisin: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tpi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tpi.vec) | -| Tongan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.to.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.to.vec) | Tsonga: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ts.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ts.vec) | Tswana: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tn.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tn.vec) | -| Tulu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tcy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tcy.vec) | Tumbuka: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tum.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tum.vec) | Turkish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tr.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tr.vec) | -| Turkmen: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tk.vec) | Tuvan: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tyv.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tyv.vec) | Twi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tw.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.tw.vec) | -| Udmurt: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.udm.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.udm.vec) | Ukrainian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uk.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uk.vec) | Upper Sorbian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hsb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hsb.vec) | -| Urdu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ur.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ur.vec) | Uyghur: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ug.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ug.vec) | Uzbek: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uz.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.uz.vec) | -| Venda: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ve.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ve.vec) | Venetian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vec.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vec.vec) | Vepsian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vep.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vep.vec) | -| Vietnamese: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vi.vec) | Volapük: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vo.vec) | Võro: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fiu_vro.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fiu_vro.vec) | -| Walloon: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wa.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wa.vec) | Waray: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.war.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.war.vec) | Welsh: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.cy.vec) | -| West Flemish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vls.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.vls.vec) | West Frisian: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fy.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fy.vec) | Western Punjabi: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnb.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pnb.vec) | -| Wolof: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wo.vec) | Wu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wuu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.wuu.vec) | Xhosa: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xh.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.xh.vec) | -| Yiddish: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yi.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yi.vec) | Yoruba: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yo.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.yo.vec) | Zazaki: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.diq.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.diq.vec) | -| Zeelandic: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zea.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zea.vec) | Zhuang: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.za.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.za.vec) | Zulu: [*bin+text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zu.zip), [*text*](https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zu.vec) | - -### Format - -The word vectors come in both the binary and text default formats of fastText. -In the text format, each line contains a word followed by its vector. Each value is space separated. -Words are ordered by their frequency in a descending order. - -### License - -The word vectors are distributed under the [*Creative Commons Attribution-Share-Alike License 3.0*](https://creativecommons.org/licenses/by-sa/3.0/). - -### References - -If you use these word vectors, please cite the following paper: - -P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606) - -```markup -@article{bojanowski2017enriching, - title={Enriching Word Vectors with Subword Information}, - author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, - journal={Transactions of the Association for Computational Linguistics}, - volume={5}, - year={2017}, - issn={2307-387X}, - pages={135--146} -} -``` diff --git a/docs/python-module.md b/docs/python-module.md deleted file mode 100644 index 73a4d3c89..000000000 --- a/docs/python-module.md +++ /dev/null @@ -1,314 +0,0 @@ ---- -id: python-module -title: Python module ---- - -In this document we present how to use fastText in python. - -## Table of contents - -* [Requirements](#requirements) -* [Installation](#installation) -* [Usage overview](#usage-overview) - * [Word representation model](#word-representation-model) - * [Text classification model](#text-classification-model) - * [IMPORTANT: Preprocessing data / encoding conventions](#important-preprocessing-data-encoding-conventions) - * [More examples](#more-examples) -* [API](#api) - * [`train_unsupervised` parameters](#train_unsupervised-parameters) - * [`train_supervised` parameters](#train_supervised-parameters) - * [`model` object](#model-object) - - -# Requirements - -[fastText](https://fasttext.cc/) builds on modern Mac OS and Linux distributions. -Since it uses C\++11 features, it requires a compiler with good C++11 support. You will need [Python](https://www.python.org/) (version 2.7 or ≥ 3.4), [NumPy](http://www.numpy.org/) & [SciPy](https://www.scipy.org/) and [pybind11](https://github.com/pybind/pybind11). - - -# Installation - -To install the latest release, you can do : -```bash -$ pip install fasttext -``` - -or, to get the latest development version of fasttext, you can install from our github repository : -```bash -$ git clone https://github.com/facebookresearch/fastText.git -$ cd fastText -$ sudo pip install . -$ # or : -$ sudo python setup.py install -``` - -# Usage overview - - -## Word representation model - -In order to learn word vectors, as [described here](/docs/en/references.html#enriching-word-vectors-with-subword-information), we can use `fasttext.train_unsupervised` function like this: - - -```py -import fasttext - -# Skipgram model : -model = fasttext.train_unsupervised('data.txt', model='skipgram') - -# or, cbow model : -model = fasttext.train_unsupervised('data.txt', model='cbow') - -``` - -where `data.txt` is a training file containing utf-8 encoded text. - - -The returned `model` object represents your learned model, and you can use it to retrieve information. - -```py -print(model.words) # list of words in dictionary -print(model['king']) # get the vector of the word 'king' -``` - - -### Saving and loading a model object - -You can save your trained model object by calling the function `save_model`. -```py -model.save_model("model_filename.bin") -``` - -and retrieve it later thanks to the function `load_model` : -```py -model = fasttext.load_model("model_filename.bin") -``` - -For more information about word representation usage of fasttext, you can refer to our [word representations tutorial](/docs/en/unsupervised-tutorial.html). - - -## Text classification model - -In order to train a text classifier using the method [described here](/docs/en/references.html#bag-of-tricks-for-efficient-text-classification), we can use `fasttext.train_supervised` function like this: - - -```py -import fasttext - -model = fasttext.train_supervised('data.train.txt') -``` - -where `data.train.txt` is a text file containing a training sentence per line along with the labels. By default, we assume that labels are words that are prefixed by the string `__label__` - -Once the model is trained, we can retrieve the list of words and labels: - -```py -print(model.words) -print(model.labels) -``` - -To evaluate our model by computing the precision at 1 (P@1) and the recall on a test set, we use the `test` function: - -```py -def print_results(N, p, r): - print("N\t" + str(N)) - print("P@{}\t{:.3f}".format(1, p)) - print("R@{}\t{:.3f}".format(1, r)) - -print_results(*model.test('test.txt')) -``` - -We can also predict labels for a specific text : - -```py -model.predict("Which baking dish is best to bake a banana bread ?") -``` - -By default, `predict` returns only one label : the one with the highest probability. You can also predict more than one label by specifying the parameter `k`: -```py -model.predict("Which baking dish is best to bake a banana bread ?", k=3) -``` - -If you want to predict more than one sentence you can pass an array of strings : - -```py -model.predict(["Which baking dish is best to bake a banana bread ?", "Why not put knives in the dishwasher?"], k=3) -``` - - -Of course, you can also save and load a model to/from a file as [in the word representation usage](#saving-and-loading-a-model-object). - -For more information about text classification usage of fasttext, you can refer to our [text classification tutorial](/docs/en/supervised-tutorial.html). - - - - -### Compress model files with quantization - -When you want to save a supervised model file, fastText can compress it in order to have a much smaller model file by sacrificing only a little bit performance. - -```py -# with the previously trained `model` object, call : -model.quantize(input='data.train.txt', retrain=True) - -# then display results and save the new model : -print_results(*model.test(valid_data)) -model.save_model("model_filename.ftz") -``` - -`model_filename.ftz` will have a much smaller size than `model_filename.bin`. - -For further reading on quantization, you can refer to [this paragraph from our blog post](/blog/2017/10/02/blog-post.html#model-compression). - - -## IMPORTANT: Preprocessing data / encoding conventions - -In general it is important to properly preprocess your data. In particular our example scripts in the [root folder](https://github.com/facebookresearch/fastText) do this. - -fastText assumes UTF-8 encoded text. All text must be [unicode for Python2](https://docs.python.org/2/library/functions.html#unicode) and [str for Python3](https://docs.python.org/3.5/library/stdtypes.html#textseq). The passed text will be [encoded as UTF-8 by pybind11](https://pybind11.readthedocs.io/en/master/advanced/cast/strings.html?highlight=utf-8#strings-bytes-and-unicode-conversions) before passed to the fastText C++ library. This means it is important to use UTF-8 encoded text when building a model. On Unix-like systems you can convert text using [iconv](https://en.wikipedia.org/wiki/Iconv). - -fastText will tokenize (split text into pieces) based on the following ASCII characters (bytes). In particular, it is not aware of UTF-8 whitespace. We advice the user to convert UTF-8 whitespace / word boundaries into one of the following symbols as appropiate. - -* space -* tab -* vertical tab -* carriage return -* formfeed -* the null character - -The newline character is used to delimit lines of text. In particular, the EOS token is appended to a line of text if a newline character is encountered. The only exception is if the number of tokens exceeds the MAX\_LINE\_SIZE constant as defined in the [Dictionary header](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h). This means if you have text that is not separate by newlines, such as the [fil9 dataset](http://mattmahoney.net/dc/textdata), it will be broken into chunks with MAX\_LINE\_SIZE of tokens and the EOS token is not appended. - -The length of a token is the number of UTF-8 characters by considering the [leading two bits of a byte](https://en.wikipedia.org/wiki/UTF-8#Description) to identify [subsequent bytes of a multi-byte sequence](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc). Knowing this is especially important when choosing the minimum and maximum length of subwords. Further, the EOS token (as specified in the [Dictionary header](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h)) is considered a character and will not be broken into subwords. - -## More examples - -In order to have a better knowledge of fastText models, please consider the main [README](https://github.com/facebookresearch/fastText/blob/master/README.md) and in particular [the tutorials on our website](https://fasttext.cc/docs/en/supervised-tutorial.html). - -You can find further python examples in [the doc folder](https://github.com/facebookresearch/fastText/tree/master/python/doc/examples). - -As with any package you can get help on any Python function using the help function. - -For example - -``` -+>>> import fasttext -+>>> help(fasttext.FastText) - -Help on module fasttext.FastText in fasttext: - -NAME - fasttext.FastText - -DESCRIPTION - # Copyright (c) 2017-present, Facebook, Inc. - # All rights reserved. - # - # This source code is licensed under the MIT license found in the - # LICENSE file in the root directory of this source tree. - -FUNCTIONS - load_model(path) - Load a model given a filepath and return a model object. - - tokenize(text) - Given a string of text, tokenize it and return a list of tokens -[...] -``` - - -# API - - -## `train_unsupervised` parameters - -```python - input # training file path (required) - model # unsupervised fasttext model {cbow, skipgram} [skipgram] - lr # learning rate [0.05] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [5] - minn # min length of char ngram [3] - maxn # max length of char ngram [6] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [ns] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] - verbose # verbose [2] -``` - -## `train_supervised` parameters - -```python - input # training file path (required) - lr # learning rate [0.1] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [1] - minCountLabel # minimal number of label occurences [1] - minn # min length of char ngram [0] - maxn # max length of char ngram [0] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [softmax] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] - label # label prefix ['__label__'] - verbose # verbose [2] - pretrainedVectors # pretrained word vectors (.vec file) for supervised learning [] -``` - -## `model` object - -`train_supervised`, `train_unsupervised` and `load_model` functions return an instance of `_FastText` class, that we generaly name `model` object. - -This object exposes those training arguments as properties : `lr`, `dim`, `ws`, `epoch`, `minCount`, `minCountLabel`, `minn`, `maxn`, `neg`, `wordNgrams`, `loss`, `bucket`, `thread`, `lrUpdateRate`, `t`, `label`, `verbose`, `pretrainedVectors`. So `model.wordNgrams` will give you the max length of word ngram used for training this model. - -In addition, the object exposes several functions : - -```python - get_dimension # Get the dimension (size) of a lookup vector (hidden layer). - # This is equivalent to `dim` property. - get_input_vector # Given an index, get the corresponding vector of the Input Matrix. - get_input_matrix # Get a copy of the full input matrix of a Model. - get_labels # Get the entire list of labels of the dictionary - # This is equivalent to `labels` property. - get_line # Split a line of text into words and labels. - get_output_matrix # Get a copy of the full output matrix of a Model. - get_sentence_vector # Given a string, get a single vector represenation. This function - # assumes to be given a single line of text. We split words on - # whitespace (space, newline, tab, vertical tab) and the control - # characters carriage return, formfeed and the null character. - get_subword_id # Given a subword, return the index (within input matrix) it hashes to. - get_subwords # Given a word, get the subwords and their indicies. - get_word_id # Given a word, get the word id within the dictionary. - get_word_vector # Get the vector representation of word. - get_words # Get the entire list of words of the dictionary - # This is equivalent to `words` property. - is_quantized # whether the model has been quantized - predict # Given a string, get a list of labels and a list of corresponding probabilities. - quantize # Quantize the model reducing the size of the model and it's memory footprint. - save_model # Save the model to the given path - test # Evaluate supervised model using file given by path - test_label # Return the precision and recall score for each label. -``` - -The properties `words`, `labels` return the words and labels from the dictionary : -```py -model.words # equivalent to model.get_words() -model.labels # equivalent to model.get_labels() -``` - -The object overrides `__getitem__` and `__contains__` functions in order to return the representation of a word and to check if a word is in the vocabulary. - -```py -model['king'] # equivalent to model.get_word_vector('king') -'king' in model # equivalent to `'king' in model.get_words()` -``` diff --git a/docs/references.md b/docs/references.md deleted file mode 100644 index 304b257ed..000000000 --- a/docs/references.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -id: references -title: References ---- - -Please cite [1](#enriching-word-vectors-with-subword-information) if using this code for learning word representations or [2](#bag-of-tricks-for-efficient-text-classification) if using for text classification. - -[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606) - -```markup -@article{bojanowski2016enriching, - title={Enriching Word Vectors with Subword Information}, - author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1607.04606}, - year={2016} -} -``` - -[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759) - -```markup -@article{joulin2016bag, - title={Bag of Tricks for Efficient Text Classification}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1607.01759}, - year={2016} -} -``` - -[3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651) - -```markup -@article{joulin2016fasttext, - title={FastText.zip: Compressing text classification models}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1612.03651}, - year={2016} -} -``` - -(\* These authors contributed equally.) diff --git a/docs/supervised-models.md b/docs/supervised-models.md deleted file mode 100644 index 41519b20a..000000000 --- a/docs/supervised-models.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -id: supervised-models -title: Supervised models ---- - -This page gathers several pre-trained supervised models on several datasets. - -### Description - -The regular models are trained using the procedure described in [1]. They can be reproduced using the classification-results.sh script within our github repository. The quantized models are build by using the respective supervised settings and adding the following flags to the quantize subcommand. - -```bash --qnorm -retrain -cutoff 100000 -``` - -### Table of models - -Each entry describes the test accuracy and size of the model. You can click on a table cell to download the corresponding model. - -| dataset | ag news | amazon review full | amazon review polarity | dbpedia | -|-----------|-----------------------|-----------------------|------------------------|------------------------| -| regular | [0.924 / 387MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/ag_news.bin) | [0.603 / 462MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/amazon_review_full.bin) | [0.946 / 471MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/amazon_review_polarity.bin) | [0.986 / 427MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/dbpedia.bin) | -| compressed | [0.92 / 1.6MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/ag_news.ftz) | [0.599 / 1.6MB]( https://dl.fbaipublicfiles.com/fasttext/supervised-models/amazon_review_full.ftz) | [0.93 / 1.6MB]( https://dl.fbaipublicfiles.com/fasttext/supervised-models/amazon_review_polarity.ftz) | [0.984 / 1.7MB]( https://dl.fbaipublicfiles.com/fasttext/supervised-models/dbpedia.ftz) | - -| dataset | sogou news | yahoo answers | yelp review polarity | yelp review full | -|-----------|----------------------|------------------------|----------------------|------------------------| -| regular | [0.969 / 402MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/sogou_news.bin) | [0.724 / 494MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/yahoo_answers.bin)| [0.957 / 409MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/yelp_review_polarity.bin)| [0.639 / 412MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/yelp_review_full.bin)| -| compressed | [0.968 / 1.4MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/sogou_news.ftz) | [0.717 / 1.6MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/yahoo_answers.ftz) | [0.957 / 1.5MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/yelp_review_polarity.ftz) | [0.636 / 1.5MB](https://dl.fbaipublicfiles.com/fasttext/supervised-models/yelp_review_full.ftz) | - -### References - -If you use these models, please cite the following paper: - -[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759) - -```markup -@article{joulin2016bag, - title={Bag of Tricks for Efficient Text Classification}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1607.01759}, - year={2016} -} -``` - -[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651) - -```markup -@article{joulin2016fasttext, - title={FastText.zip: Compressing text classification models}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1612.03651}, - year={2016} -} -``` diff --git a/docs/supervised-tutorial.md b/docs/supervised-tutorial.md deleted file mode 100644 index 1f0a1fb06..000000000 --- a/docs/supervised-tutorial.md +++ /dev/null @@ -1,597 +0,0 @@ ---- -id: supervised-tutorial -title: Text classification ---- - -Text classification is a core problem to many applications, like spam detection, sentiment analysis or smart replies. In this tutorial, we describe how to build a text classifier with the fastText tool. - -## What is text classification? - -The goal of text classification is to assign documents (such as emails, posts, text messages, product reviews, etc...) to one or multiple categories. Such categories can be review scores, spam v.s. non-spam, or the language in which the document was typed. Nowadays, the dominant approach to build such classifiers is machine learning, that is learning classification rules from examples. In order to build such classifiers, we need labeled data, which consists of documents and their corresponding categories (or tags, or labels). - -As an example, we build a classifier which automatically classifies stackexchange questions about cooking into one of several possible tags, such as `pot`, `bowl` or `baking`. - -## Installing fastText - -The first step of this tutorial is to install and build fastText. It only requires a c++ compiler with good support of c++11. - -Let us start by downloading the [most recent release](https://github.com/facebookresearch/fastText/releases): - -```bash -$ wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip -$ unzip v0.9.2.zip -``` - -Move to the fastText directory and build it: - -```bash -$ cd fastText-0.9.2 -# for command line tool : -$ make -# for python bindings : -$ pip install . -``` - - - -
-Running the binary without any argument will print the high level documentation, showing the different use cases supported by fastText: - -```bash ->> ./fasttext -usage: fasttext - -The commands supported by fasttext are: - - supervised train a supervised classifier - quantize quantize a model to reduce the memory usage - test evaluate a supervised classifier - predict predict most likely labels - predict-prob predict most likely labels with probabilities - skipgram train a skipgram model - cbow train a cbow model - print-word-vectors print word vectors given a trained model - print-sentence-vectors print sentence vectors given a trained model - nn query for nearest neighbors - analogies query for analogies - -``` - -In this tutorial, we mainly use the `supervised`, `test` and `predict` subcommands, which corresponds to learning (and using) text classifier. For an introduction to the other functionalities of fastText, please see the [tutorial about learning word vectors](https://fasttext.cc/docs/en/unsupervised-tutorial.html). - - -
-Calling the help function will show high level documentation of the library: -```py ->>> import fasttext ->>> help(fasttext.FastText) -Help on module fasttext.FastText in fasttext: - -NAME - fasttext.FastText - -DESCRIPTION - # Copyright (c) 2017-present, Facebook, Inc. - # All rights reserved. - # - # This source code is licensed under the MIT license found in the - # LICENSE file in the root directory of this source tree. - -FUNCTIONS - load_model(path) - Load a model given a filepath and return a model object. - - read_args(arg_list, arg_dict, arg_names, default_values) - - tokenize(text) - Given a string of text, tokenize it and return a list of tokens - - train_supervised(*kargs, **kwargs) - Train a supervised model and return a model object. - - input must be a filepath. The input text does not need to be tokenized - as per the tokenize function, but it must be preprocessed and encoded - as UTF-8. You might want to consult standard preprocessing scripts such - as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html - - The input file must must contain at least one label per line. For an - example consult the example datasets which are part of the fastText - repository such as the dataset pulled by classification-example.sh. - - train_unsupervised(*kargs, **kwargs) - Train an unsupervised model and return a model object. - - input must be a filepath. The input text does not need to be tokenized - as per the tokenize function, but it must be preprocessed and encoded - as UTF-8. You might want to consult standard preprocessing scripts such - as tokenizer.perl mentioned here: http://www.statmt.org/wmt07/baseline.html - - The input field must not contain any labels or use the specified label prefix - unless it is ok for those words to be ignored. For an example consult the - dataset pulled by the example script word-vector-example.sh, which is - part of the fastText repository. -``` - -In this tutorial, we mainly use the `train_supervised`, which returns a model object, and call `test` and `predict` on this object. That corresponds to learning (and using) text classifier. For an introduction to the other functionalities of fastText, please see the [tutorial about learning word vectors](https://fasttext.cc/docs/en/unsupervised-tutorial.html). - - -## Getting and preparing the data - -As mentioned in the introduction, we need labeled data to train our supervised classifier. In this tutorial, we are interested in building a classifier to automatically recognize the topic of a stackexchange question about cooking. Let's download examples of questions from [the cooking section of Stackexchange](http://cooking.stackexchange.com/), and their associated tags: - -```bash ->> wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz && tar xvzf cooking.stackexchange.tar.gz ->> head cooking.stackexchange.txt -``` - -Each line of the text file contains a list of labels, followed by the corresponding document. All the labels start by the `__label__` prefix, which is how fastText recognize what is a label or what is a word. The model is then trained to predict the labels given the word in the document. - -Before training our first classifier, we need to split the data into train and validation. We will use the validation set to evaluate how good the learned classifier is on new data. - -```bash ->> wc cooking.stackexchange.txt - 15404 169582 1401900 cooking.stackexchange.txt -``` - -Our full dataset contains 15404 examples. Let's split it into a training set of 12404 examples and a validation set of 3000 examples: - -```bash ->> head -n 12404 cooking.stackexchange.txt > cooking.train ->> tail -n 3000 cooking.stackexchange.txt > cooking.valid -``` - -## Our first classifier - -We are now ready to train our first classifier: - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -Read 0M words -Number of words: 14598 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 75109 lr: 0.000000 loss: 5.708354 eta: 0h0m -``` - -The `-input` command line option indicates the file containing the training examples, while the `-output` option indicates where to save the model. At the end of training, a file `model_cooking.bin`, containing the trained classifier, is created in the current directory. - - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input="cooking.train") -Read 0M words -Number of words: 14598 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 75109 lr: 0.000000 loss: 5.708354 eta: 0h0m -``` -The `input` argument indicates the file containing the training examples. We can now use the `model` variable to access information on the trained model. - -We can also call `save_model` to save it as a file and load it later with `load_model` function. -```py ->>> model.save_model("model_cooking.bin") -``` - - - -Now, we can test our classifier, by : - - -```bash ->> ./fasttext predict model_cooking.bin - -``` - -and then typing a sentence. Let's first try the sentence: - -*Which baking dish is best to bake a banana bread ?* - -The predicted tag is `baking` which fits well to this question. Let us now try a second example: - -*Why not put knives in the dishwasher?* - - -```py ->>> model.predict("Which baking dish is best to bake a banana bread ?") -((u'__label__baking',), array([0.15613931])) -``` -The predicted tag is `baking` which fits well to this question. Let us now try a second example: - -```py ->>> model.predict("Why not put knives in the dishwasher?") -((u'__label__food-safety',), array([0.08686075])) -``` - - - - -The label predicted by the model is `food-safety`, which is not relevant. Somehow, the model seems to fail on simple examples. - -To get a better sense of its quality, let's test it on the validation data by running: - - - -```bash ->> ./fasttext test model_cooking.bin cooking.valid -N 3000 -P@1 0.124 -R@1 0.0541 -Number of examples: 3000 -``` -The output of fastText are the precision at one (`P@1`) and the recall at one (`R@1`). - - -```py ->>> model.test("cooking.valid") -(3000L, 0.124, 0.0541) -``` -The output are the number of samples (here `3000`), the precision at one (`0.124`) and the recall at one (`0.0541`). - - -We can also compute the precision at five and recall at five with: - - - -```bash ->> ./fasttext test model_cooking.bin cooking.valid 5 -N 3000 -P@5 0.0668 -R@5 0.146 -Number of examples: 3000 -``` - -```py ->>> model.test("cooking.valid", k=5) -(3000L, 0.0668, 0.146) -``` - - - -## Advanced readers: precision and recall - -The precision is the number of correct labels among the labels predicted by fastText. The recall is the number of labels that successfully were predicted, among all the real labels. Let's take an example to make this more clear: - -*Why not put knives in the dishwasher?* - -On Stack Exchange, this sentence is labeled with three tags: `equipment`, `cleaning` and `knives`. The top five labels predicted by the model can be obtained with: - - - -```bash ->> ./fasttext predict model_cooking.bin - 5 -``` - -```py ->>> model.predict("Why not put knives in the dishwasher?", k=5) -((u'__label__food-safety', u'__label__baking', u'__label__equipment', u'__label__substitutions', u'__label__bread'), array([0.0857 , 0.0657, 0.0454, 0.0333, 0.0333])) -``` - - -are `food-safety`, `baking`, `equipment`, `substitutions` and `bread`. - -Thus, one out of five labels predicted by the model is correct, giving a precision of 0.20. Out of the three real labels, only one is predicted by the model, giving a recall of 0.33. - -For more details, see [the related Wikipedia page](https://en.wikipedia.org/wiki/Precision_and_recall). - -## Making the model better - -The model obtained by running fastText with the default arguments is pretty bad at classifying new questions. Let's try to improve the performance, by changing the default parameters. - -### preprocessing the data - -Looking at the data, we observe that some words contain uppercase letter or punctuation. One of the first step to improve the performance of our model is to apply some simple pre-processing. A crude normalization can be obtained using command line tools such as `sed` and `tr`: - -```bash ->> cat cooking.stackexchange.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > cooking.preprocessed.txt ->> head -n 12404 cooking.preprocessed.txt > cooking.train ->> tail -n 3000 cooking.preprocessed.txt > cooking.valid -``` - -Let's train a new model on the pre-processed data: - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 82041 lr: 0.000000 loss: 5.671649 eta: 0h0m - ->> ./fasttext test model_cooking.bin cooking.valid -N 3000 -P@1 0.164 -R@1 0.0717 -Number of examples: 3000 -``` - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input="cooking.train") -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 82041 lr: 0.000000 loss: 5.671649 eta: 0h0m - ->>> model.test("cooking.valid") -(3000L, 0.164, 0.0717) -``` - - -We observe that thanks to the pre-processing, the vocabulary is smaller (from 14k words to 9k). The precision is also starting to go up by 4%! - -### more epochs and larger learning rate - -By default, fastText sees each training example only five times during training, which is pretty small, given that our training set only have 12k training examples. The number of times each examples is seen (also known as the number of epochs), can be increased using the `-epoch` option: - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -epoch 25 -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 77633 lr: 0.000000 loss: 7.147976 eta: 0h0m -``` - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input="cooking.train", epoch=25) -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 77633 lr: 0.000000 loss: 7.147976 eta: 0h0m -``` - - -Let's test the new model: - - - -```bash ->> ./fasttext test model_cooking.bin cooking.valid -N 3000 -P@1 0.501 -R@1 0.218 -Number of examples: 3000 -``` - -```py ->>> model.test("cooking.valid") -(3000L, 0.501, 0.218) -``` - - -This is much better! Another way to change the learning speed of our model is to increase (or decrease) the learning rate of the algorithm. This corresponds to how much the model changes after processing each example. A learning rate of 0 would mean that the model does not change at all, and thus, does not learn anything. Good values of the learning rate are in the range `0.1 - 1.0`. - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -lr 1.0 -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 81469 lr: 0.000000 loss: 6.405640 eta: 0h0m - ->> ./fasttext test model_cooking.bin cooking.valid -N 3000 -P@1 0.563 -R@1 0.245 -Number of examples: 3000 -``` - -```py ->>> model = fasttext.train_supervised(input="cooking.train", lr=1.0) -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 81469 lr: 0.000000 loss: 6.405640 eta: 0h0m - ->>> model.test("cooking.valid") -(3000L, 0.563, 0.245) -``` - - -Even better! Let's try both together: - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -lr 1.0 -epoch 25 -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 76394 lr: 0.000000 loss: 4.350277 eta: 0h0m - ->> ./fasttext test model_cooking.bin cooking.valid -N 3000 -P@1 0.585 -R@1 0.255 -Number of examples: 3000 -``` - -```py ->>> model = fasttext.train_supervised(input="cooking.train", lr=1.0, epoch=25) -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 76394 lr: 0.000000 loss: 4.350277 eta: 0h0m - ->>> model.test("cooking.valid") -(3000L, 0.585, 0.255) -``` - - -Let us now add a few more features to improve even further our performance! - -### word n-grams - -Finally, we can improve the performance of a model by using word bigrams, instead of just unigrams. This is especially important for classification problems where word order is important, such as sentiment analysis. - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -lr 1.0 -epoch 25 -wordNgrams 2 -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 75366 lr: 0.000000 loss: 3.226064 eta: 0h0m - ->> ./fasttext test model_cooking.bin cooking.valid -N 3000 -P@1 0.599 -R@1 0.261 -Number of examples: 3000 -``` - -```py ->>> model = fasttext.train_supervised(input="cooking.train", lr=1.0, epoch=25, wordNgrams=2) -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 75366 lr: 0.000000 loss: 3.226064 eta: 0h0m - ->>> model.test("cooking.valid") -(3000L, 0.599, 0.261) -``` - - -With a few steps, we were able to go from a precision at one of 12.4% to 59.9%. Important steps included: - -* preprocessing the data ; -* changing the number of epochs (using the option `-epoch`, standard range `[5 - 50]`) ; -* changing the learning rate (using the option `-lr`, standard range `[0.1 - 1.0]`) ; -* using word n-grams (using the option `-wordNgrams`, standard range `[1 - 5]`). - -## Advanced readers: What is a Bigram? - -A 'unigram' refers to a single undividing unit, or token, usually used as an input to a model. For example a unigram can be a word or a letter depending on the model. In fastText, we work at the word level and thus unigrams are words. - -Similarly we denote by 'bigram' the concatenation of 2 consecutive tokens or words. Similarly we often talk about n-gram to refer to the concatenation any n consecutive tokens. - -For example, in the sentence, 'Last donut of the night', the unigrams are 'last', 'donut', 'of', 'the' and 'night'. The bigrams are: 'Last donut', 'donut of', 'of the' and 'the night'. - -Bigrams are particularly interesting because, for most sentences, you can reconstruct the order of the words just by looking at a bag of n-grams. - -Let us illustrate this by a simple exercise, given the following bigrams, try to reconstruct the original sentence: 'all out', 'I am', 'of bubblegum', 'out of' and 'am all'. -It is common to refer to a word as a unigram. - -## Scaling things up - -Since we are training our model on a few thousands of examples, the training only takes a few seconds. But training models on larger datasets, with more labels can start to be too slow. A potential solution to make the training faster is to use the [hierarchical softmax](#advanced-readers-hierarchical-softmax), instead of the regular softmax. This can be done with the option `-loss hs`: - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -lr 1.0 -epoch 25 -wordNgrams 2 -bucket 200000 -dim 50 -loss hs -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 2199406 lr: 0.000000 loss: 1.718807 eta: 0h0m -``` - -```py ->>> model = fasttext.train_supervised(input="cooking.train", lr=1.0, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='hs') -Read 0M words -Number of words: 9012 -Number of labels: 734 -Progress: 100.0% words/sec/thread: 2199406 lr: 0.000000 loss: 1.718807 eta: 0h0m -``` - - -Training should now take less than a second. - - -## Advanced readers: hierarchical softmax - -The hierarchical softmax is a loss function that approximates the softmax with a much faster computation. - -The idea is to build a binary tree whose leaves correspond to the labels. Each intermediate node has a binary decision activation (e.g. sigmoid) that is trained, and predicts if we should go to the left or to the right. The probability of the output unit is then given by the product of the probabilities of intermediate nodes along the path from the root to the output unit leave. - -For a detailed explanation, you can have a look on [this video](https://www.youtube.com/watch?v=B95LTf2rVWM). - -In fastText, we use a Huffman tree, so that the lookup time is faster for more frequent outputs and thus the average lookup time for the output is optimal. - -## Multi-label classification - -When we want to assign a document to multiple labels, we can still use the softmax loss and play with the parameters for prediction, namely the number of labels to predict and the threshold for the predicted probability. However playing with these arguments can be tricky and unintuitive since the probabilities must sum to 1. - -A convenient way to handle multiple labels is to use independent binary classifiers for each label. This can be done with `-loss one-vs-all` or `-loss ova`. - - - -```bash ->> ./fasttext supervised -input cooking.train -output model_cooking -lr 0.5 -epoch 25 -wordNgrams 2 -bucket 200000 -dim 50 -loss one-vs-all -Read 0M words -Number of words: 14543 -Number of labels: 735 -Progress: 100.0% words/sec/thread: 72104 lr: 0.000000 loss: 4.340807 ETA: 0h 0m -``` - -```py ->>> import fasttext ->>> model = fasttext.train_supervised(input="cooking.train", lr=0.5, epoch=25, wordNgrams=2, bucket=200000, dim=50, loss='ova') -Read 0M words -Number of words: 14543 -Number of labels: 735 -Progress: 100.0% words/sec/thread: 72104 lr: 0.000000 loss: 4.340807 ETA: 0h 0m -``` - - -It is a good idea to decrease the learning rate compared to other loss functions. - -Now let's have a look on our predictions, we want as many prediction as possible (argument `-1`) and we want only labels with probability higher or equal to `0.5` : - - -```bash ->> ./fasttext predict-prob model_cooking.bin - -1 0.5 -``` -and then type the sentence: - -*Which baking dish is best to bake a banana bread ?* - -we get: -``` -__label__baking 1.00000 __label__bananas 0.939923 __label__bread 0.592677 -``` - -```py ->>> model.predict("Which baking dish is best to bake a banana bread ?", k=-1, threshold=0.5) -((u''__label__baking, u'__label__bananas', u'__label__bread'), array([1.00000, 0.939923, 0.592677])) -``` - - - - - - -
-We can also evaluate our results with the `test` command : -```bash ->> ./fasttext test model_cooking.bin cooking.valid -1 0.5 -N 3000 -P@-1 0.702 -R@-1 0.2 -Number of examples: 3000 -``` -and play with the threshold to obtain desired precision/recall metrics : - -```bash ->> ./fasttext test model_cooking.bin cooking.valid -1 0.1 -N 3000 -P@-1 0.591 -R@-1 0.272 -Number of examples: 3000 -``` - -
-We can also evaluate our results with the `test` function: -```py ->>> model.test("cooking.valid", k=-1) -(3000L, 0.702, 0.2) -``` - - - -## Conclusion - -In this tutorial, we gave a brief overview of how to use fastText to train powerful text classifiers. We had a light overview of some of the most important options to tune. diff --git a/docs/support.md b/docs/support.md deleted file mode 100644 index 46ca97147..000000000 --- a/docs/support.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -id: support -title: Get started ---- - -## What is fastText? - -fastText is a library for efficient learning of word representations and sentence classification. - -## Requirements - -fastText builds on modern Mac OS and Linux distributions. -Since it uses C++11 features, it requires a compiler with good C++11 support. -These include : - -* (gcc-4.6.3 or newer) or (clang-3.3 or newer) - -Compilation is carried out using a Makefile, so you will need to have a working **make**. -For the word-similarity evaluation script you will need: - -* python 2.6 or newer -* numpy & scipy - -## Building fastText as a command line tool - -In order to build `fastText`, use the following: - -```bash -$ git clone https://github.com/facebookresearch/fastText.git -$ cd fastText -$ make -``` - -This will produce object files for all the classes as well as the main binary `fasttext`. -If you do not plan on using the default system-wide compiler, update the two macros defined at the beginning of the Makefile (CC and INCLUDES). - - -## Building `fasttext` python module - -In order to build `fasttext` module for python, use the following: - -```bash -$ git clone https://github.com/facebookresearch/fastText.git -$ cd fastText -$ sudo pip install . -$ # or : -$ sudo python setup.py install -``` - -Then verify the installation went well : -```bash -$ python -Python 2.7.15 |(default, May 1 2018, 18:37:05) -Type "help", "copyright", "credits" or "license" for more information. ->>> import fasttext ->>> -``` -If you don't see any error message, the installation was successful. diff --git a/docs/unsupervised-tutorials.md b/docs/unsupervised-tutorials.md deleted file mode 100644 index 278c1e1ec..000000000 --- a/docs/unsupervised-tutorials.md +++ /dev/null @@ -1,483 +0,0 @@ ---- -id: unsupervised-tutorial -title: Word representations ---- -A popular idea in modern machine learning is to represent words by vectors. These vectors capture hidden information about a language, like word analogies or semantic. It is also used to improve performance of text classifiers. - -In this tutorial, we show how to build these word vectors with the fastText tool. To download and install fastText, follow the first steps of [the tutorial on text classification](https://fasttext.cc/docs/en/supervised-tutorial.html). - -## Getting the data - -In order to compute word vectors, you need a large text corpus. Depending on the corpus, the word vectors will capture different information. In this tutorial, we focus on Wikipedia's articles but other sources could be considered, like news or Webcrawl (more examples [here](http://statmt.org/)). To download a raw dump of Wikipedia, run the following command: - -```bash -wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -``` - -Downloading the Wikipedia corpus takes some time. Instead, lets restrict our study to the first 1 billion bytes of English Wikipedia. They can be found on Matt Mahoney's [website](http://mattmahoney.net/): - -```bash -$ mkdir data -$ wget -c http://mattmahoney.net/dc/enwik9.zip -P data -$ unzip data/enwik9.zip -d data -``` - -A raw Wikipedia dump contains a lot of HTML / XML data. We pre-process it with the wikifil.pl script bundled with fastText (this script was originally developed by Matt Mahoney, and can be found on his [website](http://mattmahoney.net/)). - -```bash -$ perl wikifil.pl data/enwik9 > data/fil9 -``` - -We can check the file by running the following command: - -```bash -$ head -c 80 data/fil9 -anarchism originated as a term of abuse first used against early working class -``` - -The text is nicely pre-processed and can be used to learn our word vectors. - -## Training word vectors - - - -Learning word vectors on this data can now be achieved with a single command: - -```bash -$ mkdir result -$ ./fasttext skipgram -input data/fil9 -output result/fil9 -``` - -To decompose this command line: ./fastext calls the binary fastText executable (see how to install fastText [here](https://fasttext.cc/docs/en/support.html)) with the 'skipgram' model (it can also be 'cbow'). We then specify the requires options '-input' for the location of the data and '-output' for the location where the word representations will be saved. - -While fastText is running, the progress and estimated time to completion is shown on your screen. Once the program finishes, there should be two files in the result directory: - -```bash -$ ls -l result --rw-r-r-- 1 bojanowski 1876110778 978480850 Dec 20 11:01 fil9.bin --rw-r-r-- 1 bojanowski 1876110778 190004182 Dec 20 11:01 fil9.vec -``` - -The `fil9.bin` file is a binary file that stores the whole fastText model and can be subsequently loaded. The `fil9.vec` file is a text file that contains the word vectors, one per line for each word in the vocabulary: - -```bash -$ head -n 4 result/fil9.vec -218316 100 -the -0.10363 -0.063669 0.032436 -0.040798 0.53749 0.00097867 0.10083 0.24829 ... -of -0.0083724 0.0059414 -0.046618 -0.072735 0.83007 0.038895 -0.13634 0.60063 ... -one 0.32731 0.044409 -0.46484 0.14716 0.7431 0.24684 -0.11301 0.51721 0.73262 ... -``` - -The first line is a header containing the number of words and the dimensionality of the vectors. The subsequent lines are the word vectors for all words in the vocabulary, sorted by decreasing frequency. - -Learning word vectors on this data can now be achieved with a single command: -```py ->>> import fasttext ->>> model = fasttext.train_unsupervised('data/fil9') -``` -While fastText is running, the progress and estimated time to completion is shown on your screen. Once the training finishes, `model` variable contains information on the trained model, and can be used for querying: - -```py ->>> model.words -[u'the', u'of', u'one', u'zero', u'and', u'in', u'two', u'a', u'nine', u'to', u'is', ... -``` -It returns all words in the vocabulary, sorted by decreasing frequency. We can get the word vector by: -```py ->>> model.get_word_vector("the") -array([-0.03087516, 0.09221972, 0.17660329, 0.17308897, 0.12863874, - 0.13912526, -0.09851588, 0.00739991, 0.37038437, -0.00845221, - ... - -0.21184735, -0.05048715, -0.34571868, 0.23765688, 0.23726143], - dtype=float32) -``` - -We can save this model on disk as a binary file: -```py ->>> model.save_model("result/fil9.bin") -``` -and reload it later instead of training again: -```py -$ python ->>> import fasttext ->>> model = fasttext.load_model("result/fil9.bin") -``` - - - -## Advanced readers: skipgram versus cbow - -fastText provides two models for computing word representations: skipgram and cbow ('**c**ontinuous-**b**ag-**o**f-**w**ords'). - -The skipgram model learns to predict a target word thanks to a nearby word. On the other hand, the cbow model predicts the target word according to its context. The context is represented as a bag of the words contained in a fixed size window around the target word. - -Let us illustrate this difference with an example: given the sentence *'Poets have been mysteriously silent on the subject of cheese'* and the target word '*silent*', a skipgram model tries to predict the target using a random close-by word, like '*subject' *or* '*mysteriously*'**. *The cbow model takes all the words in a surrounding window, like {*been, *mysteriously*, on, the*}, and uses the sum of their vectors to predict the target. The figure below summarizes this difference with another example. - -![cbow vs skipgram](https://fasttext.cc/img/cbo_vs_skipgram.png) -To train a cbow model with fastText, you run the following command: - - - -```bash -./fasttext cbow -input data/fil9 -output result/fil9 -``` - -```py ->>> import fasttext ->>> model = fasttext.train_unsupervised('data/fil9', "cbow") -``` - - -In practice, we observe that skipgram models works better with subword information than cbow. - -## Advanced readers: playing with the parameters - -So far, we run fastText with the default parameters, but depending on the data, these parameters may not be optimal. Let us give an introduction to some of the key parameters for word vectors. - -The most important parameters of the model are its dimension and the range of size for the subwords. The dimension (*dim*) controls the size of the vectors, the larger they are the more information they can capture but requires more data to be learned. But, if they are too large, they are harder and slower to train. By default, we use 100 dimensions, but any value in the 100-300 range is as popular. The subwords are all the substrings contained in a word between the minimum size (*minn*) and the maximal size (*maxn*). By default, we take all the subword between 3 and 6 characters, but other range could be more appropriate to different languages: - - - -```bash -$ ./fasttext skipgram -input data/fil9 -output result/fil9 -minn 2 -maxn 5 -dim 300 -``` - -```py ->>> import fasttext ->>> model = fasttext.train_unsupervised('data/fil9', minn=2, maxn=5, dim=300) -``` - - -Depending on the quantity of data you have, you may want to change the parameters of the training. The *epoch* parameter controls how many times the model will loop over your data. By default, we loop over the dataset 5 times. If you dataset is extremely massive, you may want to loop over it less often. Another important parameter is the learning rate -*lr*. The higher the learning rate is, the faster the model converge to a solution but at the risk of overfitting to the dataset. The default value is 0.05 which is a good compromise. If you want to play with it we suggest to stay in the range of [0.01, 1]: - - - -```bash -$ ./fasttext skipgram -input data/fil9 -output result/fil9 -epoch 1 -lr 0.5 -``` - -```py ->>> import fasttext ->>> model = fasttext.train_unsupervised('data/fil9', epoch=1, lr=0.5) -``` - - -Finally , fastText is multi-threaded and uses 12 threads by default. If you have less CPU cores (say 4), you can easily set the number of threads using the *thread* flag: - - - -```bash -$ ./fasttext skipgram -input data/fil9 -output result/fil9 -thread 4 -``` - -```py ->>> import fasttext ->>> model = fasttext.train_unsupervised('data/fil9', thread=4) -``` - - - -## Printing word vectors - -Searching and printing word vectors directly from the `fil9.vec` file is cumbersome. Fortunately, there is a `print-word-vectors` functionality in fastText. - -For example, we can print the word vectors of words *asparagus,* *pidgey* and *yellow* with the following command: - - -```bash -$ echo "asparagus pidgey yellow" | ./fasttext print-word-vectors result/fil9.bin -asparagus 0.46826 -0.20187 -0.29122 -0.17918 0.31289 -0.31679 0.17828 -0.04418 ... -pidgey -0.16065 -0.45867 0.10565 0.036952 -0.11482 0.030053 0.12115 0.39725 ... -yellow -0.39965 -0.41068 0.067086 -0.034611 0.15246 -0.12208 -0.040719 -0.30155 ... -``` - -```py ->>> [model.get_word_vector(x) for x in ["asparagus", "pidgey", "yellow"]] -[array([-0.25751096, -0.18716481, 0.06921121, 0.06455903, 0.29168844, - 0.15426874, -0.33448914, -0.427215 , 0.7813013 , -0.10600132, - ... - 0.37090245, 0.39266172, -0.4555302 , 0.27452755, 0.00467369], - dtype=float32), - array([-0.20613593, -0.25325796, -0.2422259 , -0.21067499, 0.32879013, - 0.7269511 , 0.3782259 , 0.11274897, 0.246764 , -0.6423613 , - ... - 0.46302193, 0.2530962 , -0.35795924, 0.5755718 , 0.09843876], - dtype=float32), - array([-0.304823 , 0.2543754 , -0.2198013 , -0.25421786, 0.11219151, - 0.38286993, -0.22636674, -0.54023844, 0.41095474, -0.3505803 , - ... - 0.54788435, 0.36740595, -0.5678512 , 0.07523401, -0.08701935], - dtype=float32)] - -``` - - -A nice feature is that you can also query for words that did not appear in your data! Indeed words are represented by the sum of its substrings. As long as the unknown word is made of known substrings, there is a representation of it! - -As an example let's try with a misspelled word: - - - -```bash -$ echo "enviroment" | ./fasttext print-word-vectors result/fil9.bin -``` - -```py ->>> model.get_word_vector("enviroment") -``` - - -You still get a word vector for it! But how good it is? Let's find out in the next sections! - - -## Nearest neighbor queries - -A simple way to check the quality of a word vector is to look at its nearest neighbors. This give an intuition of the type of semantic information the vectors are able to capture. - -This can be achieved with the nearest neighbor (*nn*) functionality. For example, we can query the 10 nearest neighbors of a word by running the following command: - - - -```bash -$ ./fasttext nn result/fil9.bin -Pre-computing word vectors... done. -``` - -Then we are prompted to type our query word, let us try *asparagus* : - -```bash -Query word? asparagus -beetroot 0.812384 -tomato 0.806688 -horseradish 0.805928 -spinach 0.801483 -licorice 0.791697 -lingonberries 0.781507 -asparagales 0.780756 -lingonberry 0.778534 -celery 0.774529 -beets 0.773984 -``` - -```py ->>> model.get_nearest_neighbors('asparagus') -[(0.812384, u'beetroot'), (0.806688, u'tomato'), (0.805928, u'horseradish'), (0.801483, u'spinach'), (0.791697, u'licorice'), (0.781507, u'lingonberries'), (0.780756, u'asparagales'), (0.778534, u'lingonberry'), (0.774529, u'celery'), (0.773984, u'beets')] -``` - - -Nice! It seems that vegetable vectors are similar. Note that the nearest neighbor is the word *asparagus* itself, this means that this word appeared in the dataset. What about pokemons? - - - -```bash -Query word? pidgey -pidgeot 0.891801 -pidgeotto 0.885109 -pidge 0.884739 -pidgeon 0.787351 -pok 0.781068 -pikachu 0.758688 -charizard 0.749403 -squirtle 0.742582 -beedrill 0.741579 -charmeleon 0.733625 -``` - -```py ->>> model.get_nearest_neighbors('pidgey') -[(0.891801, u'pidgeot'), (0.885109, u'pidgeotto'), (0.884739, u'pidge'), (0.787351, u'pidgeon'), (0.781068, u'pok'), (0.758688, u'pikachu'), (0.749403, u'charizard'), (0.742582, u'squirtle'), (0.741579, u'beedrill'), (0.733625, u'charmeleon')] -``` - - - -Different evolution of the same Pokemon have close-by vectors! But what about our misspelled word, is its vector close to anything reasonable? Let s find out: - - - -```bash -Query word? enviroment -enviromental 0.907951 -environ 0.87146 -enviro 0.855381 -environs 0.803349 -environnement 0.772682 -enviromission 0.761168 -realclimate 0.716746 -environment 0.702706 -acclimatation 0.697196 -ecotourism 0.697081 -``` - -```py ->>> model.get_nearest_neighbors('enviroment') -[(0.907951, u'enviromental'), (0.87146, u'environ'), (0.855381, u'enviro'), (0.803349, u'environs'), (0.772682, u'environnement'), (0.761168, u'enviromission'), (0.716746, u'realclimate'), (0.702706, u'environment'), (0.697196, u'acclimatation'), (0.697081, u'ecotourism')] -``` - - - -Thanks to the information contained within the word, the vector of our misspelled word matches to reasonable words! It is not perfect but the main information has been captured. - -## Advanced reader: measure of similarity - -In order to find nearest neighbors, we need to compute a similarity score between words. Our words are represented by continuous word vectors and we can thus apply simple similarities to them. In particular we use the cosine of the angles between two vectors. This similarity is computed for all words in the vocabulary, and the 10 most similar words are shown. Of course, if the word appears in the vocabulary, it will appear on top, with a similarity of 1. - -## Word analogies - -In a similar spirit, one can play around with word analogies. For example, we can see if our model can guess what is to France, and what Berlin is to Germany. - - - - - -This can be done with the *analogies* functionality. It takes a word triplet (like *Germany Berlin France*) and outputs the analogy: - - - -```bash -$ ./fasttext analogies result/fil9.bin -Pre-computing word vectors... done. -Query triplet (A - B + C)? berlin germany france -paris 0.896462 -bourges 0.768954 -louveciennes 0.765569 -toulouse 0.761916 -valenciennes 0.760251 -montpellier 0.752747 -strasbourg 0.744487 -meudon 0.74143 -bordeaux 0.740635 -pigneaux 0.736122 -``` - -```py ->>> model.get_analogies("berlin", "germany", "france") -[(0.896462, u'paris'), (0.768954, u'bourges'), (0.765569, u'louveciennes'), (0.761916, u'toulouse'), (0.760251, u'valenciennes'), (0.752747, u'montpellier'), (0.744487, u'strasbourg'), (0.74143, u'meudon'), (0.740635, u'bordeaux'), (0.736122, u'pigneaux')] -``` - - -The answer provided by our model is *Paris*, which is correct. Let's have a look at a less obvious example: - - - -```bash -Query triplet (A - B + C)? psx sony nintendo -gamecube 0.803352 -nintendogs 0.792646 -playstation 0.77344 -sega 0.772165 -gameboy 0.767959 -arcade 0.754774 -playstationjapan 0.753473 -gba 0.752909 -dreamcast 0.74907 -famicom 0.745298 -``` - -```py ->>> model.get_analogies("psx", "sony", "nintendo") -[(0.803352, u'gamecube'), (0.792646, u'nintendogs'), (0.77344, u'playstation'), (0.772165, u'sega'), (0.767959, u'gameboy'), (0.754774, u'arcade'), (0.753473, u'playstationjapan'), (0.752909, u'gba'), (0.74907, u'dreamcast'), (0.745298, u'famicom')] -``` - - - -Our model considers that the *nintendo* analogy of a *psx* is the *gamecube*, which seems reasonable. Of course the quality of the analogies depend on the dataset used to train the model and one can only hope to cover fields only in the dataset. - - -## Importance of character n-grams - -Using subword-level information is particularly interesting to build vectors for unknown words. For example, the word *gearshift* does not exist on Wikipedia but we can still query its closest existing words: - - - -```bash -Query word? gearshift -gearing 0.790762 -flywheels 0.779804 -flywheel 0.777859 -gears 0.776133 -driveshafts 0.756345 -driveshaft 0.755679 -daisywheel 0.749998 -wheelsets 0.748578 -epicycles 0.744268 -gearboxes 0.73986 -``` - -```py ->>> model.get_nearest_neighbors('gearshift') -[(0.790762, u'gearing'), (0.779804, u'flywheels'), (0.777859, u'flywheel'), (0.776133, u'gears'), (0.756345, u'driveshafts'), (0.755679, u'driveshaft'), (0.749998, u'daisywheel'), (0.748578, u'wheelsets'), (0.744268, u'epicycles'), (0.73986, u'gearboxes')] -``` - - - - -Most of the retrieved words share substantial substrings but a few are actually quite different, like *cogwheel*. You can try other words like *sunbathe* or *grandnieces*. - -Now that we have seen the interest of subword information for unknown words, let's check how it compares to a model that does not use subword information. To train a model without subwords, just run the following command: - - - -```bash -$ ./fasttext skipgram -input data/fil9 -output result/fil9-none -maxn 0 -``` -The results are saved in result/fil9-non.vec and result/fil9-non.bin. - -```py ->>> model_without_subwords = fasttext.train_unsupervised('data/fil9', maxn=0) -``` - - - -To illustrate the difference, let us take an uncommon word in Wikipedia, like *accomodation* which is a misspelling of *accommodation**.* Here is the nearest neighbors obtained without subwords: - - - -```bash -$ ./fasttext nn result/fil9-none.bin -Query word? accomodation -sunnhordland 0.775057 -accomodations 0.769206 -administrational 0.753011 -laponian 0.752274 -ammenities 0.750805 -dachas 0.75026 -vuosaari 0.74172 -hostelling 0.739995 -greenbelts 0.733975 -asserbo 0.732465 -``` - -```py ->>> model_without_subwords.get_nearest_neighbors('accomodation') -[(0.775057, u'sunnhordland'), (0.769206, u'accomodations'), (0.753011, u'administrational'), (0.752274, u'laponian'), (0.750805, u'ammenities'), (0.75026, u'dachas'), (0.74172, u'vuosaari'), (0.739995, u'hostelling'), (0.733975, u'greenbelts'), (0.732465, u'asserbo')] -``` - - -The result does not make much sense, most of these words are unrelated. On the other hand, using subword information gives the following list of nearest neighbors: - - - -```bash -Query word? accomodation -accomodations 0.96342 -accommodation 0.942124 -accommodations 0.915427 -accommodative 0.847751 -accommodating 0.794353 -accomodated 0.740381 -amenities 0.729746 -catering 0.725975 -accomodate 0.703177 -hospitality 0.701426 -``` - -```py ->>> model.get_nearest_neighbors('accomodation') -[(0.96342, u'accomodations'), (0.942124, u'accommodation'), (0.915427, u'accommodations'), (0.847751, u'accommodative'), (0.794353, u'accommodating'), (0.740381, u'accomodated'), (0.729746, u'amenities'), (0.725975, u'catering'), (0.703177, u'accomodate'), (0.701426, u'hospitality')] -``` - - -The nearest neighbors capture different variation around the word *accommodation*. We also get semantically related words such as *amenities* or *catering*. - -## Conclusion - -In this tutorial, we show how to obtain word vectors from Wikipedia. This can be done for any language and we provide [pre-trained models](https://fasttext.cc/docs/en/pretrained-vectors.html) with the default setting for 294 of them. diff --git a/docs/webassembly-module.md b/docs/webassembly-module.md deleted file mode 100644 index c1e984de1..000000000 --- a/docs/webassembly-module.md +++ /dev/null @@ -1,338 +0,0 @@ ---- -id: webassembly-module -title: WebAssembly module ---- - -In this document we present how to use fastText in javascript with WebAssembly. - -## Table of contents - -* [Requirements](#requirements) -* [Building WebAssembly binaries](#building-webassembly-binaries) -* [Build a webpage that uses fastText](#build-a-webpage-that-uses-fasttext) -* [Load a model](#load-a-model) -* [Train a model](#train-a-model) - * [Disclaimer](#disclaimer) - * [Text classification](#text-classification) - * [Word representations](#word-representations) -* [Quantized models](#quantized-models) -* [API](#api) - * [`model` object](#model-object) - * [`loadModel`](#loadmodel) - * [`trainSupervised`](#trainsupervised) - * [`trainUnsupervised`](#trainunsupervised) - -# Requirements - -For building [fastText](https://fasttext.cc/) with WebAssembly bindings, we will need: - - a compiler with good C++11 support, since it uses C\++11 features, - - [emscripten](https://emscripten.org/), - - a [browser that supports WebAssembly](https://caniuse.com/#feat=wasm). - - -# Building WebAssembly binaries - -First, download and install emscripten sdk as [described here](https://emscripten.org/docs/getting_started/downloads.html#installation-instructions). - - -We need to make sure we activated the PATH for emscripten: -```bash -$ source /path/to/emsdk/emsdk_env.sh -``` - -Clone [fastText repository](https://github.com/facebookresearch/fastText/): - -```bash -$ git clone git@github.com:facebookresearch/fastText.git -``` - -Build WebAssembly binaries: -```bash -$ cd fastText -$ make wasm -``` - -This will create `fasttext_wasm.wasm` and `fasttext_wasm.js` in the `webassembly` folder. - -- `fasttext_wasm.wasm` is the binary file that will be loaded in the webassembly's virtual machine. -- `fasttext_wasm.js` is a javascript file built by emscripten, that helps to load `fasttext_wasm.wasm` file in the virtual machine and provides some helper functions. -- `fasttext.js` is the wrapper that provides a nice API for fastText. - -As the user of the library, we will interact with classes and methods defined in `fasttext.js`. We won't deal with `fasttext_wasm.*` files, but they are necessary to run fastText in the javascript's VM. - -# Build a webpage that uses fastText - -In this section we are going to build a minimal HTML page that loads fastText WebAssembly module. - -At the root of the repository, create a folder `webassembly-test`, and copy the files mentioned in the previous section: - -```bash -$ mkdir webassembly-test -$ cp webassembly/fasttext_wasm.wasm webassembly-test/ -$ cp webassembly/fasttext_wasm.js webassembly-test/ -$ cp webassembly/fasttext.js webassembly-test/ -``` - -Inside that folder, create `test.html` file containing: -```html - - - - - - - - - - -``` - -It is important to add the attribute `type="module"` to the script tag, because we use ES6 style imports. `addOnPostRun` is a function that helps to provide a handler that is called when fastText is successfully loaded in the virtual machine. Once we are called inside that function, we can create an instance of `FastText`, that we will use to access the api. - - -Let's test it. - -Opening `test.html` directly in the browser won't work since we are dynamically loading webassembly resources. The `test.html` file must be served from a webserver. The easiest way to achieve this is to use python's simple http server module: - -```bash -$ cd webassembly-test -$ python -m SimpleHTTPServer -``` - -Then browse `http://localhost:8000/test.html` in your browser. If everything worked as expected, you should see `FastText {f: FastText}` in the javascript console. - - -# Load a model - -In order to load a fastText model that was already trained, we can use `loadModel` function. In the example below we use `lid.176.ftz` that you can download from [here](/docs/en/language-identification.html). - -Place the model file you want to load inside the same directory than the HTML file, and inside the script part: -```javascript -import {FastText, addOnPostRun} from "./fasttext.js"; - -const printVector = function(predictions) { - for (let i=0; i { - let ft = new FastText(); - - const url = "lid.176.ftz"; - ft.loadModel(url).then(model => { - - console.log("Model loaded.") - - let text = "Bonjour à tous. Ceci est du français"; - console.log(text); - printVector(model.predict(text, 5, 0.0)); - - text = "Hello, world. This is english"; - console.log(text); - printVector(model.predict(text, 5, 0.0)); - - text = "Merhaba dünya. Bu da türkçe" - console.log(text); - printVector(model.predict(text, 5, 0.0)); - }); -}); -``` - -`loadModel` function returns a [Promise](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise) that resolves to a `model` object. We can then use [`model` object](#model-object) to call various methods, such as `predict`. - -We define `printVector` function that loops through a representation of `std::vector` in javascript, and displays the items. Here, we use it to display prediction results. - -You can also refer to `webassembly/doc/examples/predict.html` in the source code. - -# Calling other methods - -Once the model is loaded, you can call any method like `model.getDimension()` or `model.getSubwords(word)`. You can refer to [this](#api) section of the document for a complete API. You can also have a look to `webassembly/doc/examples/misc.html` file in the source code for further examples. - - -# Train a model - -### Disclaimer - -It is also possible to train a model inside the browser with fastText's WebAssembly API. The training can be slow because at the time of writing, it is not possible to use multithreading in WebAssembly (along with dynamic memory growth). So most of the time, we would train a model with the python or command line tool, eventually quantize it, and load it in the WebAssembly module. However, training a model inside the browser can be useful for creating animations or educational tools. - -### Text classification - -Place the `cooking.train` file (as described [here](/docs/en/supervised-tutorial.html)) inside the same directory: - -```javascript -import {FastText, addOnPostRun} from "./fasttext.js"; - -const trainCallback = (progress, loss, wst, lr, eta) => { - console.log([progress, loss, wst, lr, eta]); -}; - -addOnPostRun(() => { - let ft = new FastText(); - - ft.trainSupervised("cooking.train", { - 'lr':1.0, - 'epoch':10, - 'loss':'hs', - 'wordNgrams':2, - 'dim':50, - 'bucket':200000 - }, trainCallback).then(model => { - console.log('Trained.'); - }); -}); -``` - -`trainCallback` function is called by the module to show progress, average training cost, number of words per second (per thread, but there is only one thread), learning rate, estimated remaining time. - - -### Word representations - -Place the `fil9` file (as described [here](/docs/en/unsupervised-tutorial.html)) inside the same directory: - -```javascript -import {FastText, addOnPostRun} from "./fasttext.js"; - -const trainCallback = (progress, loss, wst, lr, eta) => { - console.log([progress, loss, wst, lr, eta]); -}; - -addOnPostRun(() => { - let ft = new FastText(); - - ft.trainUnsupervised("fil9", 'skipgram', { - 'lr':0.1, - 'epoch':1, - 'loss':'ns', - 'wordNgrams':2, - 'dim':50, - 'bucket':200000 - }, trainCallback).then(model => { - console.log('Trained.'); - }); -}); -``` - -# Quantized models - -Quantization is a technique that reduces the size of your models. You can quantize your model as [described here](/docs/en/faqs.html#how-can-i-reduce-the-size-of-my-fasttext-models). - -You can load a quantized model in fastText's WebAssembly module, as we did in ["Load a model" section](#load-a-model). - - -In the context of web, it is particularly useful to have smaller models since they can be downloaded much faster. You can use our autotune feature as [described here](/docs/en/autotune.html#constrain-model-size) in order to find the best trade-off between accuracy and model size that fits your needs. - - -# API - -## `model` object - -`trainSupervised`, `trainUnsupervised` and `loadModel` functions return a Promise that resolves to an instance of `FastTextModel` class, that we generaly name `model` object. - -This object exposes several functions: - -```javascript -isQuant // true if the model is quantized. -getDimension // the dimension (size) of a lookup vector (hidden layer). -getWordVector(word) // the vector representation of `word`. -getSentenceVector(text) // the vector representation of `text`. -getNearestNeighbors(word, k=10) // nearest `k` neighbors of `word`. -getAnalogies(wordA, wordB, wordC, k) // nearest `k` neighbors of the operation `wordA - wordB + wordC`. -getWordId(word) // get the word id within the dictionary. -getSubwordId(subword) // the index (within input matrix) a subword hashes to. -getSubwords(word) // the subwords and their indicies. -getInputVector(ind) // given an index, get the corresponding vector of the Input Matrix. -predict(text, k = 1, threshold = 0.0) // Given a string, get a list of labels and a list of corresponding - // probabilities. k controls the number of returned labels. -getInputMatrix() // get a reference to the full input matrix of a (non-quantized) Model. -getOutputMatrix() // get a reference to the full output matrix of a (non-quantized) Model. -getWords() // get the entire list of words of the dictionary including the frequency - // of the individual words. This does not include any subwords. For that - // please consult the function get_subwords. -getLabels() // get the entire list of labels of the dictionary including the frequency -getLine(text) // split a line of text into words and labels. -saveModel() // saves the model file in WebAssembly's in-memory FS and returns a blob -test(url, k, threshold) // downloads the test file from the specified url, evaluates the supervised model with it. -``` - -You can also have a look to `webassembly/doc/examples/misc.html` file in the source code for further examples. - -## `loadModel` - -You can load a model as follows: - -`ft.loadModel(url);` - -`loadModel` returns a [Promise](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise) that resolves to a [`model` object](#model-object). - - -## `trainSupervised` - -You can train a text classification model with fastText's WebAssembly API as follows: - -`ft.trainSupervised(trainFile, args, trainCallback);` - -- `trainFile`: the url of the input file -- `args`: a dictionary with following keys: -```javascript - lr # learning rate [0.1] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [1] - minCountLabel # minimal number of label occurences [1] - minn # min length of char ngram [0] - maxn # max length of char ngram [0] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [softmax] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] - label # label prefix ['__label__'] -``` -- `trainCallback` is the name of the function that will be called during training to provide various information. Set this argument to `null` if you don't need a callback, or provide a function that has the following signature: `function myCallback(progress, loss, wst, lr, eta){ ... }` - -`trainSupervised` returns a [Promise](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise) that resolves to a [`model` object](#model-object). - -## `trainUnsupervised` - -You can train a word representation model with fastText's WebAssembly API as follows: - -`ft.trainUnsupervised(trainFile, modelname, args, trainCallback);` - -- `trainFile`: the url of the input file -- `modelName`: must be `"cbow"` or `"skipgram"` -- `args`: a dictionary with following keys: -```javascript - lr # learning rate [0.05] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [5] - minn # min length of char ngram [3] - maxn # max length of char ngram [6] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [ns] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] -``` -- `trainCallback` is the name of the function that will be called during training to provide various information. Set this argument to `null` if you don't need a callback, or provide a function that has the following signature: `function myCallback(progress, loss, wst, lr, eta){ ... }` - -`trainUnsupervised` returns a [Promise](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Promise) that resolves to a [`model` object](#model-object). - - - diff --git a/download_model.py b/download_model.py deleted file mode 100755 index b457f13b8..000000000 --- a/download_model.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import argparse - -import fasttext.util - - -args = None - - -def command_download(lang_id, if_exists): - """ - Download pre-trained common-crawl vectors from fastText's website - https://fasttext.cc/docs/en/crawl-vectors.html - """ - fasttext.util.download_model(lang_id, if_exists) - - -def main(): - global args - - parser = argparse.ArgumentParser( - description='fastText helper tool to reduce model dimensions.') - parser.add_argument("language", type=str, default="en", - help="language identifier of the pre-trained vectors. For example `en` or `fr`.") - parser.add_argument("--overwrite", action="store_true", - help="overwrite if file exists.") - - args = parser.parse_args() - - command_download(args.language, if_exists=( - 'overwrite' if args.overwrite else 'strict')) - - -if __name__ == '__main__': - main() diff --git a/eval.py b/eval.py deleted file mode 100644 index 3dad9040b..000000000 --- a/eval.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2016-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -import numpy as np -from scipy import stats -import os -import math -import argparse - - -def compat_splitting(line): - return line.decode('utf8').split() - - -def similarity(v1, v2): - n1 = np.linalg.norm(v1) - n2 = np.linalg.norm(v2) - return np.dot(v1, v2) / n1 / n2 - - -parser = argparse.ArgumentParser(description='Process some integers.') -parser.add_argument( - '--model', - '-m', - dest='modelPath', - action='store', - required=True, - help='path to model' -) -parser.add_argument( - '--data', - '-d', - dest='dataPath', - action='store', - required=True, - help='path to data' -) -args = parser.parse_args() - -vectors = {} -fin = open(args.modelPath, 'rb') -for _, line in enumerate(fin): - try: - tab = compat_splitting(line) - vec = np.array(tab[1:], dtype=float) - word = tab[0] - if np.linalg.norm(vec) == 0: - continue - if not word in vectors: - vectors[word] = vec - except ValueError: - continue - except UnicodeDecodeError: - continue -fin.close() - -mysim = [] -gold = [] -drop = 0.0 -nwords = 0.0 - -fin = open(args.dataPath, 'rb') -for line in fin: - tline = compat_splitting(line) - word1 = tline[0].lower() - word2 = tline[1].lower() - nwords = nwords + 1.0 - - if (word1 in vectors) and (word2 in vectors): - v1 = vectors[word1] - v2 = vectors[word2] - d = similarity(v1, v2) - mysim.append(d) - gold.append(float(tline[2])) - else: - drop = drop + 1.0 -fin.close() - -corr = stats.spearmanr(mysim, gold) -dataset = os.path.basename(args.dataPath) -print( - "{0:20s}: {1:2.0f} (OOV: {2:2.0f}%)" - .format(dataset, corr[0] * 100, math.ceil(drop / nwords * 100.0)) -) diff --git a/fasttext.pc.in b/fasttext.pc.in deleted file mode 100644 index 6522d34e9..000000000 --- a/fasttext.pc.in +++ /dev/null @@ -1,10 +0,0 @@ -prefix=@CMAKE_INSTALL_PREFIX@ -exec_prefix=@CMAKE_INSTALL_FULL_LIBEXECDIR@ -libdir=@CMAKE_INSTALL_FULL_LIBDIR@ -includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ - -Name: @PROJECT_NAME@ -Description: Efficient learning of word representations and sentence classification -Version: @PROJECT_VERSION@ -Libs: -L${libdir} -lfasttext -Cflags: -I${includedir} diff --git a/get-wikimedia.sh b/get-wikimedia.sh deleted file mode 100755 index d39a0337f..000000000 --- a/get-wikimedia.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2016-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# - -set -e - -normalize_text() { - sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \ - -e 's/"/ " /g' -e 's/\./ \. /g' -e 's/
/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ - -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \ - -e 's/«/ /g' | tr 0-9 " " -} - -export LANGUAGE=en_US.UTF-8 -export LC_ALL=en_US.UTF-8 -export LANG=en_US.UTF-8 - -NOW=$(date +"%Y%m%d") - -ROOT="data/wikimedia/${NOW}" -mkdir -p "${ROOT}" -echo "Saving data in ""$ROOT" -read -r -p "Choose a language (e.g. en, bh, fr, etc.): " choice -LANG="$choice" -echo "Chosen language: ""$LANG" -read -r -p "Continue to download (WARNING: This might be big and can take a long time!)(y/n)? " choice -case "$choice" in - y|Y ) echo "Starting download...";; - n|N ) echo "Exiting";exit 1;; - * ) echo "Invalid answer";exit 1;; -esac -wget -c "https://dumps.wikimedia.org/""$LANG""wiki/latest/""${LANG}""wiki-latest-pages-articles.xml.bz2" -P "${ROOT}" -echo "Processing ""$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2" -bzip2 -c -d "$ROOT"/"$LANG""wiki-latest-pages-articles.xml.bz2" | awk '{print tolower($0);}' | perl -e ' -# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase -# letters (a-z, converted from A-Z), and spaces (never consecutive)... -# All other characters are converted to spaces. Only text which normally appears. -# in the web browser is displayed. Tables are removed. Image captions are. -# preserved. Links are converted to normal text. Digits are spelled out. -# *** Modified to not spell digits or throw away non-ASCII characters *** -# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain. -$/=">"; # input record separator -while (<>) { - if (/ ... - if (/#redirect/i) {$text=0;} # remove #REDIRECT - if ($text) { - # Remove any text not normally visible - if (/<\/text>/) {$text=0;} - s/<.*>//; # remove xml tags - s/&/&/g; # decode URL encoded chars - s/<//g; - s///g; # remove references ... - s/<[^>]*>//g; # remove xhtml tags - s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text - s/\|thumb//ig; # remove images links, preserve caption - s/\|left//ig; - s/\|right//ig; - s/\|\d+px//ig; - s/\[\[image:[^\[\]]*\|//ig; - s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup - s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages - s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text - s/{{[^}]*}}//g; # remove {{icons}} and {tables} - s/{[^}]*}//g; - s/\[//g; # remove [ and ] - s/\]//g; - s/&[^;]*;/ /g; # remove URL encoded chars - $_=" $_ "; - chop; - print $_; - } -} -' | normalize_text | awk '{if (NF>1) print;}' | tr -s " " | shuf > "${ROOT}"/wiki."${LANG}".txt diff --git a/python/README.md b/python/README.md deleted file mode 100644 index f6dbda228..000000000 --- a/python/README.md +++ /dev/null @@ -1,322 +0,0 @@ -# fastText [![CircleCI](https://circleci.com/gh/facebookresearch/fastText/tree/master.svg?style=svg)](https://circleci.com/gh/facebookresearch/fastText/tree/master) - -[fastText](https://fasttext.cc/) is a library for efficient learning of word representations and sentence classification. - -In this document we present how to use fastText in python. - -## Table of contents - -* [Requirements](#requirements) -* [Installation](#installation) -* [Usage overview](#usage-overview) - * [Word representation model](#word-representation-model) - * [Text classification model](#text-classification-model) - * [IMPORTANT: Preprocessing data / encoding conventions](#important-preprocessing-data-encoding-conventions) - * [More examples](#more-examples) -* [API](#api) - * [`train_unsupervised` parameters](#train_unsupervised-parameters) - * [`train_supervised` parameters](#train_supervised-parameters) - * [`model` object](#model-object) - - -# Requirements - -[fastText](https://fasttext.cc/) builds on modern Mac OS and Linux distributions. -Since it uses C\++11 features, it requires a compiler with good C++11 support. You will need [Python](https://www.python.org/) (version 2.7 or ≥ 3.4), [NumPy](http://www.numpy.org/) & [SciPy](https://www.scipy.org/) and [pybind11](https://github.com/pybind/pybind11). - - -# Installation - -To install the latest release, you can do : -```bash -$ pip install fasttext -``` - -or, to get the latest development version of fasttext, you can install from our github repository : -```bash -$ git clone https://github.com/facebookresearch/fastText.git -$ cd fastText -$ sudo pip install . -$ # or : -$ sudo python setup.py install -``` - -# Usage overview - - -## Word representation model - -In order to learn word vectors, as [described here](https://fasttext.cc/docs/en/references.html#enriching-word-vectors-with-subword-information), we can use `fasttext.train_unsupervised` function like this: - - -```py -import fasttext - -# Skipgram model : -model = fasttext.train_unsupervised('data.txt', model='skipgram') - -# or, cbow model : -model = fasttext.train_unsupervised('data.txt', model='cbow') - -``` - -where `data.txt` is a training file containing utf-8 encoded text. - - -The returned `model` object represents your learned model, and you can use it to retrieve information. - -```py -print(model.words) # list of words in dictionary -print(model['king']) # get the vector of the word 'king' -``` - - -### Saving and loading a model object - -You can save your trained model object by calling the function `save_model`. -```py -model.save_model("model_filename.bin") -``` - -and retrieve it later thanks to the function `load_model` : -```py -model = fasttext.load_model("model_filename.bin") -``` - -For more information about word representation usage of fasttext, you can refer to our [word representations tutorial](https://fasttext.cc/docs/en/unsupervised-tutorial.html). - - -## Text classification model - -In order to train a text classifier using the method [described here](https://fasttext.cc/docs/en/references.html#bag-of-tricks-for-efficient-text-classification), we can use `fasttext.train_supervised` function like this: - - -```py -import fasttext - -model = fasttext.train_supervised('data.train.txt') -``` - -where `data.train.txt` is a text file containing a training sentence per line along with the labels. By default, we assume that labels are words that are prefixed by the string `__label__` - -Once the model is trained, we can retrieve the list of words and labels: - -```py -print(model.words) -print(model.labels) -``` - -To evaluate our model by computing the precision at 1 (P@1) and the recall on a test set, we use the `test` function: - -```py -def print_results(N, p, r): - print("N\t" + str(N)) - print("P@{}\t{:.3f}".format(1, p)) - print("R@{}\t{:.3f}".format(1, r)) - -print_results(*model.test('test.txt')) -``` - -We can also predict labels for a specific text : - -```py -model.predict("Which baking dish is best to bake a banana bread ?") -``` - -By default, `predict` returns only one label : the one with the highest probability. You can also predict more than one label by specifying the parameter `k`: -```py -model.predict("Which baking dish is best to bake a banana bread ?", k=3) -``` - -If you want to predict more than one sentence you can pass an array of strings : - -```py -model.predict(["Which baking dish is best to bake a banana bread ?", "Why not put knives in the dishwasher?"], k=3) -``` - - -Of course, you can also save and load a model to/from a file as [in the word representation usage](#saving-and-loading-a-model-object). - -For more information about text classification usage of fasttext, you can refer to our [text classification tutorial](https://fasttext.cc/docs/en/supervised-tutorial.html). - - - - -### Compress model files with quantization - -When you want to save a supervised model file, fastText can compress it in order to have a much smaller model file by sacrificing only a little bit performance. - -```py -# with the previously trained `model` object, call : -model.quantize(input='data.train.txt', retrain=True) - -# then display results and save the new model : -print_results(*model.test(valid_data)) -model.save_model("model_filename.ftz") -``` - -`model_filename.ftz` will have a much smaller size than `model_filename.bin`. - -For further reading on quantization, you can refer to [this paragraph from our blog post](https://fasttext.cc/blog/2017/10/02/blog-post.html#model-compression). - - -## IMPORTANT: Preprocessing data / encoding conventions - -In general it is important to properly preprocess your data. In particular our example scripts in the [root folder](https://github.com/facebookresearch/fastText) do this. - -fastText assumes UTF-8 encoded text. All text must be [unicode for Python2](https://docs.python.org/2/library/functions.html#unicode) and [str for Python3](https://docs.python.org/3.5/library/stdtypes.html#textseq). The passed text will be [encoded as UTF-8 by pybind11](https://pybind11.readthedocs.io/en/master/advanced/cast/strings.html?highlight=utf-8#strings-bytes-and-unicode-conversions) before passed to the fastText C++ library. This means it is important to use UTF-8 encoded text when building a model. On Unix-like systems you can convert text using [iconv](https://en.wikipedia.org/wiki/Iconv). - -fastText will tokenize (split text into pieces) based on the following ASCII characters (bytes). In particular, it is not aware of UTF-8 whitespace. We advice the user to convert UTF-8 whitespace / word boundaries into one of the following symbols as appropiate. - -* space -* tab -* vertical tab -* carriage return -* formfeed -* the null character - -The newline character is used to delimit lines of text. In particular, the EOS token is appended to a line of text if a newline character is encountered. The only exception is if the number of tokens exceeds the MAX\_LINE\_SIZE constant as defined in the [Dictionary header](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h). This means if you have text that is not separate by newlines, such as the [fil9 dataset](http://mattmahoney.net/dc/textdata), it will be broken into chunks with MAX\_LINE\_SIZE of tokens and the EOS token is not appended. - -The length of a token is the number of UTF-8 characters by considering the [leading two bits of a byte](https://en.wikipedia.org/wiki/UTF-8#Description) to identify [subsequent bytes of a multi-byte sequence](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc). Knowing this is especially important when choosing the minimum and maximum length of subwords. Further, the EOS token (as specified in the [Dictionary header](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h)) is considered a character and will not be broken into subwords. - -## More examples - -In order to have a better knowledge of fastText models, please consider the main [README](https://github.com/facebookresearch/fastText/blob/master/README.md) and in particular [the tutorials on our website](https://fasttext.cc/docs/en/supervised-tutorial.html). - -You can find further python examples in [the doc folder](https://github.com/facebookresearch/fastText/tree/master/python/doc/examples). - -As with any package you can get help on any Python function using the help function. - -For example - -``` -+>>> import fasttext -+>>> help(fasttext.FastText) - -Help on module fasttext.FastText in fasttext: - -NAME - fasttext.FastText - -DESCRIPTION - # Copyright (c) 2017-present, Facebook, Inc. - # All rights reserved. - # - # This source code is licensed under the MIT license found in the - # LICENSE file in the root directory of this source tree. - -FUNCTIONS - load_model(path) - Load a model given a filepath and return a model object. - - tokenize(text) - Given a string of text, tokenize it and return a list of tokens -[...] -``` - - -# API - - -## `train_unsupervised` parameters - -```python - input # training file path (required) - model # unsupervised fasttext model {cbow, skipgram} [skipgram] - lr # learning rate [0.05] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [5] - minn # min length of char ngram [3] - maxn # max length of char ngram [6] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [ns] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] - verbose # verbose [2] -``` - -## `train_supervised` parameters - -```python - input # training file path (required) - lr # learning rate [0.1] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [1] - minCountLabel # minimal number of label occurences [1] - minn # min length of char ngram [0] - maxn # max length of char ngram [0] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [softmax] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] - label # label prefix ['__label__'] - verbose # verbose [2] - pretrainedVectors # pretrained word vectors (.vec file) for supervised learning [] -``` - -## `model` object - -`train_supervised`, `train_unsupervised` and `load_model` functions return an instance of `_FastText` class, that we generaly name `model` object. - -This object exposes those training arguments as properties : `lr`, `dim`, `ws`, `epoch`, `minCount`, `minCountLabel`, `minn`, `maxn`, `neg`, `wordNgrams`, `loss`, `bucket`, `thread`, `lrUpdateRate`, `t`, `label`, `verbose`, `pretrainedVectors`. So `model.wordNgrams` will give you the max length of word ngram used for training this model. - -In addition, the object exposes several functions : - -```python - get_dimension # Get the dimension (size) of a lookup vector (hidden layer). - # This is equivalent to `dim` property. - get_input_vector # Given an index, get the corresponding vector of the Input Matrix. - get_input_matrix # Get a copy of the full input matrix of a Model. - get_labels # Get the entire list of labels of the dictionary - # This is equivalent to `labels` property. - get_line # Split a line of text into words and labels. - get_output_matrix # Get a copy of the full output matrix of a Model. - get_sentence_vector # Given a string, get a single vector represenation. This function - # assumes to be given a single line of text. We split words on - # whitespace (space, newline, tab, vertical tab) and the control - # characters carriage return, formfeed and the null character. - get_subword_id # Given a subword, return the index (within input matrix) it hashes to. - get_subwords # Given a word, get the subwords and their indicies. - get_word_id # Given a word, get the word id within the dictionary. - get_word_vector # Get the vector representation of word. - get_words # Get the entire list of words of the dictionary - # This is equivalent to `words` property. - is_quantized # whether the model has been quantized - predict # Given a string, get a list of labels and a list of corresponding probabilities. - quantize # Quantize the model reducing the size of the model and it's memory footprint. - save_model # Save the model to the given path - test # Evaluate supervised model using file given by path - test_label # Return the precision and recall score for each label. -``` - -The properties `words`, `labels` return the words and labels from the dictionary : -```py -model.words # equivalent to model.get_words() -model.labels # equivalent to model.get_labels() -``` - -The object overrides `__getitem__` and `__contains__` functions in order to return the representation of a word and to check if a word is in the vocabulary. - -```py -model['king'] # equivalent to model.get_word_vector('king') -'king' in model # equivalent to `'king' in model.get_words()` -``` - - -Join the fastText community ---------------------------- - -- [Facebook page](https://www.facebook.com/groups/1174547215919768) -- [Stack overflow](https://stackoverflow.com/questions/tagged/fasttext) -- [Google group](https://groups.google.com/forum/#!forum/fasttext-library) -- [GitHub](https://github.com/facebookresearch/fastText) diff --git a/python/README.rst b/python/README.rst deleted file mode 100644 index 6e6f38d84..000000000 --- a/python/README.rst +++ /dev/null @@ -1,406 +0,0 @@ -fastText |CircleCI| -=================== - -`fastText `__ is a library for efficient learning -of word representations and sentence classification. - -In this document we present how to use fastText in python. - -Table of contents ------------------ - -- `Requirements <#requirements>`__ -- `Installation <#installation>`__ -- `Usage overview <#usage-overview>`__ -- `Word representation model <#word-representation-model>`__ -- `Text classification model <#text-classification-model>`__ -- `IMPORTANT: Preprocessing data / encoding - conventions <#important-preprocessing-data-encoding-conventions>`__ -- `More examples <#more-examples>`__ -- `API <#api>`__ -- `train_unsupervised parameters <#train_unsupervised-parameters>`__ -- `train_supervised parameters <#train_supervised-parameters>`__ -- `model object <#model-object>`__ - -Requirements -============ - -`fastText `__ builds on modern Mac OS and Linux -distributions. Since it uses C++11 features, it requires a compiler with -good C++11 support. You will need `Python `__ -(version 2.7 or ≥ 3.4), `NumPy `__ & -`SciPy `__ and -`pybind11 `__. - -Installation -============ - -To install the latest release, you can do : - -.. code:: bash - - $ pip install fasttext - -or, to get the latest development version of fasttext, you can install -from our github repository : - -.. code:: bash - - $ git clone https://github.com/facebookresearch/fastText.git - $ cd fastText - $ sudo pip install . - $ # or : - $ sudo python setup.py install - -Usage overview -============== - -Word representation model -------------------------- - -In order to learn word vectors, as `described -here `__, -we can use ``fasttext.train_unsupervised`` function like this: - -.. code:: py - - import fasttext - - # Skipgram model : - model = fasttext.train_unsupervised('data.txt', model='skipgram') - - # or, cbow model : - model = fasttext.train_unsupervised('data.txt', model='cbow') - -where ``data.txt`` is a training file containing utf-8 encoded text. - -The returned ``model`` object represents your learned model, and you can -use it to retrieve information. - -.. code:: py - - print(model.words) # list of words in dictionary - print(model['king']) # get the vector of the word 'king' - -Saving and loading a model object -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can save your trained model object by calling the function -``save_model``. - -.. code:: py - - model.save_model("model_filename.bin") - -and retrieve it later thanks to the function ``load_model`` : - -.. code:: py - - model = fasttext.load_model("model_filename.bin") - -For more information about word representation usage of fasttext, you -can refer to our `word representations -tutorial `__. - -Text classification model -------------------------- - -In order to train a text classifier using the method `described -here `__, -we can use ``fasttext.train_supervised`` function like this: - -.. code:: py - - import fasttext - - model = fasttext.train_supervised('data.train.txt') - -where ``data.train.txt`` is a text file containing a training sentence -per line along with the labels. By default, we assume that labels are -words that are prefixed by the string ``__label__`` - -Once the model is trained, we can retrieve the list of words and labels: - -.. code:: py - - print(model.words) - print(model.labels) - -To evaluate our model by computing the precision at 1 (P@1) and the -recall on a test set, we use the ``test`` function: - -.. code:: py - - def print_results(N, p, r): - print("N\t" + str(N)) - print("P@{}\t{:.3f}".format(1, p)) - print("R@{}\t{:.3f}".format(1, r)) - - print_results(*model.test('test.txt')) - -We can also predict labels for a specific text : - -.. code:: py - - model.predict("Which baking dish is best to bake a banana bread ?") - -By default, ``predict`` returns only one label : the one with the -highest probability. You can also predict more than one label by -specifying the parameter ``k``: - -.. code:: py - - model.predict("Which baking dish is best to bake a banana bread ?", k=3) - -If you want to predict more than one sentence you can pass an array of -strings : - -.. code:: py - - model.predict(["Which baking dish is best to bake a banana bread ?", "Why not put knives in the dishwasher?"], k=3) - -Of course, you can also save and load a model to/from a file as `in the -word representation usage <#saving-and-loading-a-model-object>`__. - -For more information about text classification usage of fasttext, you -can refer to our `text classification -tutorial `__. - -Compress model files with quantization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -When you want to save a supervised model file, fastText can compress it -in order to have a much smaller model file by sacrificing only a little -bit performance. - -.. code:: py - - # with the previously trained `model` object, call : - model.quantize(input='data.train.txt', retrain=True) - - # then display results and save the new model : - print_results(*model.test(valid_data)) - model.save_model("model_filename.ftz") - -``model_filename.ftz`` will have a much smaller size than -``model_filename.bin``. - -For further reading on quantization, you can refer to `this paragraph -from our blog -post `__. - -IMPORTANT: Preprocessing data / encoding conventions ----------------------------------------------------- - -In general it is important to properly preprocess your data. In -particular our example scripts in the `root -folder `__ do this. - -fastText assumes UTF-8 encoded text. All text must be `unicode for -Python2 `__ -and `str for -Python3 `__. -The passed text will be `encoded as UTF-8 by -pybind11 `__ -before passed to the fastText C++ library. This means it is important to -use UTF-8 encoded text when building a model. On Unix-like systems you -can convert text using `iconv `__. - -fastText will tokenize (split text into pieces) based on the following -ASCII characters (bytes). In particular, it is not aware of UTF-8 -whitespace. We advice the user to convert UTF-8 whitespace / word -boundaries into one of the following symbols as appropiate. - -- space -- tab -- vertical tab -- carriage return -- formfeed -- the null character - -The newline character is used to delimit lines of text. In particular, -the EOS token is appended to a line of text if a newline character is -encountered. The only exception is if the number of tokens exceeds the -MAX\_LINE\_SIZE constant as defined in the `Dictionary -header `__. -This means if you have text that is not separate by newlines, such as -the `fil9 dataset `__, it will be -broken into chunks with MAX\_LINE\_SIZE of tokens and the EOS token is -not appended. - -The length of a token is the number of UTF-8 characters by considering -the `leading two bits of a -byte `__ to identify -`subsequent bytes of a multi-byte -sequence `__. -Knowing this is especially important when choosing the minimum and -maximum length of subwords. Further, the EOS token (as specified in the -`Dictionary -header `__) -is considered a character and will not be broken into subwords. - -More examples -------------- - -In order to have a better knowledge of fastText models, please consider -the main -`README `__ -and in particular `the tutorials on our -website `__. - -You can find further python examples in `the doc -folder `__. - -As with any package you can get help on any Python function using the -help function. - -For example - -:: - - +>>> import fasttext - +>>> help(fasttext.FastText) - - Help on module fasttext.FastText in fasttext: - - NAME - fasttext.FastText - - DESCRIPTION - # Copyright (c) 2017-present, Facebook, Inc. - # All rights reserved. - # - # This source code is licensed under the MIT license found in the - # LICENSE file in the root directory of this source tree. - - FUNCTIONS - load_model(path) - Load a model given a filepath and return a model object. - - tokenize(text) - Given a string of text, tokenize it and return a list of tokens - [...] - -API -=== - -``train_unsupervised`` parameters ---------------------------------- - -.. code:: python - - input # training file path (required) - model # unsupervised fasttext model {cbow, skipgram} [skipgram] - lr # learning rate [0.05] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [5] - minn # min length of char ngram [3] - maxn # max length of char ngram [6] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [ns] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] - verbose # verbose [2] - -``train_supervised`` parameters -------------------------------- - -.. code:: python - - input # training file path (required) - lr # learning rate [0.1] - dim # size of word vectors [100] - ws # size of the context window [5] - epoch # number of epochs [5] - minCount # minimal number of word occurences [1] - minCountLabel # minimal number of label occurences [1] - minn # min length of char ngram [0] - maxn # max length of char ngram [0] - neg # number of negatives sampled [5] - wordNgrams # max length of word ngram [1] - loss # loss function {ns, hs, softmax, ova} [softmax] - bucket # number of buckets [2000000] - thread # number of threads [number of cpus] - lrUpdateRate # change the rate of updates for the learning rate [100] - t # sampling threshold [0.0001] - label # label prefix ['__label__'] - verbose # verbose [2] - pretrainedVectors # pretrained word vectors (.vec file) for supervised learning [] - -``model`` object ----------------- - -``train_supervised``, ``train_unsupervised`` and ``load_model`` -functions return an instance of ``_FastText`` class, that we generaly -name ``model`` object. - -This object exposes those training arguments as properties : ``lr``, -``dim``, ``ws``, ``epoch``, ``minCount``, ``minCountLabel``, ``minn``, -``maxn``, ``neg``, ``wordNgrams``, ``loss``, ``bucket``, ``thread``, -``lrUpdateRate``, ``t``, ``label``, ``verbose``, ``pretrainedVectors``. -So ``model.wordNgrams`` will give you the max length of word ngram used -for training this model. - -In addition, the object exposes several functions : - -.. code:: python - - get_dimension # Get the dimension (size) of a lookup vector (hidden layer). - # This is equivalent to `dim` property. - get_input_vector # Given an index, get the corresponding vector of the Input Matrix. - get_input_matrix # Get a copy of the full input matrix of a Model. - get_labels # Get the entire list of labels of the dictionary - # This is equivalent to `labels` property. - get_line # Split a line of text into words and labels. - get_output_matrix # Get a copy of the full output matrix of a Model. - get_sentence_vector # Given a string, get a single vector represenation. This function - # assumes to be given a single line of text. We split words on - # whitespace (space, newline, tab, vertical tab) and the control - # characters carriage return, formfeed and the null character. - get_subword_id # Given a subword, return the index (within input matrix) it hashes to. - get_subwords # Given a word, get the subwords and their indicies. - get_word_id # Given a word, get the word id within the dictionary. - get_word_vector # Get the vector representation of word. - get_words # Get the entire list of words of the dictionary - # This is equivalent to `words` property. - is_quantized # whether the model has been quantized - predict # Given a string, get a list of labels and a list of corresponding probabilities. - quantize # Quantize the model reducing the size of the model and it's memory footprint. - save_model # Save the model to the given path - test # Evaluate supervised model using file given by path - test_label # Return the precision and recall score for each label. - -The properties ``words``, ``labels`` return the words and labels from -the dictionary : - -.. code:: py - - model.words # equivalent to model.get_words() - model.labels # equivalent to model.get_labels() - -The object overrides ``__getitem__`` and ``__contains__`` functions in -order to return the representation of a word and to check if a word is -in the vocabulary. - -.. code:: py - - model['king'] # equivalent to model.get_word_vector('king') - 'king' in model # equivalent to `'king' in model.get_words()` - -Join the fastText community ---------------------------- - -- `Facebook page `__ -- `Stack - overflow `__ -- `Google - group `__ -- `GitHub `__ - -.. |CircleCI| image:: https://circleci.com/gh/facebookresearch/fastText/tree/master.svg?style=svg - :target: https://circleci.com/gh/facebookresearch/fastText/tree/master diff --git a/python/benchmarks/README.rst b/python/benchmarks/README.rst deleted file mode 100644 index 01c63e7a3..000000000 --- a/python/benchmarks/README.rst +++ /dev/null @@ -1,3 +0,0 @@ -These programs allow us to compare the performance of a few key operations when consindering changes. - -It is important to run these to make sure a change doesn't introduce a regression. diff --git a/python/benchmarks/get_word_vector.py b/python/benchmarks/get_word_vector.py deleted file mode 100644 index 385997d5f..000000000 --- a/python/benchmarks/get_word_vector.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -from fasttext import load_model -from fasttext import tokenize -import sys -import time -import argparse - - -def get_word_vector(data, model): - t1 = time.time() - print("Reading") - with open(data, "r") as f: - tokens = tokenize(f.read()) - t2 = time.time() - print("Read TIME: " + str(t2 - t1)) - print("Read NUM : " + str(len(tokens))) - f = load_model(model) - # This is not equivalent to piping the data into - # print-word-vector, because the data is tokenized - # first. - t3 = time.time() - i = 0 - for t in tokens: - f.get_word_vector(t) - i += 1 - if i % 10000 == 0: - sys.stderr.write("\ri: " + str(float(i / len(tokens)))) - sys.stderr.flush() - t4 = time.time() - print("\nVectoring: " + str(t4 - t3)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Simple benchmark for get_word_vector." - ) - parser.add_argument("model", help="A model file to use for benchmarking.") - parser.add_argument("data", help="A data file to use for benchmarking.") - args = parser.parse_args() - get_word_vector(args.data, args.model) diff --git a/python/doc/examples/FastTextEmbeddingBag.py b/python/doc/examples/FastTextEmbeddingBag.py deleted file mode 100644 index d49e95aaf..000000000 --- a/python/doc/examples/FastTextEmbeddingBag.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -# NOTE: This requires PyTorch! We do not provide installation scripts to install PyTorch. -# It is up to you to install this dependency if you want to execute this example. -# PyTorch's website should give you clear instructions on this: http://pytorch.org/ - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from torch.nn.modules.sparse import EmbeddingBag -import numpy as np -import torch -import random -import string -import time -from fasttext import load_model -from torch.autograd import Variable - - -class FastTextEmbeddingBag(EmbeddingBag): - def __init__(self, model_path): - self.model = load_model(model_path) - input_matrix = self.model.get_input_matrix() - input_matrix_shape = input_matrix.shape - super().__init__(input_matrix_shape[0], input_matrix_shape[1]) - self.weight.data.copy_(torch.FloatTensor(input_matrix)) - - def forward(self, words): - word_subinds = np.empty([0], dtype=np.int64) - word_offsets = [0] - for word in words: - _, subinds = self.model.get_subwords(word) - word_subinds = np.concatenate((word_subinds, subinds)) - word_offsets.append(word_offsets[-1] + len(subinds)) - word_offsets = word_offsets[:-1] - ind = Variable(torch.LongTensor(word_subinds)) - offsets = Variable(torch.LongTensor(word_offsets)) - return super().forward(ind, offsets) - - -def random_word(N): - return ''.join( - random.choices( - string.ascii_uppercase + string.ascii_lowercase + string.digits, - k=N - ) - ) - - -if __name__ == "__main__": - ft_emb = FastTextEmbeddingBag("fil9.bin") - model = load_model("fil9.bin") - num_lines = 200 - total_seconds = 0.0 - total_words = 0 - for _ in range(num_lines): - words = [ - random_word(random.randint(1, 10)) - for _ in range(random.randint(15, 25)) - ] - total_words += len(words) - words_average_length = sum([len(word) for word in words]) / len(words) - start = time.clock() - words_emb = ft_emb(words) - total_seconds += (time.clock() - start) - for i in range(len(words)): - word = words[i] - ft_word_emb = model.get_word_vector(word) - py_emb = np.array(words_emb[i].data) - assert (np.isclose(ft_word_emb, py_emb).all()) - print( - "Avg. {:2.5f} seconds to build embeddings for {} lines with a total of {} words.". - format(total_seconds, num_lines, total_words) - ) diff --git a/python/doc/examples/bin_to_vec.py b/python/doc/examples/bin_to_vec.py deleted file mode 100644 index 17f359a7b..000000000 --- a/python/doc/examples/bin_to_vec.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division, absolute_import, print_function - -from fasttext import load_model -import argparse -import errno - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=("Print fasttext .vec file to stdout from .bin file") - ) - parser.add_argument( - "model", - help="Model to use", - ) - args = parser.parse_args() - - f = load_model(args.model) - words = f.get_words() - print(str(len(words)) + " " + str(f.get_dimension())) - for w in words: - v = f.get_word_vector(w) - vstr = "" - for vi in v: - vstr += " " + str(vi) - try: - print(w + vstr) - except IOError as e: - if e.errno == errno.EPIPE: - pass diff --git a/python/doc/examples/compute_accuracy.py b/python/doc/examples/compute_accuracy.py deleted file mode 100644 index c359c64ac..000000000 --- a/python/doc/examples/compute_accuracy.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division, absolute_import, print_function - -from fasttext import load_model -from fasttext import util -import argparse -import numpy as np - - -def process_question(question, cossims, model, words, vectors): - correct = 0 - num_qs = 0 - num_lines = 0 - for line in question: - num_lines += 1 - qwords = line.split() - # We lowercase all words to correspond to the preprocessing - # we applied to our data. - qwords = [x.lower().strip() for x in qwords] - # If one of the words is not in the vocabulary we skip this question - found = True - for w in qwords: - if w not in words: - found = False - break - if not found: - continue - # The first three words form the query - # We retrieve their word vectors and normalize them - query = qwords[:3] - query = [model.get_word_vector(x) for x in query] - query = [x / np.linalg.norm(x) for x in query] - # Get the query vector. Example: - # Germany - Berlin + France - query = query[1] - query[0] + query[2] - # We don't need to rank all the words, only until we found - # the first word not equal to our set of query words. - ban_set = list(map(lambda x: words.index(x), qwords[:3])) - if words[util.find_nearest_neighbor( - query, vectors, ban_set, cossims=cossims - )] == qwords[3]: - correct += 1 - num_qs += 1 - return correct, num_qs, num_lines - - -# We use the same conventions as within compute-accuracy -def print_compute_accuracy_score( - question, correct, num_qs, total_accuracy, semantic_accuracy, - syntactic_accuracy -): - print( - ( - "{0:>30}: ACCURACY TOP1: {3:.2f} % ({1} / {2})\t Total accuracy: {4:.2f} % Semantic accuracy: {5:.2f} % Syntactic accuracy: {6:.2f} %" - ).format( - question, - correct, - num_qs, - correct / float(num_qs) * 100 if num_qs > 0 else 0, - total_accuracy * 100, - semantic_accuracy * 100, - syntactic_accuracy * 100, - ) - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=( - "compute_accuracy equivalent in Python. " - "See https://github.com/tmikolov/word2vec/blob/master/demo-word-accuracy.sh" - ) - ) - parser.add_argument( - "model", - help="Model to use", - ) - parser.add_argument( - "question_words", - help="word questions similar to tmikolov's file (see help for link)", - ) - parser.add_argument( - "threshold", - help="threshold used to limit number of words used", - ) - args = parser.parse_args() - args.threshold = int(args.threshold) - - # Retrieve list of normalized word vectors for the first words up - # until the threshold count. - f = load_model(args.model) - # Gets words with associated frequeny sorted by default by descending order - words, freq = f.get_words(include_freq=True) - words = words[:args.threshold] - vectors = np.zeros((len(words), f.get_dimension()), dtype=float) - for i in range(len(words)): - wv = f.get_word_vector(words[i]) - wv = wv / np.linalg.norm(wv) - vectors[i] = wv - - total_correct = 0 - total_qs = 0 - total_num_lines = 0 - - total_se_correct = 0 - total_se_qs = 0 - - total_sy_correct = 0 - total_sy_qs = 0 - - qid = 0 - questions = [] - with open(args.question_words, 'r') as fqw: - questions = fqw.read().split(':')[1:] - # For efficiency preallocate the memory to calculate cosine similarities - cossims = np.zeros(len(words), dtype=float) - for question in questions: - quads = question.split('\n') - question = quads[0].strip() - quads = quads[1:-1] - correct, num_qs, num_lines = process_question( - quads, cossims, f, words, vectors - ) - total_qs += num_qs - total_correct += correct - total_num_lines += num_lines - - if (qid < 5): - total_se_correct += correct - total_se_qs += num_qs - else: - total_sy_correct += correct - total_sy_qs += num_qs - - print_compute_accuracy_score( - question, - correct, - num_qs, - total_correct / float(total_qs) if total_qs > 0 else 0, - total_se_correct / float(total_se_qs) if total_se_qs > 0 else 0, - total_sy_correct / float(total_sy_qs) if total_sy_qs > 0 else 0, - ) - qid += 1 - - print( - "Questions seen / total: {0} {1} {2:.2f} %". - format( - total_qs, - total_num_lines, - total_qs / total_num_lines * 100 if total_num_lines > 0 else 0, - ) - ) diff --git a/python/doc/examples/get_vocab.py b/python/doc/examples/get_vocab.py deleted file mode 100644 index 60f0814df..000000000 --- a/python/doc/examples/get_vocab.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division, absolute_import, print_function - -from fasttext import load_model -import argparse -import errno - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=( - "Print words or labels and frequency of a model's dictionary" - ) - ) - parser.add_argument( - "model", - help="Model to use", - ) - parser.add_argument( - "-l", - "--labels", - help="Print labels instead of words", - action='store_true', - default=False, - ) - args = parser.parse_args() - - f = load_model(args.model) - if args.labels: - words, freq = f.get_labels(include_freq=True) - else: - words, freq = f.get_words(include_freq=True) - for w, f in zip(words, freq): - try: - print(w + "\t" + str(f)) - except IOError as e: - if e.errno == errno.EPIPE: - pass diff --git a/python/doc/examples/train_supervised.py b/python/doc/examples/train_supervised.py deleted file mode 100644 index 28ab9ef1e..000000000 --- a/python/doc/examples/train_supervised.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import os -from fasttext import train_supervised - - -def print_results(N, p, r): - print("N\t" + str(N)) - print("P@{}\t{:.3f}".format(1, p)) - print("R@{}\t{:.3f}".format(1, r)) - - -if __name__ == "__main__": - train_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.train') - valid_data = os.path.join(os.getenv("DATADIR", ''), 'cooking.valid') - - # train_supervised uses the same arguments and defaults as the fastText cli - model = train_supervised( - input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1 - ) - print_results(*model.test(valid_data)) - - model = train_supervised( - input=train_data, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1, - loss="hs" - ) - print_results(*model.test(valid_data)) - model.save_model("cooking.bin") - - model.quantize(input=train_data, qnorm=True, retrain=True, cutoff=100000) - print_results(*model.test(valid_data)) - model.save_model("cooking.ftz") diff --git a/python/doc/examples/train_unsupervised.py b/python/doc/examples/train_unsupervised.py deleted file mode 100644 index 52476dfe6..000000000 --- a/python/doc/examples/train_unsupervised.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division, absolute_import, print_function - -from fasttext import train_unsupervised -import numpy as np -import os -from scipy import stats - - -# Because of fasttext we don't need to account for OOV -def compute_similarity(data_path): - def similarity(v1, v2): - n1 = np.linalg.norm(v1) - n2 = np.linalg.norm(v2) - return np.dot(v1, v2) / n1 / n2 - - mysim = [] - gold = [] - - with open(data_path, 'rb') as fin: - for line in fin: - tline = line.split() - word1 = tline[0].lower() - word2 = tline[1].lower() - - v1 = model.get_word_vector(word1) - v2 = model.get_word_vector(word2) - d = similarity(v1, v2) - mysim.append(d) - gold.append(float(tline[2])) - - corr = stats.spearmanr(mysim, gold) - dataset = os.path.basename(data_path) - correlation = corr[0] * 100 - return dataset, correlation, 0 - - -if __name__ == "__main__": - model = train_unsupervised( - input=os.path.join(os.getenv("DATADIR", ''), 'fil9'), - model='skipgram', - ) - model.save_model("fil9.bin") - dataset, corr, oov = compute_similarity('rw.txt') - print("{0:20s}: {1:2.0f} (OOV: {2:2.0f}%)".format(dataset, corr, 0)) diff --git a/python/fasttext_module/fasttext/tests/__init__.py b/python/fasttext_module/fasttext/tests/__init__.py deleted file mode 100644 index ff6443122..000000000 --- a/python/fasttext_module/fasttext/tests/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -from .test_configurations import get_supervised_models -from .test_script import gen_tests -from .test_script import gen_unit_tests diff --git a/python/fasttext_module/fasttext/tests/test_configurations.py b/python/fasttext_module/fasttext/tests/test_configurations.py deleted file mode 100644 index a0d5c5416..000000000 --- a/python/fasttext_module/fasttext/tests/test_configurations.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import multiprocessing - -# This script represents a collection of integration tests -# Each integration test comes with a full set of parameters, -# a dataset, and expected metrics. -# These configurations can be used by various fastText APIs -# to confirm some level of correctness. - - -def max_thread(): - return multiprocessing.cpu_count() - 1 - - -def check_supervised_configuration(configuration, verbose=1): - configuration["args"]["verbose"] = verbose - configuration["quant_args"]["verbose"] = verbose - return configuration - - -def check_supervised_configurations(configurations, verbose=1): - for i in range(len(configurations)): - configurations[i] = check_supervised_configuration( - configurations[i], verbose=verbose - ) - return configurations - - -def flickr_job(thread=None): - if thread is None: - thread = max_thread() - config = {} - config["dataset"] = "YFCC100M" - config["args"] = { - "dim": 256, - "wordNgrams": 2, - "minCount": 10, - "bucket": 10000000, - "epoch": 20, - "loss": "hs", - "minCountLabel": 100, - "thread": thread - } - config["args"]["input"] = "YFCC100M/train" - config["quant_args"] = { - "dsub": 2, - "lr": 0.1, - "epoch": 5, - "cutoff": 100000, - "qnorm": True, - "retrain": True, - "qout": True - } - config["quant_args"]["input"] = config["args"]["input"] - config["test"] = { - "n": 647224, - "p1": 0.470, - "r1": 0.071, - "size": 12060039727, - "data": "YFCC100M/test", - } - # One quant example (to illustrate slack): 0.344, 0.0528, 64506972 - config["quant_test"] = { - "n": 647224, - "p1": 0.300, - "r1": 0.0450, - "size": 70000000, - "data": "YFCC100M/test", - } - return config - - -def langid_job1(thread=None): - if thread is None: - thread = max_thread() - config = {} - config["dataset"] = "langid" - config["args"] = {"dim": 16, "minn": 2, "maxn": 4, "thread": thread} - config["args"]["input"] = "langid.train" - config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True} - config["quant_args"]["input"] = config["args"]["input"] - config["test"] = { - "n": 10000, - "p1": 0.985, - "r1": 0.985, - "size": 368132610, - "data": "langid.valid", - } - # One quant example (to illustrate slack): 0.984 0.984 932793 - config["quant_test"] = { - "p1": 0.97, - "r1": 0.97, - "size": 1000000, - } - config["quant_test"]["n"] = config["test"]["n"] - config["quant_test"]["data"] = config["test"]["data"] - return config - - -def langid_job2(thread=None): - if thread is None: - thread = max_thread() - config = langid_job1(thread).copy() - config["args"]["loss"] = "hs" - return config - - -def cooking_job1(thread=None): - if thread is None: - thread = max_thread() - config = {} - config["dataset"] = "cooking" - config["args"] = { - "epoch": 25, - "lr": 1.0, - "wordNgrams": 2, - "minCount": 1, - "thread": thread, - } - config["args"]["input"] = "cooking.train" - config["quant_args"] = {"qnorm": True, "cutoff": 50000, "retrain": True} - config["quant_args"]["input"] = config["args"]["input"] - config["test"] = { - "n": 3000, - "p1": 0.59, - "r1": 0.25, - "size": 804047585, - "data": "cooking.valid", - } - # One quant example (to illustrate slack): 0.602 0.26 3439172 - config["quant_test"] = { - "p1": 0.55, - "r1": 0.20, - "size": 4000000, - } - config["quant_test"]["n"] = config["test"]["n"] - config["quant_test"]["data"] = config["test"]["data"] - return config - - -def cooking_job2(thread=None): - if thread is None: - thread = max_thread() - config = cooking_job1(thread).copy() - config["args"]["loss"] = "hs" - return config - - -# Supervised models -# See https://fasttext.cc/docs/en/supervised-models.html -def get_supervised_models(thread=None, verbose=1): - if thread is None: - thread = max_thread() - sup_job_dataset = [ - "ag_news", "sogou_news", "dbpedia", "yelp_review_polarity", - "yelp_review_full", "yahoo_answers", "amazon_review_full", - "amazon_review_polarity" - ] - - sup_params = { - "dim": 10, - "wordNgrams": 2, - "minCount": 1, - "bucket": 10000000, - "epoch": 5, - "thread": thread, - "verbose": 1, - } - quant_params = { - "retrain": True, - "cutoff": 100000, - "qnorm": True, - "verbose": 1, - } - sup_job_lr = [0.25, 0.5, 0.5, 0.1, 0.1, 0.1, 0.05, 0.05] - - sup_job_n = [7600, 60000, 70000, 38000, 50000, 60000, 650000, 400000] - - sup_job_p1 = [0.915, 0.968, 0.983, 0.956, 0.638, 0.723, 0.600, 0.940] - sup_job_r1 = [0.915, 0.968, 0.983, 0.956, 0.638, 0.723, 0.600, 0.940] - sup_job_size = [ - 405607193, 421445471, 447481878, 427867393, 431292576, 517549567, - 483742593, 493604598 - ] - - sup_job_quant_p1 = [0.918, 0.965, 0.983, 0.950, 0.625, 0.707, 0.58, 0.920] - sup_job_quant_r1 = [0.918, 0.965, 0.983, 0.950, 0.625, 0.707, 0.58, 0.920] - sup_job_quant_size = [ - 1600000, 1500000, 1700000, 1600000, 1600000, 1700000, 1600000, 1600000 - ] - - configurations = [] - for i in range(len(sup_job_dataset)): - configuration = {} - configuration["dataset"] = sup_job_dataset[i] - args = sup_params.copy() - quant_args = quant_params.copy() - args["lr"] = sup_job_lr[i] - args["input"] = sup_job_dataset[i] + ".train" - quant_args["lr"] = sup_job_lr[i] - quant_args["input"] = sup_job_dataset[i] + ".train" - configuration["args"] = args - configuration["quant_args"] = quant_args - test = { - "n": sup_job_n[i], - "p1": sup_job_p1[i], - "r1": sup_job_r1[i], - "size": sup_job_size[i], - "data": sup_job_dataset[i] + ".test", - } - quant_test = { - "n": sup_job_n[i], - "p1": sup_job_quant_p1[i], - "r1": sup_job_quant_r1[i], - "size": sup_job_quant_size[i], - "data": sup_job_dataset[i] + ".test", - } - configuration["test"] = test - configuration["quant_test"] = quant_test - configurations.append(configuration) - configurations.append(flickr_job()) - configurations.append(langid_job1()) - configurations.append(langid_job2()) - configurations.append(cooking_job1()) - configurations.append(cooking_job2()) - configurations = check_supervised_configurations( - configurations, verbose=verbose - ) - return configurations diff --git a/python/fasttext_module/fasttext/tests/test_script.py b/python/fasttext_module/fasttext/tests/test_script.py deleted file mode 100644 index a41515ad0..000000000 --- a/python/fasttext_module/fasttext/tests/test_script.py +++ /dev/null @@ -1,629 +0,0 @@ -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -from fasttext import train_supervised -from fasttext import train_unsupervised -from fasttext import util -import fasttext -import os -import subprocess -import unittest -import tempfile -import random -import sys -import copy -import numpy as np -try: - import unicode -except ImportError: - pass -from fasttext.tests.test_configurations import get_supervised_models - - -def eprint(cls, *args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - -def get_random_unicode(length): - # See: https://stackoverflow.com/questions/1477294/generate-random-utf-8-string-in-python - - try: - get_char = unichr - except NameError: - get_char = chr - - # Update this to include code point ranges to be sampled - include_ranges = [ - (0x0021, 0x0021), - (0x0023, 0x0026), - (0x0028, 0x007E), - (0x00A1, 0x00AC), - (0x00AE, 0x00FF), - (0x0100, 0x017F), - (0x0180, 0x024F), - (0x2C60, 0x2C7F), - (0x16A0, 0x16F0), - (0x0370, 0x0377), - (0x037A, 0x037E), - (0x0384, 0x038A), - (0x038C, 0x038C), - ] - - alphabet = [ - get_char(code_point) - for current_range in include_ranges - for code_point in range(current_range[0], current_range[1] + 1) - ] - return ''.join(random.choice(alphabet) for i in range(length)) - - -def get_random_words(N, a=1, b=20, unique=True): - words = [] - while (len(words) < N): - length = random.randint(a, b) - word = get_random_unicode(length) - if unique and word not in words: - words.append(word) - else: - words.append(word) - return words - - -def get_random_data( - num_lines=100, - max_vocab_size=100, - min_words_line=0, - max_words_line=20, - min_len_word=1, - max_len_word=10, - unique_words=True, -): - random_words = get_random_words( - max_vocab_size, min_len_word, max_len_word, unique=unique_words - ) - lines = [] - for _ in range(num_lines): - line = [] - line_length = random.randint(min_words_line, max_words_line) - for _ in range(line_length): - i = random.randint(0, max_vocab_size - 1) - line.append(random_words[i]) - line = " ".join(line) - lines.append(line) - return lines - - -def default_kwargs(kwargs): - default = {"thread": 1, "epoch": 1, "minCount": 1, "bucket": 1000} - for k, v in default.items(): - if k not in kwargs: - kwargs[k] = v - return kwargs - - -def build_unsupervised_model(data, kwargs): - kwargs = default_kwargs(kwargs) - with tempfile.NamedTemporaryFile(delete=False) as tmpf: - for line in data: - tmpf.write((line + "\n").encode("UTF-8")) - tmpf.flush() - model = train_unsupervised(input=tmpf.name, **kwargs) - return model - - -def build_supervised_model(data, kwargs): - kwargs = default_kwargs(kwargs) - with tempfile.NamedTemporaryFile(delete=False) as tmpf: - for line in data: - line = "__label__" + line.strip() + "\n" - tmpf.write(line.encode("UTF-8")) - tmpf.flush() - model = train_supervised(input=tmpf.name, **kwargs) - return model - - -def read_labels(data_file): - labels = [] - lines = [] - with open(data_file, 'r') as f: - for line in f: - labels_line = [] - words_line = [] - try: - line = unicode(line, "UTF-8").split() - except NameError: - line = line.split() - for word in line: - if word.startswith("__label__"): - labels_line.append(word) - else: - words_line.append(word) - labels.append(labels_line) - lines.append(" ".join(words_line)) - return lines, labels - - -class TestFastTextUnitPy(unittest.TestCase): - # TODO: Unit test copy behavior of fasttext - - def gen_test_get_vector(self, kwargs): - # Confirm if no subwords, OOV is zero, confirm min=10 means words < 10 get zeros - - f = build_unsupervised_model(get_random_data(100), kwargs) - words, _ = f.get_words(include_freq=True) - words += get_random_words(100) - for word in words: - f.get_word_vector(word) - - def gen_test_multi_get_line(self, kwargs): - data = get_random_data(100) - model1 = build_supervised_model(data, kwargs) - model2 = build_unsupervised_model(data, kwargs) - lines1 = [] - lines2 = [] - for line in data: - words, labels = model1.get_line(line) - lines1.append(words) - self.assertEqual(len(labels), 0) - words, labels = model2.get_line(line) - lines2.append(words) - self.assertEqual(len(labels), 0) - all_lines1, all_labels1 = model1.get_line(data) - all_lines2, all_labels2 = model2.get_line(data) - self.assertEqual(lines1, all_lines1) - self.assertEqual(lines2, all_lines2) - for labels in all_labels1: - self.assertEqual(len(labels), 0) - for labels in all_labels2: - self.assertEqual(len(labels), 0) - - def gen_test_supervised_util_test(self, kwargs): - def check(data): - third = int(len(data) / 3) - train_data = data[:2 * third] - valid_data = data[third:] - with tempfile.NamedTemporaryFile( - delete=False - ) as tmpf, tempfile.NamedTemporaryFile(delete=False) as tmpf2: - for line in train_data: - tmpf.write( - ("__label__" + line.strip() + "\n").encode("UTF-8") - ) - tmpf.flush() - for line in valid_data: - tmpf2.write( - ("__label__" + line.strip() + "\n").encode("UTF-8") - ) - tmpf2.flush() - model = train_supervised(input=tmpf.name, **kwargs) - true_labels = [] - all_words = [] - with open(tmpf2.name, 'r') as fid: - for line in fid: - if sys.version_info < (3, 0): - line = line.decode("UTF-8") - if len(line.strip()) == 0: - continue - words, labels = model.get_line(line.strip()) - if len(labels) == 0: - continue - all_words.append(" ".join(words)) - true_labels += [labels] - predictions, _ = model.predict(all_words) - p, r = util.test(predictions, true_labels) - N = len(predictions) - Nt, pt, rt = model.test(tmpf2.name) - self.assertEqual(N, Nt) - self.assertEqual(p, pt) - self.assertEqual(r, rt) - - # Need at least one word to have a label and a word to prevent error - check(get_random_data(100, min_words_line=2)) - - def gen_test_supervised_predict(self, kwargs): - # Confirm number of labels, confirm labels for easy dataset - # Confirm 1 label and 0 label dataset - - f = build_supervised_model(get_random_data(100), kwargs) - words = get_random_words(100) - for k in [1, 2, 5]: - for w in words: - labels, probs = f.predict(w, k) - data = get_random_data(100) - for line in data: - labels, probs = f.predict(line, k) - - def gen_test_supervised_multiline_predict(self, kwargs): - # Confirm number of labels, confirm labels for easy dataset - # Confirm 1 label and 0 label dataset - - def check_predict(f): - for k in [1, 2, 5]: - words = get_random_words(10) - agg_labels = [] - agg_probs = [] - for w in words: - labels, probs = f.predict(w, k) - agg_labels += [labels] - agg_probs += [probs] - all_labels1, all_probs1 = f.predict(words, k) - data = get_random_data(10) - for line in data: - labels, probs = f.predict(line, k) - agg_labels += [labels] - agg_probs += [probs] - all_labels2, all_probs2 = f.predict(data, k) - all_labels = list(all_labels1) + list(all_labels2) - all_probs = list(all_probs1) + list(all_probs2) - for label1, label2 in zip(all_labels, agg_labels): - self.assertEqual(list(label1), list(label2)) - for prob1, prob2 in zip(all_probs, agg_probs): - self.assertEqual(list(prob1), list(prob2)) - - check_predict(build_supervised_model(get_random_data(100), kwargs)) - check_predict( - build_supervised_model( - get_random_data(100, min_words_line=1), kwargs - ) - ) - - def gen_test_vocab(self, kwargs): - # Confirm empty dataset, confirm all label dataset - - data = get_random_data(100) - words_python = {} - for line in data: - line_words = line.split() - for w in line_words: - if w not in words_python: - words_python[w] = 0 - words_python[w] += 1 - f = build_unsupervised_model(data, kwargs) - words, freqs = f.get_words(include_freq=True) - foundEOS = False - for word, freq in zip(words, freqs): - if word == fasttext.EOS: - foundEOS = True - else: - self.assertEqual(words_python[word], freq) - # EOS is special to fasttext, but still part of the vocab - self.assertEqual(len(words_python), len(words) - 1) - self.assertTrue(foundEOS) - - # Should cause "Empty vocabulary" error. - data = get_random_data(0) - gotError = False - try: - build_unsupervised_model(data, kwargs) - except ValueError: - gotError = True - self.assertTrue(gotError) - - def gen_test_subwords(self, kwargs): - # Define expected behavior - f = build_unsupervised_model(get_random_data(100), kwargs) - words, _ = f.get_words(include_freq=True) - words += get_random_words(10, 1, 10) - for w in words: - f.get_subwords(w) - - def gen_test_tokenize(self, kwargs): - self.assertEqual(["asdf", "asdb"], fasttext.tokenize("asdf asdb")) - self.assertEqual(["asdf"], fasttext.tokenize("asdf")) - self.assertEqual([fasttext.EOS], fasttext.tokenize("\n")) - self.assertEqual(["asdf", fasttext.EOS], fasttext.tokenize("asdf\n")) - self.assertEqual([], fasttext.tokenize("")) - self.assertEqual([], fasttext.tokenize(" ")) - # An empty string is not a token (it's just whitespace) - # So the minimum length must be 1 - words = get_random_words(100, 1, 20) - self.assertEqual(words, fasttext.tokenize(" ".join(words))) - - def gen_test_unsupervised_dimension(self, kwargs): - if "dim" in kwargs: - f = build_unsupervised_model(get_random_data(100), kwargs) - self.assertEqual(f.get_dimension(), kwargs["dim"]) - - def gen_test_supervised_dimension(self, kwargs): - if "dim" in kwargs: - f = build_supervised_model(get_random_data(100), kwargs) - self.assertEqual(f.get_dimension(), kwargs["dim"]) - - def gen_test_subword_vector(self, kwargs): - f = build_unsupervised_model(get_random_data(100), kwargs) - words, _ = f.get_words(include_freq=True) - words += get_random_words(100, 1, 20) - input_matrix = f.get_input_matrix() - for word in words: - # Universal API to get word vector - vec1 = f.get_word_vector(word) - - # Build word vector from subwords - subwords, subinds = f.get_subwords(word) - subvectors = list(map(lambda x: f.get_input_vector(x), subinds)) - if len(subvectors) == 0: - vec2 = np.zeros((f.get_dimension(), )) - else: - subvectors = np.vstack(subvectors) - vec2 = np.sum((subvectors / len(subwords)), 0) - - # Build word vector from subinds - if len(subinds) == 0: - vec3 = np.zeros((f.get_dimension(), )) - else: - vec3 = np.sum(input_matrix[subinds] / len(subinds), 0) - - # Build word vectors from word and subword ids - wid = f.get_word_id(word) - if wid >= 0: - swids = list(map(lambda x: f.get_subword_id(x), subwords[1:])) - swids.append(wid) - else: - swids = list(map(lambda x: f.get_subword_id(x), subwords)) - if len(swids) == 0: - vec4 = np.zeros((f.get_dimension(), )) - else: - swids = np.array(swids) - vec4 = np.sum(input_matrix[swids] / len(swids), 0) - - self.assertTrue(np.isclose(vec1, vec2, atol=1e-5, rtol=0).all()) - self.assertTrue(np.isclose(vec2, vec3, atol=1e-5, rtol=0).all()) - self.assertTrue(np.isclose(vec3, vec4, atol=1e-5, rtol=0).all()) - self.assertTrue(np.isclose(vec4, vec1, atol=1e-5, rtol=0).all()) - - def gen_test_unsupervised_get_words(self, kwargs): - # Check more corner cases of 0 vocab, empty file etc. - f = build_unsupervised_model(get_random_data(100), kwargs) - words1, freq1 = f.get_words(include_freq=True) - words2 = f.get_words(include_freq=False) - self.assertEqual(len(words1), len(words2)) - self.assertEqual(len(words1), len(freq1)) - - def gen_test_supervised_get_words(self, kwargs): - f = build_supervised_model(get_random_data(100), kwargs) - words1, freq1 = f.get_words(include_freq=True) - words2 = f.get_words(include_freq=False) - self.assertEqual(len(words1), len(words2)) - self.assertEqual(len(words1), len(freq1)) - - def gen_test_unsupervised_get_labels(self, kwargs): - f = build_unsupervised_model(get_random_data(100), kwargs) - labels1, freq1 = f.get_labels(include_freq=True) - labels2 = f.get_labels(include_freq=False) - words2 = f.get_words(include_freq=False) - self.assertEqual(len(labels1), len(labels2)) - self.assertEqual(len(labels1), len(freq1)) - self.assertEqual(len(labels1), len(words2)) - for w1, w2 in zip(labels2, words2): - self.assertEqual(w1, w2) - - def gen_test_supervised_get_labels(self, kwargs): - f = build_supervised_model(get_random_data(100), kwargs) - labels1, freq1 = f.get_labels(include_freq=True) - labels2 = f.get_labels(include_freq=False) - self.assertEqual(len(labels1), len(labels2)) - self.assertEqual(len(labels1), len(freq1)) - - def gen_test_unsupervised_exercise_is_quant(self, kwargs): - f = build_unsupervised_model(get_random_data(100), kwargs) - gotError = False - try: - f.quantize() - except ValueError: - gotError = True - self.assertTrue(gotError) - - def gen_test_supervised_exercise_is_quant(self, kwargs): - f = build_supervised_model( - get_random_data(1000, max_vocab_size=1000), kwargs - ) - self.assertTrue(not f.is_quantized()) - f.quantize() - self.assertTrue(f.is_quantized()) - - def gen_test_newline_predict_sentence(self, kwargs): - f = build_supervised_model(get_random_data(100), kwargs) - sentence = " ".join(get_random_words(20)) - f.predict(sentence, k=5) - sentence += "\n" - gotError = False - try: - f.predict(sentence, k=5) - except ValueError: - gotError = True - self.assertTrue(gotError) - - f = build_supervised_model(get_random_data(100), kwargs) - sentence = " ".join(get_random_words(20)) - f.get_sentence_vector(sentence) - sentence += "\n" - gotError = False - try: - f.get_sentence_vector(sentence) - except ValueError: - gotError = True - self.assertTrue(gotError) - - -# Generate a supervised test case -# The returned function will be set as an attribute to a test class -def gen_sup_test(configuration, data_dir): - def sup_test(self): - def get_path_size(path): - path_size = subprocess.check_output(["stat", "-c", "%s", - path]).decode('utf-8') - path_size = int(path_size) - return path_size - - def check(model, model_filename, test, lessthan, msg_prefix=""): - N_local_out, p1_local_out, r1_local_out = model.test(test["data"]) - self.assertEqual( - N_local_out, test["n"], msg_prefix + "N: Want: " + - str(test["n"]) + " Is: " + str(N_local_out) - ) - self.assertTrue( - p1_local_out >= test["p1"], msg_prefix + "p1: Want: " + - str(test["p1"]) + " Is: " + str(p1_local_out) - ) - self.assertTrue( - r1_local_out >= test["r1"], msg_prefix + "r1: Want: " + - str(test["r1"]) + " Is: " + str(r1_local_out) - ) - path_size = get_path_size(model_filename) - size_msg = str(test["size"]) + " Is: " + str(path_size) - if lessthan: - self.assertTrue( - path_size <= test["size"], - msg_prefix + "Size: Want at most: " + size_msg - ) - else: - self.assertTrue( - path_size == test["size"], - msg_prefix + "Size: Want: " + size_msg - ) - - configuration["args"]["input"] = os.path.join( - data_dir, configuration["args"]["input"] - ) - configuration["quant_args"]["input"] = configuration["args"]["input"] - configuration["test"]["data"] = os.path.join( - data_dir, configuration["test"]["data"] - ) - configuration["quant_test"]["data"] = configuration["test"]["data"] - output = os.path.join(tempfile.mkdtemp(), configuration["dataset"]) - print() - model = train_supervised(**configuration["args"]) - model.save_model(output + ".bin") - check( - model, - output + ".bin", - configuration["test"], - False, - msg_prefix="Supervised: " - ) - print() - model.quantize(**configuration["quant_args"]) - model.save_model(output + ".ftz") - check( - model, - output + ".ftz", - configuration["quant_test"], - True, - msg_prefix="Quantized: " - ) - - return sup_test - - -def gen_unit_tests(verbose=0): - gen_funcs = [ - func for func in dir(TestFastTextUnitPy) - if callable(getattr(TestFastTextUnitPy, func)) - if func.startswith("gen_test_") - ] - general_settings = [ - { - "minn": 2, - "maxn": 4, - }, { - "minn": 0, - "maxn": 0, - "bucket": 0 - }, { - "dim": 1 - }, { - "dim": 5 - } - ] - supervised_settings = [ - { - "minn": 2, - "maxn": 4, - }, { - "minn": 0, - "maxn": 0, - "bucket": 0 - }, { - "dim": 1 - }, { - "dim": 5 - }, { - "dim": 5, - "loss": "hs" - } - ] - unsupervised_settings = [ - { - "minn": 2, - "maxn": 4, - }, { - "minn": 0, - "maxn": 0, - "bucket": 0 - }, { - "dim": 1 - }, { - "dim": 5, - "model": "cbow" - }, { - "dim": 5, - "model": "skipgram" - } - ] - for gen_func in gen_funcs: - - def build_test(test_name, kwargs=None): - if kwargs is None: - kwargs = {} - kwargs["verbose"] = verbose - - def test(self): - return getattr(TestFastTextUnitPy, - "gen_" + test_name)(self, copy.deepcopy(kwargs)) - - return test - - test_name = gen_func[4:] - if "_unsupervised_" in test_name: - for i, setting in enumerate(unsupervised_settings): - setattr( - TestFastTextUnitPy, test_name + "_" + str(i), - build_test(test_name, setting) - ) - elif "_supervised_" in test_name: - for i, setting in enumerate(supervised_settings): - setattr( - TestFastTextUnitPy, test_name + "_" + str(i), - build_test(test_name, setting) - ) - else: - for i, setting in enumerate(general_settings): - setattr( - TestFastTextUnitPy, test_name + "_" + str(i), - build_test(test_name, setting) - ) - - return TestFastTextUnitPy - - -def gen_tests(data_dir, verbose=1): - class TestFastTextPy(unittest.TestCase): - pass - - i = 0 - for configuration in get_supervised_models(verbose=verbose): - setattr( - TestFastTextPy, - "test_sup_" + str(i) + "_" + configuration["dataset"], - gen_sup_test(configuration, data_dir) - ) - i += 1 - return TestFastTextPy diff --git a/python/fasttext_module/fasttext/util/__init__.py b/python/fasttext_module/fasttext/util/__init__.py deleted file mode 100644 index 01136f94b..000000000 --- a/python/fasttext_module/fasttext/util/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -from .util import test -from .util import find_nearest_neighbor -from .util import reduce_model -from .util import download_model diff --git a/python/fasttext_module/fasttext/util/util.py b/python/fasttext_module/fasttext/util/util.py deleted file mode 100644 index 7218b3671..000000000 --- a/python/fasttext_module/fasttext/util/util.py +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -# NOTE: The purpose of this file is not to accumulate all useful utility -# functions. This file should contain very commonly used and requested functions -# (such as test). If you think you have a function at that level, please create -# an issue and we will happily review your suggestion. This file is also not supposed -# to pull in dependencies outside of numpy/scipy without very good reasons. For -# example, this file should not use sklearn and matplotlib to produce a t-sne -# plot of word embeddings or such. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import numpy as np -import sys -import shutil -import os -import gzip - -try: - from urllib.request import urlopen -except ImportError: - from urllib2 import urlopen - - -valid_lang_ids = {"af", "sq", "als", "am", "ar", "an", "hy", "as", "ast", - "az", "ba", "eu", "bar", "be", "bn", "bh", "bpy", "bs", - "br", "bg", "my", "ca", "ceb", "bcl", "ce", "zh", "cv", - "co", "hr", "cs", "da", "dv", "nl", "pa", "arz", "eml", - "en", "myv", "eo", "et", "hif", "fi", "fr", "gl", "ka", - "de", "gom", "el", "gu", "ht", "he", "mrj", "hi", "hu", - "is", "io", "ilo", "id", "ia", "ga", "it", "ja", "jv", - "kn", "pam", "kk", "km", "ky", "ko", "ku", "ckb", "la", - "lv", "li", "lt", "lmo", "nds", "lb", "mk", "mai", "mg", - "ms", "ml", "mt", "gv", "mr", "mzn", "mhr", "min", "xmf", - "mwl", "mn", "nah", "nap", "ne", "new", "frr", "nso", - "no", "nn", "oc", "or", "os", "pfl", "ps", "fa", "pms", - "pl", "pt", "qu", "ro", "rm", "ru", "sah", "sa", "sc", - "sco", "gd", "sr", "sh", "scn", "sd", "si", "sk", "sl", - "so", "azb", "es", "su", "sw", "sv", "tl", "tg", "ta", - "tt", "te", "th", "bo", "tr", "tk", "uk", "hsb", "ur", - "ug", "uz", "vec", "vi", "vo", "wa", "war", "cy", "vls", - "fy", "pnb", "yi", "yo", "diq", "zea"} - - -# TODO: Add example on reproducing model.test with util.test and model.get_line -def test(predictions, labels, k=1): - """ - Return precision and recall modeled after fasttext's test - """ - precision = 0.0 - nexamples = 0 - nlabels = 0 - for prediction, labels in zip(predictions, labels): - for p in prediction: - if p in labels: - precision += 1 - nexamples += 1 - nlabels += len(labels) - return (precision / (k * nexamples), precision / nlabels) - - -def find_nearest_neighbor(query, vectors, ban_set, cossims=None): - """ - query is a 1d numpy array corresponding to the vector to which you want to - find the closest vector - vectors is a 2d numpy array corresponding to the vectors you want to consider - ban_set is a set of indicies within vectors you want to ignore for nearest match - cossims is a 1d numpy array of size len(vectors), which can be passed for efficiency - - returns the index of the closest match to query within vectors - - """ - if cossims is None: - cossims = np.matmul(vectors, query, out=cossims) - else: - np.matmul(vectors, query, out=cossims) - rank = len(cossims) - 1 - result_i = np.argpartition(cossims, rank)[rank] - while result_i in ban_set: - rank -= 1 - result_i = np.argpartition(cossims, rank)[rank] - return result_i - - -def _reduce_matrix(X_orig, dim, eigv): - """ - Reduces the dimension of a (m × n) matrix `X_orig` to - to a (m × dim) matrix `X_reduced` - It uses only the first 100000 rows of `X_orig` to do the mapping. - Matrix types are all `np.float32` in order to avoid unncessary copies. - """ - if eigv is None: - mapping_size = 100000 - X = X_orig[:mapping_size] - X = X - X.mean(axis=0, dtype=np.float32) - C = np.divide(np.matmul(X.T, X), X.shape[0] - 1, dtype=np.float32) - _, U = np.linalg.eig(C) - eigv = U[:, :dim] - - X_reduced = np.matmul(X_orig, eigv) - - return (X_reduced, eigv) - - -def reduce_model(ft_model, target_dim): - """ - ft_model is an instance of `_FastText` class - This function computes the PCA of the input and the output matrices - and sets the reduced ones. - """ - inp_reduced, proj = _reduce_matrix( - ft_model.get_input_matrix(), target_dim, None) - out_reduced, _ = _reduce_matrix( - ft_model.get_output_matrix(), target_dim, proj) - - ft_model.set_matrices(inp_reduced, out_reduced) - - return ft_model - - -def _print_progress(downloaded_bytes, total_size): - percent = float(downloaded_bytes) / total_size - bar_size = 50 - bar = int(percent * bar_size) - percent = round(percent * 100, 2) - sys.stdout.write(" (%0.2f%%) [" % percent) - sys.stdout.write("=" * bar) - sys.stdout.write(">") - sys.stdout.write(" " * (bar_size - bar)) - sys.stdout.write("]\r") - sys.stdout.flush() - - if downloaded_bytes >= total_size: - sys.stdout.write('\n') - - -def _download_file(url, write_file_name, chunk_size=2**13): - print("Downloading %s" % url) - response = urlopen(url) - if hasattr(response, 'getheader'): - file_size = int(response.getheader('Content-Length').strip()) - else: - file_size = int(response.info().getheader('Content-Length').strip()) - downloaded = 0 - download_file_name = write_file_name + ".part" - with open(download_file_name, 'wb') as f: - while True: - chunk = response.read(chunk_size) - downloaded += len(chunk) - if not chunk: - break - f.write(chunk) - _print_progress(downloaded, file_size) - - os.rename(download_file_name, write_file_name) - - -def _download_gz_model(gz_file_name, if_exists): - if os.path.isfile(gz_file_name): - if if_exists == 'ignore': - return True - elif if_exists == 'strict': - print("gzip File exists. Use --overwrite to download anyway.") - return False - elif if_exists == 'overwrite': - pass - - url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/%s" % gz_file_name - _download_file(url, gz_file_name) - - return True - - -def download_model(lang_id, if_exists='strict', dimension=None): - """ - Download pre-trained common-crawl vectors from fastText's website - https://fasttext.cc/docs/en/crawl-vectors.html - """ - if lang_id not in valid_lang_ids: - raise Exception("Invalid lang id. Please select among %s" % - repr(valid_lang_ids)) - - file_name = "cc.%s.300.bin" % lang_id - gz_file_name = "%s.gz" % file_name - - if os.path.isfile(file_name): - if if_exists == 'ignore': - return file_name - elif if_exists == 'strict': - print("File exists. Use --overwrite to download anyway.") - return - elif if_exists == 'overwrite': - pass - - if _download_gz_model(gz_file_name, if_exists): - with gzip.open(gz_file_name, 'rb') as f: - with open(file_name, 'wb') as f_out: - shutil.copyfileobj(f, f_out) - - return file_name diff --git a/quantization-example.sh b/quantization-example.sh deleted file mode 100755 index 570a5999e..000000000 --- a/quantization-example.sh +++ /dev/null @@ -1,40 +0,0 @@ -myshuf() { - perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@"; -} - -normalize_text() { - tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \ - sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/
/ /g' \ - -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \ - -e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf -} - -RESULTDIR=result -DATADIR=data - -mkdir -p "${RESULTDIR}" -mkdir -p "${DATADIR}" - -if [ ! -f "${DATADIR}/dbpedia.train" ] -then - wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "${DATADIR}/dbpedia_csv.tar.gz" - tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}" - cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train" - cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test" -fi - -make - -echo "Training..." -./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 - -echo "Quantizing..." -./fasttext quantize -output "${RESULTDIR}/dbpedia" -input "${DATADIR}/dbpedia.train" -qnorm -retrain -epoch 1 -cutoff 100000 - -echo "Testing original model..." -./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" -echo "Testing quantized model..." -./fasttext test "${RESULTDIR}/dbpedia.ftz" "${DATADIR}/dbpedia.test" - -wc -c < "${RESULTDIR}/dbpedia.bin" | awk '{print "Size of the original model:\t",$1;}' -wc -c < "${RESULTDIR}/dbpedia.ftz" | awk '{print "Size of the quantized model:\t",$1;}' diff --git a/reduce_model.py b/reduce_model.py deleted file mode 100755 index ecf37b913..000000000 --- a/reduce_model.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import argparse -import os -import re -import sys - -import fasttext -import fasttext.util - -args = None - - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - - -def guess_target_name(model_file, initial_dim, target_dim): - """ - Given a model name with the convention a..b, this function - returns the model's name with `target_dim` value. - For example model_file name `cc.en.300.bin` with initial dim 300 becomes - `cc.en.100.bin` when the `target_dim` is 100. - """ - prg = re.compile("(.*).%s.(.*)" % initial_dim) - m = prg.match(model_file) - if m: - return "%s.%d.%s" % (m.group(1), target_dim, m.group(2)) - - sp_ext = os.path.splitext(model_file) - return "%s.%d%s" % (sp_ext[0], target_dim, sp_ext[1]) - - -def command_reduce(model_file, target_dim, if_exists): - """ - Given a `model_file`, this function reduces its dimension to `target_dim` - by applying a PCA. - """ - eprint("Loading model") - - ft = fasttext.load_model(model_file) - initial_dim = ft.get_dimension() - if target_dim >= initial_dim: - raise Exception("Target dimension (%d) should be less than initial dimension (%d)." % ( - target_dim, initial_dim)) - - result_filename = guess_target_name(model_file, initial_dim, target_dim) - if os.path.isfile(result_filename): - if if_exists == 'overwrite': - pass - elif if_exists == 'strict': - raise Exception( - "File already exists. Use --overwrite to overwrite.") - elif if_exists == 'ignore': - return result_filename - - eprint("Reducing matrix dimensions") - fasttext.util.reduce_model(ft, target_dim) - - eprint("Saving model") - ft.save_model(result_filename) - eprint("%s saved" % result_filename) - - return result_filename - - -def main(): - global args - - parser = argparse.ArgumentParser( - description='fastText helper tool to reduce model dimensions.') - parser.add_argument("model", type=str, - help="model file to reduce. model.bin") - parser.add_argument("dim", type=int, - help="targeted dimension of word vectors.") - parser.add_argument("--overwrite", action="store_true", - help="overwrite if file exists.") - - args = parser.parse_args() - - command_reduce(args.model, args.dim, if_exists=( - 'overwrite' if args.overwrite else 'strict')) - - -if __name__ == '__main__': - main() diff --git a/runtests.py b/runtests.py deleted file mode 100644 index 6e20ddc69..000000000 --- a/runtests.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (c) 2016-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# - -# To run the integration tests you must first fetch all the required test data. -# Have a look at tests/fetch_test_data.sh -# You will then need to point this script to the corresponding folder - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import unittest -import argparse -from fasttext.tests import gen_tests -from fasttext.tests import gen_unit_tests - - -def run_tests(tests): - suite = unittest.TestLoader().loadTestsFromTestCase(tests) - unittest.TextTestRunner(verbosity=3).run(suite) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-u", "--unit-tests", help="run unit tests", action="store_true" - ) - parser.add_argument( - "-i", - "--integration-tests", - help="run integration tests", - action="store_true" - ) - parser.add_argument( - "-v", - "--verbose", - default=1, - help="verbosity level (default 1)", - type=int, - ) - parser.add_argument("--data-dir", help="Full path to data directory") - args = parser.parse_args() - if args.unit_tests: - run_tests(gen_unit_tests(verbose=args.verbose)) - if args.integration_tests: - if args.data_dir is None: - raise ValueError( - "Need data directory! Consult tests/fetch_test_data.sh" - ) - run_tests(gen_tests(args.data_dir, verbose=args.verbose)) - if not args.unit_tests and not args.integration_tests: - print("Ran no tests") diff --git a/scripts/kbcompletion/README.md b/scripts/kbcompletion/README.md deleted file mode 100644 index a16358d3b..000000000 --- a/scripts/kbcompletion/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Fast Linear Model for Knowledge Graph Embeddings - -## Knowledge base completion - -These scripts require the [fastText library](https://github.com/facebookresearch/fastText). - -Run the data.sh script to download and format the datasets. Then run any of the scripts to train and test on a given dataset. - -## Reference - -If you use this code please cite: - -@article{joulin2017fast, - title={Fast Linear Model for Knowledge Graph Embeddings}, - author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Nickel, Maximilian and Mikolov, Tomas}, - journal={arXiv preprint arXiv:1710.10881}, - year={2017} -} - diff --git a/scripts/kbcompletion/data.sh b/scripts/kbcompletion/data.sh deleted file mode 100755 index 61aeff929..000000000 --- a/scripts/kbcompletion/data.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2017-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# -set -e -DATADIR=data/ - -if [ ! -d "$DATADIR" ]; then - mkdir $DATADIR -fi - -cd $DATADIR -echo "preparing WN18" -#wget -P . https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:wordnet-mlj12.tar.gz -#mv fetch.php\?media\=en\:wordnet-mlj12.tar.gz wordnet-mlj12.tar.gz -wget -P . https://github.com/mana-ysh/knowledge-graph-embeddings/raw/master/dat/wordnet-mlj12.tar.gz -tar -xzvf wordnet-mlj12.tar.gz -DIR=wordnet-mlj12 -for f in ${DIR}/wordnet-ml*.txt; -do - fn=${DIR}/ft_$(basename $f) - awk '{print "__label__"$1,"0_"$2, $3;print $1,"1_"$2," __label__"$3}' < ${f} > ${fn}; -done -cat ${DIR}/ft_* > ${DIR}/ft_wordnet-mlj12-full.txt -cat ${DIR}/ft_*train.txt ${DIR}/ft_*valid.txt > ${DIR}/ft_wordnet-mlj12-valid+train.txt - -echo "preparing FB15K" -#wget https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:fb15k.tgz -#mv fetch.php\?media\=en\:fb15k.tgz fb15k.tgz -wget https://github.com/mana-ysh/knowledge-graph-embeddings/raw/master/dat/fb15k.tgz -tar -xzvf fb15k.tgz -DIR=FB15k/ -for f in ${DIR}/freebase*.txt; -do - fn=${DIR}/ft_$(basename $f) - echo $f " --> " $fn - awk '{print "__label__"$1,"0_"$2, $3;print $1,"1_"$2," __label__"$3}' < ${f} > ${fn}; -done -cat ${DIR}/ft_* > ${DIR}/ft_freebase_mtr100_mte100-full.txt -cat ${DIR}/ft_*train.txt ${DIR}/ft_*valid.txt > ${DIR}/ft_freebase_mtr100_mte100-valid+train.txt - -echo "preparing FB15K-237" -wget https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.2.zip -unzip FB15K-237.2.zip -DIR=Release/ -for f in train.txt test.txt valid.txt -do - fn=${DIR}/ft_$(basename $f) - echo $f " --> " $fn - awk -F "\t" '{print "__label__"$1,"0_"$2, $3;print $1,"1_"$2," __label__"$3}' < ${DIR}/${f} > ${fn}; -done -cat ${DIR}/ft_*.txt > ${DIR}/ft_full.txt -cat ${DIR}/ft_train.txt ${DIR}/ft_valid.txt > ${DIR}/ft_valid+train.txt - -echo "preparing SVO" -wget . https://everest.hds.utc.fr/lib/exe/fetch.php?media=en:svo-tensor-dataset.tar.gz -mv fetch.php?media=en:svo-tensor-dataset.tar.gz svo-tensor-dataset.tar.gz -tar -xzvf svo-tensor-dataset.tar.gz -DIR=SVO-tensor-dataset -for f in ${DIR}/svo_data*.dat; -do - fn=${DIR}/ft_$(basename $f) - awk '{print "0_"$1,"1_"$3,"__label__"$2;}' < ${f} > ${fn}; -done -cat ${DIR}/ft_*train*.dat ${DIR}/ft_*valid*.dat > ${DIR}/ft_svo_data-valid+train.dat diff --git a/scripts/kbcompletion/eval.cpp b/scripts/kbcompletion/eval.cpp deleted file mode 100644 index 4ecfd6fce..000000000 --- a/scripts/kbcompletion/eval.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/** - * Copyright (c) 2017-present, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include - -std::string EOS = ""; - -bool readWord(std::istream& in, std::string& word) -{ - char c; - std::streambuf& sb = *in.rdbuf(); - word.clear(); - while ((c = sb.sbumpc()) != EOF) { - if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' || - c == '\f' || c == '\0') { - if (word.empty()) { - if (c == '\n') { - word += EOS; - return true; - } - continue; - } else { - if (c == '\n') - sb.sungetc(); - return true; - } - } - word.push_back(c); - } - in.get(); - return !word.empty(); -} - -int main(int argc, char** argv) { - int k = 10; - if (argc < 4) { - std::cerr<<"eval []"< > KB; - - while (kbf.peek() != EOF) { - std::string label, key, word; - while (readWord(kbf, word)) { - if (word == EOS) {break;} - if (word.find("__label__") == 0) {label = word;} - else {key += "|" + word;} - } - KB[key][label] = true; - } - kbf.close(); - - double precision = 0.0; - int32_t nexamples = 0; - while (predf.peek() != EOF || gtf.peek() != EOF) { - if (predf.peek() == EOF || gtf.peek() == EOF) { - std::cerr<<"pred / gt files have diff sizes"< /dev/null | awk '{if(NR==3) print "raw hit@10="$2}' - -echo "computing filtered hit@10..." -$ft predict ${model}.bin $DIR/ft_freebase_mtr100_mte100-test.txt 20000 > $pred -./eval $pred ${DIR}/ft_freebase_mtr100_mte100-test.txt $DIR/ft_freebase_mtr100_mte100-full.txt 10 | awk '{if(NR==2) print "filtered hit@10="$2}' - -echo "---- train+val ----" - -$ft supervised -input $DIR/ft_freebase_mtr100_mte100-valid+train.txt \ - -dim ${dim} -epoch ${dim} -output ${model} -lr .2 -thread 20 -loss ns -neg ${neg} -minCount 0 - -echo "computing raw hits@10..." -$ft test ${model}.bin $DIR/ft_freebase_mtr100_mte100-test.txt 10 2> /dev/null | awk '{if(NR==3) print "raw hit@10="$2}' - -echo "computing filtered hit@10..." -$ft predict ${model}.bin $DIR/ft_freebase_mtr100_mte100-test.txt 20000 > $pred -./eval $pred ${DIR}/ft_freebase_mtr100_mte100-test.txt $DIR/ft_freebase_mtr100_mte100-full.txt 10 | awk '{if(NR==2) print "filtered hit@10="$2}' diff --git a/scripts/kbcompletion/fb15k237.sh b/scripts/kbcompletion/fb15k237.sh deleted file mode 100755 index e06f24a33..000000000 --- a/scripts/kbcompletion/fb15k237.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash -# -# copyright (c) 2017-present, facebook, inc. -# all rights reserved. -# -# this source code is licensed under the MIT license found in the -# license file in the root directory of this source tree. -# -# script for FB15k237 -DIR=data/Release/ -FASTTEXTDIR=../../ - -# compile - -pushd $FASTTEXTDIR -make opt -popd -ft=${FASTTEXTDIR}/fasttext - -g++ -std=c++0x eval.cpp -o eval - -## Train model and test it on validation: - -pred=data/fb237pred -model=data/fb15k237 -dim=50 -epoch=10 -neg=500 - -echo "---- train ----" -$ft supervised -input $DIR/ft_train.txt \ - -dim $dim -epoch $epoch -output ${model} -lr .2 -thread 20 -loss ns -neg $neg -minCount 0 - -echo "computing filtered hit@10..." -$ft predict ${model}.bin $DIR/ft_test.txt 20000 > $pred -./eval $pred ${DIR}/ft_test.txt $DIR/ft_full.txt 10 | awk '{if(NR==2) print "filtered hit@10="$2}' - -echo "---- train+val ----" - -$ft supervised -input $DIR/ft_valid+train.txt \ - -dim ${dim} -epoch ${dim} -output ${model} -lr .2 -thread 20 -loss ns -neg ${neg} -minCount 0 - -echo "computing filtered hit@10..." -$ft predict ${model}.bin $DIR/ft_test.txt 20000 > $pred -./eval $pred ${DIR}/ft_test.txt $DIR/ft_full.txt 10 | awk '{if(NR==2) print "filtered hit@10="$2}' diff --git a/scripts/kbcompletion/svo.sh b/scripts/kbcompletion/svo.sh deleted file mode 100755 index 61e4636ef..000000000 --- a/scripts/kbcompletion/svo.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash -# -# copyright (c) 2017-present, facebook, inc. -# all rights reserved. -# -# this source code is licensed under the MIT license found in the -# license file in the root directory of this source tree. -# -# script for SVO -DIR=data/SVO-tensor-dataset -FASTTEXTDIR=../../ - -# compile -pushd $FASTTEXTDIR -make opt -popd -ft=${FASTTEXTDIR}/fasttext - -## Train model and test it on validation: - -dim=200 -epoch=3 -model=svo - -echo "---- train ----" -time $ft supervised -input ${DIR}/ft_svo_data_train_1000000.dat \ - -dim $dim -epoch $epoch -output ${model} -lr .2 -thread 20 - -echo "computing raw hit@5%..." -$ft test ${model}.bin ${DIR}/ft_svo_data_test_250000.dat 227 2> /dev/null | awk '{if(NR==3) print "raw hit@5%="$2}' - - -echo "---- train + valid ----" -time $ft supervised -input ${DIR}/ft_svo_data-valid+train.dat \ - -dim $dim -epoch $epoch -output ${model} -lr .2 -thread 20 - -echo "computing raw hit@5%..." -$ft test ${model}.bin ${DIR}/ft_svo_data_test_250000.dat 227 2> /dev/null | awk '{if(NR==3) print "raw hit@5%="$2}' diff --git a/scripts/kbcompletion/wn18.sh b/scripts/kbcompletion/wn18.sh deleted file mode 100755 index fac7bb660..000000000 --- a/scripts/kbcompletion/wn18.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash -# -# copyright (c) 2017-present, facebook, inc. -# all rights reserved. -# -# this source code is licensed under the MIT license found in the -# license file in the root directory of this source tree. -# -# script for WN11 -DIR=data/wordnet-mlj12/ -FASTTEXTDIR=../../ - -# compile - -pushd $FASTTEXTDIR -make opt -popd -ft=${FASTTEXTDIR}/fasttext - -g++ -std=c++0x eval.cpp -o eval - -# Train model and test it: -dim=100 -epoch=100 -neg=500 -model=data/wn -pred=data/wnpred - -echo "---- train ----" -$ft supervised -input ${DIR}/ft_wordnet-mlj12-train.txt \ - -dim $dim -epoch $epoch -output ${model} -lr .2 -thread 20 -loss ns -neg $neg - -echo "computing raw hits@10..." -$ft test ${model}.bin ${DIR}/ft_wordnet-mlj12-test.txt 10 2> /dev/null | awk '{if(NR==3) print "raw hit@10 = "$2}' - -echo "computing filtered hit@10..." -$ft predict ${model}.bin ${DIR}/ft_wordnet-mlj12-test.txt 20000 > $pred -./eval $pred ${DIR}/ft_wordnet-mlj12-test.txt $DIR/ft_wordnet-mlj12-full.txt 10 | awk '{if(NR==2) print "filtered hit@10 = "$2}' - -echo "---- train+val ----" -$ft supervised -input ${DIR}/ft_wordnet-mlj12-valid+train.txt \ - -dim $dim -epoch $epoch -output ${model} -lr .2 -thread 20 -loss ns -neg $neg - -echo "computing raw hits@10..." -$ft test ${model}.bin ${DIR}/ft_wordnet-mlj12-test.txt 10 2> /dev/null | awk '{if(NR==3) print "raw hit@10 = "$2}' - -echo "computing filtered hit@10..." -$ft predict ${model}.bin ${DIR}/ft_wordnet-mlj12-test.txt 20000 > $pred -./eval $pred ${DIR}/ft_wordnet-mlj12-test.txt $DIR/ft_wordnet-mlj12-full.txt 10 | awk '{if(NR==2) print "filtered hit@10 = "$2}' diff --git a/scripts/quantization/quantization-results.sh b/scripts/quantization/quantization-results.sh deleted file mode 100644 index 980491897..000000000 --- a/scripts/quantization/quantization-results.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash -# -# Copyright (c) 2016-present, Facebook, Inc. -# All rights reserved. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. -# - -# This script applies quantization to the models from Table 1 in: -# Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016 - -set -e - -DATASET=( - ag_news - sogou_news - dbpedia - yelp_review_polarity - yelp_review_full - yahoo_answers - amazon_review_full - amazon_review_polarity -) - -# These learning rates were chosen by validation on a subset of the training set. -LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 ) - -RESULTDIR=result -DATADIR=data - -echo 'Warning! Make sure you run the classification-results.sh script before this one' -echo 'Otherwise you can expect the commands in this script to fail' - -for i in {0..7} -do - echo "Working on dataset ${DATASET[i]}" - ../../fasttext quantize -input "${DATADIR}/${DATASET[i]}.train" \ - -output "${RESULTDIR}/${DATASET[i]}" -lr "${LR[i]}" \ - -thread 4 -qnorm -retrain -epoch 5 -cutoff 100000 > /dev/null - ../../fasttext test "${RESULTDIR}/${DATASET[i]}.ftz" \ - "${DATADIR}/${DATASET[i]}.test" -done diff --git a/src/autotune.cc b/src/autotune.cc deleted file mode 100644 index 567731bb2..000000000 --- a/src/autotune.cc +++ /dev/null @@ -1,477 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "autotune.h" - -#include -#include -#include -#include -#include -#include -#include - -#define LOG_VAL(name, val) \ - if (autotuneArgs.verbose > 2) { \ - std::cout << #name " = " << val << std::endl; \ - } -#define LOG_VAL_NAN(name, val) \ - if (autotuneArgs.verbose > 2) { \ - if (std::isnan(val)) { \ - std::cout << #name " = NaN" << std::endl; \ - } else { \ - std::cout << #name " = " << val << std::endl; \ - } \ - } - -namespace { - -std::function interruptSignalHandler; - -void signalHandler(int signal) { - if (signal == SIGINT) { - interruptSignalHandler(); - } -} - -class ElapsedTimeMarker { - std::chrono::steady_clock::time_point start_; - - public: - ElapsedTimeMarker() { - start_ = std::chrono::steady_clock::now(); - } - double getElapsed() { - return fasttext::utils::getDuration( - start_, std::chrono::steady_clock::now()); - } -}; - -} // namespace - -namespace fasttext { - -constexpr double kUnknownBestScore = -1.0; -constexpr int kCutoffLimit = 256; - -template -T getArgGauss( - T val, - std::minstd_rand& rng, - double startSigma, - double endSigma, - double t, - bool linear) { - T returnValue; - const double stddev = startSigma - - ((startSigma - endSigma) / 0.5) * - std::min(0.5, std::max((t - 0.25), 0.0)); - - std::normal_distribution normal(0.0, stddev); - - const double coeff = normal(rng); - double updateCoeff = 0.0; - - if (linear) { - updateCoeff = coeff; - returnValue = static_cast(updateCoeff + val); - } else { - updateCoeff = std::pow(2.0, coeff); - returnValue = static_cast(updateCoeff * val); - } - - return returnValue; -} - -template -T updateArgGauss( - T val, - T min, - T max, - double startSigma, - double endSigma, - double t, - bool linear, - std::minstd_rand& rng) { - T retVal = getArgGauss(val, rng, startSigma, endSigma, t, linear); - if (retVal > max) { - retVal = max; - } - if (retVal < min) { - retVal = min; - } - return retVal; -} - -AutotuneStrategy::AutotuneStrategy( - const Args& originalArgs, - std::minstd_rand::result_type seed) - : bestArgs_(), - maxDuration_(originalArgs.autotuneDuration), - rng_(seed), - trials_(0), - bestMinnIndex_(0), - bestDsubExponent_(1), - bestNonzeroBucket_(2000000), - originalBucket_(originalArgs.bucket) { - minnChoices_ = {0, 2, 3}; - updateBest(originalArgs); -} - -Args AutotuneStrategy::ask(double elapsed) { - const double t = std::min(1.0, elapsed / maxDuration_); - trials_++; - - if (trials_ == 1) { - return bestArgs_; - } - - Args args = bestArgs_; - - if (!args.isManual("epoch")) { - args.epoch = updateArgGauss(args.epoch, 1, 100, 2.8, 2.5, t, false, rng_); - } - if (!args.isManual("lr")) { - args.lr = updateArgGauss(args.lr, 0.01, 5.0, 1.9, 1.0, t, false, rng_); - }; - if (!args.isManual("dim")) { - args.dim = updateArgGauss(args.dim, 1, 1000, 1.4, 0.3, t, false, rng_); - } - if (!args.isManual("wordNgrams")) { - args.wordNgrams = - updateArgGauss(args.wordNgrams, 1, 5, 4.3, 2.4, t, true, rng_); - } - if (!args.isManual("dsub")) { - int dsubExponent = - updateArgGauss(bestDsubExponent_, 1, 4, 2.0, 1.0, t, true, rng_); - args.dsub = (1 << dsubExponent); - } - if (!args.isManual("minn")) { - int minnIndex = updateArgGauss( - bestMinnIndex_, - 0, - static_cast(minnChoices_.size() - 1), - 4.0, - 1.4, - t, - true, - rng_); - args.minn = minnChoices_[minnIndex]; - } - if (!args.isManual("maxn")) { - if (args.minn == 0) { - args.maxn = 0; - } else { - args.maxn = args.minn + 3; - } - } - if (!args.isManual("bucket")) { - int nonZeroBucket = updateArgGauss( - bestNonzeroBucket_, 10000, 10000000, 2.0, 1.5, t, false, rng_); - args.bucket = nonZeroBucket; - } else { - args.bucket = originalBucket_; - } - if (args.wordNgrams <= 1 && args.maxn == 0) { - args.bucket = 0; - } - if (!args.isManual("loss")) { - args.loss = loss_name::softmax; - } - - return args; -} - -int AutotuneStrategy::getIndex(int val, const std::vector& choices) { - auto found = std::find(choices.begin(), choices.end(), val); - int ind = 0; - if (found != choices.end()) { - ind = std::distance(choices.begin(), found); - } - return ind; -} - -void AutotuneStrategy::updateBest(const Args& args) { - bestArgs_ = args; - bestMinnIndex_ = getIndex(args.minn, minnChoices_); - bestDsubExponent_ = log2(args.dsub); - if (args.bucket != 0) { - bestNonzeroBucket_ = args.bucket; - } -} - -Autotune::Autotune(const std::shared_ptr& fastText) - : fastText_(fastText), - elapsed_(0.), - bestScore_(0.), - trials_(0), - sizeConstraintFailed_(0), - continueTraining_(false), - strategy_(), - timer_() {} - -void Autotune::printInfo(double maxDuration) { - double progress = elapsed_ * 100 / maxDuration; - progress = std::min(progress, 100.0); - - std::cerr << "\r"; - std::cerr << std::fixed; - std::cerr << "Progress: "; - std::cerr << std::setprecision(1) << std::setw(5) << progress << "%"; - std::cerr << " Trials: " << std::setw(4) << trials_; - std::cerr << " Best score: " << std::setw(9) << std::setprecision(6); - if (bestScore_ == kUnknownBestScore) { - std::cerr << "unknown"; - } else { - std::cerr << bestScore_; - } - std::cerr << " ETA: " - << utils::ClockPrint(std::max(maxDuration - elapsed_, 0.0)); - std::cerr << std::flush; -} - -void Autotune::timer( - const std::chrono::steady_clock::time_point& start, - double maxDuration) { - elapsed_ = 0.0; - while (keepTraining(maxDuration)) { - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - elapsed_ = utils::getDuration(start, std::chrono::steady_clock::now()); - printInfo(maxDuration); - } - abort(); -} - -bool Autotune::keepTraining(double maxDuration) const { - return continueTraining_ && elapsed_ < maxDuration; -} - -void Autotune::abort() { - if (continueTraining_) { - continueTraining_ = false; - fastText_->abort(); - } -} - -void Autotune::startTimer(const Args& args) { - std::chrono::steady_clock::time_point start = - std::chrono::steady_clock::now(); - timer_ = std::thread([=]() { timer(start, args.autotuneDuration); }); - bestScore_ = kUnknownBestScore; - trials_ = 0; - continueTraining_ = true; - - auto previousSignalHandler = std::signal(SIGINT, signalHandler); - interruptSignalHandler = [&]() { - std::signal(SIGINT, previousSignalHandler); - std::cerr << std::endl << "Aborting autotune..." << std::endl; - abort(); - }; -} - -double Autotune::getMetricScore( - Meter& meter, - const metric_name& metricName, - const double metricValue, - const std::string& metricLabel) const { - double score = 0.0; - int32_t labelId = -1; - if (!metricLabel.empty()) { - labelId = fastText_->getLabelId(metricLabel); - if (labelId == -1) { - throw std::runtime_error("Unknown autotune metric label"); - } - } - if (metricName == metric_name::f1score) { - score = meter.f1Score(); - } else if (metricName == metric_name::f1scoreLabel) { - score = meter.f1Score(labelId); - } else if (metricName == metric_name::precisionAtRecall) { - score = meter.precisionAtRecall(metricValue); - } else if (metricName == metric_name::precisionAtRecallLabel) { - score = meter.precisionAtRecall(labelId, metricValue); - } else if (metricName == metric_name::recallAtPrecision) { - score = meter.recallAtPrecision(metricValue); - } else if (metricName == metric_name::recallAtPrecisionLabel) { - score = meter.recallAtPrecision(labelId, metricValue); - } else { - throw std::runtime_error("Unknown metric"); - } - return score; -} - -void Autotune::printArgs(const Args& args, const Args& autotuneArgs) { - LOG_VAL(epoch, args.epoch) - LOG_VAL(lr, args.lr) - LOG_VAL(dim, args.dim) - LOG_VAL(minCount, args.minCount) - LOG_VAL(wordNgrams, args.wordNgrams) - LOG_VAL(minn, args.minn) - LOG_VAL(maxn, args.maxn) - LOG_VAL(bucket, args.bucket) - LOG_VAL(dsub, args.dsub) - LOG_VAL(loss, args.lossToString(args.loss)) -} - -int Autotune::getCutoffForFileSize( - bool qout, - bool qnorm, - int dsub, - int64_t fileSize) const { - int64_t outModelSize = 0; - const int64_t outM = fastText_->getOutputMatrix()->size(0); - const int64_t outN = fastText_->getOutputMatrix()->size(1); - if (qout) { - const int64_t outputPqSize = 16 + 4 * (outN * (1 << 8)); - outModelSize = - 21 + (outM * ((outN + 2 - 1) / 2)) + outputPqSize + (qnorm ? outM : 0); - } else { - outModelSize = 16 + 4 * (outM * outN); - } - const int64_t dim = fastText_->getInputMatrix()->size(1); - - int target = (fileSize - (107) - 4 * (1 << 8) * dim - outModelSize); - int cutoff = target / ((dim + dsub - 1) / dsub + (qnorm ? 1 : 0) + 10); - - return std::max(cutoff, kCutoffLimit); -} - -bool Autotune::quantize(Args& args, const Args& autotuneArgs) { - if (autotuneArgs.getAutotuneModelSize() == Args::kUnlimitedModelSize) { - return true; - } - auto outputSize = fastText_->getOutputMatrix()->size(0); - - args.qnorm = true; - args.qout = (outputSize >= kCutoffLimit); - args.retrain = true; - args.cutoff = getCutoffForFileSize( - args.qout, args.qnorm, args.dsub, autotuneArgs.getAutotuneModelSize()); - LOG_VAL(cutoff, args.cutoff); - if (args.cutoff == kCutoffLimit) { - return false; - } - fastText_->quantize(args); - - return true; -} - -void Autotune::printSkippedArgs(const Args& autotuneArgs) { - std::unordered_set argsToCheck = {"epoch", - "lr", - "dim", - "wordNgrams", - "loss", - "bucket", - "minn", - "maxn", - "dsub"}; - for (const auto& arg : argsToCheck) { - if (autotuneArgs.isManual(arg)) { - std::cerr << "Warning : " << arg - << " is manually set to a specific value. " - << "It will not be automatically optimized." << std::endl; - } - } -} - -void Autotune::train(const Args& autotuneArgs) { - std::ifstream validationFileStream(autotuneArgs.autotuneValidationFile); - if (!validationFileStream.is_open()) { - throw std::invalid_argument("Validation file cannot be opened!"); - } - printSkippedArgs(autotuneArgs); - - bool sizeConstraintWarning = false; - int verbose = autotuneArgs.verbose; - Args bestTrainArgs(autotuneArgs); - Args trainArgs(autotuneArgs); - trainArgs.verbose = 0; - strategy_ = std::unique_ptr( - new AutotuneStrategy(trainArgs, autotuneArgs.seed)); - startTimer(autotuneArgs); - - while (keepTraining(autotuneArgs.autotuneDuration)) { - trials_++; - - trainArgs = strategy_->ask(elapsed_); - LOG_VAL(Trial, trials_) - printArgs(trainArgs, autotuneArgs); - ElapsedTimeMarker elapsedTimeMarker; - double currentScore = std::numeric_limits::quiet_NaN(); - try { - fastText_->train(trainArgs); - bool sizeConstraintOK = quantize(trainArgs, autotuneArgs); - if (sizeConstraintOK) { - const auto& metricLabel = autotuneArgs.getAutotuneMetricLabel(); - Meter meter(!metricLabel.empty()); - fastText_->test( - validationFileStream, autotuneArgs.autotunePredictions, 0.0, meter); - - currentScore = getMetricScore( - meter, - autotuneArgs.getAutotuneMetric(), - autotuneArgs.getAutotuneMetricValue(), - metricLabel); - - if (bestScore_ == kUnknownBestScore || (currentScore > bestScore_)) { - bestTrainArgs = trainArgs; - bestScore_ = currentScore; - strategy_->updateBest(bestTrainArgs); - } - } else { - sizeConstraintFailed_++; - if (!sizeConstraintWarning && trials_ > 10 && - sizeConstraintFailed_ > (trials_ / 2)) { - sizeConstraintWarning = true; - std::cerr << std::endl - << "Warning : requested model size is probably too small. " - "You may want to increase `autotune-modelsize`." - << std::endl; - } - } - } catch (DenseMatrix::EncounteredNaNError&) { - // ignore diverging loss and go on - } catch (std::bad_alloc&) { - // ignore parameter samples asking too much memory - } catch (TimeoutError&) { - break; - } catch (FastText::AbortError&) { - break; - } - LOG_VAL_NAN(currentScore, currentScore) - LOG_VAL(train took, elapsedTimeMarker.getElapsed()) - } - if (timer_.joinable()) { - timer_.join(); - } - - if (bestScore_ == kUnknownBestScore) { - std::string errorMessage; - if (sizeConstraintWarning) { - errorMessage = - "Couldn't fulfil model size constraint: please increase " - "`autotune-modelsize`."; - } else { - errorMessage = - "Didn't have enough time to train once: please increase " - "`autotune-duration`."; - } - throw std::runtime_error(errorMessage); - } else { - std::cerr << std::endl; - std::cerr << "Training again with best arguments" << std::endl; - bestTrainArgs.verbose = verbose; - LOG_VAL(Best selected args, 0) - printArgs(bestTrainArgs, autotuneArgs); - fastText_->train(bestTrainArgs); - quantize(bestTrainArgs, autotuneArgs); - } -} - -} // namespace fasttext diff --git a/src/autotune.h b/src/autotune.h deleted file mode 100644 index 8b300ae5d..000000000 --- a/src/autotune.h +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "args.h" -#include "fasttext.h" - -namespace fasttext { - -class AutotuneStrategy { - private: - Args bestArgs_; - int maxDuration_; - std::minstd_rand rng_; - int trials_; - int bestMinnIndex_; - int bestDsubExponent_; - int bestNonzeroBucket_; - int originalBucket_; - std::vector minnChoices_; - int getIndex(int val, const std::vector& choices); - - public: - explicit AutotuneStrategy( - const Args& args, - std::minstd_rand::result_type seed); - Args ask(double elapsed); - void updateBest(const Args& args); -}; - -class Autotune { - protected: - std::shared_ptr fastText_; - double elapsed_; - double bestScore_; - int32_t trials_; - int32_t sizeConstraintFailed_; - std::atomic continueTraining_; - std::unique_ptr strategy_; - std::thread timer_; - - bool keepTraining(double maxDuration) const; - void printInfo(double maxDuration); - void timer( - const std::chrono::steady_clock::time_point& start, - double maxDuration); - void abort(); - void startTimer(const Args& args); - double getMetricScore( - Meter& meter, - const metric_name& metricName, - const double metricValue, - const std::string& metricLabel) const; - void printArgs(const Args& args, const Args& autotuneArgs); - void printSkippedArgs(const Args& autotuneArgs); - bool quantize(Args& args, const Args& autotuneArgs); - int getCutoffForFileSize(bool qout, bool qnorm, int dsub, int64_t fileSize) - const; - - class TimeoutError : public std::runtime_error { - public: - TimeoutError() : std::runtime_error("Autotune timed out.") {} - }; - - public: - Autotune() = delete; - explicit Autotune(const std::shared_ptr& fastText); - Autotune(const Autotune&) = delete; - Autotune(Autotune&&) = delete; - Autotune& operator=(const Autotune&) = delete; - Autotune& operator=(Autotune&&) = delete; - ~Autotune() noexcept = default; - - void train(const Args& args); -}; - -} // namespace fasttext diff --git a/src/loss.cc b/src/loss.cc index 285eb9f16..04da9777c 100644 --- a/src/loss.cc +++ b/src/loss.cc @@ -95,25 +95,6 @@ void Loss::findKBest( BinaryLogisticLoss::BinaryLogisticLoss(std::shared_ptr& wo) : Loss(wo) {} -real BinaryLogisticLoss::binaryLogistic( - int32_t target, - Model::State& state, - bool labelIsPositive, - real lr, - bool backprop) const { - real score = sigmoid(wo_->dotRow(state.hidden, target)); - if (backprop) { - real alpha = lr * (real(labelIsPositive) - score); - state.grad.addRow(*wo_, target, alpha); - wo_->addVectorToRow(state.hidden, target, alpha); - } - if (labelIsPositive) { - return -log(score); - } else { - return -log(1.0 - score); - } -} - void BinaryLogisticLoss::computeOutput(Model::State& state) const { Vector& output = state.output; output.mul(*wo_, state.hidden); @@ -126,22 +107,6 @@ void BinaryLogisticLoss::computeOutput(Model::State& state) const { OneVsAllLoss::OneVsAllLoss(std::shared_ptr& wo) : BinaryLogisticLoss(wo) {} -real OneVsAllLoss::forward( - const std::vector& targets, - int32_t /* we take all targets here */, - Model::State& state, - real lr, - bool backprop) { - real loss = 0.0; - int32_t osz = state.output.size(); - for (int32_t i = 0; i < osz; i++) { - bool isMatch = utils::contains(targets, i); - loss += binaryLogistic(i, state, isMatch, lr, backprop); - } - - return loss; -} - NegativeSamplingLoss::NegativeSamplingLoss( std::shared_ptr& wo, int neg, @@ -161,34 +126,6 @@ NegativeSamplingLoss::NegativeSamplingLoss( uniform_ = std::uniform_int_distribution(0, negatives_.size() - 1); } -real NegativeSamplingLoss::forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) { - assert(targetIndex >= 0); - assert(targetIndex < targets.size()); - int32_t target = targets[targetIndex]; - real loss = binaryLogistic(target, state, true, lr, backprop); - - for (int32_t n = 0; n < neg_; n++) { - auto negativeTarget = getNegative(target, state.rng); - loss += binaryLogistic(negativeTarget, state, false, lr, backprop); - } - return loss; -} - -int32_t NegativeSamplingLoss::getNegative( - int32_t target, - std::minstd_rand& rng) { - int32_t negative; - do { - negative = negatives_[uniform_(rng)]; - } while (target == negative); - return negative; -} - HierarchicalSoftmaxLoss::HierarchicalSoftmaxLoss( std::shared_ptr& wo, const std::vector& targetCounts) @@ -244,22 +181,6 @@ void HierarchicalSoftmaxLoss::buildTree(const std::vector& counts) { } } -real HierarchicalSoftmaxLoss::forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) { - real loss = 0.0; - int32_t target = targets[targetIndex]; - const std::vector& binaryCode = codes_[target]; - const std::vector& pathToRoot = paths_[target]; - for (int32_t i = 0; i < pathToRoot.size(); i++) { - loss += binaryLogistic(pathToRoot[i], state, binaryCode[i], lr, backprop); - } - return loss; -} - void HierarchicalSoftmaxLoss::predict( int32_t k, real threshold, @@ -319,28 +240,4 @@ void SoftmaxLoss::computeOutput(Model::State& state) const { } } -real SoftmaxLoss::forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) { - computeOutput(state); - - assert(targetIndex >= 0); - assert(targetIndex < targets.size()); - int32_t target = targets[targetIndex]; - - if (backprop) { - int32_t osz = wo_->size(0); - for (int32_t i = 0; i < osz; i++) { - real label = (i == target) ? 1.0 : 0.0; - real alpha = lr * (label - state.output[i]); - state.grad.addRow(*wo_, i, alpha); - wo_->addVectorToRow(state.hidden, i, alpha); - } - } - return -log(state.output[target]); -}; - } // namespace fasttext diff --git a/src/loss.h b/src/loss.h index 3aea72f87..ad73a7917 100644 --- a/src/loss.h +++ b/src/loss.h @@ -40,12 +40,6 @@ class Loss { explicit Loss(std::shared_ptr& wo); virtual ~Loss() = default; - virtual real forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) = 0; virtual void computeOutput(Model::State& state) const = 0; virtual void predict( @@ -56,14 +50,6 @@ class Loss { }; class BinaryLogisticLoss : public Loss { - protected: - real binaryLogistic( - int32_t target, - Model::State& state, - bool labelIsPositive, - real lr, - bool backprop) const; - public: explicit BinaryLogisticLoss(std::shared_ptr& wo); virtual ~BinaryLogisticLoss() noexcept override = default; @@ -74,12 +60,6 @@ class OneVsAllLoss : public BinaryLogisticLoss { public: explicit OneVsAllLoss(std::shared_ptr& wo); ~OneVsAllLoss() noexcept override = default; - real forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) override; }; class NegativeSamplingLoss : public BinaryLogisticLoss { @@ -89,7 +69,6 @@ class NegativeSamplingLoss : public BinaryLogisticLoss { int neg_; std::vector negatives_; std::uniform_int_distribution uniform_; - int32_t getNegative(int32_t target, std::minstd_rand& rng); public: explicit NegativeSamplingLoss( @@ -98,12 +77,6 @@ class NegativeSamplingLoss : public BinaryLogisticLoss { const std::vector& targetCounts); ~NegativeSamplingLoss() noexcept override = default; - real forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) override; }; class HierarchicalSoftmaxLoss : public BinaryLogisticLoss { @@ -134,12 +107,6 @@ class HierarchicalSoftmaxLoss : public BinaryLogisticLoss { std::shared_ptr& wo, const std::vector& counts); ~HierarchicalSoftmaxLoss() noexcept override = default; - real forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) override; void predict( int32_t k, real threshold, @@ -151,12 +118,6 @@ class SoftmaxLoss : public Loss { public: explicit SoftmaxLoss(std::shared_ptr& wo); ~SoftmaxLoss() noexcept override = default; - real forward( - const std::vector& targets, - int32_t targetIndex, - Model::State& state, - real lr, - bool backprop) override; void computeOutput(Model::State& state) const override; }; diff --git a/src/main.cc b/src/main.cc deleted file mode 100644 index 7e039d08a..000000000 --- a/src/main.cc +++ /dev/null @@ -1,454 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include "args.h" -#include "autotune.h" -#include "fasttext.h" - -using namespace fasttext; - -void printUsage() { - std::cerr - << "usage: fasttext \n\n" - << "The commands supported by fasttext are:\n\n" - << " supervised train a supervised classifier\n" - << " quantize quantize a model to reduce the memory " - "usage\n" - << " test evaluate a supervised classifier\n" - << " test-label print labels with precision and recall " - "scores\n" - << " predict predict most likely labels\n" - << " predict-prob predict most likely labels with " - "probabilities\n" - << " skipgram train a skipgram model\n" - << " cbow train a cbow model\n" - << " print-word-vectors print word vectors given a trained model\n" - << " print-sentence-vectors print sentence vectors given a trained " - "model\n" - << " print-ngrams print ngrams given a trained model and " - "word\n" - << " nn query for nearest neighbors\n" - << " analogies query for analogies\n" - << " dump dump arguments,dictionary,input/output " - "vectors\n" - << std::endl; -} - -void printQuantizeUsage() { - std::cerr << "usage: fasttext quantize " << std::endl; -} - -void printTestUsage() { - std::cerr - << "usage: fasttext test [] []\n\n" - << " model filename\n" - << " test data filename (if -, read from stdin)\n" - << " (optional; 1 by default) predict top k labels\n" - << " (optional; 0.0 by default) probability threshold\n" - << std::endl; -} - -void printPredictUsage() { - std::cerr - << "usage: fasttext predict[-prob] [] []\n\n" - << " model filename\n" - << " test data filename (if -, read from stdin)\n" - << " (optional; 1 by default) predict top k labels\n" - << " (optional; 0.0 by default) probability threshold\n" - << std::endl; -} - -void printTestLabelUsage() { - std::cerr - << "usage: fasttext test-label [] []\n\n" - << " model filename\n" - << " test data filename\n" - << " (optional; 1 by default) predict top k labels\n" - << " (optional; 0.0 by default) probability threshold\n" - << std::endl; -} - -void printPrintWordVectorsUsage() { - std::cerr << "usage: fasttext print-word-vectors \n\n" - << " model filename\n" - << std::endl; -} - -void printPrintSentenceVectorsUsage() { - std::cerr << "usage: fasttext print-sentence-vectors \n\n" - << " model filename\n" - << std::endl; -} - -void printPrintNgramsUsage() { - std::cerr << "usage: fasttext print-ngrams \n\n" - << " model filename\n" - << " word to print\n" - << std::endl; -} - -void quantize(const std::vector& args) { - Args a = Args(); - if (args.size() < 3) { - printQuantizeUsage(); - a.printHelp(); - exit(EXIT_FAILURE); - } - a.parseArgs(args); - FastText fasttext; - // parseArgs checks if a->output is given. - fasttext.loadModel(a.output + ".bin"); - fasttext.quantize(a); - fasttext.saveModel(a.output + ".ftz"); - exit(0); -} - -void printNNUsage() { - std::cout << "usage: fasttext nn \n\n" - << " model filename\n" - << " (optional; 10 by default) predict top k labels\n" - << std::endl; -} - -void printAnalogiesUsage() { - std::cout << "usage: fasttext analogies \n\n" - << " model filename\n" - << " (optional; 10 by default) predict top k labels\n" - << std::endl; -} - -void printDumpUsage() { - std::cout << "usage: fasttext dump