diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fd5bee..6613604 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,9 +7,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) find_package(SparseHash) if(SPARSEHASH_FOUND) add_definitions(-DHAVE_SPARSEHASH) + include_directories(${SPARSEHASH_INCLUDE_DIR}) endif(SPARSEHASH_FOUND) -find_package(OpenMP QUIET) +find_package(OpenMP) if (OPENMP_FOUND) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif(OPENMP_FOUND) diff --git a/README.md b/README.md index 5f1d0ec..e4329ff 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,22 @@ These can be symmetrized using the included `atools` command using a variety of ./atools -i forward.align -j reverse.align -c grow-diag-final-and + +## Compiling for windows + +To build 64-bit binaries with libsparsehash on Windows using Visual Studio 2017, the following recipe works: + + + cd d:\src + git clone https://github.com/clab/fast_align.git + git clone https://github.com/sparsehash/sparsehash.git + cd d:\src\fast_align + mkdir build64 + cd build64 + cmake -G "Visual Studio 15 2017 Win64" -D SPARSEHASH_INCLUDE_DIR=d:\src\sparsehash\src .. + cmake --build build64 --config Release + + ## Output `fast_align` produces outputs in the widely-used `i-j` “Pharaoh format,” where a pair `i-j` indicates that the ith word (zero-indexed) of the left language (by convention, the *source* language) is aligned to the jth word of the right sentence (by convention, the *target* language). For example, a good alignment of the above German–English corpus would be: diff --git a/src/atools.cc b/src/atools.cc index ef8967f..628994f 100644 --- a/src/atools.cc +++ b/src/atools.cc @@ -6,7 +6,11 @@ #include #include #include +#ifndef _MSC_VER #include +#else +#include "getopt.h" +#endif #include "alignment_io.h" @@ -16,22 +20,27 @@ struct option options[] = { {"input_1", required_argument, 0, 'i'}, {"input_2", required_argument, 0, 'j'}, {"command", required_argument, 0, 'c'}, + {"outfile", required_argument, 0, 'o'}, {0,0,0,0} }; string input_1; string input_2; string command; +ofstream outfileStream; +ostream* outstream = &cout; + bool InitCommandLine(int argc, char** argv) { while (1) { int oi; - int c = getopt_long(argc, argv, "i:j:c:", options, &oi); + int c = getopt_long(argc, argv, "i:j:c:o:", options, &oi); if (c == -1) break; switch(c) { case 'i': input_1 = optarg; break; case 'j': input_2 = optarg; break; case 'c': command = optarg; break; + case 'o': outfileStream.open(optarg); outstream = &outfileStream; break; default: return false; } } @@ -308,7 +317,7 @@ int main(int argc, char **argv) { AddCommand(); AddCommand(); if (!InitCommandLine(argc, argv)) { - cerr << "Usage: " << argv[0] << " -c COMMAND -i FILE1.AL [-j FILE2.AL]\n"; + cerr << "Usage: " << argv[0] << " -c COMMAND -i FILE1.AL [-j FILE2.AL] [-o OUTPATH]\n"; cerr << "Valid options for COMMAND:"; for (auto it : commands) cerr << ' ' << it.first; @@ -368,7 +377,7 @@ int main(int argc, char **argv) { } if (cmd.Result() == 1) { - AlignmentIO::SerializePharaohFormat(*out, &cout); + AlignmentIO::SerializePharaohFormat(*out, outstream); } } if (cmd.Result() == 2) diff --git a/src/corpus.h b/src/corpus.h index fc98e0d..71ef41f 100644 --- a/src/corpus.h +++ b/src/corpus.h @@ -22,7 +22,7 @@ class Dict { return (x == ' ' || x == '\t'); } - inline void ConvertWhitespaceDelimitedLine(const std::string& line, std::vector* out) { + inline void ConvertWhitespaceDelimitedLine(const std::string& line, std::vector* out, bool frozen=false) { size_t cur = 0; size_t last = 0; int state = 0; @@ -30,7 +30,7 @@ class Dict { while(cur < line.size()) { if (is_ws(line[cur++])) { if (state == 0) continue; - out->push_back(Convert(line.substr(last, cur - last - 1))); + out->push_back(Convert(line.substr(last, cur - last - 1), frozen)); state = 0; } else { if (state == 1) continue; diff --git a/src/fast_align.cc b/src/fast_align.cc index 637af26..9d72d83 100644 --- a/src/fast_align.cc +++ b/src/fast_align.cc @@ -14,12 +14,20 @@ // #include +#include #include #include #include #include +#ifndef _MSC_VER #include +#else +#define NOMINMAX +#include "getopt.h" +#endif #include +#include + #include "src/corpus.h" #include "src/ttables.h" @@ -37,12 +45,13 @@ Dict d; // integerization map void ParseLine(const string& line, vector* src, - vector* trg) { + vector* trg, + bool frozen=false) { static const unsigned kDIV = d.Convert("|||"); vector tmp; src->clear(); trg->clear(); - d.ConvertWhitespaceDelimitedLine(line, &tmp); + d.ConvertWhitespaceDelimitedLine(line, &tmp, frozen); unsigned i = 0; while (i < tmp.size() && tmp[i] != kDIV) { src->push_back(tmp[i]); @@ -56,10 +65,15 @@ void ParseLine(const string& line, } string input; +ostream* outputStream = &cout; +//used if user specifies -O option to output to file +ofstream outputFileStream; + string conditional_probability_filename = ""; string input_model_file = ""; double mean_srclen_multiplier = 1.0; int is_reverse = 0; +int print_alignments_only = 0; int ITERATIONS = 5; int favor_diagonal = 0; double beam_threshold = -4.0; @@ -88,16 +102,20 @@ struct option options[] = { {"no_null_word", no_argument, &no_null_word, 1 }, {"conditional_probabilities", required_argument, 0, 'p'}, {"thread_buffer_size", required_argument, 0, 'b'}, + {"output_file", required_argument, 0, 'O'}, + {"num_threads", required_argument, 0, 'n'}, + {"print_alignments_only",no_argument, &print_alignments_only,'A'}, {0,0,0,0} }; bool InitCommandLine(int argc, char** argv) { while (1) { int oi; - int c = getopt_long(argc, argv, "i:rI:df:m:t:q:T:ova:Np:b:s", options, &oi); + int c = getopt_long(argc, argv, "i:rI:df:m:t:q:T:ova:Np:b:sO:n:A", options, &oi); if (c == -1) break; cerr << "ARG=" << (char)c << endl; switch(c) { + case 'A': print_alignments_only = 1; break; case 'i': input = optarg; break; case 'r': is_reverse = 1; break; case 'I': ITERATIONS = atoi(optarg); break; @@ -108,9 +126,11 @@ bool InitCommandLine(int argc, char** argv) { case 'q': prob_align_null = atof(optarg); break; case 'T': favor_diagonal = 1; diagonal_tension = atof(optarg); break; case 'o': optimize_tension = 1; break; + case 'O': outputFileStream.open(optarg); outputStream = &outputFileStream; break; case 'v': variational_bayes = 1; break; case 'a': alpha = atof(optarg); break; case 'N': no_null_word = 1; break; + case 'n': omp_set_num_threads(atoi(optarg)); break; case 'p': conditional_probability_filename = optarg; break; case 'b': thread_buffer_size = atoi(optarg); break; case 's': print_scores = 1; break; @@ -225,7 +245,11 @@ inline void AddTranslationOptions(vector >& insert_buffer, TTable* s2t) { s2t->SetMaxE(insert_buffer.size()-1); #pragma omp parallel for schedule(dynamic) +#ifndef _MSC_VER for (unsigned e = 0; e < insert_buffer.size(); ++e) { +#else + for (int e = 0; e < insert_buffer.size(); ++e) { +#endif for (unsigned f : insert_buffer[e]) { s2t->Insert(e, f); } @@ -295,23 +319,129 @@ void InitialPass(const unsigned kNULL, const bool use_null, TTable* s2t, cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; } +double ForceAlign(const string& line, int lc, double prob_align_not_null, bool use_null, string& ret, const unsigned kNULL, const TTable& s2t) +{ + vector src, trg; + int totalSpaces = std::count_if(line.begin(), line.end(), [](char c) { return c == ' '; }); + src.reserve(totalSpaces); + trg.reserve(totalSpaces); + ParseLine(line, &src, &trg, true); + if (!print_alignments_only) + { + for (auto s : src) *outputStream << d.Convert(s) << ' '; + *outputStream << "|||"; + for (auto t : trg) *outputStream << ' ' << d.Convert(t); + *outputStream << " |||"; + } + if (is_reverse) + swap(src, trg); + if (src.size() == 0 || trg.size() == 0) { + cerr << "Error in line " << lc << endl; + } + double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); + bool first = true; + + ret.reserve(1024); + + // compute likelihood + for (unsigned j = 0; j < trg.size(); ++j) { + unsigned f_j = trg[j]; + double sum = 0; + int a_j = 0; + double max_pat = 0; + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) + if (use_null) { + if (favor_diagonal) prob_a_i = prob_align_null; + max_pat = s2t.safe_prob(kNULL, f_j) * prob_a_i; + sum += max_pat; + } + double az = 0; + if (favor_diagonal) + az = DiagonalAlignment::ComputeZ(j + 1, trg.size(), src.size(), diagonal_tension) / prob_align_not_null; + for (unsigned i = 1; i <= src.size(); ++i) { + if (favor_diagonal) + prob_a_i = DiagonalAlignment::UnnormalizedProb(j + 1, i, trg.size(), src.size(), diagonal_tension) / az; + double pat = s2t.safe_prob(src[i - 1], f_j) * prob_a_i; + if (pat > max_pat) { max_pat = pat; a_j = i; } + sum += pat; + } + log_prob += log(sum); + char convBuff[16]; + if (true) { + if (a_j > 0) { + if (!first) { + ret.append(" "); + } + if (is_reverse) { + itoa(j, convBuff, 10); + ret.append(convBuff); + ret.append("-"); + itoa(a_j - 1, convBuff, 10); + ret.append(convBuff); + } + else { + itoa(a_j - 1, convBuff, 10); + ret.append(convBuff); + ret.append("-"); + itoa(j, convBuff, 10); + ret.append(convBuff); + } + first = false; + } + } + } + return log_prob; +} + +double AlignBuffer(const vector& buffer, int& lc, double prob_align_not_null, bool use_null, const unsigned int kNULL, TTable& s2t) +{ + double tlp = 0.0; + vector logprobs(buffer.size()); + vector outputs(buffer.size()); +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < buffer.size(); ++i) + { + string ret; + logprobs[i] = ForceAlign(buffer[i], lc + i, prob_align_not_null, use_null, ret, kNULL, s2t); + outputs[i] = ret; + } + for (int i = 0; i < buffer.size(); ++i) + { + *outputStream << outputs[i] << endl; + if (!print_alignments_only) + { + *outputStream << " ||| " << logprobs[i]; + } + tlp += logprobs[i]; + } + return tlp; +} + int main(int argc, char** argv) { if (!InitCommandLine(argc, argv)) { - cerr << "Usage: " << argv[0] << " -i file.fr-en\n" + cerr << "Usage: " << argv[0] << " -i file.fr-en\n" << " Standard options ([USE] = strongly recommended):\n" << " -i: [REQ] Input parallel corpus\n" << " -v: [USE] Use Dirichlet prior on lexical translation distributions\n" << " -d: [USE] Favor alignment points close to the monotonic diagonoal\n" << " -o: [USE] Optimize how close to the diagonal alignment points should be\n" << " -r: Run alignment in reverse (condition on target and predict source)\n" - << " -c: Output conditional probability table\n" + << " -p: Output conditional probability table\n" + << " -O: Output to path instead of stdout\n" + << " -n: Use this many threads\n" << " Advanced options:\n" << " -I: number of iterations in EM training (default = 5)\n" << " -q: p_null parameter (default = 0.08)\n" << " -N: No null word\n" << " -a: alpha parameter for optional Dirichlet prior (default = 0.01)\n" << " -T: starting lambda for diagonal distance parameter (default = 4)\n" - << " -s: print alignment scores (alignment ||| score, disabled by default)\n"; + << " -s: print alignment scores (alignment ||| score, disabled by default)\n" + << " -f: force align, using specified input probability table (obtained via training with -p switch)\n" + << " -A: print alignments only (only applies to forced align, where default is to dump src|||tgt|||align|||p(align)" + << " -m: set mean source length multiplier\n" + << " -t: set beam threshold\n" + << " -a: set alpha parameter\n" + << " -b: set thread buffer size\n"; return 1; } const bool use_null = !no_null_word; @@ -365,7 +495,7 @@ int main(int argc, char** argv) { prob_align_not_null, &c0, &emp_feat, &likelihood, &s2t, &outputs); if (final_iteration) { for (const string& output : outputs) { - cout << output; + *outputStream << output; } } buffer.clear(); @@ -376,7 +506,7 @@ int main(int argc, char** argv) { prob_align_not_null, &c0, &emp_feat, &likelihood, &s2t, &outputs); if (final_iteration) { for (const string& output : outputs) { - cout << output; + *outputStream << output; } } buffer.clear(); @@ -402,7 +532,11 @@ int main(int argc, char** argv) { for (int ii = 0; ii < 8; ++ii) { double mod_feat = 0; #pragma omp parallel for reduction(+:mod_feat) +#ifndef _MSC_VER for(size_t i = 0; i < size_counts.size(); ++i) { +#else + for (int i = 0; i < size_counts.size(); ++i) { +#endif const pair& p = size_counts[i].first; for (short j = 1; j <= p.first; ++j) mod_feat += size_counts[i].second * DiagonalAlignment::ComputeDLogZ(j, p.first, p.second, diagonal_tension); @@ -434,58 +568,26 @@ int main(int argc, char** argv) { vector src, trg; int lc = 0; double tlp = 0; - while(getline(in, line)) { - ++lc; - ParseLine(line, &src, &trg); - for (auto s : src) cout << d.Convert(s) << ' '; - cout << "|||"; - for (auto t : trg) cout << ' ' << d.Convert(t); - cout << " |||"; - if (is_reverse) - swap(src, trg); - if (src.size() == 0 || trg.size() == 0) { - cerr << "Error in line " << lc << endl; - return 1; - } - double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); - // compute likelihood - for (unsigned j = 0; j < trg.size(); ++j) { - unsigned f_j = trg[j]; - double sum = 0; - int a_j = 0; - double max_pat = 0; - double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) - if (use_null) { - if (favor_diagonal) prob_a_i = prob_align_null; - max_pat = s2t.safe_prob(kNULL, f_j) * prob_a_i; - sum += max_pat; - } - double az = 0; - if (favor_diagonal) - az = DiagonalAlignment::ComputeZ(j+1, trg.size(), src.size(), diagonal_tension) / prob_align_not_null; - for (unsigned i = 1; i <= src.size(); ++i) { - if (favor_diagonal) - prob_a_i = DiagonalAlignment::UnnormalizedProb(j + 1, i, trg.size(), src.size(), diagonal_tension) / az; - double pat = s2t.safe_prob(src[i-1], f_j) * prob_a_i; - if (pat > max_pat) { max_pat = pat; a_j = i; } - sum += pat; - } - log_prob += log(sum); - if (true) { - if (a_j > 0) { - cout << ' '; - if (is_reverse) - cout << j << '-' << (a_j - 1); - else - cout << (a_j - 1) << '-' << j; - } - } - } - tlp += log_prob; - cout << " ||| " << log_prob << endl << flush; - } // loop over test set sentences + vector buffer; + + while (true) { + getline(in, line); + if (!in) break; + ++lc; + + buffer.push_back(line); + if (buffer.size() >= thread_buffer_size) { + tlp += AlignBuffer(buffer, lc, prob_align_not_null, use_null, kNULL, s2t); + buffer.clear(); + } + } + if (buffer.size() > 0) + { + tlp+= AlignBuffer(buffer, lc, prob_align_not_null, use_null, kNULL, s2t); + } + *outputStream << flush; cerr << "TOTAL LOG PROB " << tlp << endl; - } + } // loop over test set sentences return 0; } diff --git a/src/getopt.h b/src/getopt.h new file mode 100644 index 0000000..21a42a9 --- /dev/null +++ b/src/getopt.h @@ -0,0 +1,655 @@ +#ifdef _MSC_VER //MSVC does not provide a getopt.h, so use the one from mingw-w64. +#ifndef __GETOPT_H__ +/** + * DISCLAIMER + * This file is part of the mingw-w64 runtime package. + * + * The mingw-w64 runtime package and its code is distributed in the hope that it + * will be useful but WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESSED OR + * IMPLIED ARE HEREBY DISCLAIMED. This includes but is not limited to + * warranties of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + /* + * Copyright (c) 2002 Todd C. Miller + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F39502-99-1-0512. + */ +/*- + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma warning(disable:4996) + +#define __GETOPT_H__ + +/* All the headers include this file. */ +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define REPLACE_GETOPT /* use this getopt as the system getopt(3) */ + +#ifdef REPLACE_GETOPT +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ +#undef optreset /* see getopt.h */ +#define optreset __mingw_optreset +int optreset; /* reset getopt */ +char *optarg; /* argument associated with option */ +#endif + +//extern int optind; /* index of first non-option in argv */ +//extern int optopt; /* single option character, as parsed */ +//extern int opterr; /* flag to enable built-in diagnostics... */ +// /* (user may set to zero, to suppress) */ +// +//extern char *optarg; /* pointer to argument of current option */ + +#define PRINT_ERROR ((opterr) && (*options != ':')) + +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ + +/* return values */ +#define BADCH (int)'?' +#define BADARG ((*options == ':') ? (int)':' : (int)'?') +#define INORDER (int)1 + +#ifndef __CYGWIN__ +#define __progname __argv[0] +#else +extern char __declspec(dllimport) *__progname; +#endif + +#ifdef __CYGWIN__ +static char EMSG[] = ""; +#else +#define EMSG "" +#endif + +static int getopt_internal(int, char * const *, const char *, + const struct option *, int *, int); +static int parse_long_options(char * const *, const char *, + const struct option *, int *, int); +static int gcd(int, int); +static void permute_args(int, int, int, char * const *); + +static char *place = EMSG; /* option letter processing */ + +/* XXX: set optreset to 1 rather than these two */ +static int nonopt_start = -1; /* first non option argument (for permute) */ +static int nonopt_end = -1; /* first option after non options (for permute) */ + +/* Error messages */ +static const char recargchar[] = "option requires an argument -- %c"; +static const char recargstring[] = "option requires an argument -- %s"; +static const char ambig[] = "ambiguous option -- %.*s"; +static const char noarg[] = "option doesn't take an argument -- %.*s"; +static const char illoptchar[] = "unknown option -- %c"; +static const char illoptstring[] = "unknown option -- %s"; + +static void +_vwarnx(const char *fmt,va_list ap) +{ + (void)fprintf(stderr,"%s: ",__progname); + if (fmt != NULL) + (void)vfprintf(stderr,fmt,ap); + (void)fprintf(stderr,"\n"); +} + +static void +warnx(const char *fmt,...) +{ + va_list ap; + va_start(ap,fmt); + _vwarnx(fmt,ap); + va_end(ap); +} + +/* + * Compute the greatest common divisor of a and b. + */ +static int +gcd(int a, int b) +{ + int c; + + c = a % b; + while (c != 0) { + a = b; + b = c; + c = a % b; + } + + return (b); +} + +/* + * Exchange the block from nonopt_start to nonopt_end with the block + * from nonopt_end to opt_end (keeping the same order of arguments + * in each block). + */ +static void +permute_args(int panonopt_start, int panonopt_end, int opt_end, + char * const *nargv) +{ + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char *swap; + + /* + * compute lengths of blocks and number and size of cycles + */ + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) { + cstart = panonopt_end+i; + pos = cstart; + for (j = 0; j < cyclelen; j++) { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char **) nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char **)nargv)[cstart] = swap; + } + } +} + +#ifdef REPLACE_GETOPT +/* + * getopt -- + * Parse argc/argv argument vector. + * + * [eventually this will replace the BSD getopt] + */ +int +getopt(int nargc, char * const *nargv, const char *options) +{ + + /* + * We don't pass FLAG_PERMUTE to getopt_internal() since + * the BSD getopt(3) (unlike GNU) has never done this. + * + * Furthermore, since many privileged programs call getopt() + * before dropping privileges it makes sense to keep things + * as simple (and bug-free) as possible. + */ + return (getopt_internal(nargc, nargv, options, NULL, NULL, 0)); +} +#endif /* REPLACE_GETOPT */ + +//extern int getopt(int nargc, char * const *nargv, const char *options); + +#ifdef _BSD_SOURCE +/* + * BSD adds the non-standard `optreset' feature, for reinitialisation + * of `getopt' parsing. We support this feature, for applications which + * proclaim their BSD heritage, before including this header; however, + * to maintain portability, developers are advised to avoid it. + */ +# define optreset __mingw_optreset +extern int optreset; +#endif +#ifdef __cplusplus +} +#endif +/* + * POSIX requires the `getopt' API to be specified in `unistd.h'; + * thus, `unistd.h' includes this header. However, we do not want + * to expose the `getopt_long' or `getopt_long_only' APIs, when + * included in this manner. Thus, close the standard __GETOPT_H__ + * declarations block, and open an additional __GETOPT_LONG_H__ + * specific block, only when *not* __UNISTD_H_SOURCED__, in which + * to declare the extended API. + */ +#endif /* !defined(__GETOPT_H__) */ + +#if !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) +#define __GETOPT_LONG_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +struct option /* specification for a long form option... */ +{ + const char *name; /* option name, without leading hyphens */ + int has_arg; /* does it take an argument? */ + int *flag; /* where to save its status, or NULL */ + int val; /* its associated status value */ +}; + +enum /* permitted values for its `has_arg' field... */ +{ + no_argument = 0, /* option never takes an argument */ + required_argument, /* option always requires an argument */ + optional_argument /* option may take an argument */ +}; + +/* + * parse_long_options -- + * Parse long options in argc/argv argument vector. + * Returns -1 if short_too is set and the option does not match long_options. + */ +static int +parse_long_options(char * const *nargv, const char *options, + const struct option *long_options, int *idx, int short_too) +{ + char *current_argv, *has_equal; + size_t current_argv_len; + int i, ambiguous, match; + +#define IDENTICAL_INTERPRETATION(_x, _y) \ + (long_options[(_x)].has_arg == long_options[(_y)].has_arg && \ + long_options[(_x)].flag == long_options[(_y)].flag && \ + long_options[(_x)].val == long_options[(_y)].val) + + current_argv = place; + match = -1; + ambiguous = 0; + + optind++; + + if ((has_equal = strchr(current_argv, '=')) != NULL) { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, + current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) { + /* exact match */ + match = i; + ambiguous = 0; + break; + } + /* + * If this is a known short option, don't allow + * a partial match of a single character. + */ + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else if (!IDENTICAL_INTERPRETATION(i, match)) + ambiguous = 1; + } + if (ambiguous) { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int)current_argv_len, + current_argv); + optopt = 0; + return (BADCH); + } + if (match != -1) { /* option found */ + if (long_options[match].has_arg == no_argument + && has_equal) { + if (PRINT_ERROR) + warnx(noarg, (int)current_argv_len, + current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return (BADARG); + } + if (long_options[match].has_arg == required_argument || + long_options[match].has_arg == optional_argument) { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == + required_argument) { + /* + * optional argument doesn't use next nargv + */ + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) + && (optarg == NULL)) { + /* + * Missing argument; leading ':' indicates no error + * should be generated. + */ + if (PRINT_ERROR) + warnx(recargstring, + current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return (BADARG); + } + } else { /* unknown option */ + if (short_too) { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return (BADCH); + } + if (idx) + *idx = match; + if (long_options[match].flag) { + *long_options[match].flag = long_options[match].val; + return (0); + } else + return (long_options[match].val); +#undef IDENTICAL_INTERPRETATION +} + +/* + * getopt_internal -- + * Parse argc/argv argument vector. Called by user level routines. + */ +static int +getopt_internal(int nargc, char * const *nargv, const char *options, + const struct option *long_options, int *idx, int flags) +{ + char *oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; + + if (options == NULL) + return (-1); + + /* + * XXX Some GNU programs (like cvs) set optind to 0 instead of + * XXX using optreset. Work around this braindamage. + */ + if (optind == 0) + optind = optreset = 1; + + /* + * Disable GNU extensions if POSIXLY_CORRECT is set or options + * string begins with a '+'. + * + * CV, 2009-12-14: Check POSIXLY_CORRECT anew if optind == 0 or + * optreset != 0 for GNU compatibility. + */ + if (posixly_correct == -1 || optreset != 0) + posixly_correct = (getenv("POSIXLY_CORRECT") != NULL); + if (*options == '-') + flags |= FLAG_ALLARGS; + else if (posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + if (*options == '+' || *options == '-') + options++; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; +start: + if (optreset || !*place) { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } + else if (nonopt_start != -1) { + /* + * If we skipped non-options, set optind + * to the first of them. + */ + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + if (*(place = nargv[optind]) != '-' || + (place[1] == '\0' && strchr(options, '-') == NULL)) { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) { + /* + * GNU extension: + * return non-option as argument to option 1 + */ + optarg = nargv[optind++]; + return (INORDER); + } + if (!(flags & FLAG_PERMUTE)) { + /* + * If no permutation wanted, stop parsing + * at first non-option. + */ + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + nonopt_start = optind - + (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* + * If we have "-" do nothing, if "--" we are done. + */ + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') { + optind++; + place = EMSG; + /* + * We found an option (--), so if we skipped + * non-options, we have to permute. + */ + if (nonopt_end != -1) { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* + * Check long options if: + * 1) we were passed some + * 2) the arg is not just "-" + * 3) either the arg starts with -- we are getopt_long_only() + */ + if (long_options != NULL && place != nargv[optind] && + (*place == '-' || (flags & FLAG_LONGONLY))) { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, + idx, short_too); + if (optchar != -1) { + place = EMSG; + return (optchar); + } + } + + if ((optchar = (int)*place++) == (int)':' || + (optchar == (int)'-' && *place != '\0') || + (oli = (char*)strchr(options, optchar)) == NULL) { + /* + * If the user specified "-" and '-' isn't listed in + * options, return -1 (non-option) as per POSIX. + * Otherwise, it is an unknown option character (or ':'). + */ + if (optchar == (int)'-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return (BADCH); + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') { + /* -W long-option */ + if (*place) /* no space */ + /* NOTHING */; + else if (++optind >= nargc) { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } else /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, + idx, 0); + place = EMSG; + return (optchar); + } + if (*++oli != ':') { /* doesn't take argument */ + if (!*place) + ++optind; + } else { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') { /* arg not optional */ + if (++optind >= nargc) { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return (BADARG); + } else + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return (optchar); +} + +/* + * getopt_long -- + * Parse argc/argv argument vector. + */ +int +getopt_long(int nargc, char * const *nargv, const char *options, + const struct option *long_options, int *idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE)); +} + +/* + * getopt_long_only -- + * Parse argc/argv argument vector. + */ +int +getopt_long_only(int nargc, char * const *nargv, const char *options, + const struct option *long_options, int *idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE|FLAG_LONGONLY)); +} + +//extern int getopt_long(int nargc, char * const *nargv, const char *options, +// const struct option *long_options, int *idx); +//extern int getopt_long_only(int nargc, char * const *nargv, const char *options, +// const struct option *long_options, int *idx); +/* + * Previous MinGW implementation had... + */ +#ifndef HAVE_DECL_GETOPT +/* + * ...for the long form API only; keep this for compatibility. + */ +# define HAVE_DECL_GETOPT 1 +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* !defined(__UNISTD_H_SOURCED__) && !defined(__GETOPT_LONG_H__) */ +#endif diff --git a/src/ttables.h b/src/ttables.h index 7dc43b3..90f3f09 100644 --- a/src/ttables.h +++ b/src/ttables.h @@ -84,7 +84,11 @@ class TTable { void NormalizeVB(const double alpha) { ttable.swap(counts); #pragma omp parallel for schedule(dynamic) +#ifndef _MSC_VER for (unsigned i = 0; i < ttable.size(); ++i) { +#else + for (int i = 0; i < ttable.size(); ++i) { // MSVC only supports OpenMP 2.0, which doesn't allow signed types here +#endif double tot = 0; Word2Double& cpd = ttable[i]; for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) @@ -101,7 +105,11 @@ class TTable { void Normalize() { ttable.swap(counts); #pragma omp parallel for schedule(dynamic) +#ifndef _MSC_VER for (unsigned i = 0; i < ttable.size(); ++i) { +#else + for (int i = 0; i < ttable.size(); ++i) { +#endif double tot = 0; Word2Double& cpd = ttable[i]; for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) @@ -159,7 +167,11 @@ class TTable { private: void ClearCounts() { #pragma omp parallel for schedule(dynamic) +#ifndef _MSC_VER for (size_t i=0; i