diff --git a/dprog.c b/dprog.c index e30598a..a718e8c 100644 --- a/dprog.c +++ b/dprog.c @@ -28,7 +28,7 @@ coding, RBS scores, etc. *******************************************************************************/ -int dprog(struct _node *nod, int nn, struct _training *tinf, int flag) { +int dprog(struct _node *nod, int nn, struct _training *tinf, int flag, int max_node_dist, int rapid) { int i, j, min, max_ndx = -1, path, nxt, tmp; double max_sc = -1.0; @@ -42,15 +42,22 @@ int dprog(struct _node *nod, int nn, struct _training *tinf, int flag) { /* Set up distance constraints for making connections, */ /* but make exceptions for giant ORFS. */ - if(i < MAX_NODE_DIST) min = 0; else min = i-MAX_NODE_DIST; + if(i < max_node_dist) min = 0; else min = i-max_node_dist; if(nod[i].strand == -1 && nod[i].type != STOP && nod[min].ndx >= nod[i].stop_val) while(min >= 0 && nod[i].ndx != nod[i].stop_val) min--; if(nod[i].strand == 1 && nod[i].type == STOP && nod[min].ndx >= nod[i].stop_val) while(min >= 0 && nod[i].ndx != nod[i].stop_val) min--; - if(min < MAX_NODE_DIST) min = 0; - else min = min-MAX_NODE_DIST; + + /* Rapid mode is 50% faster producing the same result, */ + /* when tested with E. coli genome (GCF_000008865.2). */ + if(rapid){ + if(min < 0) min = 0; + } else { + if(min < max_node_dist) min = 0; else min = min-max_node_dist; + } + for(j = min; j < i; j++) { score_connection(nod, j, i, tinf, flag); } diff --git a/dprog.h b/dprog.h index d729f4c..6235108 100644 --- a/dprog.h +++ b/dprog.h @@ -28,9 +28,9 @@ #define MAX_SAM_OVLP 60 #define MAX_OPP_OVLP 200 -#define MAX_NODE_DIST 500 +// #define MAX_NODE_DIST 500 -int dprog(struct _node *, int, struct _training *, int); +int dprog(struct _node *, int, struct _training *, int, int, int); void score_connection(struct _node *, int, int, struct _training *, int); void eliminate_bad_genes(struct _node *, int, struct _training *); diff --git a/main.c b/main.c index 0834a07..95d6d2f 100644 --- a/main.c +++ b/main.c @@ -43,7 +43,7 @@ int copy_standard_input_to_file(char *, int); int main(int argc, char *argv[]) { int rv, slen, nn, ng, i, ipath, *gc_frame, do_training, output, max_phase; - int closed, do_mask, nmask, force_nonsd, user_tt, is_meta, num_seq, quiet; + int closed, do_mask, nmask, force_nonsd, user_tt, is_meta, num_seq, quiet, max_node_dist, rapid; int piped, max_slen, fnum; double max_score, gc, low, high; unsigned char *seq, *rseq, *useq; @@ -87,7 +87,7 @@ int main(int argc, char *argv[]) { memset(meta[i].tinf, 0, sizeof(struct _training)); } nn = 0; slen = 0; ipath = 0; ng = 0; nmask = 0; - user_tt = 0; is_meta = 0; num_seq = 0; quiet = 0; + user_tt = 0; is_meta = 0; num_seq = 0; quiet = 0; max_node_dist = 500; rapid = 0; max_phase = 0; max_score = -100.0; train_file = NULL; do_training = 0; start_file = NULL; trans_file = NULL; nuc_file = NULL; @@ -113,15 +113,17 @@ int main(int argc, char *argv[]) { /* Parse the command line arguments */ for(i = 1; i < argc; i++) { - if(i == argc-1 && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "-T") == 0 - || strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "-A") == 0 || + if(i == argc-1 && + (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "-T") == 0 || + strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "-A") == 0 || strcmp(argv[i], "-g") == 0 || strcmp(argv[i], "-g") == 0 || strcmp(argv[i], "-f") == 0 || strcmp(argv[i], "-F") == 0 || strcmp(argv[i], "-s") == 0 || strcmp(argv[i], "-S") == 0 || strcmp(argv[i], "-i") == 0 || strcmp(argv[i], "-I") == 0 || strcmp(argv[i], "-o") == 0 || strcmp(argv[i], "-O") == 0 || - strcmp(argv[i], "-p") == 0 || strcmp(argv[i], "-P") == 0)) - usage("-a/-f/-g/-i/-o/-p/-s options require parameters."); + strcmp(argv[i], "-p") == 0 || strcmp(argv[i], "-P") == 0 || + strcmp(argv[i], "-x") == 0 || strcmp(argv[i], "-X") == 0 )) + usage("-a/-f/-g/-i/-o/-p/-s/-x options require parameters."); else if(strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "-C") == 0) closed = 1; else if(strcmp(argv[i], "-q") == 0 || strcmp(argv[i], "-Q") == 0) @@ -189,6 +191,12 @@ int main(int argc, char *argv[]) { else usage("Invalid output format specified."); i++; } + else if(strcmp(argv[i], "-x") == 0 || strcmp(argv[i], "-X") == 0) { + max_node_dist = atoi(argv[i+1]); + i++; + } + else if(strcmp(argv[i], "-r") == 0 || strcmp(argv[i], "-R") == 0) + rapid = 1; else usage("Unknown option."); } @@ -382,7 +390,7 @@ int main(int argc, char *argv[]) { fprintf(stderr, "Building initial set of genes to train from..."); } record_overlapping_starts(nodes, nn, &tinf, 0); - ipath = dprog(nodes, nn, &tinf, 0); + ipath = dprog(nodes, nn, &tinf, 0, max_node_dist, rapid); if(quiet == 0) { fprintf(stderr, "done!\n"); } @@ -514,7 +522,7 @@ int main(int argc, char *argv[]) { write_start_file(start_ptr, nodes, nn, &tinf, num_seq, slen, 0, NULL, VERSION, cur_header); record_overlapping_starts(nodes, nn, &tinf, 1); - ipath = dprog(nodes, nn, &tinf, 1); + ipath = dprog(nodes, nn, &tinf, 1, max_node_dist, rapid); eliminate_bad_genes(nodes, ipath, &tinf); ng = add_genes(genes, nodes, ipath); tweak_final_starts(genes, ng, nodes, nn, &tinf); @@ -555,7 +563,7 @@ int main(int argc, char *argv[]) { reset_node_scores(nodes, nn); score_nodes(seq, rseq, slen, nodes, nn, meta[i].tinf, closed, is_meta); record_overlapping_starts(nodes, nn, meta[i].tinf, 1); - ipath = dprog(nodes, nn, meta[i].tinf, 1); + ipath = dprog(nodes, nn, meta[i].tinf, 1, max_node_dist, rapid); if(nodes[ipath].score > max_score) { max_phase = i; max_score = nodes[ipath].score; @@ -645,7 +653,7 @@ void usage(char *msg) { fprintf(stderr, " [-g tr_table] [-h] [-i input_file] [-m]"); fprintf(stderr, " [-n] [-o output_file]\n"); fprintf(stderr, " [-p mode] [-q] [-s start_file]"); - fprintf(stderr, " [-t training_file] [-v]\n"); + fprintf(stderr, " [-t training_file] [-x] [-r] [-v]\n"); fprintf(stderr, "\nDo 'prodigal -h' for more information.\n\n"); exit(15); } @@ -684,6 +692,10 @@ void help() { fprintf(stderr, " -t: Write a training file (if none exists); "); fprintf(stderr, "otherwise, read and use\n"); fprintf(stderr, " the specified training file.\n"); + fprintf(stderr, " -x: Specify the number of neighbor nodes to score connection."); + fprintf(stderr, " Default is 500.\n"); + fprintf(stderr, " -r: Rapid mode. When tested with the E. coli genome, "); + fprintf(stderr, "the same result was obtained in 2/3 of the original time."); fprintf(stderr, " -v: Print version number and exit.\n\n"); exit(0); }