From 26f4ae74bcb7ea1d6c6cd254fccbfaf6c0869db4 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Tue, 11 Nov 2025 00:41:45 +0000 Subject: [PATCH 1/5] License query improvement: search also using latest component version --- inc/debug.h | 1 + inc/scanoss.h | 2 +- src/license.c | 28 ++++++++++++++++++++++++---- src/scan.c | 4 ++++ src/scanlog.c | 18 +++++++++++++++++- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/inc/debug.h b/inc/debug.h index 12d8a2e..ee2caa7 100644 --- a/inc/debug.h +++ b/inc/debug.h @@ -32,6 +32,7 @@ extern bool debug_on; //= false; //set debug mode from main. extern bool quiet; +bool scanlog_init(void); void scanlog(const char *fmt, ...); void map_dump(scan_data_t *scan); long microseconds_now(void); diff --git a/inc/scanoss.h b/inc/scanoss.h index 23bd905..855a99c 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -39,7 +39,7 @@ #define WFP_LN 4 #define WFP_REC_LN 18 -#define SCANOSS_VERSION "5.4.18" +#define SCANOSS_VERSION "5.4.19" /* Log files */ #define SCAN_LOG "/tmp/scanoss_scan.log" diff --git a/src/license.c b/src/license.c index f33a671..3035e12 100644 --- a/src/license.c +++ b/src/license.c @@ -51,7 +51,7 @@ 4 = Scancode detection 5 = Scancode detection at mining time 6 = osslot */ -const char *license_sources[] = {"component_declared", "file_spdx_tag", "file_header", "license_file", "scancode", "scancode-file", "osselot"}; +const char *license_sources[] = {"component_declared", "file_spdx_tag", "file_header", "license_file", "scancode-file", "scancode", "osselot"}; bool full_license_report = false; @@ -337,8 +337,6 @@ bool print_licenses_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * int src = atoi(source); - scanlog("Fetched license %s\n", license); - if (strlen(license) > 2 && (src < (sizeof(license_sources) / sizeof(license_sources[0])))) license_add_to_list(&licenses[src], license); @@ -401,8 +399,30 @@ void print_licenses(component_data_t *comp) scanlog("License for %s@%s license returns %d hits\n", comp->purls[i], comp->version, records); if (records) - break; + { + //Look if someone of the prefered liceses ids already has a match + for (int i = 0; i < 4; i++) + { + if (licenses_by_type[i].count > 0) + { + scanlog("Stop searching for licenses\n"); + break; + } + } + } + if (strcmp(comp->version, comp->latest_version) != 0) + { + /* Calculate purl@latest_version md5 */ + purl_version_md5(purlversion_md5, comp->purls[i], comp->latest_version); + + records = ldb_fetch_recordset(NULL, oss_license, purlversion_md5, false, print_licenses_item, &licenses_by_type); + scanlog("License for %s@%s license returns %d hits\n", comp->purls[i], comp->latest_version, records); + + if (records) + break; + } + /* Unversioned purl license */ records = ldb_fetch_recordset(NULL, oss_license, comp->purls_md5[i], false, print_licenses_item, &licenses_by_type); scanlog("License for %s license returns %d hits\n", comp->purls[i], records); diff --git a/src/scan.c b/src/scan.c index 0123ab0..84401fa 100644 --- a/src/scan.c +++ b/src/scan.c @@ -449,6 +449,10 @@ void ldb_scan(scan_data_t *scan) exit(EXIT_FAILURE); } + // Clean up the log file + if (debug_on) + scanlog_init(); + scan->matchmap_size = 0; scan->match_type = MATCH_NONE; scan->timer = microseconds_now(); diff --git a/src/scanlog.c b/src/scanlog.c index 01b73bc..adef2da 100644 --- a/src/scanlog.c +++ b/src/scanlog.c @@ -47,9 +47,25 @@ long microseconds_now() return (now.tv_sec*(int)1e6+now.tv_usec); } +/** + * @brief Initialize the log file as blank + * @return true if successful, false otherwise + */ +bool scanlog_init() +{ + FILE *log = fopen(SCAN_LOG, "w"); + if (!log) + { + fprintf(stderr, "Warning: Cannot create/initialize the log file\n"); + return false; + } + fclose(log); + return true; +} + /** * @brief Print the logs in stderr - * @param fmt string to be printed + * @param fmt string to be printed * @param ... //TODO */ void scanlog(const char *fmt, ...) From b59d3cf26ca0f64d763dae96624f1fc00d5049fc Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Fri, 14 Nov 2025 21:51:05 +0000 Subject: [PATCH 2/5] Change matched percent type. Change final selection on snippet matching logic --- inc/match.h | 2 +- src/component.c | 6 ++---- src/license.c | 6 +++--- src/match.c | 29 ++++++++++++++++------------- src/report.c | 2 +- src/snippet_selection.c | 2 +- src/util.c | 2 +- 7 files changed, 25 insertions(+), 24 deletions(-) diff --git a/inc/match.h b/inc/match.h index 31278f3..a807919 100644 --- a/inc/match.h +++ b/inc/match.h @@ -16,7 +16,7 @@ typedef struct match_data_t int lines_matched; /*number of matched lines*/ char * line_ranges; /*input snippet line ranges */ char * oss_ranges; /* kb snippet line ranges */ - char * matched_percent; /* matched percent */ + int matched_percent; /* matched percent */ int path_ln; /*file path lenght*/ //TODO check if this is needed. uint8_t file_md5[MD5_LEN]; /* file md5 */ char source_md5[MD5_LEN * 2 + 1]; /*matched file md5 in hex format */ diff --git a/src/component.c b/src/component.c index 40b4eeb..8906313 100644 --- a/src/component.c +++ b/src/component.c @@ -209,8 +209,9 @@ static char * look_for_version(char *in) void fill_component_path(component_data_t *component, char *file_path) { component->file = strdup(look_for_version(file_path)); - component->path_ln = strlen(file_path); + component->path_ln = strlen(component->file); flip_slashes(component->file); + component->path_depth = path_depth(component->file); } /** @@ -298,9 +299,6 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa } else component->rank = COMPONENT_DEFAULT_RANK; - - component->path_depth = path_depth(component->file); - return true; } diff --git a/src/license.c b/src/license.c index 3035e12..83433f2 100644 --- a/src/license.c +++ b/src/license.c @@ -220,7 +220,7 @@ static char *json_from_license(uint32_t *crclist, char *buffer, char *license, i string_clean(license); int len = 0; - if (strlen(license) < 2) + if (!*license || strlen(license) < 2) return buffer; /* Calculate CRC to avoid duplicates */ uint32_t CRC = string_crc32c(license); @@ -337,7 +337,7 @@ bool print_licenses_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * int src = atoi(source); - if (strlen(license) > 2 && (src < (sizeof(license_sources) / sizeof(license_sources[0])))) + if (src < (sizeof(license_sources) / sizeof(license_sources[0]))) license_add_to_list(&licenses[src], license); free(source); @@ -431,7 +431,7 @@ void print_licenses(component_data_t *comp) } /* Open licenses structure */ - char * result = calloc(MAX_FIELD_LN * 100, 1); + char * result = calloc(MAX_FIELD_LN * 1024, 1); char * buffer = result; int len = 0; diff --git a/src/match.c b/src/match.c index 1b1a41d..f50e962 100644 --- a/src/match.c +++ b/src/match.c @@ -68,7 +68,6 @@ void match_data_free(match_data_t *data) free_and_null((void **)&data->snippet_ids); free_and_null((void **)&data->line_ranges); free_and_null((void **)&data->oss_ranges); - free_and_null((void **)&data->matched_percent); free_and_null((void **)&data->crytography_text); free_and_null((void **)&data->quality_text); component_list_destroy(&data->component_list); @@ -91,7 +90,7 @@ match_data_t * match_data_copy(match_data_t * in) out->type = in->type; out->line_ranges = strdup(in->line_ranges); out->oss_ranges = strdup(in->oss_ranges); - out->matched_percent = strdup(in->matched_percent); + out->matched_percent = in->matched_percent; out->snippet_ids = strdup(in->snippet_ids); strcpy(out->source_md5, in->source_md5); return out; @@ -339,7 +338,7 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ if (b->rank < COMPONENT_DEFAULT_RANK || a->rank < COMPONENT_DEFAULT_RANK) { //shorter path lenght are prefered - if (b->rank < a->rank &&b->path_depth < a->path_depth/2) + if (b->rank < a->rank && b->path_depth < a->path_depth/2) return true; else if (b->rank > a->rank && a->path_depth < b->path_depth/2) return false; @@ -450,8 +449,8 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, if (!new_comp) return false; - fill_component_path(new_comp, path); /* Create a new component and fill it from the url record */ + fill_component_path(new_comp, path); new_comp->file_md5_ref = component_list->match_ref->file_md5; /* If the component is valid add it to the component list */ @@ -543,13 +542,13 @@ bool load_matches(match_data_t *match) { asprintf(&match->line_ranges, "n/a"); asprintf(&match->oss_ranges, "n/a"); - asprintf(&match->matched_percent, "%d functions matched", match->hits); + match->matched_percent = -1; } else if (match->type == MATCH_FILE) { asprintf(&match->line_ranges, "all"); asprintf(&match->oss_ranges, "all"); - asprintf(&match->matched_percent, "100%%"); + match->matched_percent = 100; } uint32_t records = 0; @@ -745,7 +744,7 @@ void match_select_best(scan_data_t *scan) } //If best match has 20% more of hits do nothing. - if (best_match->hits >= match->hits * 1.2) + if (best_match->hits >= match->hits * 1.2 && best_match_component->path_depth < match_component->path_depth) continue; //if cantidate has 10% more of hits do not consider dates and switch @@ -753,13 +752,18 @@ void match_select_best(scan_data_t *scan) { scanlog("Replacing best match due to big hits difference\n"); scan->matches_list_array[i]->best_match = item->match; + continue; } // if the hit numbers are close, select the oldest. - else if (abs(scan->matches_list_array[i]->best_match->hits - item->match->hits) <= 2 && - find_oldest_match(scan->matches_list_array[i]->best_match, item->match)) + int diff = abs(best_match->matched_percent - match->matched_percent); + + if (diff <= 10) { - scanlog("Replacing best match for an older version with equal hits\n"); - scan->matches_list_array[i]->best_match = item->match; + if (component_date_comparation(best_match_component, match_component)) + { + scanlog("Replacing best match for an older version with similar hits\n"); + scan->matches_list_array[i]->best_match = item->match; + } } } } @@ -792,10 +796,9 @@ void match_select_best(scan_data_t *scan) { free(scan->matches_list_array[i]->best_match->line_ranges); free(scan->matches_list_array[i]->best_match->oss_ranges); - free(scan->matches_list_array[i]->best_match->matched_percent); scan->matches_list_array[i]->best_match->line_ranges = r.local; scan->matches_list_array[i]->best_match->oss_ranges = r.remote; - scan->matches_list_array[i]->best_match->matched_percent = r.matched; + scan->matches_list_array[i]->best_match->matched_percent = atoi(r.matched); max_hits = scan->matches_list_array[i]->best_match->hits; index = i; diff --git a/src/report.c b/src/report.c index 5a9e3b8..190f8d5 100644 --- a/src/report.c +++ b/src/report.c @@ -321,7 +321,7 @@ bool print_json_match(struct match_data_t * match) printf("\"id\": \"%s\"", matchtypes[match->type]); printf(",\"lines\": \"%s\"", match->line_ranges); printf(",\"oss_lines\": \"%s\"", match->oss_ranges); - printf(",\"matched\": \"%s\"", match->matched_percent); + printf(",\"matched\": \"%d%%\"", match->matched_percent); if ((engine_flags & ENABLE_SNIPPET_IDS) && match->type == MATCH_SNIPPET) { diff --git a/src/snippet_selection.c b/src/snippet_selection.c index 388705a..adeb307 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -151,7 +151,7 @@ void biggest_snippet(scan_data_t *scan) matched_percent = 99; if (matched_percent < 1) matched_percent = 1; - asprintf(&match_new->matched_percent, "%u%%", matched_percent); + match_new->matched_percent = matched_percent; match_new->lines_matched = matched_lines; //match_new->hits = hits; diff --git a/src/util.c b/src/util.c index 820f216..2c6d509 100644 --- a/src/util.c +++ b/src/util.c @@ -380,7 +380,7 @@ int path_is_third_party(const char* path) "contrib", // Contributed/third-party code "plugin", // Plugins (often third-party) - "lib", "components", "modules", "ext", + "utils","lib", "components", "modules", "ext", "test", "fixtures", "examples", "files", "assets", "runtime", "subprojects", "managed", "local_packages", "published", From a9b110dd25eaa13f4571a8ab00767beeb69bb1e9 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Fri, 14 Nov 2025 22:25:30 +0000 Subject: [PATCH 3/5] fix mutiple licenses parsing bug --- src/license.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/license.c b/src/license.c index 83433f2..2651123 100644 --- a/src/license.c +++ b/src/license.c @@ -272,7 +272,7 @@ static char *split_in_json_array(uint32_t *crclist, char *buffer, char *license, } while (next_lic); - return buffer; + return r; // Return the updated buffer pointer, not the original } char * license_to_json(uint32_t *crclist, char *buffer, char *license, int src, bool *first_record) From 87f4b6a181a4115f5d4c4c512edb47a539d30841 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Wed, 19 Nov 2025 11:24:43 +0000 Subject: [PATCH 4/5] third party filter update. Best component selection logic tune up for ranked urls. Performance optimization --- inc/component.h | 1 + inc/scanoss.h | 2 +- inc/util.h | 2 +- src/license.c | 1 + src/limits.c | 11 +----- src/match.c | 90 +++++++++++++++++++++++++++++++++++++++---------- src/util.c | 16 ++++++--- 7 files changed, 89 insertions(+), 34 deletions(-) diff --git a/inc/component.h b/inc/component.h index 3a3c122..5c73f46 100644 --- a/inc/component.h +++ b/inc/component.h @@ -59,6 +59,7 @@ typedef struct component_data_t int health_stats[3]; /* health stats: forks, watchers, contributors */ int rank; /* purl ranking - optional*/ int path_depth; /* depth of the matched file path*/ + int third_party_rank; /* Saves third party ranking*/ } component_data_t; component_data_t * component_init(void); diff --git a/inc/scanoss.h b/inc/scanoss.h index 855a99c..e32c313 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -31,7 +31,7 @@ #include "limits.h" #define MAX_FILE_PATH 1024 -#define FETCH_MAX_FILES 20000 +#define FETCH_MAX_FILES 12000 #define MIN_FILE_SIZE 256 // files below this size will be ignored #define CRC_LIST_LEN 1024 // list of crc checksums to avoid metadata duplicates #define SNIPPET_LINE_TOLERANCE 10 diff --git a/inc/util.h b/inc/util.h index 6445c0e..c0f89a2 100644 --- a/inc/util.h +++ b/inc/util.h @@ -63,7 +63,7 @@ char * str_cat_realloc(char **a, char * b); void free_and_null(void ** pr); -int path_is_third_party(const char* path); +int path_is_third_party(component_data_t *comp); /* Counts the number of '/' characters in a path string */ int path_depth(char* path); diff --git a/src/license.c b/src/license.c index 2651123..13a8592 100644 --- a/src/license.c +++ b/src/license.c @@ -336,6 +336,7 @@ bool print_licenses_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * free(CSV); int src = atoi(source); + scanlog("Fetched License %s - source ID %d\n", license, src); if (src < (sizeof(license_sources) / sizeof(license_sources[0]))) license_add_to_list(&licenses[src], license); diff --git a/src/limits.c b/src/limits.c index 3e27b69..8471794 100644 --- a/src/limits.c +++ b/src/limits.c @@ -9,17 +9,8 @@ * @see https://github.com/scanoss/engine/blob/master/src/limits.c */ - -int consecutive_score = 4000; /** Maximumm sUsed for snippet selection */ - -/* During snippet scanning, when a wfp (with more than consecutive_threshold wfps) produces a score higher - than consecutive_score by consecutive_hits in a row, the scan will skip consecutive_jump snippets */ -int consecutive_hits = 4; -int consecutive_jump = 5; -int consecutive_threshold = 50; - int range_tolerance = 5; /** A maximum number of non-matched lines tolerated inside a matching range */ int min_match_lines = 10; /** Minimum number of lines matched for a match range to be acepted */ int min_match_hits = 4; /** Minimum number of snippet ID hits to produce a snippet match*/ -const int max_vulnerabilities = 50; /** Show only the first N vulnerabilities */ +const int max_vulnerabilities = 50; /** Show only the first N vulnerabilities */ \ No newline at end of file diff --git a/src/match.c b/src/match.c index f50e962..d0d1f07 100644 --- a/src/match.c +++ b/src/match.c @@ -312,37 +312,41 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ } } else if (a->path_rank < PATH_LEVEL_COMP_REF / 3 + 1) + { + scanlog("%s rejected, %s wins by path rank %d\n", b->purls[0], a->purls[0], a->path_rank); return false; + } } if (!*b->release_date) + { + scanlog("%s rejected due to empty release date\n", b->purls[0]); return false; + } if (!*a->release_date) + { + scanlog("%s accepted, %s has empty release date\n", b->purls[0], a->purls[0]); return true; + } // Third-party path evaluation - int tp_a = path_is_third_party(a->file); - int tp_b = path_is_third_party(b->file); + int tp_a = path_is_third_party(a); + int tp_b = path_is_third_party(b); - if (tp_a > tp_b) + if (tp_a - tp_b > 4) { - scanlog("Component rejected by third party path filter (%s=%d > %s=%d)\n", a->purls[0], tp_a, b->purls[0], tp_b); + scanlog("Component rejected by third party path filter (%s=%d=%s > %s=%d=%s)\n", a->purls[0], tp_a,a->file, b->purls[0], tp_b, b->file); return false; } - else if (tp_a < tp_b) + else if (tp_b - tp_a > 4) { scanlog("Component accepted by third party path filter (%s=%d < %s=%d)\n", a->purls[0], tp_a, b->purls[0], tp_b); return true; } + //when the url ranking is enabled if (b->rank < COMPONENT_DEFAULT_RANK || a->rank < COMPONENT_DEFAULT_RANK) - { - //shorter path lenght are prefered - if (b->rank < a->rank && b->path_depth < a->path_depth/2) - return true; - else if (b->rank > a->rank && a->path_depth < b->path_depth/2) - return false; - + { bool good_purl_a = binary_file_to_purl(a); bool good_purl_b = binary_file_to_purl(b); if (good_purl_b && !good_purl_a) @@ -355,11 +359,35 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ scanlog("Component %s rejected by binary purl match\n", b->purls[0]); return false; } + + if (b->rank >= COMPONENT_RANK_SELECTION_MAX && a->rank < COMPONENT_RANK_SELECTION_MAX) + { + scanlog("%s rejected by rank threshold %d >= %d\n", b->purls[0], b->rank, COMPONENT_RANK_SELECTION_MAX); + return false; + } //lower rank selection logic - if (b->rank < COMPONENT_RANK_SELECTION_MAX && b->path_depth <= a->path_depth) + if (b->rank <= COMPONENT_RANK_SELECTION_MAX) { scanlog("path lenght: %s - %d vs %s - %d\n", b->file, b->path_depth, a->file, a->path_depth); + //shorter path lenght are prefered + if (b->path_depth < a->path_depth/2) + { + scanlog("%s accepted by shorter path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); + return true; + } + else if (a->path_depth < b->path_depth/2) + { + scanlog("%s rejected by longer path depth %d vs %d\n", b->purls[0], b->path_depth, a->path_depth); + return false; + } + + if(b->path_depth > a->path_depth+1) + { + scanlog("%s rejected by deeper path in rank selection %d > %d\n", b->purls[0], b->path_depth, a->path_depth); + return false; + } + if (b->rank < a->rank) { scanlog("%s wins %s by rank %d/%d\n", b->purls[0], a->purls[0], b->rank, a->rank); @@ -377,9 +405,14 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ { if (purl_source_check(a) > purl_source_check(b)) { - scanlog("Component prefered by source\n"); + scanlog("%s accepted over %s by source check\n", b->purls[0], a->purls[0]); return true; } + else if (purl_source_check(b) > purl_source_check(a)) + { + scanlog("%s rejected by source check\n", b->purls[0]); + return false; + } //Look for available health information print_health(a); @@ -404,6 +437,11 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ scanlog("Component %s prefered over %s by vendor+component=purl\n", b->purls[0], a->purls[0]); return true; } + else if (purl_vendor_component_check(a) && !purl_vendor_component_check(b)) + { + scanlog("Component %s rejected, %s wins by vendor+component=purl\n", b->purls[0], a->purls[0]); + return false; + } if (!a->purls_md5[0] && a->purls[0]) { @@ -421,23 +459,39 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ if ((!a->age && b->age) || b->age > a->age) { - scanlog("Component %s prefered over %s by purl date\n", b->purls[0], a->purls[0]); + scanlog("Component %s prefered over %s by purl date (age: %ld vs %ld)\n", b->purls[0], a->purls[0], b->age, a->age); return true; } + else if ((!b->age && a->age) || a->age > b->age) + { + scanlog("Component %s rejected by purl date (age: %ld vs %ld)\n", b->purls[0], b->age, a->age); + return false; + } if (b->age == a->age && !strcmp(a->component, b->component) && strcmp(a->version, b->version) > 0) { scanlog("Component %s prefered over %s by version\n", b->purls[0], a->purls[0]); return true; } + else if (b->age == a->age && !strcmp(a->component, b->component) && strcmp(b->version, a->version) > 0) + { + scanlog("Component %s rejected by version comparison\n", b->purls[0]); + return false; + } } /*select the oldest release date */ if (strcmp(b->release_date, a->release_date) < 0) { - scanlog("Component %s prefered over %s by release date\n", b->purls[0], a->purls[0]); + scanlog("Component %s (rank %d) prefered over %s (rank %d) by release date\n", b->purls[0],b->rank, a->purls[0], a->rank); return true; } + else if (strcmp(b->release_date, a->release_date) > 0) + { + scanlog("Component %s (rank %d) rejected, %s (rank %d) wins by older release date\n", b->purls[0], b->rank, a->purls[0], a->rank); + return false; + } + scanlog("Component %s rejected, no criteria matched\n", b->purls[0]); return false; } @@ -718,7 +772,7 @@ void match_select_best(scan_data_t *scan) if (match_component == best_match_component) continue; - if (path_is_third_party(match_component->file) < path_is_third_party(best_match_component->file) || !strcmp(match_component->release_date, "9999-99-99")) + if (path_is_third_party(match_component) < path_is_third_party(best_match_component) || !strcmp(match_component->release_date, "9999-99-99")) continue; scanlog("%s - %s - %d - %d VS %s - %s - %d - %d\n", @@ -736,7 +790,7 @@ void match_select_best(scan_data_t *scan) //If the best match is not good or is not identified be prefer the candidate. if ((!best_match_component->identified && match_component->identified) || - (path_is_third_party(best_match_component->file) < path_is_third_party(match_component->file))) + (path_is_third_party(best_match_component) < path_is_third_party(match_component))) { scanlog("Replacing best match for a prefered component\n"); scan->matches_list_array[i]->best_match = item->match; diff --git a/src/util.c b/src/util.c index 2c6d509..8033fe5 100644 --- a/src/util.c +++ b/src/util.c @@ -332,9 +332,17 @@ void free_and_null(void ** pr) } } -int path_is_third_party(const char* path) +int path_is_third_party(component_data_t *comp) { - const char* patterns[] = { + if (comp->third_party_rank > 0) + return comp->third_party_rank; + + if (!comp->file) + return 0; + + char * path = comp->file; + + const char* patterns[] = { // Explicit third-party naming "third_party", // Covers third_party, ThirdParty, third-party via strcasestr "thirdparty", // Alternative spelling @@ -381,10 +389,10 @@ int path_is_third_party(const char* path) "plugin", // Plugins (often third-party) "utils","lib", "components", "modules", "ext", - "test", "fixtures", "examples", + "fixtures", "examples", "files", "assets", "runtime", "subprojects", "managed", "local_packages", "published", - "driver", "libresources", "offloading" + "driver", "libresources", "offloading","documentation", "test" }; const int numPatterns = sizeof(patterns) / sizeof(patterns[0]); From dcf5c9e93eb31aad8eb5bcc21131510721b800e8 Mon Sep 17 00:00:00 2001 From: mariano scasso Date: Wed, 19 Nov 2025 17:04:39 +0000 Subject: [PATCH 5/5] remove snippets above the min_match_lines limit --- src/match.c | 2 +- src/snippet_selection.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/match.c b/src/match.c index d0d1f07..6b0770f 100644 --- a/src/match.c +++ b/src/match.c @@ -798,7 +798,7 @@ void match_select_best(scan_data_t *scan) } //If best match has 20% more of hits do nothing. - if (best_match->hits >= match->hits * 1.2 && best_match_component->path_depth < match_component->path_depth) + if (best_match->hits >= match->hits * 1.2 && best_match_component->path_depth <= match_component->path_depth) continue; //if cantidate has 10% more of hits do not consider dates and switch diff --git a/src/snippet_selection.c b/src/snippet_selection.c index adeb307..0d29172 100644 --- a/src/snippet_selection.c +++ b/src/snippet_selection.c @@ -262,6 +262,9 @@ int ranges_assemble(matchmap_range *ranges, char *line_ranges, char *oss_ranges) strcat(line_ranges, ","); if (*oss_ranges) strcat(oss_ranges, ","); + //discard snippets below the limit of detection + if (to - from < min_match_lines) + continue; /* Add from-to values */ sprintf(line_ranges + strlen(line_ranges), "%d-%d", from, to);