From 18bb9fa2db3caf5b13eeaa9a4984a690bd7f23c3 Mon Sep 17 00:00:00 2001 From: Mariano Scasso <75589700+mscasso-scanoss@users.noreply.github.com> Date: Wed, 15 Jan 2025 16:13:38 -0300 Subject: [PATCH 01/19] fix bug with scanning test. Add -T scanning parameter. Change default snippets scanning tolerace (#83) Co-authored-by: core software devel --- inc/match_list.h | 3 ++- src/debug.c | 2 +- src/help.c | 58 ++++++++++++++++++++++++------------------------ src/main.c | 5 ++++- src/match_list.c | 12 +++++++++- src/util.c | 3 ++- 6 files changed, 49 insertions(+), 34 deletions(-) diff --git a/inc/match_list.h b/inc/match_list.h index 91b3b32..d9915e8 100644 --- a/inc/match_list.h +++ b/inc/match_list.h @@ -78,7 +78,7 @@ #define SCAN_MAX_SNIPPETS_DEFAULT 1 #define SCAN_MAX_COMPONENTS_DEFAULT 3 -#define MATCH_LIST_TOLERANCE 98.5 +#define MATCH_LIST_TOLERANCE 97.5 typedef struct match_data_t match_data_t; /* Forward declaration */ /** @@ -145,5 +145,6 @@ bool component_list_add(component_list_t * list, component_data_t * new_comp, bo void component_list_print(component_list_t * list, bool (*printer) (component_data_t * fpa), char * separator); void component_list_destroy(component_list_t *list); bool component_list_add_binary(component_list_t *list, component_data_t *new_comp, bool (*val)(component_data_t *a, component_data_t *b), bool remove_a); +void match_list_tolerance_set(float in); #endif diff --git a/src/debug.c b/src/debug.c index c6ffea5..0b1e9f8 100644 --- a/src/debug.c +++ b/src/debug.c @@ -210,7 +210,7 @@ void scan_benchmark() scan->hashes[i] = rand() % 256 + (rand() % 256) * 256 + (rand() % 256) * 256 * 256 + (rand() % 256) * 256 * 256 * 256; } scan->hash_count = total_hashes; - + scan->total_lines = 10; ldb_scan_snippets(scan); scan_data_free(scan); } diff --git a/src/help.c b/src/help.c index 8916f1a..8b15dcc 100644 --- a/src/help.c +++ b/src/help.c @@ -38,41 +38,42 @@ */ void help () { - printf ("ScanOSS Engine v%s\n", SCANOSS_VERSION); - printf ("\n\ -This program performs an OSS inventory for the given TARGET comparing against the ScanOSS Knowledgebase.\n\ -Results are printed in STDOUT in JSON format\n\ + printf ("ScanOSS Engine v%s\n", SCANOSS_VERSION); + printf ("\n\ +This program performs an OSS inventory scan of the specified TARGET by comparing it against the ScanOSS Knowledgebase.\n\ +Results are displayed in JSON format through STDOUT.\n\ \n\ Syntax: scanoss [parameters] [TARGET]\n\ \n\ Configuration:\n\ --w Treats TARGET as a .wfp file regardless of the actual file extension.\n\ --H High Precision Snippet Match mode, 'libhpsm.so' must be present in the system.\n\ --e Expect matching extensions to equal the file extension being scanned (default: off).\n\ --M NUMBER Looks for NUMBER of different components in a file (MAX 9).\n\ --s SBOM Use assets specified in JSON SBOM (CycloneDX/SPDX2.2 JSON format) as input to identification.\n\ --b SBOM Ignore matches to assets specified in JSON SBOM (CycloneDX/SPDX2.2 JSON format).\n\ --B SBOM Same than \"-b\" but forcing snippet scan.\n\ --a SBOM Displays attribution notices for provided SBOM.json.\n\ --c HINT Provide a component HINT to influence scan results.\n\ --k KEY Displays contents of file KEY from MZ sources archive.\n\ --l LICENSE Displays OSADL metadata for the provided SPDX license ID.\n\ +-w Process TARGET as a .wfp file, regardless of its actual extension.\n\ +-H Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system).\n\ +-e Match only files with identical extensions as the scanned file (default: off).\n\ +-M NUMBER Search for up to NUMBER different components in each file (maximum: 9).\n\ +-T NUMBER Set snippet scanning tolerance percentage (default: 3.5).\n\ +-s SBOM Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ +-b SBOM Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ +-B SBOM Same as \"-b\" but with forced snippet scanning.\n\ +-a SBOM Show attribution notices for the provided SBOM.json file.\n\ +-c HINT Add a component HINT to guide scan results.\n\ +-k KEY Show contents of the specified KEY file from MZ sources archive.\n\ +-l LICENSE Display OSADL metadata for the given SPDX license ID.\n\ \n\ Options:\n\ --t Tests engine performance.\n\ --v Display version and exit.\n\ --n Specify DB name (default: oss).\n\ --h Display this help and exit.\n\ --d Save debugging information to disk (/tmp).\n\ --q Produces no JSON output. Only debugging info via STDERR.\n\ +-t Run engine performance tests.\n\ +-v Show version information and exit.\n\ +-n Set database name (default: oss).\n\ +-h Display this help information and exit.\n\ +-d Store debugging information to disk (/tmp).\n\ +-q Suppress JSON output (show only debugging info via STDERR).\n\ \n\ -Enviroment variables:\n\ -SCANOSS_MATCHMAP_MAX: define the snippet scanning match map size, %d by default.\n\ -SCANOSS_API_URL: defines the API url, %s by default.\n\ +Environment variables:\n\ +SCANOSS_MATCHMAP_MAX: Set the snippet scanning match map size (default: %d).\n\ +SCANOSS_API_URL: Define the API endpoint URL (default: %s).\n\ \n\ Engine scanning flags:\n\ -The scanning engine can be configured by passing configuration flags with the -F parameter.\n\ -Alternatively, these value can be written in %s\n\ +Configure the scanning engine using flags with the -F parameter.\n\ +These settings can also be specified in %s\n\ +-------+-------------------------------------------------------+\n\ | Flag | Setting |\n\ +-------+-------------------------------------------------------+\n\ @@ -83,7 +84,7 @@ Alternatively, these value can be written in %s\n\ | 16 | Disable copyrights (default: enabled) |\n\ | 32 | Disable vulnerabilities (default: enabled) |\n\ | 64 | Disable quality (default: enabled) |\n\ -| 128 | Disable cryptography (defalt: enabled) |\n\ +| 128 | Disable cryptography (default: enabled) |\n\ | 256 | Disable best match only (default: enabled) |\n\ | 512 | Hide identified files (default: disabled) |\n\ | 1024 | Enable download_url (default: disabled) |\n\ @@ -92,8 +93,7 @@ Alternatively, these value can be written in %s\n\ | 8192 | Disable health layer (default: enabled) |\n\ | 16384 | Enable high accuracy, slower scan (default: disabled) |\n\ +-------+-------------------------------------------------------+\n\ -Example: scanoss -F 12 DIRECTORY (scans DIRECTORY disabling license and dependency data)\n\ +Example: scanoss -F 12 DIRECTORY (scan DIRECTORY without license and dependency data)\n\ \n\ Copyright (C) 2018-2022 SCANOSS.COM\n", DEFAULT_MATCHMAP_FILES, API_URL, ENGINE_FLAGS_FILE); - } diff --git a/src/main.c b/src/main.c index 9a488aa..337a466 100644 --- a/src/main.c +++ b/src/main.c @@ -291,7 +291,7 @@ int main(int argc, char **argv) int option; bool invalid_argument = false; char * ldb_db_name = NULL; - while ((option = getopt(argc, argv, ":f:s:b:B:c:k:a:F:l:n:M:N:wtvhedqH")) != -1) + while ((option = getopt(argc, argv, ":T:s:b:B:c:k:a:F:l:n:M:N:wtvhedqH")) != -1) { /* Check valid alpha is entered */ if (optarg) @@ -355,6 +355,9 @@ int main(int argc, char **argv) case 'N': scan_max_components = atol(optarg); break; + case 'T': + match_list_tolerance_set(atof(optarg)); + break; case 'w': force_wfp = true; break; diff --git a/src/match_list.c b/src/match_list.c index ff51e68..276fa7d 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -9,6 +9,7 @@ #include "component.h" int list_size = 0; +static float match_list_tolerance = MATCH_LIST_TOLERANCE; void component_list_destroy(component_list_t *list) { @@ -210,10 +211,19 @@ bool component_list_add_binary(component_list_t *list, component_data_t *new_com return false; } +void match_list_tolerance_set(float in) +{ + if (in > 99) + in = 99; + + match_list_tolerance = 100.0-in; + scanlog("setting match list tolerance to %.1f\n", match_list_tolerance); +} + bool tolerance_eval(int a, int b) { int relative_error = (abs(a - b) * 100) / ((a + b) / 2); - if (100 - relative_error >= MATCH_LIST_TOLERANCE) + if (100 - relative_error >= match_list_tolerance) return true; else return false; diff --git a/src/util.c b/src/util.c index 4a7f1b4..cfbcb99 100644 --- a/src/util.c +++ b/src/util.c @@ -372,7 +372,8 @@ bool path_is_third_party(const char* path) "local_packages", "managed", "3rd", - "thirdparty" + "thirdparty", + "LibResources" }; // Número de patrones a verificar From ca293fb43a65bc291883dea4b023561158f5ea75 Mon Sep 17 00:00:00 2001 From: Jeronimo Ortiz <166400360+ortizjeronimo@users.noreply.github.com> Date: Mon, 20 Jan 2025 09:40:56 -0300 Subject: [PATCH 02/19] updated documentation theme --- docs/requirements-docs.txt | 2 +- docs/source/conf.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index a95ae18..483a4e9 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1 +1 @@ -furo +sphinx_rtd_theme diff --git a/docs/source/conf.py b/docs/source/conf.py index 3b9402e..9fc9dc7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,7 +13,7 @@ # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration -extensions = [] +extensions = ['sphinx_rtd_theme'] templates_path = ['_templates'] exclude_patterns = [] @@ -23,6 +23,6 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'furo' +html_theme = 'sphinx_rtd_theme' html_logo = 'scanosslogo.jpg' html_static_path = ['_static'] From e41cf0b5ab81bc7f961aadaffeaf4a3669868729 Mon Sep 17 00:00:00 2001 From: Mariano Scasso <75589700+mscasso-scanoss@users.noreply.github.com> Date: Mon, 10 Mar 2025 19:18:17 -0300 Subject: [PATCH 03/19] 5.4.10 (#85) * fix memory bug processing versions * change sources env-variables * tune up scanning limits * fix ranges assembling bug. Improve snippets and component selection * change on report, file content url will be returned empty if the env-var wasn't defined --- inc/limits.h | 4 +- inc/match.h | 1 + inc/match_list.h | 3 +- inc/scan.h | 2 +- inc/scanoss.h | 3 +- src/help.c | 6 +-- src/match.c | 40 ++++------------ src/match_list.c | 16 ++++++- src/report.c | 10 ++-- src/scan.c | 2 +- src/snippets.c | 120 +++++++++++++++++++++++++---------------------- src/util.c | 4 +- src/versions.c | 4 +- 13 files changed, 106 insertions(+), 109 deletions(-) diff --git a/inc/limits.h b/inc/limits.h index b91dedd..69122a4 100644 --- a/inc/limits.h +++ b/inc/limits.h @@ -37,9 +37,9 @@ /* Snippets */ #define DEFAULT_MATCHMAP_FILES 10000 // Default number of files evaluated in snippet matching -#define MAX_MATCHMAP_FILES (DEFAULT_MATCHMAP_FILES * 5) // Max number of files evaluated in snippet matching to prevent performance issues +#define MAX_MATCHMAP_FILES (DEFAULT_MATCHMAP_FILES * 10) // Max number of files evaluated in snippet matching to prevent performance issues +#define MIN_LINES_COVERAGE 0.8 #define SKIP_SNIPPETS_IF_FILE_BIGGER (1024 * 1024 * 4) -#define SKIP_SNIPPETS_IF_STARTS_WITH (const char*[3]) {"{", "release_date) return true; - if (!path_is_third_party(a->file) && path_is_third_party(b->file) && !(engine_flags & ENABLE_PATH_HINT)) + if (!path_is_third_party(a->file) && path_is_third_party(b->file)) { scanlog("Component rejected by third party filter\n"); return false; @@ -331,7 +331,7 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ { if (purl_source_check(a) > purl_source_check(b)) { - scanlog("Component prefered by vsource\n"); + scanlog("Component prefered by source\n"); return true; } @@ -471,39 +471,15 @@ bool load_matches(match_data_t *match) { scanlog("Load matches\n"); - /* Compile match ranges and fill up matched percent */ - int hits = 100; - int matched_percent = 100; - /* Get matching line ranges (snippet match) */ - if (match->type == MATCH_SNIPPET) - { - hits = compile_ranges(match); - scanlog("compile_ranges returns %d hits\n", hits); - - if (hits < min_match_hits) - { - match->type = MATCH_NONE; - return false; - } - - float percent = (hits * 100) / match->scan_ower->total_lines; - if (hits) - matched_percent = floor(percent); - if (matched_percent > 99) - matched_percent = 99; - if (matched_percent < 1) - matched_percent = 1; - - asprintf(&match->matched_percent, "%u%%", matched_percent); - } - else if (match->type == MATCH_BINARY) + + if (match->type == MATCH_BINARY) { asprintf(&match->line_ranges, "n/a"); asprintf(&match->oss_ranges, "n/a"); asprintf(&match->matched_percent, "%d functions matched", match->hits); } - else + else if (match->type == MATCH_FILE) { asprintf(&match->line_ranges, "all"); asprintf(&match->oss_ranges, "all"); @@ -696,11 +672,13 @@ void match_select_best(scan_data_t *scan) break; } - if (!best_match_component->identified && match_component->identified) + if ((!best_match_component->identified && match_component->identified) || + (strcmp(best_match_component->vendor,best_match_component->component) && !strcmp(match_component->vendor, match_component->component)) || + (path_is_third_party(best_match_component->file) && !path_is_third_party(match_component->file))) { scanlog("Replacing best match for a prefered component\n"); scan->matches_list_array[i]->best_match = item->match; - } + } } } diff --git a/src/match_list.c b/src/match_list.c index 276fa7d..9df35af 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -311,11 +311,11 @@ bool match_list_add(match_list_t *list, match_data_t *new_match, bool (*val)(mat } /* in autolimit mode the list doesnt have a fix size, it will accept all the matchest until a 75% of the fist element (the biggest) */ //TODO: this part of the code should be in the function pointer or I need to re-evaluate the archtecture of this function */ - if (list->autolimit && !tolerance_eval(list->headp.lh_first->match->hits, list->last_element->match->hits)) + if (list->autolimit && !tolerance_eval(list->headp.lh_first->match->lines_matched, list->last_element->match->lines_matched)) { np = list->headp.lh_first; /*We have to find and remove the unwanted elements */ - for (; np->entries.le_next != NULL && tolerance_eval(list->headp.lh_first->match->hits, np->entries.le_next->match->hits); np = np->entries.le_next) + for (; np->entries.le_next != NULL && tolerance_eval(list->headp.lh_first->match->lines_matched, np->entries.le_next->match->lines_matched); np = np->entries.le_next) { } @@ -403,6 +403,18 @@ bool match_list_print(match_list_t *list, bool (*printer)(match_data_t *fpa), ch return true; } +bool match_list_eval(match_list_t *list, match_data_t * in, bool (*eval)(match_data_t *fpa, match_data_t *fpb)) +{ + int i = 0; + for (struct entry *np = list->headp.lh_first; np != NULL && iitems; np = np->entries.le_next) + { + if(eval(np->match, in)) + return true; + i++; + } + return false; +} + void component_list_print(component_list_t *list, bool (*printer)(component_data_t *fpa), char *separator) { for (struct comp_entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next) diff --git a/src/report.c b/src/report.c index 99c058c..c650328 100644 --- a/src/report.c +++ b/src/report.c @@ -334,17 +334,19 @@ bool print_json_match(struct match_data_t * match) printf(",\"source_hash\": \"%s\"", match->source_md5); /* Output file_url (same as url when match type = url) */ - char * file_url_enabled = getenv("SCANOSS_FILE_CONTENTS"); - if (!file_url_enabled || strcmp(file_url_enabled, "false")) + char * file_contents_url = getenv("SCANOSS_FILE_CONTENTS_URL"); + if (file_contents_url && *file_contents_url && strcmp(file_contents_url, "false")) { if (!match->component_list.headp.lh_first->component->url_match) { - char *custom_url = getenv("SCANOSS_API_URL"); - printf(",\"file_url\": \"%s/file_contents/%s\"", custom_url ? custom_url : API_URL, file_id); + printf(",\"file_url\": \"%s/%s\"", file_contents_url, file_id); } else printf(",\"file_url\": \"%s\"", match->component_list.headp.lh_first->component->url); } + else //return an empty string + printf(",\"file_url\": \" \""); + free(file_id); diff --git a/src/scan.c b/src/scan.c index 608fdfd..8d586d0 100644 --- a/src/scan.c +++ b/src/scan.c @@ -59,7 +59,7 @@ scan_data_t * scan_data_init(char *target, int max_snippets, int max_components) scan_data_t * scan = calloc(1, sizeof(*scan)); scan->file_path = strdup(target); scan->file_size = malloc(32); - scan->hashes = malloc(MAX_FILE_SIZE); + scan->hashes = calloc(MAX_FILE_SIZE,1); scan->lines = malloc(MAX_FILE_SIZE); scan->match_type = MATCH_NONE; diff --git a/src/snippets.c b/src/snippets.c index 6bb4fb2..1c3717c 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -40,6 +40,7 @@ #include "match.h" #include "match_list.h" #include "stdlib.h" +#include "snippets.h" int matchmap_max_files = DEFAULT_MATCHMAP_FILES; /** @@ -91,6 +92,20 @@ static bool hit_test(match_data_t *a, match_data_t *b) else return false; } + +bool ranges_intersection(match_data_t *a, match_data_t *b) +{ + for (int i = 0; i < a->matchmap_reg->ranges_number; i++) + { + for (int j = 0; j < b->matchmap_reg->ranges_number; j++) + { + if (a->matchmap_reg->range[i].from <= b->matchmap_reg->range[j].to && + b->matchmap_reg->range[j].from <= a->matchmap_reg->range[i].to) + return true; + } + } + return false; +} /** * @brief Fill the matches list array based on the matchmap. The possible matches will be sorted by hits number. * @@ -103,9 +118,8 @@ void biggest_snippet(scan_data_t *scan) for (int i = 0; i < scan->max_snippets_to_process; i++) scan->matches_list_array_indirection[i] = -1; - int snippet_tolerance = range_tolerance / scan->max_snippets_to_process + min_match_lines; /* Used to define bounds between two possible snippets */ /*Fill the matches list with the files from the matchmap */ - for (int sector = 0; sector < 255; sector++) + for (int sector = 0; sector < 256; sector++) { int j = scan->matchmap_rank_by_sector[sector]; @@ -115,48 +129,65 @@ void biggest_snippet(scan_data_t *scan) if (scan->matchmap[j].hits >= min_match_hits) /* Only consider file with more than min_match_hits */ { match_data_t *match_new = calloc(1, sizeof(match_data_t)); /* Create a match object */ - memcpy(match_new->file_md5, scan->matchmap[j].md5, MD5_LEN); + memcpy(match_new->file_md5, scan->matchmap[j].md5, oss_file.key_ln); match_new->hits = scan->matchmap[j].hits; match_new->matchmap_reg = &scan->matchmap[j]; match_new->type = scan->match_type; match_new->from = scan->matchmap[j].range->from; strcpy(match_new->source_md5, scan->source_md5); match_new->scan_ower = scan; - bool found = false; int i = 0; - for (; i < scan->matches_list_array_index; i++) /*Check if there is already a list for this line ranges */ + + if (snippet_extension_discard(match_new)) { - if (scan->matches_list_array_indirection[i] > -1 && - abs(scan->matches_list_array_indirection[i] - match_new->from) < snippet_tolerance) - { - found = true; - break; - } + match_data_free(match_new); + continue; + } + + int matched_lines = compile_ranges(match_new); + if (matched_lines < min_match_lines) { + match_data_free(match_new); + continue; } - if (!found) /*If there is no list for the snippet range we have to create a new one */ + float percent = (matched_lines * 100) / match_new->scan_ower->total_lines; + int matched_percent = floor(percent); + if (matched_percent > 99) + matched_percent = 99; + if (matched_percent < 1) + matched_percent = 1; + asprintf(&match_new->matched_percent, "%u%%", matched_percent); + match_new->lines_matched = matched_lines; + //match_new->hits = hits; + + do /*Check if there is already a list for this line ranges */ { - if (scan->matches_list_array_index < scan->max_snippets_to_process) /* Check for the list limit */ + if (!scan->matches_list_array[scan->matches_list_array_index] && scan->matches_list_array_index < scan->max_snippets_to_process) { - scan->matches_list_array_indirection[scan->matches_list_array_index] = match_new->from; /*update indirection*/ - scan->matches_list_array[scan->matches_list_array_index] = match_list_init(true, 1); /*create the list*/ - i = scan->matches_list_array_index; /* update index*/ + scan->matches_list_array[scan->matches_list_array_index] = match_list_init(true, 1); /*create the list if it doesnt exist*/ scan->matches_list_array_index++; + if(!match_list_add(scan->matches_list_array[i], match_new, hit_test, true)) + { + match_data_free(match_new); + } + break; } - else - i = scan->max_snippets_to_process - 1; /*add in the last available list if there is no more space for new lists*/ - } - - if (snippet_extension_discard(match_new) || !match_list_add(scan->matches_list_array[i], match_new, hit_test, true)) /*Add the match in the selected list */ - { - scanlog("Rejected match with %d hits\n", match_new->hits); - match_data_free(match_new); /* the the memory if the match was not accepted in the list */ - } + if (match_list_eval(scan->matches_list_array[i], match_new, ranges_intersection) || i == scan->max_snippets_to_process -1) + { + if(!match_list_add(scan->matches_list_array[i], match_new, hit_test, true)) + { + match_data_free(match_new); + } + break; + } + i++; + } while(i < scan->matches_list_array_index); /*Check if there is already a list for this line ranges */ } } /*just for loging*/ if (debug_on) { + scanlog("Match list array index: %d\n", scan->matches_list_array_index); for (int i = 0; i < scan->matches_list_array_index; i++) { scanlog("Match list N %d, with %d matches. %d <= HITS <= %d \n", i, scan->matches_list_array[i]->items, @@ -165,8 +196,8 @@ void biggest_snippet(scan_data_t *scan) struct entry *item = NULL; LIST_FOREACH(item, &scan->matches_list_array[i]->headp, entries) { - char md5_hex[MD5_LEN * 2 + 1]; - ldb_bin_to_hex(item->match->file_md5, MD5_LEN, md5_hex); + char md5_hex[oss_file.key_ln * 2 + 1]; + ldb_bin_to_hex(item->match->file_md5, oss_file.key_ln, md5_hex); scanlog("%s - %d\n", md5_hex, item->match->hits); } } @@ -348,8 +379,11 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) { if(out_ranges_index >= 0 && (ranges[i].from - tolerance <= out_ranges[out_ranges_index].to)) { + if (out_ranges[out_ranges_index].to > ranges[i].to) + continue; + out_ranges[out_ranges_index].to = ranges[i].to; - scanlog("join range %d with %d\n", i, out_ranges_index); + //scanlog("join range %d with %d: %d - %d\n", i, out_ranges_index, out_ranges[out_ranges_index].from, out_ranges[out_ranges_index].to); } else { @@ -397,35 +431,7 @@ uint32_t compile_ranges(match_data_t *match) return 0; } - uint16_t reported_hits = match->matchmap_reg->hits; int hits = 0; - /* Revise hits and decrease if needed */ - for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) - { - long from = match->matchmap_reg->range[i].from; //uint16_read(match->matchmap_reg + MD5_LEN + 2 + i * 6); - long to = match->matchmap_reg->range[i].to; //uint16_read(match->matchmap_reg + MD5_LEN + 2 + i * 6 + 2); - long delta = to - from; - - if (to < 1) - break; - - /* Ranges to be ignored (under min_match_lines) should decrease hits counter */ - if (delta < min_match_lines) - { - /* Single-line range decreases by 1, otherwise decrease by 2 (from and to) */ - reported_hits -= ((delta == 0) ? 1 : 2); - } - - /* Exit if hits is below two */ - if (reported_hits < min_match_hits) - { - scanlog("Discarted ranges brings hits count to %u\n", reported_hits); - return 0; - } - - scanlog("compile_ranges #%d = %ld to %ld - OSS from: %d\n", i, from, to, match->matchmap_reg->range[i].oss_line); - } - /* Add tolerances and assemble line ranges */ ranges_sort(match->matchmap_reg->range, match->matchmap_reg->ranges_number); @@ -800,7 +806,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) } if (cat_limit > scan->max_matchmap_size) { - if ((hashes_to_process < scan->hash_count / 10 || (float) lines_coverage / scan->hash_count < 0.6) && cat_limit < MAX_MATCHMAP_FILES) + if ((hashes_to_process < scan->hash_count / 10 || (float) lines_coverage / scan->hash_count < MIN_LINES_COVERAGE) && cat_limit < MAX_MATCHMAP_FILES) { scan->max_matchmap_size += map[map_indirection[i][j]].size; } diff --git a/src/util.c b/src/util.c index cfbcb99..4151d66 100644 --- a/src/util.c +++ b/src/util.c @@ -330,10 +330,10 @@ void free_and_null(void * pr) bool path_is_third_party(const char* path) { - // Array de patrones comunes const char* patterns[] = { "third_party", "3rdparty", + "site-packages", "vendor", "external", "dependencies", @@ -376,10 +376,8 @@ bool path_is_third_party(const char* path) "LibResources" }; - // Número de patrones a verificar const int numPatterns = sizeof(patterns) / sizeof(patterns[0]); - // Verificar cada patrón for (int i = 0; i < numPatterns; i++) { if (strstr(path, patterns[i]) != NULL) diff --git a/src/versions.c b/src/versions.c index 405beed..2639bf8 100644 --- a/src/versions.c +++ b/src/versions.c @@ -55,10 +55,10 @@ void normalise_version(char *version, char *component) return; char aux[MAX_FIELD_LN] = "\0"; + int compt_len = strlen(component); /* Remove leading component name from version */ - if ((version && component) && stristart(version, component)) + if ((version && component) && stristart(version, component) && strlen(version) > compt_len + 1) { - int compt_len = strlen(component); sprintf(aux, "%s",version + compt_len + 1); } From e39a4d9299cca189c38708c813d39ef13ead148a Mon Sep 17 00:00:00 2001 From: core software devel Date: Mon, 5 Aug 2024 12:16:40 +0000 Subject: [PATCH 04/19] major change on mutiple snippet detection --- src/snippets.c | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/src/snippets.c b/src/snippets.c index 1c3717c..ae8a89b 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -144,20 +144,14 @@ void biggest_snippet(scan_data_t *scan) continue; } - int matched_lines = compile_ranges(match_new); - if (matched_lines < min_match_lines) { - match_data_free(match_new); - continue; - } - - float percent = (matched_lines * 100) / match_new->scan_ower->total_lines; + int hits = compile_ranges(match_new); + float percent = (hits * 100) / match_new->scan_ower->total_lines; int matched_percent = floor(percent); if (matched_percent > 99) matched_percent = 99; if (matched_percent < 1) matched_percent = 1; asprintf(&match_new->matched_percent, "%u%%", matched_percent); - match_new->lines_matched = matched_lines; //match_new->hits = hits; do /*Check if there is already a list for this line ranges */ @@ -372,7 +366,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) processed = 0; out_ranges[0] = ranges[0]; memset(out_ranges, 0, sizeof(matchmap_range) * MATCHMAP_RANGES); - scanlog("Range tolerance: %d\n", tolerance); + //scanlog("Range tolerance: %d\n", tolerance); for (int i = 0; i < size; i++) { if (ranges[i].from && ranges[i].to) @@ -383,7 +377,7 @@ matchmap_range * ranges_join_overlapping(matchmap_range *ranges, int size) continue; out_ranges[out_ranges_index].to = ranges[i].to; - //scanlog("join range %d with %d: %d - %d\n", i, out_ranges_index, out_ranges[out_ranges_index].from, out_ranges[out_ranges_index].to); + //scanlog("join range %d with %d\n", i, out_ranges_index); } else { @@ -432,10 +426,37 @@ uint32_t compile_ranges(match_data_t *match) } int hits = 0; + /* Revise hits and decrease if needed */ + for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) + { + long from = match->matchmap_reg->range[i].from; + long to = match->matchmap_reg->range[i].to; + long delta = to - from; + + if (to < 1) + break; + + /* Ranges to be ignored (under min_match_lines) should decrease hits counter */ + if (delta < min_match_lines) + { + /* Single-line range decreases by 1, otherwise decrease by 2 (from and to) */ + reported_hits -= ((delta == 0) ? 1 : 2); + } + + /* Exit if hits is below two */ + if (reported_hits < min_match_hits) + { + scanlog("Discarted ranges brings hits count to %u\n", reported_hits); + return 0; + } + + //scanlog("compile_ranges #%d = %ld to %ld - OSS from: %d\n", i, from, to, match->matchmap_reg->range[i].oss_line); + } + /* Add tolerances and assemble line ranges */ ranges_sort(match->matchmap_reg->range, match->matchmap_reg->ranges_number); - if (debug_on) + /*if (debug_on) { scanlog("Accepted ranges (min lines range = %d):\n", min_match_lines); for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) @@ -444,7 +465,7 @@ uint32_t compile_ranges(match_data_t *match) scanlog(" %d = %ld to %ld - OSS from: %d\n", i, match->matchmap_reg->range[i].from,match->matchmap_reg->range[i].to, match->matchmap_reg->range[i].oss_line); } - } + }*/ matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number); @@ -459,7 +480,7 @@ uint32_t compile_ranges(match_data_t *match) } } - if (debug_on) + /*if (debug_on) { scanlog("Final ranges:\n"); for (uint32_t i = 0; i < MATCHMAP_RANGES; i++) @@ -467,7 +488,7 @@ uint32_t compile_ranges(match_data_t *match) if ( ranges[i].from && ranges[i].to) scanlog(" %d = %ld to %ld - OSS from: %d\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); } - } + }*/ hits = ranges_assemble(ranges, line_ranges, oss_ranges); match->line_ranges = strdup(line_ranges); match->oss_ranges = strdup(oss_ranges); From 4ed8fac3fa5eef289711d61b96d66622f068503a Mon Sep 17 00:00:00 2001 From: core software devel Date: Wed, 7 Aug 2024 11:15:14 +0000 Subject: [PATCH 05/19] force the engine to pick different components. Add component_list_update function --- inc/match_list.h | 9 ++++++++- src/match.c | 51 +++++++++++++++++++++++++++++++++++++----------- src/match_list.c | 18 +++++++++++++++++ 3 files changed, 66 insertions(+), 12 deletions(-) diff --git a/inc/match_list.h b/inc/match_list.h index 6520724..6773972 100644 --- a/inc/match_list.h +++ b/inc/match_list.h @@ -81,6 +81,13 @@ #define MATCH_LIST_TOLERANCE 99.9 typedef struct match_data_t match_data_t; /* Forward declaration */ +typedef enum +{ + LIST_ITEM_NOT_FOUND = 0, + LIST_ITEM_FOUND, + LIST_ITEM_UPDATE +} list_update_t; + /** * @brief Define a list of component_data_t * @@ -147,5 +154,5 @@ void component_list_destroy(component_list_t *list); bool component_list_add_binary(component_list_t *list, component_data_t *new_comp, bool (*val)(component_data_t *a, component_data_t *b), bool remove_a); bool match_list_eval(match_list_t *list, match_data_t * in, bool (*eval)(match_data_t *fpa, match_data_t *fpb)); void match_list_tolerance_set(float in); - +bool component_list_update(component_list_t *list, component_data_t * in, list_update_t (*eval)(component_data_t *fpa, component_data_t *fpb)); #endif diff --git a/src/match.c b/src/match.c index 56f9427..a3ad3c8 100644 --- a/src/match.c +++ b/src/match.c @@ -381,6 +381,19 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ return false; } +list_update_t component_update(component_data_t *a, component_data_t *b) +{ + if (a && b && a->purls[0] && b->purls[0] && a->release_date && b->release_date && !strcmp(a->purls[0], b->purls[0])) + { + if (strcmp(b->release_date, a->release_date) < 0) + return LIST_ITEM_UPDATE; + else + return LIST_ITEM_FOUND; + } + else + return LIST_ITEM_NOT_FOUND; +} + bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, char *path) { component_data_t *new_comp = NULL; @@ -391,18 +404,34 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, fill_component_path(new_comp, path); /* Create a new component and fill it from the url record */ - - new_comp->file_md5_ref = component_list->match_ref->file_md5; - /* If the component is valid add it to the component list */ - /* The component list is a fixed size list, of size 3 by default, this means the list will keep the free oldest components*/ - /* The oldest component will be the first in the list, if two components have the same age the purl date will untie */ - new_comp->file_path_ref = component_list->match_ref->scan_ower->file_path; - new_comp->path_rank = PATH_LEVEL_COMP_INIT_VALUE; - - scanlog("--- new comp: %s@%s %s %d---\n", new_comp->purls[0], new_comp->version, new_comp->release_date, new_comp->identified); - if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) + component_data_t *new_comp = calloc(1, sizeof(*new_comp)); + bool result = fill_component(new_comp, url_id, path, (uint8_t *)url_rec); + if (result) { - component_data_free(new_comp); /* Free if the componet was rejected */ + new_comp->file_md5_ref = component_list->match_ref->file_md5; + /* If the component is valid add it to the component list */ + /* The component list is a fixed size list, of size 3 by default, this means the list will keep the free oldest components*/ + /* The oldest component will be the first in the list, if two components have the same age the purl date will untie */ + new_comp->identified = IDENTIFIED_NONE; + asset_declared(new_comp); + new_comp->file_path_ref = component_list->match_ref->scan_ower->file_path; + new_comp->path_rank = PATH_LEVEL_COMP_INIT_VALUE; + if (!component_list_update(component_list, new_comp, component_update)) + { + scanlog("--- new comp %s---\n", new_comp->component); + if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) + { + scanlog("component rejected: %s\n", new_comp->purls[0]); + component_data_free(new_comp); /* Free if the componet was rejected */ + } + else + scanlog("component accepted: %s - pathrank: %d\n", new_comp->purls[0], new_comp->path_rank); + } + else if (debug_on) + { + scanlog("--- Componen already exist: %s---\n", new_comp->component); + } + } else { diff --git a/src/match_list.c b/src/match_list.c index 9df35af..09981e5 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -427,6 +427,24 @@ void component_list_print(component_list_t *list, bool (*printer)(component_data } } +bool component_list_update(component_list_t *list, component_data_t * in, list_update_t (*eval)(component_data_t *fpa, component_data_t *fpb)) +{ + for (struct comp_entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next) + { + list_update_t r = eval(np->component, in); + if (r == LIST_ITEM_UPDATE) + { + component_data_t * aux = np->component; + np->component = in; + component_data_free(aux); + return true; + } + else if (r == LIST_ITEM_FOUND) + return true; + } + return false; +} + void match_list_process(match_list_t *list, bool (*funct_p)(match_data_t *fpa)) { for (struct entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next) From 3164bcb08d38a5c31dda5e7eb6e5f2cbc2f0a39e Mon Sep 17 00:00:00 2001 From: core software devel Date: Wed, 7 Aug 2024 13:56:54 +0000 Subject: [PATCH 06/19] solve memory leaks --- src/license.c | 7 +++---- src/match.c | 9 ++++----- src/quality.c | 19 +++++++++---------- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/license.c b/src/license.c index 93c5e5a..3281ce6 100644 --- a/src/license.c +++ b/src/license.c @@ -80,19 +80,18 @@ void normalize_license(char *license) { char def[MAX_ARGLN]; strcpy(def, license_normalization[i]); - char *token; /* get the first token */ - token = strtok(def, ","); + char * token = strtok(def, ","); - char *spdx = token; + //char *spdx = token; /* walk through other tokens */ while (token != NULL) { if (stricmp(license, token)) { - strcpy(license, spdx); + strcpy(license, token); return; } token = strtok(NULL, ","); diff --git a/src/match.c b/src/match.c index a3ad3c8..dd68605 100644 --- a/src/match.c +++ b/src/match.c @@ -388,7 +388,11 @@ list_update_t component_update(component_data_t *a, component_data_t *b) if (strcmp(b->release_date, a->release_date) < 0) return LIST_ITEM_UPDATE; else + { + scanlog("--- Componen already exist: %s---\n", b->component); + component_data_free(b); return LIST_ITEM_FOUND; + } } else return LIST_ITEM_NOT_FOUND; @@ -427,11 +431,6 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, else scanlog("component accepted: %s - pathrank: %d\n", new_comp->purls[0], new_comp->path_rank); } - else if (debug_on) - { - scanlog("--- Componen already exist: %s---\n", new_comp->component); - } - } else { diff --git a/src/quality.c b/src/quality.c index bc9fcb6..5f9a8db 100644 --- a/src/quality.c +++ b/src/quality.c @@ -56,13 +56,13 @@ const char *quality_sources[] = {"best_practices"}; bool print_quality_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { - match_data_t * match = (match_data_t*) ptr; - char *CSV = (char*) data; + char ** out = ptr; + char *csv = (char*) data; char *source = calloc(MAX_JSON_VALUE_LEN, 1); char *quality = calloc(MAX_JSON_VALUE_LEN, 1); - extract_csv(source, CSV, 1, MAX_JSON_VALUE_LEN); - extract_csv(quality, CSV, 2, MAX_JSON_VALUE_LEN); + extract_csv(source, csv, 1, MAX_JSON_VALUE_LEN); + extract_csv(quality, csv, 2, MAX_JSON_VALUE_LEN); int src = atoi(source); @@ -85,7 +85,7 @@ bool print_quality_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *d len += sprintf(result+len,"\"score\": \"%s\",", quality); len += sprintf(result+len,"\"source\": \"%s\"", quality_sources[atoi(source)]); len += sprintf(result+len,"}"); - match->quality_text = strdup(result); + *out = strdup(result); } @@ -105,17 +105,16 @@ void print_quality(match_data_t * match) return; char result[MAX_FIELD_LN] = "\0"; - match->quality_text = NULL; + char * aux = NULL; sprintf(result,"\"quality\": ["); - ldb_fetch_recordset(NULL, oss_quality, match->file_md5, false, print_quality_item, match); + ldb_fetch_recordset(NULL, oss_quality, match->file_md5, false, print_quality_item, &aux); - char * aux = NULL; - asprintf(&aux, "%s%s]", result, match->quality_text ? match->quality_text : ""); free(match->quality_text); - match->quality_text = aux; + asprintf(&match->quality_text, "%s%s]", result, aux ? aux : ""); + free(aux); } From c96d28c58e6921a21a35a2a78329592b103b3398 Mon Sep 17 00:00:00 2001 From: core software devel Date: Thu, 22 Aug 2024 12:35:49 +0000 Subject: [PATCH 07/19] replace MD5_LEN by key_ln --- inc/util.h | 5 ----- src/attributions.c | 16 ++++++++-------- src/binary_scan.c | 20 ++++++++++---------- src/component.c | 12 ++++++------ src/debug.c | 4 ++-- src/dependency.c | 12 ++++++------ src/file.c | 8 ++++---- src/hpsm.c | 6 +++--- src/license.c | 2 +- src/match.c | 16 ++++++++-------- src/match_list.c | 4 ++-- src/mz.c | 6 +++--- src/query.c | 4 ++-- src/report.c | 13 +++++-------- src/scan.c | 19 +++++++------------ src/snippets.c | 4 ++-- src/url.c | 6 +++--- src/util.c | 21 ++++++--------------- 18 files changed, 78 insertions(+), 100 deletions(-) diff --git a/inc/util.h b/inc/util.h index 7741e6b..5f1077e 100644 --- a/inc/util.h +++ b/inc/util.h @@ -26,11 +26,6 @@ char *datestamp(void); /* Prints a "created" JSON element with the current datestamp */ void print_datestamp(void); -//void file_md5(char *filepath, uint8_t *md5_result); - -/* Returns a string with a hex representation of md5 */ -char *md5_hex(uint8_t *md5); - /* Removes chr from str */ void remove_char(char *str, char chr); diff --git a/src/attributions.c b/src/attributions.c index e9a85b4..ac67673 100644 --- a/src/attributions.c +++ b/src/attributions.c @@ -54,10 +54,10 @@ bool notices_handler(uint8_t *key, uint8_t *subkey, int subkey_ln, \ uint8_t *data, uint32_t datalen, int iteration, void *ptr) { - if (datalen != 2 * MD5_LEN) return false; - char hexkey[MD5_LEN * 2 + 1]; - memcpy(hexkey, data, MD5_LEN * 2); - hexkey[MD5_LEN * 2] = 0; + if (datalen != 2 * oss_attribution.key_ln) return false; + char hexkey[oss_attribution.key_ln * 2 + 1]; + memcpy(hexkey, data, oss_attribution.key_ln * 2); + hexkey[oss_attribution.key_ln * 2] = 0; /* Print attribution notice header */ char *component = (char *) ptr; @@ -86,11 +86,11 @@ uint8_t *data, uint32_t datalen, int iteration, void *ptr) { bool *valid = (bool *) ptr; - if (datalen != MD5_LEN) return false; + if (datalen != oss_attribution.key_ln) return false; /* Convert key */ uint8_t attr_id[16]; - ldb_hex_to_bin((char *) data, MD5_LEN * 2, attr_id); + ldb_hex_to_bin((char *) data, oss_attribution.key_ln * 2, attr_id); /* Define mz_job values */ struct mz_job job; @@ -100,7 +100,7 @@ uint8_t *data, uint32_t datalen, int iteration, void *ptr) job.mz_ln = 0; job.id = NULL; job.ln = 0; - job.md5[MD5_LEN] = 0; + job.md5[oss_attribution.key_ln] = 0; job.key = NULL; /* If file does not exist, exit with valid = false */ @@ -288,7 +288,7 @@ int attribution_notices(char * components) char * licenses_json = notices_load_file(); /* Validate SBOM */ declared_components = get_components(components); - if (check_purl_attributions(oss_attribution, licenses_json) && !debug_on) + if (check_purl_attributions(oss_attribution, licenses_json)) /* Print attribution notices */ print_purl_attribution_notices(oss_attribution, licenses_json); diff --git a/src/binary_scan.c b/src/binary_scan.c index a7f7a88..9e72ed8 100644 --- a/src/binary_scan.c +++ b/src/binary_scan.c @@ -68,7 +68,7 @@ static bool add_purl_from_urlid(uint8_t *key, uint8_t *subkey, int subkey_ln, ui if (iteration > MAX_URLS) return true; /* Ignore path lengths over the limit */ - if (!datalen || datalen >= (MD5_LEN + MAX_FILE_PATH)) return false; + if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; /* Decrypt data */ char * decrypted = decrypt_data(raw_data, datalen, oss_file, key, subkey); @@ -77,8 +77,8 @@ static bool add_purl_from_urlid(uint8_t *key, uint8_t *subkey, int subkey_ln, ui component_list_t * component_list = (component_list_t*) ptr; /* Copy data to memory */ - uint8_t url_id[MD5_LEN]; - memcpy(url_id, raw_data, MD5_LEN); + uint8_t url_id[oss_url.key_ln]; + memcpy(url_id, raw_data, oss_url.key_ln); char path[MAX_FILE_PATH+1]; strncpy(path, decrypted, MAX_FILE_PATH); @@ -136,7 +136,7 @@ static bool get_all_file_ids(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8 { if (iteration < max_files_to_process * 2) { - memcpy(files[iteration].url_id, data, MD5_LEN); + memcpy(files[iteration].url_id, data, oss_url.key_ln); return false; } return true; @@ -253,16 +253,16 @@ extern bool first_file; int binary_scan(char * input) { /* Get file MD5 */ - char * hexmd5 = strndup(input, MD5_LEN * 2); + char * hexmd5 = strndup(input, oss_file.key_ln * 2); scanlog("Bin File md5 to be scanned: %s\n", hexmd5); - uint8_t bin_md5[MD5_LEN]; - ldb_hex_to_bin(hexmd5, MD5_LEN * 2, bin_md5); + uint8_t bin_md5[oss_file.key_ln]; + ldb_hex_to_bin(hexmd5, oss_file.key_ln * 2, bin_md5); free(hexmd5); - uint8_t zero_md5[MD5_LEN] = {0xd4,0x1d,0x8c,0xd9,0x8f,0x00,0xb2,0x04,0xe9,0x80,0x09,0x98,0xec,0xf8,0x42,0x7e}; //empty string md5 + /*uint8_t zero_md5[oss_file.key_ln] = {0xd4,0x1d,0x8c,0xd9,0x8f,0x00,0xb2,0x04,0xe9,0x80,0x09,0x98,0xec,0xf8,0x42,0x7e}; //empty string md5 if (!memcmp(zero_md5,bin_md5, MD5_LEN)) //the md5 key of an empty string must be skipped. - return -1; + return -1;*/ if (ldb_key_exists(oss_file, bin_md5)) { @@ -272,7 +272,7 @@ int binary_scan(char * input) char * target = strndup(file_name, target_len); scan_data_t * scan = scan_data_init(target, 1, 1); free(target); - memcpy(scan->md5, bin_md5, MD5_LEN); + memcpy(scan->md5, bin_md5, oss_file.key_ln); scan->match_type = MATCH_FILE; compile_matches(scan); diff --git a/src/component.c b/src/component.c index 6861ec3..1e2734d 100644 --- a/src/component.c +++ b/src/component.c @@ -95,7 +95,7 @@ component_data_t *component_data_copy(component_data_t *in) out->latest_version = strdup(in->latest_version); out->license = strdup(in->license); out->url_match = in->url_match; - memcpy(out->url_md5, in->url_md5, MD5_LEN); + memcpy(out->url_md5, in->url_md5, oss_url.key_ln); if (in->main_url) out->main_url = strdup(in->main_url); out->url = strdup(in->url); @@ -109,8 +109,8 @@ component_data_t *component_data_copy(component_data_t *in) if (in->purls_md5[i]) { - out->purls_md5[i] = malloc(MD5_LEN); - memcpy(out->purls_md5[i], in->purls_md5[i], MD5_LEN); + out->purls_md5[i] = malloc(oss_purl.key_ln); + memcpy(out->purls_md5[i], in->purls_md5[i], oss_purl.key_ln); } } @@ -236,7 +236,7 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa /* Extract fields from file record */ if (url_key) { - memcpy(component->url_md5, url_key, MD5_LEN); + memcpy(component->url_md5, url_key, oss_url.key_ln); if (file_path) { fill_component_path(component, file_path); @@ -301,14 +301,14 @@ bool component_date_comparation(component_data_t *a, component_data_t *b) if (!a->purls_md5[0] && a->purls[0]) { - a->purls_md5[0] = malloc(MD5_LEN); + a->purls_md5[0] = malloc(oss_url.key_ln); MD5((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); a->age = get_component_age(a->purls_md5[0]); } if (!b->purls_md5[0] && b->purls[0]) { - b->purls_md5[0] = malloc(MD5_LEN); + b->purls_md5[0] = malloc(oss_purl.key_ln); MD5((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); b->age = get_component_age(b->purls_md5[0]); } diff --git a/src/debug.c b/src/debug.c index 0b1e9f8..040a267 100644 --- a/src/debug.c +++ b/src/debug.c @@ -162,7 +162,7 @@ void map_dump(scan_data_t *scan) /* Print matching MD5 */ uint8_t *md5 = scan->matchmap[i].md5; - for (int j = 0; j < MD5_LEN; j++) fprintf(map, "%02x", md5[j]); + for (int j = 0; j < oss_file.key_ln; j++) fprintf(map, "%02x", md5[j]); /* Print hits */ fprintf(map, " %04x ", scan->matchmap[i].hits); @@ -198,7 +198,7 @@ void scan_benchmark() { scan_data_t * scan = scan_data_init("pseudo_file", 0, 0); scan->preload = true; - memcpy(scan->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", MD5_LEN); + memset(scan->md5, 0, oss_file.key_ln); strcpy(scan->file_size, "1024"); progress ("Scanning: ", f + 1, total_files, false); diff --git a/src/dependency.c b/src/dependency.c index e3dae5f..1334694 100644 --- a/src/dependency.c +++ b/src/dependency.c @@ -123,10 +123,10 @@ int print_dependencies(component_data_t * comp) if (!records) for (int i = 0; i < MAX_PURLS && comp->purls[i]; i++) { - uint8_t md5[MD5_LEN]; - purl_version_md5(md5, comp->purls[i], comp->version); + uint8_t hash[oss_purl.key_ln]; + purl_version_md5(hash, comp->purls[i], comp->version); - records = ldb_fetch_recordset(NULL, oss_dependency, md5, false, print_dependencies_item, comp); + records = ldb_fetch_recordset(NULL, oss_dependency, hash, false, print_dependencies_item, comp); if (records) { scanlog("Dependency matches (%d) reported for %s@%s\n", records, comp->purls[i],comp->version); @@ -139,10 +139,10 @@ int print_dependencies(component_data_t * comp) if (!records) for (int i = 0; i < MAX_PURLS && comp->purls[i]; i++) { - uint8_t md5[MD5_LEN]; - purl_version_md5(md5, comp->purls[i], comp->latest_version); + uint8_t hash[oss_purl.key_ln]; + purl_version_md5(hash, comp->purls[i], comp->latest_version); - records = ldb_fetch_recordset(NULL, oss_dependency, md5, false, print_dependencies_item, comp); + records = ldb_fetch_recordset(NULL, oss_dependency, hash, false, print_dependencies_item, comp); if (records) { scanlog("Dependency matches (%d) reported for %s@%s\n", records, comp->purls[i],comp->latest_version); diff --git a/src/file.c b/src/file.c index 204915e..acd9338 100644 --- a/src/file.c +++ b/src/file.c @@ -196,7 +196,7 @@ bool collect_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra if (iteration >= FETCH_MAX_FILES) return true; /* Ignore path lengths over the limit */ - if (!datalen || datalen >= (MD5_LEN + MAX_FILE_PATH)) return false; + if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; /* Decrypt data */ char * decrypted = decrypt_data(raw_data, datalen, oss_file, key, subkey); @@ -205,7 +205,7 @@ bool collect_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra /* Copy data to memory */ file_recordset *files = ptr; - memcpy(files[iteration].url_id, raw_data, MD5_LEN); + memcpy(files[iteration].url_id, raw_data, oss_url.key_ln); strncpy(files[iteration].path, decrypted, MAX_FILE_PATH); free(decrypted); @@ -227,7 +227,7 @@ bool collect_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra bool count_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { /* Ignore path lengths over the limit */ - if (!datalen || datalen >= (MD5_LEN + MAX_FILE_PATH)) return false; + if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; int * count = ptr; *count = iteration; @@ -273,7 +273,7 @@ bool get_first_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, return false; *(char *)ptr = 0; - char *ext = file_extension((char *)file_data + MD5_LEN); + char *ext = file_extension((char *)file_data + oss_file.key_ln); if (ext) strcpy((char *) ptr, ext); diff --git a/src/hpsm.c b/src/hpsm.c index c116774..aef202e 100644 --- a/src/hpsm.c +++ b/src/hpsm.c @@ -105,9 +105,9 @@ struct ranges hpsm_calc(uint8_t *file_md5) return r; } scanlog("Running HPSM\n"); - char *file = md5_hex(file_md5); - struct ranges result = hpsm(hpsm_crc_lines, file); - free(file); + char file_hex[oss_file.key_ln * 2 + 1]; + ldb_bin_to_hex(file_md5, oss_file.key_ln, file_hex); + struct ranges result = hpsm(hpsm_crc_lines, file_hex); return result; } diff --git a/src/license.c b/src/license.c index 3281ce6..e203cbf 100644 --- a/src/license.c +++ b/src/license.c @@ -372,7 +372,7 @@ void print_licenses(component_data_t *comp) for (int i = 0; i < MAX_PURLS && comp->purls[i]; i++) { /* Calculate purl@version md5 */ - uint8_t purlversion_md5[MD5_LEN]; + uint8_t purlversion_md5[oss_purl.key_ln]; purl_version_md5(purlversion_md5, comp->purls[i], comp->version); records = ldb_fetch_recordset(NULL, oss_license, purlversion_md5, false, print_licenses_item, comp); diff --git a/src/match.c b/src/match.c index dd68605..421018c 100644 --- a/src/match.c +++ b/src/match.c @@ -86,7 +86,7 @@ void match_data_free(match_data_t *data) match_data_t * match_data_copy(match_data_t * in) { match_data_t * out = calloc(1, sizeof(*out)); - memcpy(out->file_md5,in->file_md5,MD5_LEN); + memcpy(out->file_md5,in->file_md5,oss_file.key_ln); out->hits = in->hits; out->type = in->type; out->line_ranges = strdup(in->line_ranges); @@ -354,14 +354,14 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ if (!a->purls_md5[0] && a->purls[0]) { - a->purls_md5[0] = malloc(MD5_LEN); + a->purls_md5[0] = malloc(oss_purl.key_ln); MD5((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); a->age = get_component_age(a->purls_md5[0]); } if (!b->purls_md5[0] && b->purls[0]) { - b->purls_md5[0] = malloc(MD5_LEN); + b->purls_md5[0] = malloc(oss_purl.key_ln); MD5((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); b->age = get_component_age(b->purls_md5[0]); } @@ -464,7 +464,7 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * return true; /* Ignore path lengths over the limit */ - if (!datalen || datalen >= (MD5_LEN + MAX_FILE_PATH)) return false; + if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; /* Decrypt data */ char * decrypted = decrypt_data(raw_data, datalen, oss_file, key, subkey); @@ -474,12 +474,12 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * component_list_t * component_list = (component_list_t*) ptr; /* Copy data to memory */ - uint8_t url_id[MD5_LEN] = {0xd4,0x1d,0x8c,0xd9,0x8f,0x00,0xb2,0x04,0xe9,0x80,0x09,0x98,0xec,0xf8,0x42,0x7e}; //empty string md5 + uint8_t url_id[oss_url.key_ln]; /*= {0xd4,0x1d,0x8c,0xd9,0x8f,0x00,0xb2,0x04,0xe9,0x80,0x09,0x98,0xec,0xf8,0x42,0x7e}; //empty string md5 if (!memcmp(raw_data,url_id, MD5_LEN)) //the md5 key of an empty string must be skipped. - return false; + return false;*/ - memcpy(url_id, raw_data, MD5_LEN); + memcpy(url_id, raw_data, oss_url.key_ln); char path[MAX_FILE_PATH+1]; strncpy(path, decrypted, MAX_FILE_PATH); //check the ignore list only if the match type is MATCH_SNIPPET. TODO: remove this after remine everything. @@ -824,7 +824,7 @@ void compile_matches(scan_data_t *scan) match_data_t *match_new = calloc(1, sizeof(match_data_t)); match_new->type = scan->match_type; strcpy(match_new->source_md5, scan->source_md5); - memcpy(match_new->file_md5, scan->match_ptr, MD5_LEN); + memcpy(match_new->file_md5, scan->match_ptr, oss_file.key_ln); match_new->scan_ower = scan; if (!match_list_add(scan->matches_list_array[0], match_new, NULL, false)) { diff --git a/src/match_list.c b/src/match_list.c index 09981e5..89d6ff1 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -370,8 +370,8 @@ void match_list_debug(match_list_t *list) scanlog("Print list\n"); for (struct entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next) { - char md5_hex[MD5_LEN * 2 + 1]; - ldb_bin_to_hex(np->match->matchmap_reg->md5, MD5_LEN, md5_hex); + char md5_hex[oss_file.key_ln * 2 + 1]; + ldb_bin_to_hex(np->match->matchmap_reg->md5, oss_file.key_ln, md5_hex); // printf("Item: %d - hits: %d - md5: %s - file: %s - release_date: %s - ranges: %s - purl:%s\n", // i, np->match->hits, md5_hex, np->match->file, np->match->release_date, np->match->line_ranges, np->match->purls[0]); printf("\nItem: %d - hits: %d - md5: %s - release: %s \n", i, np->match->hits, md5_hex, np->match->component_list.headp.lh_first->component->release_date); diff --git a/src/mz.c b/src/mz.c index c3ab15a..b9969d2 100644 --- a/src/mz.c +++ b/src/mz.c @@ -47,7 +47,7 @@ void mz_get_key(struct ldb_table kb, char *key) { /* Calculate mz file path */ - char mz_path[LDB_MAX_PATH + MD5_LEN] = "\0"; + char mz_path[LDB_MAX_PATH + kb.key_ln]; char mz_file_id[5] = "\0\0\0\0\0"; struct mz_job job; memcpy(mz_file_id, key, 4); @@ -66,8 +66,8 @@ void mz_get_key(struct ldb_table kb, char *key) scanlog("MZ path: %s \n", mz_path); /* Save path and key on job */ - job.key = calloc(MD5_LEN, 1); - ldb_hex_to_bin(key, MD5_LEN * 2, job.key); + job.key = calloc(kb.key_ln, 1); + ldb_hex_to_bin(key, kb.key_ln * 2, job.key); /* Read source mz file into memory */ job.mz = file_read(mz_path, &job.mz_ln); diff --git a/src/query.c b/src/query.c index b85809b..84f048a 100644 --- a/src/query.c +++ b/src/query.c @@ -47,8 +47,8 @@ char *get_filename(char *md5) { /* Convert md5 to bin */ - uint8_t md5bin[MD5_LEN]; - ldb_hex_to_bin(md5, MD5_LEN * 2, md5bin); + uint8_t md5bin[oss_file.key_ln]; + ldb_hex_to_bin(md5, oss_file.key_ln * 2, md5bin); /* Init record */ uint8_t *record = calloc(LDB_MAX_REC_LN + 1, 1); diff --git a/src/report.c b/src/report.c index c650328..96620c1 100644 --- a/src/report.c +++ b/src/report.c @@ -203,7 +203,7 @@ bool print_json_component(component_data_t * component) { if (component->purls[i] && !component->purls_md5[i]) { - component->purls_md5[i] = malloc(MD5_LEN); + component->purls_md5[i] = malloc(oss_purl.key_ln); MD5((uint8_t *)component->purls[i], strlen(component->purls[i]), component->purls_md5[i]); } } @@ -245,7 +245,8 @@ bool print_json_component(component_data_t * component) if (engine_flags & ENABLE_PATH_HINT) printf("\"path_rank\": %d,", component->path_rank); - char *url_id = md5_hex(component->url_md5); + char url_id[oss_url.key_ln * 2 + 1]; + ldb_bin_to_hex(component->url_md5, oss_url.key_ln, url_id); printf("\"url_hash\": \"%s\"", url_id); free(url_id); @@ -315,7 +316,8 @@ bool print_json_match(struct match_data_t * match) scanlog("Match with no components ignored: %s", match->source_md5); return false; } - char *file_id = md5_hex(match->file_md5); + char file_id[oss_file.key_ln * 2 +1]; + ldb_bin_to_hex(match->file_md5, oss_file.key_ln, file_id); if (engine_flags & DISABLE_BEST_MATCH) printf("{"); @@ -344,11 +346,6 @@ bool print_json_match(struct match_data_t * match) else printf(",\"file_url\": \"%s\"", match->component_list.headp.lh_first->component->url); } - else //return an empty string - printf(",\"file_url\": \" \""); - - - free(file_id); if (!(engine_flags & DISABLE_QUALITY)) { diff --git a/src/scan.c b/src/scan.c index 8d586d0..d1f18f6 100644 --- a/src/scan.c +++ b/src/scan.c @@ -19,7 +19,6 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ - #include "debug.h" #include "file.h" #include "ignorelist.h" @@ -195,7 +194,7 @@ int hash_scan(char *path, int scan_max_snippets, int scan_max_components) scan->preload = true; /* Get file MD5 */ - ldb_hex_to_bin(scan->file_path, MD5_LEN * 2, scan->md5); + ldb_hex_to_bin(scan->file_path, oss_file.key_ln * 2, scan->md5); /* Fake file length */ strcpy(scan->file_size, "999"); @@ -236,7 +235,6 @@ int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) /* Get wfp MD5 hash */ uint8_t tmp_md5[16]; get_file_md5(path, tmp_md5); - char *tmp_md5_hex = md5_hex(tmp_md5); /* Read line by line */ while ((lineln = getline(&line, &len, fp)) != -1) @@ -268,25 +266,25 @@ int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) const int tagln = 5; // len of 'file=' /* Get file MD5 */ - char * hexmd5 = strndup(line + tagln, MD5_LEN * 2); - if (strlen(hexmd5) < MD5_LEN * 2) + char * hexmd5 = strndup(line + tagln, oss_file.key_ln * 2); + if (strlen(hexmd5) < oss_file.key_ln * 2) { scanlog("Incorrect md5 len in line %s. Skipping\n", line); free(hexmd5); continue; } - rec = (uint8_t*) strdup(line + tagln + (MD5_LEN * 2) + 1); + rec = (uint8_t*) strdup(line + tagln + (oss_file.key_ln * 2) + 1); char * target = field_n(2, (char *)rec); /*Init a new scan object for the next file to be scanned */ scan = scan_data_init(target, scan_max_snippets, scan_max_components); - strcpy(scan->source_md5, tmp_md5_hex); + ldb_bin_to_hex(tmp_md5, oss_file.key_ln, scan->source_md5); extract_csv(scan->file_size, (char *)rec, 1, LDB_MAX_REC_LN); scan->preload = true; free(rec); scanlog("File md5 to be scanned: %s\n", hexmd5); - ldb_hex_to_bin(hexmd5, MD5_LEN * 2, scan->md5); + ldb_hex_to_bin(hexmd5, oss_file.key_ln * 2, scan->md5); free(hexmd5); } @@ -331,7 +329,6 @@ int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) fclose(fp); if (line) free(line); - free(tmp_md5_hex); return EXIT_SUCCESS; } @@ -445,9 +442,7 @@ void ldb_scan(scan_data_t *scan) get_file_md5(scan->file_path, scan->md5); /* Scan full file */ - char *tmp_md5_hex = md5_hex(scan->md5); - strcpy(scan->source_md5, tmp_md5_hex); - free(tmp_md5_hex); + ldb_bin_to_hex(scan->md5, oss_file.key_ln, scan->source_md5); /* Look for full file match or url match in ldb */ scan->match_type = ldb_scan_file(scan); diff --git a/src/snippets.c b/src/snippets.c index ae8a89b..601c3d0 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -650,14 +650,14 @@ int add_file_to_matchmap(scan_data_t *scan, matchmap_entry_t *item, uint8_t *md5 found = scan->matchmap_size; /* Write MD5 */ - memcpy(scan->matchmap[found].md5, md5, MD5_LEN); + memcpy(scan->matchmap[found].md5, md5, oss_file.key_ln); scan->matchmap[found].ranges_number = 0; } /* Search for the right range */ uint32_t from = 0; - uint16_t oss_line = uint16_read(md5 + MD5_LEN); + uint16_t oss_line = uint16_read(md5 + oss_file.key_ln); bool range_found = false; for (uint32_t t = 0; t < scan->matchmap[found].ranges_number; t++) diff --git a/src/url.c b/src/url.c index 677f675..5a1fbe6 100644 --- a/src/url.c +++ b/src/url.c @@ -213,7 +213,7 @@ bool handle_purl_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *d { scanlog("Related PURL: %s\n", purl); component->purls[i] = purl; - component->purls_md5[i] = malloc(MD5_LEN); + component->purls_md5[i] = malloc(oss_purl.key_ln); MD5((uint8_t *)purl, strlen(purl), component->purls_md5[i]); return false; } @@ -246,7 +246,7 @@ void fetch_related_purls(component_data_t *component) /* add main purl md5 if it is not ready */ if (!component->purls_md5[0] && component->purls[0]) { - component->purls_md5[0] = malloc(MD5_LEN); + component->purls_md5[0] = malloc(oss_purl.key_ln); MD5((uint8_t *)component->purls[0], strlen(component->purls[0]), component->purls_md5[0]); } @@ -308,7 +308,7 @@ void purl_release_date(char *purl, char *date) if (!ldb_table_exists(oss_purl.db, oss_purl.table)) //skip purl if the table is not present return; - uint8_t purl_md5[MD5_LEN]; + uint8_t purl_md5[oss_purl.key_ln]; MD5((uint8_t *)purl, strlen(purl), purl_md5); ldb_fetch_recordset(NULL, oss_purl, purl_md5, false, get_purl_first_release, (void *) date); diff --git a/src/util.c b/src/util.c index 4151d66..eeef02f 100644 --- a/src/util.c +++ b/src/util.c @@ -143,9 +143,12 @@ void vendor_component_md5(char *component, char *vendor, uint8_t *out) MD5((uint8_t *)pair, strlen(pair), out); /* Log pair_md5 */ - char hex[MD5_LEN * 2 + 1] = "\0"; - ldb_bin_to_hex(out, MD5_LEN, hex); - scanlog("vendor/component: %s = %s\n", pair, hex); + if (debug_on) + { + char hex[oss_purl.key_ln * 2 + 1]; + ldb_bin_to_hex(out, oss_purl.key_ln, hex); + scanlog("vendor/component: %s = %s\n", pair, hex); + } } /** @@ -227,18 +230,6 @@ void print_datestamp() free(stamp); } -/** - * @brief Returns a string with a hex representation of md5 - * @param md5 input md5 - * @return pointer to string - */ -char *md5_hex(uint8_t *md5) -{ - char *out = calloc(2 * MD5_LEN + 1, 1); - for (int i = 0; i < MD5_LEN; i++) sprintf(out + strlen(out), "%02x", md5[i]); - return out; -} - /** * @brief Returns the CRC32C for a string * @param str input string From f5f4d6572fd4a40ffc5892dc58fb6774ac947e54 Mon Sep 17 00:00:00 2001 From: core software devel Date: Sun, 25 Aug 2024 21:30:21 +0000 Subject: [PATCH 08/19] update fetch_recordset function handlers to new definition --- inc/match.h | 2 +- inc/url.h | 4 ++-- src/attributions.c | 8 +++----- src/binary_scan.c | 6 +++--- src/copyright.c | 4 ++-- src/cryptography.c | 4 ++-- src/dependency.c | 4 ++-- src/file.c | 4 ++-- src/health.c | 4 ++-- src/license.c | 4 ++-- src/match.c | 4 ++-- src/quality.c | 2 +- src/query.c | 9 ++++----- src/snippets.c | 2 +- src/url.c | 18 +++++++++--------- src/vulnerability.c | 4 ++-- 16 files changed, 40 insertions(+), 43 deletions(-) diff --git a/inc/match.h b/inc/match.h index 31278f3..d36c8cf 100644 --- a/inc/match.h +++ b/inc/match.h @@ -35,6 +35,6 @@ void output_matches_json(scan_data_t *scan); void compile_matches(scan_data_t *scan); match_list_t * match_select_m_best(scan_data_t * scan); match_list_t * match_select_m_component_best(scan_data_t * scan); -bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); +bool component_from_file(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); #endif diff --git a/inc/url.h b/inc/url.h index 28ca4ac..abb62fe 100644 --- a/inc/url.h +++ b/inc/url.h @@ -4,7 +4,7 @@ #include "scanoss.h" #include "match_list.h" -bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); +bool handle_url_record(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); /* Calculates a main project URL from the PURL */ void fill_main_url(component_data_t *match); @@ -13,7 +13,7 @@ void fill_main_url(component_data_t *match); void fetch_related_purls(component_data_t *component); /* Handler function for getting the oldest URL */ -bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr); +bool get_oldest_url(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr); bool get_purl_first_release(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr); diff --git a/src/attributions.c b/src/attributions.c index ac67673..d265574 100644 --- a/src/attributions.c +++ b/src/attributions.c @@ -51,13 +51,12 @@ * @param ptr //TODO * @return //TODO */ -bool notices_handler(uint8_t *key, uint8_t *subkey, int subkey_ln, \ -uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool notices_handler(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { if (datalen != 2 * oss_attribution.key_ln) return false; char hexkey[oss_attribution.key_ln * 2 + 1]; memcpy(hexkey, data, oss_attribution.key_ln * 2); - hexkey[oss_attribution.key_ln * 2] = 0; + hexkey[table->key_ln * 2] = 0; /* Print attribution notice header */ char *component = (char *) ptr; @@ -81,8 +80,7 @@ uint8_t *data, uint32_t datalen, int iteration, void *ptr) * @param ptr //TODO * @return return true or false if the atribution exist or not. */ -bool attribution_handler(uint8_t *key, uint8_t *subkey, int subkey_ln, \ -uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool attribution_handler(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { bool *valid = (bool *) ptr; diff --git a/src/binary_scan.c b/src/binary_scan.c index 9e72ed8..efc0b67 100644 --- a/src/binary_scan.c +++ b/src/binary_scan.c @@ -62,7 +62,7 @@ static bool sort_by_hits(component_data_t *a, component_data_t *b) #define MAX_URLS 100 -static bool add_purl_from_urlid(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) +static bool add_purl_from_urlid(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { if (iteration > MAX_URLS) @@ -71,7 +71,7 @@ static bool add_purl_from_urlid(uint8_t *key, uint8_t *subkey, int subkey_ln, ui if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; /* Decrypt data */ - char * decrypted = decrypt_data(raw_data, datalen, oss_file, key, subkey); + char * decrypted = decrypt_data(raw_data, datalen, *table, key, subkey); if (!decrypted) return NULL; @@ -128,7 +128,7 @@ int max_files_to_process = 4; * @param ptr //TODO * @return //TODO */ -static bool get_all_file_ids(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +static bool get_all_file_ids(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { //component_list_t * comp_list = (component_list_t *) ptr; file_recordset * files = (file_recordset *) ptr; diff --git a/src/copyright.c b/src/copyright.c index 74c1f5f..5139c3b 100644 --- a/src/copyright.c +++ b/src/copyright.c @@ -85,10 +85,10 @@ static void clean_copyright(char *out, char *copyright) * @param subkey //TODO * @return //TODO */ -static bool print_copyrights_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +static bool print_copyrights_item(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { component_data_t * comp = ptr; - char * CSV = decrypt_data(data, datalen, oss_copyright, key, subkey); + char * CSV = decrypt_data(data, datalen, *table, key, subkey); char *source = calloc(MAX_JSON_VALUE_LEN + 1, 1); char *copyright = calloc(MAX_COPYRIGHT + 1, 1); diff --git a/src/cryptography.c b/src/cryptography.c index 29fa8b9..10e85fe 100644 --- a/src/cryptography.c +++ b/src/cryptography.c @@ -50,12 +50,12 @@ * @param ptr //TODO * @return //TODO */ -bool print_crypto_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool print_crypto_item(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { match_data_t *match = ptr; if (!datalen) return false; - char * CSV = decrypt_data(data, datalen, oss_cryptography, key, subkey); + char * CSV = decrypt_data(data, datalen, *table, key, subkey); char *algorithm = calloc(MAX_JSON_VALUE_LEN, 1); char *strength = calloc(MAX_JSON_VALUE_LEN, 1); diff --git a/src/dependency.c b/src/dependency.c index 1334694..534c4eb 100644 --- a/src/dependency.c +++ b/src/dependency.c @@ -54,9 +54,9 @@ const char *dependency_sources[] = {"component_declared"}; * @param ptr //TODO * @return //TODO */ -bool print_dependencies_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool print_dependencies_item(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { - char *CSV = decrypt_data(data, datalen, oss_dependency, key, subkey); + char *CSV = decrypt_data(data, datalen, *table, key, subkey); component_data_t * comp = (component_data_t *) ptr; char *source = calloc(MAX_JSON_VALUE_LEN, 1); char *vendor = calloc(MAX_JSON_VALUE_LEN, 1); diff --git a/src/file.c b/src/file.c index acd9338..6c94e77 100644 --- a/src/file.c +++ b/src/file.c @@ -263,11 +263,11 @@ char *file_extension(char *path) * @param ptr //TODO * @return //TODO */ -bool get_first_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool get_first_file(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { if (!datalen) return false; - char * file_data = decrypt_data(data, datalen, oss_file, key, subkey); + char * file_data = decrypt_data(data, datalen, *table, key, subkey); if (!file_data || !*file_data) return false; diff --git a/src/health.c b/src/health.c index 393c97f..bede8df 100644 --- a/src/health.c +++ b/src/health.c @@ -43,11 +43,11 @@ * @brief Prints information about statistics of a component comming from GitHub or gitee * */ -bool print_health_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool print_health_item(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { component_data_t *match = ptr; - char * decrypted = decrypt_data(data, datalen, oss_purl, key, subkey); + char * decrypted = decrypt_data(data, datalen, *table, key, subkey); /* Expect at least a date or a pkg:*/ if (strlen(decrypted) < 9) diff --git a/src/license.c b/src/license.c index e203cbf..28d5bbe 100644 --- a/src/license.c +++ b/src/license.c @@ -277,14 +277,14 @@ bool get_first_license_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_ * @param ptr //TODO * @return */ -bool print_licenses_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool print_licenses_item(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { component_data_t *comp = ptr; if (!datalen) return false; - char *CSV = decrypt_data(data, datalen, oss_license, key, subkey); + char *CSV = decrypt_data(data, datalen, *table, key, subkey); if (!CSV) return false; diff --git a/src/match.c b/src/match.c index 421018c..27246f2 100644 --- a/src/match.c +++ b/src/match.c @@ -465,9 +465,9 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * /* Ignore path lengths over the limit */ if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; - + /* Decrypt data */ - char * decrypted = decrypt_data(raw_data, datalen, oss_file, key, subkey); + char * decrypted = decrypt_data(raw_data, datalen, *table, key, subkey); if (!decrypted) return false; diff --git a/src/quality.c b/src/quality.c index 5f9a8db..ee93ac1 100644 --- a/src/quality.c +++ b/src/quality.c @@ -53,7 +53,7 @@ const char *quality_sources[] = {"best_practices"}; * @param ptr //TODO * @return //TODO */ -bool print_quality_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool print_quality_item(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { char ** out = ptr; diff --git a/src/query.c b/src/query.c index 84f048a..5e9a428 100644 --- a/src/query.c +++ b/src/query.c @@ -80,9 +80,9 @@ char *get_filename(char *md5) * @param ptr //TODO * @return //TODO */ -bool ldb_get_first_url_not_ignored(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool ldb_get_first_url_not_ignored(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { - char * decrypted = decrypt_data(data, datalen, oss_url, key, subkey); + char * decrypted = decrypt_data(data, datalen, *table, key, subkey); char *record = (char *) ptr; @@ -123,12 +123,11 @@ void get_url_record(uint8_t *md5, uint8_t *record) * @param ptr //TODO * @return //TODO */ -bool handle_get_component_age(uint8_t *key, uint8_t *subkey, int subkey_ln, \ -uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool handle_get_component_age(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { long *age = (long *) ptr; - char * decrypted = decrypt_data(data, datalen, oss_purl, key, subkey); + char * decrypted = decrypt_data(data, datalen, *table, key, subkey); /* Expect at least a date*/ if (strlen(decrypted) < 9) diff --git a/src/snippets.c b/src/snippets.c index 601c3d0..961a65a 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -211,7 +211,7 @@ void biggest_snippet(scan_data_t *scan) * @return //TODO */ #define MATCHMAP_ITEM_SIZE (matchmap_max_files * 2) -static bool get_all_file_ids(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +static bool get_all_file_ids(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { uint8_t *record = (uint8_t *)ptr; diff --git a/src/url.c b/src/url.c index 5a1fbe6..751ef3b 100644 --- a/src/url.c +++ b/src/url.c @@ -51,11 +51,11 @@ * @param ptr //TODO * @return //TODO */ -bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) +bool handle_url_record(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { if (!datalen && datalen >= MAX_PATH) return false; - char * data = decrypt_data(raw_data, datalen, oss_url, key, subkey); + char * data = decrypt_data(raw_data, datalen, *table, key, subkey); if (!data) return false; @@ -65,7 +65,7 @@ bool handle_url_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *ra free(data); return false; } - + int subkey_ln = table->key_ln - LDB_KEY_LN; component_list_t * component_list = (component_list_t*) ptr; component_data_t * new_comp = calloc(1, sizeof(*new_comp)); @@ -177,7 +177,7 @@ bool purl_type_matches(char *purl1, char *purl2) * Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. **/ -bool handle_purl_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool handle_purl_record(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { component_data_t *component = (component_data_t *) ptr; @@ -213,7 +213,7 @@ bool handle_purl_record(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *d { scanlog("Related PURL: %s\n", purl); component->purls[i] = purl; - component->purls_md5[i] = malloc(oss_purl.key_ln); + component->purls_md5[i] = malloc(table->key_ln); MD5((uint8_t *)purl, strlen(purl), component->purls_md5[i]); return false; } @@ -273,11 +273,11 @@ void fetch_related_purls(component_data_t *component) * Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. **/ -bool get_purl_first_release(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool get_purl_first_release(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { if (!datalen) return false; - char * purl = decrypt_data(data, datalen, oss_purl, key, subkey); + char * purl = decrypt_data(data, datalen, *table, key, subkey); uint8_t *oldest = (uint8_t *) ptr; if (!purl) @@ -320,9 +320,9 @@ void purl_release_date(char *purl, char *date) * @brief Handler function for getting the oldest URL. * Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. **/ -bool get_oldest_url(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +bool get_oldest_url(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { - char * url = decrypt_data(data, datalen, oss_url, key, subkey); + char * url = decrypt_data(data, datalen, *table, key, subkey); if (!url) return false; diff --git a/src/vulnerability.c b/src/vulnerability.c index 3ec53ed..02c87e5 100644 --- a/src/vulnerability.c +++ b/src/vulnerability.c @@ -141,14 +141,14 @@ static bool vulnerability_version_matches(char * version, char *introduced, char * @param ptr //TODO * @return //TODO */ -static bool print_vulnerability_item(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +static bool print_vulnerability_item(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { component_data_t *comp = ptr; /* Set a limit to the amount of vulnerabilities returned */ if (comp->vulnerabilities > max_vulnerabilities) return true; - char * CSV = decrypt_data(data, datalen, oss_vulnerability, key, subkey); + char * CSV = decrypt_data(data, datalen, *table, key, subkey); if (!CSV) return false; From 4c481bfa5792653e7542d215155a92fff3d01712 Mon Sep 17 00:00:00 2001 From: core software devel Date: Mon, 26 Aug 2024 21:51:40 +0000 Subject: [PATCH 09/19] add path table --- inc/file.h | 1 - inc/scan.h | 1 + inc/scanoss.h | 1 + src/file.c | 35 ----------------------------------- src/main.c | 10 ++++++++++ src/match.c | 41 +++++++++++++++++++++++++++++++++-------- src/scan.c | 7 ++++++- 7 files changed, 51 insertions(+), 45 deletions(-) diff --git a/inc/file.h b/inc/file.h index 9b29814..0c9d35f 100644 --- a/inc/file.h +++ b/inc/file.h @@ -10,7 +10,6 @@ bool is_file(char *path); bool is_dir(char *path); void get_file_md5(char *filepath, uint8_t *md5_result); -bool collect_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); bool count_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); char *get_file_extension(uint8_t *md5); diff --git a/inc/scan.h b/inc/scan.h index e3e768d..ce1a64b 100644 --- a/inc/scan.h +++ b/inc/scan.h @@ -67,6 +67,7 @@ typedef struct scan_data_t } scan_data_t; extern bool force_snippet_scan; +extern bool path_table_present; scan_data_t * scan_data_init(char *target, int max_snippets, int max_components); void scan_data_free (scan_data_t * scan); diff --git a/inc/scanoss.h b/inc/scanoss.h index 841b545..66c3b3c 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -126,6 +126,7 @@ extern char * component_hint; /* DB tables */ extern struct ldb_table oss_url; extern struct ldb_table oss_file; +extern struct ldb_table oss_path; extern struct ldb_table oss_wfp; extern struct ldb_table oss_purl; extern struct ldb_table oss_copyright; diff --git a/src/file.c b/src/file.c index 6c94e77..499c606 100644 --- a/src/file.c +++ b/src/file.c @@ -178,41 +178,6 @@ int dir_count(char *path) return count; } -/** - * @brief Collect all files function pointer. Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. - * @param key //TODO - * @param subkey //TODO - * @param subkey_ln //TODO - * @param raw_data //TODO - * @param datalen //TODO - * @param iteration //TODO - * @param ptr //TODO - * @return //TODO - */ -bool collect_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) -{ - - /* Leave if FETCH_MAX_FILES is reached */ - if (iteration >= FETCH_MAX_FILES) return true; - - /* Ignore path lengths over the limit */ - if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; - - /* Decrypt data */ - char * decrypted = decrypt_data(raw_data, datalen, oss_file, key, subkey); - if (!decrypted) - return NULL; - /* Copy data to memory */ - file_recordset *files = ptr; - - memcpy(files[iteration].url_id, raw_data, oss_url.key_ln); - strncpy(files[iteration].path, decrypted, MAX_FILE_PATH); - free(decrypted); - - files[iteration].path_ln = dir_count(files[iteration].path); - return false; -} - /** * @brief Count all entries for a given md5. Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. * @param key //TODO diff --git a/src/main.c b/src/main.c index 337a466..e2a2d5a 100644 --- a/src/main.c +++ b/src/main.c @@ -49,6 +49,7 @@ struct ldb_table oss_url; struct ldb_table oss_file; +struct ldb_table oss_path; struct ldb_table oss_wfp; struct ldb_table oss_purl; struct ldb_table oss_copyright; @@ -135,6 +136,15 @@ void initialize_ldb_tables(char *name) snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "file"); oss_file = ldb_read_cfg(dbtable); + ldb_hash_mode_select(oss_file.key_ln); + + if (ldb_table_exists(oss_db_name, "path")) + { + path_table_present = true; + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "path"); + oss_path = ldb_read_cfg(dbtable); + } + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "wfp"); oss_wfp = ldb_read_cfg(dbtable); diff --git a/src/match.c b/src/match.c index 27246f2..24102e3 100644 --- a/src/match.c +++ b/src/match.c @@ -51,8 +51,8 @@ #include "health.h" const char *matchtypes[] = {"none", "file", "snippet", "binary"}; /** describe the availables kinds of match */ -bool match_extensions = false; /** global match extension flag */ - +bool match_extensions = false; /** global match extension flag */ +bool path_table_present = false; char *component_hint = NULL; /** @@ -442,6 +442,24 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, return true; } +bool path_query_handler(struct ldb_table * table, uint8_t * key, uint8_t * subkey, uint8_t * data, uint32_t datalen, int record_number, void * ptr) +{ + char **path = ptr; + /* Decrypt data */ + char * decrypted = decrypt_data(data, datalen, *table, key, subkey); + if (!decrypted || !*decrypted) + return false; + + *path = decrypted; + return true; +} +static char * path_query(uint8_t * file_id) +{ + char * path = NULL; + ldb_fetch_recordset(NULL, oss_path, file_id, false, path_query_handler, (void *) &path); + return path; +} + /** * @brief Load componentes for a match processing the file recordset list. * For each file in the recordset we will query for the oldest url in the url table. @@ -462,24 +480,31 @@ bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t * /*Return we high accuracy it is not enabled*/ if (iteration > iteration_max * 2 && !(engine_flags & ENABLE_HIGH_ACCURACY)) return true; - /* Ignore path lengths over the limit */ - if (!datalen || datalen >= (oss_file.key_ln + MAX_FILE_PATH)) return false; + if (!datalen || datalen >= (table->key_ln + MAX_FILE_PATH)) return false; + char * decrypted = NULL; + if (path_table_present) + { + decrypted = path_query(&raw_data[table->key_ln]); + } + else + { + /* Decrypt data */ + decrypted = decrypt_data(raw_data, datalen, *table, key, subkey); + } - /* Decrypt data */ - char * decrypted = decrypt_data(raw_data, datalen, *table, key, subkey); if (!decrypted) return false; component_list_t * component_list = (component_list_t*) ptr; /* Copy data to memory */ - uint8_t url_id[oss_url.key_ln]; /*= {0xd4,0x1d,0x8c,0xd9,0x8f,0x00,0xb2,0x04,0xe9,0x80,0x09,0x98,0xec,0xf8,0x42,0x7e}; //empty string md5 + uint8_t url_id[table->key_ln]; /*= {0xd4,0x1d,0x8c,0xd9,0x8f,0x00,0xb2,0x04,0xe9,0x80,0x09,0x98,0xec,0xf8,0x42,0x7e}; //empty string md5 if (!memcmp(raw_data,url_id, MD5_LEN)) //the md5 key of an empty string must be skipped. return false;*/ - memcpy(url_id, raw_data, oss_url.key_ln); + memcpy(url_id, raw_data, table->key_ln); char path[MAX_FILE_PATH+1]; strncpy(path, decrypted, MAX_FILE_PATH); //check the ignore list only if the match type is MATCH_SNIPPET. TODO: remove this after remine everything. diff --git a/src/scan.c b/src/scan.c index d1f18f6..017f679 100644 --- a/src/scan.c +++ b/src/scan.c @@ -114,7 +114,12 @@ static bool zero_bytes (uint8_t *md5) */ static match_t ldb_scan_file(scan_data_t * scan) { - scanlog("Checking entire file %s\n", scan->file_path); + if (debug_on) + { + char hex_hash[oss_file.key_ln * 2 +1]; + ldb_bin_to_hex(scan->md5, oss_file.key_ln, hex_hash); + scanlog("Checking entire file %s - hash: %s\n", scan->file_path, hex_hash); + } if (zero_bytes(scan->md5)) return MATCH_NONE; From 042aeb373448f99a769d12692b2ee4615fadf72d Mon Sep 17 00:00:00 2001 From: core software devel Date: Mon, 9 Sep 2024 14:09:37 +0000 Subject: [PATCH 10/19] update hash calculation to the new format --- inc/decrypt.h | 2 +- inc/scanoss.h | 2 -- inc/util.h | 5 +---- src/attributions.c | 4 ++-- src/component.c | 4 ++-- src/decrypt.c | 4 ++-- src/file.c | 6 +++--- src/main.c | 25 +++++++++++++++++++++++-- src/match.c | 4 ++-- src/mz.c | 11 ++++++----- src/query.c | 2 +- src/report.c | 2 +- src/scan.c | 2 +- src/snippets.c | 28 ++++++++++++++++++++-------- src/url.c | 6 +++--- src/util.c | 29 ++--------------------------- src/vulnerability.c | 2 +- 17 files changed, 71 insertions(+), 67 deletions(-) diff --git a/inc/decrypt.h b/inc/decrypt.h index bceeb47..9fecbdf 100644 --- a/inc/decrypt.h +++ b/inc/decrypt.h @@ -4,7 +4,7 @@ #include "scanoss.h" extern char * (*decrypt_data) (uint8_t *data, uint32_t size, struct ldb_table table, uint8_t *key, uint8_t *subkey); -extern void (*decrypt_mz) (uint8_t *data, uint32_t len); +extern void (*decrypt_mz) (int key_ln, uint8_t *data, uint32_t len); extern void (*encoder_version) (char * version); char * standalone_decrypt_data(uint8_t *data, uint32_t size,struct ldb_table table, uint8_t *key, uint8_t *subkey); diff --git a/inc/scanoss.h b/inc/scanoss.h index 66c3b3c..f85e5be 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -37,8 +37,6 @@ #define SNIPPET_LINE_TOLERANCE 10 #define WFP_LN 4 -#define WFP_REC_LN 18 - /* Log files */ #define SCANOSS_VERSION "5.4.10" #define SCAN_LOG "/tmp/scanoss_scan.log" diff --git a/inc/util.h b/inc/util.h index 5f1077e..5d7c6c4 100644 --- a/inc/util.h +++ b/inc/util.h @@ -12,14 +12,11 @@ void uint32_reverse(uint8_t *data); void hex_to_bin(char *hex, uint32_t len, uint8_t *out); /* Compares two MD5 checksums */ -bool md5cmp(uint8_t *md51, uint8_t *md52); +bool hashcmp(int hash_len, uint8_t *md51, uint8_t *md52); /* Trim str */ void trim(char *str); -/* Returns the pair md5 of "component/vendor" */ -void vendor_component_md5(char *component, char *vendor, uint8_t *out); - /* Returns the current date stamp */ char *datestamp(void); diff --git a/src/attributions.c b/src/attributions.c index d265574..425831a 100644 --- a/src/attributions.c +++ b/src/attributions.c @@ -220,7 +220,7 @@ bool check_purl_attributions(struct ldb_table oss_attributions, char * licenses_ { /* Get purl md5 */ uint8_t md5[16]; - MD5((uint8_t *)purl, strlen(purl), md5); + oss_attribution.hash_calc((uint8_t *)purl, strlen(purl), md5); if (declared_components[i].license && licenses_json && license_search_on_licenses_json(declared_components[i].license, licenses_json)) { @@ -269,7 +269,7 @@ void print_purl_attribution_notices(struct ldb_table oss_attributions, char * li { /* Get purl md5 */ uint8_t md5[16]; - MD5((uint8_t *)purl, strlen(purl), md5); + oss_attribution.hash_calc((uint8_t *)purl, strlen(purl), md5); print_notices(oss_attributions, md5, purl); } } diff --git a/src/component.c b/src/component.c index 1e2734d..8afda27 100644 --- a/src/component.c +++ b/src/component.c @@ -302,14 +302,14 @@ bool component_date_comparation(component_data_t *a, component_data_t *b) if (!a->purls_md5[0] && a->purls[0]) { a->purls_md5[0] = malloc(oss_url.key_ln); - MD5((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); + oss_purl.hash_calc((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); a->age = get_component_age(a->purls_md5[0]); } if (!b->purls_md5[0] && b->purls[0]) { b->purls_md5[0] = malloc(oss_purl.key_ln); - MD5((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); + oss_purl.hash_calc((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); b->age = get_component_age(b->purls_md5[0]); } diff --git a/src/decrypt.c b/src/decrypt.c index a91372d..985e812 100644 --- a/src/decrypt.c +++ b/src/decrypt.c @@ -36,7 +36,7 @@ #include "decrypt.h" char * (*decrypt_data) (uint8_t *data, uint32_t size, struct ldb_table table, uint8_t *key, uint8_t *subkey); -void (*decrypt_mz) (uint8_t *data, uint32_t len); +void (*decrypt_mz) (int key_ln, uint8_t *data, uint32_t len); void (*encoder_version) (char * version); /** * @brief Decrypt data function pointer. Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. @@ -51,7 +51,7 @@ char * standalone_decrypt_data(uint8_t *data, uint32_t size, struct ldb_table ta char * msg = NULL; if (!strcmp(table.table, "file")) - msg = strndup((char*) data + 16, size - 16); + msg = strndup((char*) data + table.key_ln, size - table.key_ln); else msg = strndup((char*) data, size); diff --git a/src/file.c b/src/file.c index 499c606..563db22 100644 --- a/src/file.c +++ b/src/file.c @@ -141,7 +141,7 @@ void get_file_md5(char *filepath, uint8_t *md5_result) if (!in) { - MD5(NULL, 0, md5_result); + oss_file.hash_calc(NULL, 0, md5_result); return; } @@ -149,7 +149,7 @@ void get_file_md5(char *filepath, uint8_t *md5_result) long filesize = ftell(in); if (!filesize) { - MD5(NULL, 0, md5_result); + oss_file.hash_calc(NULL, 0, md5_result); } else { @@ -160,7 +160,7 @@ void get_file_md5(char *filepath, uint8_t *md5_result) fprintf(stderr, "Warning: cannot open file %s\n", filepath); /* Calculate MD5sum */ - MD5(buffer, filesize, md5_result); + oss_file.hash_calc(buffer, filesize, md5_result); free(buffer); fclose(in); } diff --git a/src/main.c b/src/main.c index e2a2d5a..486ce6a 100644 --- a/src/main.c +++ b/src/main.c @@ -108,10 +108,17 @@ bool lib_encoder_load() #endif } +static hash_calc_t hash_function_select(int key_ln) +{ + if (key_ln == 8) + return ldb_crc64; + + return md5_string; +} + /* Initialize tables for the DB name indicated (defaults to oss) */ void initialize_ldb_tables(char *name) { - char * ldb_ver = NULL; ldb_version(&ldb_ver); scanlog("ldb version: %s\n", ldb_ver); @@ -132,51 +139,65 @@ void initialize_ldb_tables(char *name) scanlog("Loading tables definitions\n"); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "url"); oss_url = ldb_read_cfg(dbtable); + oss_url.hash_calc = hash_function_select(oss_url.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "file"); oss_file = ldb_read_cfg(dbtable); + oss_file.hash_calc = hash_function_select(oss_file.key_ln); - ldb_hash_mode_select(oss_file.key_ln); + //ldb_hash_mode_select(oss_file.key_ln); if (ldb_table_exists(oss_db_name, "path")) { path_table_present = true; snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "path"); oss_path = ldb_read_cfg(dbtable); + oss_path.hash_calc = hash_function_select(oss_path.key_ln); } snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "wfp"); oss_wfp = ldb_read_cfg(dbtable); + oss_wfp.hash_calc = hash_function_select(oss_wfp.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "purl"); oss_purl = ldb_read_cfg(dbtable); + oss_purl.hash_calc = hash_function_select(oss_purl.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "copyright"); oss_copyright = ldb_read_cfg(dbtable); + oss_copyright.hash_calc = hash_function_select(oss_copyright.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "quality"); oss_quality = ldb_read_cfg(dbtable); + oss_quality.hash_calc = hash_function_select(oss_quality.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "vulnerability"); oss_vulnerability = ldb_read_cfg(dbtable); + oss_vulnerability.hash_calc = hash_function_select(oss_vulnerability.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "dependency"); oss_dependency = ldb_read_cfg(dbtable); + oss_dependency.hash_calc = hash_function_select(oss_dependency.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "license"); oss_license = ldb_read_cfg(dbtable); + oss_license.hash_calc = hash_function_select(oss_license.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "attribution"); oss_attribution = ldb_read_cfg(dbtable); + oss_attribution.hash_calc = hash_function_select(oss_attribution.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "cryptography"); oss_cryptography = ldb_read_cfg(dbtable); + oss_cryptography.hash_calc = hash_function_select(oss_cryptography.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "sources"); oss_sources = ldb_read_cfg(dbtable); + oss_sources.hash_calc = hash_function_select(oss_sources.key_ln); snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "notices"); oss_notices = ldb_read_cfg(dbtable); + oss_notices.hash_calc = hash_function_select(oss_notices.key_ln); kb_version_get(); osadl_load_file(); diff --git a/src/match.c b/src/match.c index 24102e3..a3d6c65 100644 --- a/src/match.c +++ b/src/match.c @@ -355,14 +355,14 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ if (!a->purls_md5[0] && a->purls[0]) { a->purls_md5[0] = malloc(oss_purl.key_ln); - MD5((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); + oss_purl.hash_calc((uint8_t *)a->purls[0], strlen(a->purls[0]), a->purls_md5[0]); a->age = get_component_age(a->purls_md5[0]); } if (!b->purls_md5[0] && b->purls[0]) { b->purls_md5[0] = malloc(oss_purl.key_ln); - MD5((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); + oss_purl.hash_calc((uint8_t *)b->purls[0], strlen(b->purls[0]), b->purls_md5[0]); b->age = get_component_age(b->purls_md5[0]); } diff --git a/src/mz.c b/src/mz.c index b9969d2..d4b2de1 100644 --- a/src/mz.c +++ b/src/mz.c @@ -50,6 +50,7 @@ void mz_get_key(struct ldb_table kb, char *key) char mz_path[LDB_MAX_PATH + kb.key_ln]; char mz_file_id[5] = "\0\0\0\0\0"; struct mz_job job; + job.key_ln = kb.key_ln -2; memcpy(mz_file_id, key, 4); sprintf(mz_path, "%s/%s/%s/%s.mz", ldb_root, kb.db, kb.table,mz_file_id); @@ -79,7 +80,7 @@ void mz_get_key(struct ldb_table kb, char *key) { /* Position pointers */ job.id = job.mz + ptr; - uint8_t *file_ln = job.id + MZ_MD5; + uint8_t *file_ln = job.id + job.key_ln; job.zdata = file_ln + MZ_SIZE; /* Get compressed data size */ @@ -88,19 +89,19 @@ void mz_get_key(struct ldb_table kb, char *key) job.zdata_ln = tmpln; /* Get total mz record length */ - job.ln = MZ_MD5 + MZ_SIZE + job.zdata_ln; + job.ln = job.key_ln + MZ_SIZE + job.zdata_ln; /* Pass job to handler */ - if (!memcmp(job.id, job.key + 2, MZ_MD5)) + if (!memcmp(job.id, job.key + 2, job.key_ln)) { if (kb.definitions & LDB_TABLE_DEFINITION_ENCRYPTED) { - decrypt_mz(job.id, job.zdata_ln); + decrypt_mz(kb.key_ln, job.id, job.zdata_ln); } /* Decompress */ MZ_DEFLATE(&job); - job.data[job.data_ln] = 0; + //job.data[job.data_ln] = 0; printf("%s", job.data); return; } diff --git a/src/query.c b/src/query.c index 5e9a428..8d42f8d 100644 --- a/src/query.c +++ b/src/query.c @@ -215,6 +215,6 @@ void purl_version_md5(uint8_t *out, char *purl, char *version) { char purl_version[MAX_ARGLN] = "\0"; sprintf(purl_version, "%s@%s", purl, version); - MD5((uint8_t *)purl_version, strlen(purl_version), out); + oss_purl.hash_calc((uint8_t *)purl_version, strlen(purl_version), out); } diff --git a/src/report.c b/src/report.c index 96620c1..46ffd06 100644 --- a/src/report.c +++ b/src/report.c @@ -204,7 +204,7 @@ bool print_json_component(component_data_t * component) if (component->purls[i] && !component->purls_md5[i]) { component->purls_md5[i] = malloc(oss_purl.key_ln); - MD5((uint8_t *)component->purls[i], strlen(component->purls[i]), component->purls_md5[i]); + oss_purl.hash_calc((uint8_t *)component->purls[i], strlen(component->purls[i]), component->purls_md5[i]); } } diff --git a/src/scan.c b/src/scan.c index 017f679..32633ea 100644 --- a/src/scan.c +++ b/src/scan.c @@ -260,7 +260,7 @@ int wfp_scan(char * path, int scan_max_snippets, int scan_max_components) if (is_bin) binary_scan(&line[4]); - /* Parse file information with format: file=MD5(32),file_size,file_path */ + /* Parse file information with format: file=HASH(16/32),file_size,file_path */ if (is_file) { /* A scan data was fullfilled and is ready to be scanned */ diff --git a/src/snippets.c b/src/snippets.c index 961a65a..d2ebe5b 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -226,7 +226,7 @@ static bool get_all_file_ids(struct ldb_table * table, uint8_t *key, uint8_t *su { uint32_t size = uint32_read(record); /* End recordset fetch if MAX_QUERY_RESPONSE is reached */ - if (size + datalen + 4 >= WFP_REC_LN * MATCHMAP_ITEM_SIZE) + if (size + datalen + 4 >= table->rec_ln * MATCHMAP_ITEM_SIZE) { return true; } @@ -615,7 +615,7 @@ int add_file_to_matchmap(scan_data_t *scan, matchmap_entry_t *item, uint8_t *md5 return -1; } - if (md5cmp(scan->matchmap[t].md5, md5)) + if (hashcmp(oss_file.key_ln, scan->matchmap[t].md5, md5)) { lastwfp = scan->matchmap[t].lastwfp; found = t; @@ -746,13 +746,13 @@ match_t ldb_scan_snippets(scan_data_t *scan) for (long i = 0; i < scan->hash_count; i++) { /* Get all file IDs for given wfp */ - map[i].md5_set = malloc(WFP_REC_LN * MATCHMAP_ITEM_SIZE); + map[i].md5_set = malloc(oss_wfp.rec_ln * MATCHMAP_ITEM_SIZE); wfp_invert(scan->hashes[i], map[i].wfp); - //scanlog(" Add wfp %02x%02x%02x%02x to map\n",map[i].wfp[0], map[i].wfp[1],map[i].wfp[2],map[i].wfp[3]); + scanlog(" Add wfp %02x%02x%02x%02x to map\n",map[i].wfp[0], map[i].wfp[1],map[i].wfp[2],map[i].wfp[3]); uint32_write(map[i].md5_set, 0); map[i].line = scan->lines[i]; ldb_fetch_recordset(NULL, oss_wfp, map[i].wfp, false, get_all_file_ids, (void *)map[i].md5_set); - map[i].size = uint32_read(map[i].md5_set) / WFP_REC_LN; + map[i].size = uint32_read(map[i].md5_set) / oss_wfp.rec_ln; //Initializate the lines indirection when a wfp from a line has at least one md5 linked if (map[i].size) map_lines_indirection[scan->lines[i]] = 0; @@ -761,6 +761,18 @@ match_t ldb_scan_snippets(scan_data_t *scan) map_max_size = map[i].size; } + + /*for (long i = 0; i < scan->hash_count; i++) + { + printf("%02x%02x%02x%02x: ", map[i].wfp[0], map[i].wfp[1],map[i].wfp[2],map[i].wfp[3]); + for (int j=0; j < map[i].size; j++) + { + char hex[MD5_LEN_HEX] = "\0"; + ldb_bin_to_hex(map[i].md5_set + 4 + j * oss_wfp.rec_ln, oss_file.key_ln, hex); + printf(" %s", hex); + } + printf("\n"); + }*/ /* Classify the WFPs in cathegories depending on popularity Each cathegoy will contain a sub set of index refered to map rows*/ #define MAP_INDIRECTION_CAT_NUMBER 1000 @@ -890,7 +902,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) /* Add each item to the matchmap*/ for (int wfp_index = map_indexes[i]; wfp_index < map[i].size; wfp_index++) { - int wfp_p = wfp_index * WFP_REC_LN; + int wfp_p = wfp_index * oss_wfp.rec_ln; /*Stop when a new sector appers*/ if (md5s[wfp_p] != sector) { @@ -948,7 +960,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) /* Add each item to the matchmap*/ for (int wfp_index = map_indexes[i]; wfp_index < map[i].size; wfp_index++) { - int wfp_p = wfp_index * WFP_REC_LN; + int wfp_p = wfp_index * oss_wfp.rec_ln; int sector = md5s[wfp_p]; int sector_max = min_match_hits; @@ -957,7 +969,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) else sector_max = scan->matchmap[scan->matchmap_rank_by_sector[sector]].hits; - if (md5cmp(&md5s[wfp_p], scan->matchmap[scan->matchmap_rank_by_sector[sector]].md5)) + if (hashcmp(oss_file.key_ln, &md5s[wfp_p], scan->matchmap[scan->matchmap_rank_by_sector[sector]].md5)) { add_file_to_matchmap(scan, &map[i], &md5s[wfp_p], 0, §or_max, &scan->matchmap_rank_by_sector[sector]); md5_proceced++; diff --git a/src/url.c b/src/url.c index 751ef3b..2fe9467 100644 --- a/src/url.c +++ b/src/url.c @@ -214,7 +214,7 @@ bool handle_purl_record(struct ldb_table * table, uint8_t *key, uint8_t *subkey, scanlog("Related PURL: %s\n", purl); component->purls[i] = purl; component->purls_md5[i] = malloc(table->key_ln); - MD5((uint8_t *)purl, strlen(purl), component->purls_md5[i]); + oss_purl.hash_calc((uint8_t *)purl, strlen(purl), component->purls_md5[i]); return false; } /* Already exists, exit */ @@ -247,7 +247,7 @@ void fetch_related_purls(component_data_t *component) if (!component->purls_md5[0] && component->purls[0]) { component->purls_md5[0] = malloc(oss_purl.key_ln); - MD5((uint8_t *)component->purls[0], strlen(component->purls[0]), component->purls_md5[0]); + oss_purl.hash_calc((uint8_t *)component->purls[0], strlen(component->purls[0]), component->purls_md5[0]); } /* Fill purls */ @@ -309,7 +309,7 @@ void purl_release_date(char *purl, char *date) return; uint8_t purl_md5[oss_purl.key_ln]; - MD5((uint8_t *)purl, strlen(purl), purl_md5); + oss_purl.hash_calc((uint8_t *)purl, strlen(purl), purl_md5); ldb_fetch_recordset(NULL, oss_purl, purl_md5, false, get_purl_first_release, (void *) date); } diff --git a/src/util.c b/src/util.c index eeef02f..be218e6 100644 --- a/src/util.c +++ b/src/util.c @@ -99,9 +99,9 @@ void uint32_reverse(uint8_t *data) * @param md52 md5 2 * @return true for equal */ -bool md5cmp(uint8_t *md51, uint8_t *md52) +bool hashcmp(int hash_len, uint8_t *md51, uint8_t *md52) { - for (int i = 0; i < 16; i++) + for (int i = 0; i < hash_len; i++) if (md51[i] != md52[i]) return false; return true; @@ -126,31 +126,6 @@ void trim(char *str) str[i + 1] = 0; } -/** - * @brief Returns the pair md5 of "component/vendor" - * @param component component string - * @param vendor vendor sting - * @param out[out] pointer ot md5 - */ -void vendor_component_md5(char *component, char *vendor, uint8_t *out) -{ - char pair[1024] = "\0"; - if (strlen(component) + strlen(vendor) + 2 >= 1024) return; - - /* Calculate pair_md5 */ - sprintf(pair, "%s/%s", component, vendor); - for (int i = 0; i < strlen(pair); i++) pair[i] = tolower(pair[i]); - MD5((uint8_t *)pair, strlen(pair), out); - - /* Log pair_md5 */ - if (debug_on) - { - char hex[oss_purl.key_ln * 2 + 1]; - ldb_bin_to_hex(out, oss_purl.key_ln, hex); - scanlog("vendor/component: %s = %s\n", pair, hex); - } -} - /** * @brief Removes chr from str * @param str input string diff --git a/src/vulnerability.c b/src/vulnerability.c index 02c87e5..defc37a 100644 --- a/src/vulnerability.c +++ b/src/vulnerability.c @@ -244,7 +244,7 @@ void version_md5(uint8_t *out, char *vendor, char *component, char *version) { char triplet[MAX_ARGLN]; sprintf(triplet, "%s/%s/%s", vendor, component, version); - MD5((uint8_t *)triplet, strlen(triplet), out); + oss_vulnerability.hash_calc((uint8_t *)triplet, strlen(triplet), out); } /** From 939f6f78bdcfc55cb5c0f63a438efa190c84a015 Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Tue, 3 Dec 2024 11:44:13 +0100 Subject: [PATCH 11/19] update multi-snippet, add fetch_recordset local wrapper, update selection logic using vulnerabilities, solve memory leaks --- inc/component.h | 1 + inc/match_list.h | 4 +++- inc/scanoss.h | 3 +++ src/attributions.c | 6 +++--- src/binary_scan.c | 7 ++++--- src/component.c | 4 ++-- src/copyright.c | 6 +++--- src/cryptography.c | 2 +- src/dependency.c | 6 +++--- src/file.c | 2 +- src/health.c | 2 +- src/license.c | 8 ++++---- src/match.c | 49 +++++++++++++++++++++++++++++++++------------ src/match_list.c | 47 +++++++++++++++++++++++++++++++++++++++---- src/quality.c | 2 +- src/query.c | 14 ++++++++++--- src/report.c | 13 ++---------- src/snippets.c | 20 +++++++++++++----- src/url.c | 4 ++-- src/vulnerability.c | 13 +++++++----- 20 files changed, 147 insertions(+), 66 deletions(-) diff --git a/inc/component.h b/inc/component.h index 9af3519..5069983 100644 --- a/inc/component.h +++ b/inc/component.h @@ -55,5 +55,6 @@ bool component_date_comparation(component_data_t * a, component_data_t * b); component_data_t * component_data_copy(component_data_t * in); int asset_declared(component_data_t * comp); void component_item_free(component_item * comp_item); +void component_purl_md5(component_data_t * component); void fill_component_path(component_data_t *component, char *file_path); #endif \ No newline at end of file diff --git a/inc/match_list.h b/inc/match_list.h index 6773972..7d81be1 100644 --- a/inc/match_list.h +++ b/inc/match_list.h @@ -154,5 +154,7 @@ void component_list_destroy(component_list_t *list); bool component_list_add_binary(component_list_t *list, component_data_t *new_comp, bool (*val)(component_data_t *a, component_data_t *b), bool remove_a); bool match_list_eval(match_list_t *list, match_data_t * in, bool (*eval)(match_data_t *fpa, match_data_t *fpb)); void match_list_tolerance_set(float in); -bool component_list_update(component_list_t *list, component_data_t * in, list_update_t (*eval)(component_data_t *fpa, component_data_t *fpb)); +list_update_t component_list_update(component_list_t *list, component_data_t * in, list_update_t (*eval)(component_data_t *fpa, component_data_t *fpb)); +void component_list_sort(struct comp_entry *np, bool (*val)(component_data_t *a, component_data_t *b)); + #endif diff --git a/inc/scanoss.h b/inc/scanoss.h index f85e5be..6008b40 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -155,4 +155,7 @@ bool ignored_asset_match(uint8_t *url_record); void ldb_get_first_record(struct ldb_table table, uint8_t* key, void *void_ptr); int binary_scan(char * bfp); + +uint32_t fetch_recordset(struct ldb_table table, uint8_t *key, ldb_record_handler_t handler, void *ptr); + #endif diff --git a/src/attributions.c b/src/attributions.c index 425831a..773ce56 100644 --- a/src/attributions.c +++ b/src/attributions.c @@ -39,7 +39,7 @@ #include "parse.h" #include "util.h" #include "mz.h" - +#include "query.h" /** * @brief Notices LDB function pointer. Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. * @param key ldb key looking for @@ -120,7 +120,7 @@ bool attribution_handler(struct ldb_table * table, uint8_t *key, uint8_t *subkey bool purl_notices_exist(struct ldb_table oss_attribution, uint8_t *key) { bool validated = true; - ldb_fetch_recordset(NULL, oss_attribution, key, false, attribution_handler, &validated); + fetch_recordset(oss_attribution, key, attribution_handler, &validated); return validated; } @@ -134,7 +134,7 @@ bool purl_notices_exist(struct ldb_table oss_attribution, uint8_t *key) bool print_notices(struct ldb_table oss_attribution, uint8_t *key, char *component) { bool validated = true; - ldb_fetch_recordset(NULL, oss_attribution, key, false, notices_handler, component); + fetch_recordset(oss_attribution, key, notices_handler, component); return validated; } diff --git a/src/binary_scan.c b/src/binary_scan.c index efc0b67..54848c9 100644 --- a/src/binary_scan.c +++ b/src/binary_scan.c @@ -38,6 +38,7 @@ #include "url.h" #include "decrypt.h" #include "report.h" +#include "query.h" component_data_t comp_max_hits = {.hits=-1}; static bool component_hits_comparation(component_data_t *a, component_data_t *b) @@ -83,7 +84,7 @@ static bool add_purl_from_urlid(struct ldb_table * table, uint8_t *key, uint8_t strncpy(path, decrypted, MAX_FILE_PATH); uint8_t *url_rec = calloc(LDB_MAX_REC_LN, 1); /*Alloc memory for url records */ - ldb_fetch_recordset(NULL, oss_url, url_id, false, get_oldest_url, (void *)url_rec); + fetch_recordset(oss_url, url_id, get_oldest_url, (void *)url_rec); /* Create a new component and fill it from the url record */ component_data_t *new_comp = calloc(1, sizeof(*new_comp)); @@ -168,12 +169,12 @@ static void fhash_process(char * hash, component_list_t * comp_list) ldb_hex_to_bin(hash, 32, fhash); /* Get all file IDs for given wfp */ file_recordset *files = calloc(1001, sizeof(file_recordset));; - int records = ldb_fetch_recordset(NULL, oss_fhash, fhash, false, get_all_file_ids, (void *) files); + int records = fetch_recordset( oss_fhash, fhash, get_all_file_ids, (void *) files); if (records < max_files_to_process) { for (int i = 0; i < records; i++) { - ldb_fetch_recordset(NULL, oss_file, files[i].url_id, false, add_purl_from_urlid,(void *)comp_list); + fetch_recordset( oss_file, files[i].url_id, add_purl_from_urlid,(void *)comp_list); } } free(files); diff --git a/src/component.c b/src/component.c index 8afda27..5c20c09 100644 --- a/src/component.c +++ b/src/component.c @@ -298,7 +298,7 @@ bool component_date_comparation(component_data_t *a, component_data_t *b) return false; if (!*a->release_date) return true; - +scanlog("%s - %s vs %s - %s\n", a->purls[0], a->release_date, b->purls[0], b->release_date); if (!a->purls_md5[0] && a->purls[0]) { a->purls_md5[0] = malloc(oss_url.key_ln); @@ -348,7 +348,7 @@ void component_purl_md5(component_data_t * component) if (component->purls[i] && !component->purls_md5[i]) { component->purls_md5[i] = malloc(oss_purl.key_ln); - MD5((uint8_t *)component->purls[i], strlen(component->purls[i]), component->purls_md5[i]); + oss_purl.hash_calc((uint8_t *)component->purls[i], strlen(component->purls[i]), component->purls_md5[i]); } } } \ No newline at end of file diff --git a/src/copyright.c b/src/copyright.c index 5139c3b..11f5499 100644 --- a/src/copyright.c +++ b/src/copyright.c @@ -145,17 +145,17 @@ void print_copyrights(component_data_t * comp) uint32_t records = 0; - records = ldb_fetch_recordset(NULL, oss_copyright, comp->file_md5_ref, false, print_copyrights_item, comp); + records = fetch_recordset( oss_copyright, comp->file_md5_ref, print_copyrights_item, comp); scanlog("File md5 copyright records %d\n", records); if (!records) { - records = ldb_fetch_recordset(NULL, oss_copyright, comp->url_md5, false, print_copyrights_item, comp); + records = fetch_recordset( oss_copyright, comp->url_md5, print_copyrights_item, comp); scanlog("URL md5 copyright records %d\n", records); } if (!records) for (int i = 0; i < MAX_PURLS && comp->purls[i]; i++) - if (ldb_fetch_recordset(NULL, oss_copyright, comp->purls_md5[i], false, print_copyrights_item, comp)) break; + if (fetch_recordset( oss_copyright, comp->purls_md5[i], print_copyrights_item, comp)) break; char * aux = NULL; if (comp->copyright_text && *comp->copyright_text) diff --git a/src/cryptography.c b/src/cryptography.c index 10e85fe..1348049 100644 --- a/src/cryptography.c +++ b/src/cryptography.c @@ -108,7 +108,7 @@ void print_cryptography(match_data_t * match) memset(crclist, 0, sizeof(crclist)); match->crclist = crclist; - ldb_fetch_recordset(NULL, oss_cryptography, match->file_md5, false, print_crypto_item, match); + fetch_recordset(oss_cryptography, match->file_md5, print_crypto_item, match); char * aux = NULL; asprintf(&aux, "%s%s]", result, (match->crytography_text && *match->crytography_text) ? match->crytography_text : "" ); diff --git a/src/dependency.c b/src/dependency.c index 534c4eb..91e14b0 100644 --- a/src/dependency.c +++ b/src/dependency.c @@ -113,7 +113,7 @@ int print_dependencies(component_data_t * comp) uint32_t records = 0; /* Pull URL dependencies */ - records = ldb_fetch_recordset(NULL, oss_dependency, comp->url_md5, false, print_dependencies_item, NULL); + records = fetch_recordset( oss_dependency, comp->url_md5, print_dependencies_item, NULL); if (records) scanlog("Dependency matches (%d) reported for url_hash\n", records); else @@ -126,7 +126,7 @@ int print_dependencies(component_data_t * comp) uint8_t hash[oss_purl.key_ln]; purl_version_md5(hash, comp->purls[i], comp->version); - records = ldb_fetch_recordset(NULL, oss_dependency, hash, false, print_dependencies_item, comp); + records = fetch_recordset( oss_dependency, hash, print_dependencies_item, comp); if (records) { scanlog("Dependency matches (%d) reported for %s@%s\n", records, comp->purls[i],comp->version); @@ -142,7 +142,7 @@ int print_dependencies(component_data_t * comp) uint8_t hash[oss_purl.key_ln]; purl_version_md5(hash, comp->purls[i], comp->latest_version); - records = ldb_fetch_recordset(NULL, oss_dependency, hash, false, print_dependencies_item, comp); + records = fetch_recordset( oss_dependency, hash, print_dependencies_item, comp); if (records) { scanlog("Dependency matches (%d) reported for %s@%s\n", records, comp->purls[i],comp->latest_version); diff --git a/src/file.c b/src/file.c index 563db22..35a9e0d 100644 --- a/src/file.c +++ b/src/file.c @@ -257,6 +257,6 @@ char *get_file_extension(uint8_t *md5) char *out = malloc(MAX_ARGLN + 1); *out = 0; - ldb_fetch_recordset(NULL, oss_file, md5, false, get_first_file, out); + fetch_recordset(oss_file, md5, get_first_file, out); return out; } diff --git a/src/health.c b/src/health.c index bede8df..b03f7ed 100644 --- a/src/health.c +++ b/src/health.c @@ -111,6 +111,6 @@ void print_health(component_data_t *component) { if (!ldb_table_exists(oss_purl.db, oss_purl.table)) //skip crypto if the table is not present return; - ldb_fetch_recordset(NULL, oss_purl, component->purls_md5[0], false, print_health_item, component); + fetch_recordset(oss_purl, component->purls_md5[0], print_health_item, component); } diff --git a/src/license.c b/src/license.c index 28d5bbe..5dae43e 100644 --- a/src/license.c +++ b/src/license.c @@ -363,10 +363,10 @@ void print_licenses(component_data_t *comp) /* Look for component or file license */ - records = ldb_fetch_recordset(NULL, oss_license, comp->file_md5_ref, false, print_licenses_item, comp); + records = fetch_recordset(oss_license, comp->file_md5_ref, print_licenses_item, comp); scanlog("License for file_id license returns %d hits\n", records); - records = ldb_fetch_recordset(NULL, oss_license, comp->url_md5, false, print_licenses_item, comp); + records = fetch_recordset(oss_license, comp->url_md5, print_licenses_item, comp); scanlog("License for url_id license returns %d hits\n", records); for (int i = 0; i < MAX_PURLS && comp->purls[i]; i++) @@ -375,13 +375,13 @@ void print_licenses(component_data_t *comp) uint8_t purlversion_md5[oss_purl.key_ln]; purl_version_md5(purlversion_md5, comp->purls[i], comp->version); - records = ldb_fetch_recordset(NULL, oss_license, purlversion_md5, false, print_licenses_item, comp); + records = fetch_recordset(oss_license, purlversion_md5, print_licenses_item, comp); scanlog("License for %s@%s license returns %d hits\n", comp->purls[i], comp->version, records); if (records) break; - records = ldb_fetch_recordset(NULL, oss_license, comp->purls_md5[i], false, print_licenses_item, comp); + records = fetch_recordset(oss_license, comp->purls_md5[i], print_licenses_item, comp); scanlog("License for %s license returns %d hits\n", comp->purls[i], records); if (records) diff --git a/src/match.c b/src/match.c index a3d6c65..ca82a33 100644 --- a/src/match.c +++ b/src/match.c @@ -289,9 +289,14 @@ static bool component_hint_date_comparation(component_data_t *a, component_data_ if (result < 0) return false; } + + /*if (strstr(a->file, "contrib") && !strstr(b->file, "contrib")) + return true; + if (!strstr(a->file, "contrib") && strstr(b->file, "contrib")) + return false;*/ if ((engine_flags & ENABLE_PATH_HINT) && a->file_path_ref && b->file_path_ref) - { + { //evalute path rank for component a evaluate_path_rank(a); @@ -389,7 +394,7 @@ list_update_t component_update(component_data_t *a, component_data_t *b) return LIST_ITEM_UPDATE; else { - scanlog("--- Componen already exist: %s---\n", b->component); + scanlog("--- Componen already exist: %s--- %s vs %s\n", b->purls[0], b->release_date, a->release_date); component_data_free(b); return LIST_ITEM_FOUND; } @@ -402,11 +407,11 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, { component_data_t *new_comp = NULL; - ldb_fetch_recordset(NULL, oss_url, url_id, false, get_oldest_url, (void *)&new_comp); - if (!new_comp) - return false; - - fill_component_path(new_comp, path); + ldb_fetch_recordset(NULL, oss_url, url_id, false, get_oldest_url, (void *)url_rec); + + /* Extract date from url_rec */ + char date[MAX_ARGLN] = "0"; + extract_csv(date, (char *)url_rec, 4, MAX_ARGLN); /* Create a new component and fill it from the url record */ component_data_t *new_comp = calloc(1, sizeof(*new_comp)); bool result = fill_component(new_comp, url_id, path, (uint8_t *)url_rec); @@ -420,16 +425,21 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, asset_declared(new_comp); new_comp->file_path_ref = component_list->match_ref->scan_ower->file_path; new_comp->path_rank = PATH_LEVEL_COMP_INIT_VALUE; - if (!component_list_update(component_list, new_comp, component_update)) + list_update_t r = component_list_update(component_list, new_comp, component_update); + if (r == LIST_ITEM_NOT_FOUND) { scanlog("--- new comp %s---\n", new_comp->component); if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) { - scanlog("component rejected: %s\n", new_comp->purls[0]); + scanlog("component rejected: %s - %s\n", new_comp->purls[0], new_comp->release_date); component_data_free(new_comp); /* Free if the componet was rejected */ } else - scanlog("component accepted: %s - pathrank: %d\n", new_comp->purls[0], new_comp->path_rank); + scanlog("component accepted: %s - %s - pathrank: %d\n", new_comp->purls[0], new_comp->release_date, new_comp->path_rank); + } + else if (r == LIST_ITEM_UPDATE && component_list->headp.lh_first) + { + component_list_sort(component_list->headp.lh_first, component_hint_date_comparation); } } else @@ -456,7 +466,7 @@ bool path_query_handler(struct ldb_table * table, uint8_t * key, uint8_t * subke static char * path_query(uint8_t * file_id) { char * path = NULL; - ldb_fetch_recordset(NULL, oss_path, file_id, false, path_query_handler, (void *) &path); + fetch_recordset(oss_path, file_id, path_query_handler, (void *) &path); return path; } @@ -542,11 +552,11 @@ bool load_matches(match_data_t *match) uint32_t records = 0; /*Query to url table looking for a url match, will add the components to component list */ - records = ldb_fetch_recordset(NULL, oss_url, match->file_md5, false, handle_url_record, (void *)&match->component_list); + records = fetch_recordset(oss_url, match->file_md5, handle_url_record, (void *)&match->component_list); scanlog("URL recordset contains %u records\n", records); /*Collect all files from the files table matching with the match md5 being processed */ - records = ldb_fetch_recordset(NULL, oss_file, match->file_md5, false, component_from_file,(void *)&match->component_list); + records = fetch_recordset(oss_file, match->file_md5, component_from_file,(void *)&match->component_list); scanlog("Found %d file entries\n", records); /* Final optimization based on the available information for a component */ @@ -557,6 +567,10 @@ bool load_matches(match_data_t *match) struct comp_entry *item = NULL; LIST_FOREACH(item, &match->component_list.headp, entries) { + /*Check if there are some purl's md5 missing. We could do this earlier, but this is a performance optimization*/ + component_purl_md5(item->component); + + scanlog("Tiebreak\n"); if (!item->entries.le_next || !item->entries.le_next->component) break; @@ -678,6 +692,15 @@ void match_select_best(scan_data_t *scan) struct entry *item = NULL; LIST_FOREACH(item, &scan->matches_list_array[i]->headp, entries) { + if (debug_on) + { + struct comp_entry *comp = NULL; + int comp_n = 0; + LIST_FOREACH(comp, &item->match->component_list.headp, entries) + { + scanlog("<<<%d %s - %s>>>\n", comp_n, comp->component->purls[0], comp->component->release_date); + } + } if (find_oldest_match(scan->matches_list_array[i]->best_match, item->match)) scan->matches_list_array[i]->best_match = item->match; } diff --git a/src/match_list.c b/src/match_list.c index 89d6ff1..60ef8ef 100644 --- a/src/match_list.c +++ b/src/match_list.c @@ -137,6 +137,11 @@ bool component_list_add(component_list_t *list, component_data_t *new_comp, bool if(list->last_element_aux) { + if (debug_on) + { + scanlog(">>> component %s-%s is replaced by %s-%s <<<\n", list->last_element->component->purls[0], list->last_element->component->release_date, + new_comp->purls[0], new_comp->release_date); + } component_data_free(list->last_element->component); LIST_REMOVE(list->last_element_aux->entries.le_next, entries); free(list->last_element); @@ -427,33 +432,67 @@ void component_list_print(component_list_t *list, bool (*printer)(component_data } } -bool component_list_update(component_list_t *list, component_data_t * in, list_update_t (*eval)(component_data_t *fpa, component_data_t *fpb)) +static void component_switch(struct comp_entry * na, struct comp_entry * nb) +{ + component_data_t * aux = na->component; + na->component = nb->component; + nb->component = aux; +} +void component_list_sort(struct comp_entry *np, bool (*val)(component_data_t *a, component_data_t *b)) +{ + struct comp_entry *next = np->entries.le_next; + if (!next) + return; + if (next->entries.le_next) + { + component_list_sort(next, val); + } + if (val(np->component, next->component)) + component_switch(np, next); +} + + +list_update_t component_list_update(component_list_t *list, component_data_t * in, list_update_t (*eval)(component_data_t *fpa, component_data_t *fpb)) { for (struct comp_entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next) { list_update_t r = eval(np->component, in); if (r == LIST_ITEM_UPDATE) { + scanlog("update component %s with release date %s by %s\n", np->component->purls[0], np->component->release_date, in->release_date); component_data_t * aux = np->component; np->component = in; component_data_free(aux); - return true; + return r; } else if (r == LIST_ITEM_FOUND) - return true; + return r; } - return false; + return LIST_ITEM_NOT_FOUND; } void match_list_process(match_list_t *list, bool (*funct_p)(match_data_t *fpa)) { for (struct entry *np = list->headp.lh_first; np != NULL; np = np->entries.le_next) { + if (debug_on) + { + char md5_hex[MD5_LEN * 2 + 1]; + ldb_bin_to_hex(np->match->file_md5, MD5_LEN, md5_hex); + scanlog("-------- looking matches for md5: %s --------\n", md5_hex); + } bool result = funct_p(np->match); + if (debug_on) + { + scanlog("<<>>\n", np->match->component_list.headp.lh_first->component->purls[0], + np->match->component_list.headp.lh_first->component->release_date); + } + if (result) break; } + } bool match_list_is_empty(match_list_t *list) diff --git a/src/quality.c b/src/quality.c index ee93ac1..e4281e8 100644 --- a/src/quality.c +++ b/src/quality.c @@ -110,7 +110,7 @@ void print_quality(match_data_t * match) sprintf(result,"\"quality\": ["); - ldb_fetch_recordset(NULL, oss_quality, match->file_md5, false, print_quality_item, &aux); + fetch_recordset(oss_quality, match->file_md5, print_quality_item, &aux); free(match->quality_text); asprintf(&match->quality_text, "%s%s]", result, aux ? aux : ""); diff --git a/src/query.c b/src/query.c index 8d42f8d..e29fd24 100644 --- a/src/query.c +++ b/src/query.c @@ -28,7 +28,7 @@ * //TODO Long description * @see https://github.com/scanoss/engine/blob/master/src/quality.c */ - +#include "scanoss.h" #include "query.h" #include "parse.h" #include "util.h" @@ -108,7 +108,7 @@ void get_url_record(uint8_t *md5, uint8_t *record) *record = 0; /* Fetch record */ - ldb_fetch_recordset(NULL, oss_url, md5, false, ldb_get_first_url_not_ignored, (void *) record); + fetch_recordset(oss_url, md5, ldb_get_first_url_not_ignored, (void *) record); } /** @@ -200,7 +200,7 @@ int get_component_age(uint8_t *md5) long age = 0; if (ldb_table_exists(oss_purl.db, oss_purl.table)) //skip purl if the table is not present - ldb_fetch_recordset(NULL, oss_purl, md5, false, handle_get_component_age, &age); + fetch_recordset(oss_purl, md5, handle_get_component_age, &age); return age; } @@ -218,3 +218,11 @@ void purl_version_md5(uint8_t *out, char *purl, char *version) oss_purl.hash_calc((uint8_t *)purl_version, strlen(purl_version), out); } +uint32_t fetch_recordset(struct ldb_table table, uint8_t *key, ldb_record_handler_t handler, void *ptr) + { + if (!key) + return 0; + ldb_sector_t sector = {.data = NULL, .size= 0, .id = *key}; + return ldb_fetch_recordset(§or, table, key, false, handler, ptr); +} + diff --git a/src/report.c b/src/report.c index 46ffd06..391e9e9 100644 --- a/src/report.c +++ b/src/report.c @@ -197,16 +197,6 @@ bool print_json_component(component_data_t * component) { if (!component) return true; - - /*Check if there are some purl's md5 missing. We could do this earlier, but this is a performance optimization*/ - for (int i = 0; i < MAX_PURLS; i++) - { - if (component->purls[i] && !component->purls_md5[i]) - { - component->purls_md5[i] = malloc(oss_purl.key_ln); - oss_purl.hash_calc((uint8_t *)component->purls[i], strlen(component->purls[i]), component->purls_md5[i]); - } - } scanlog("print component\n"); if (engine_flags & DISABLE_BEST_MATCH) @@ -298,7 +288,8 @@ bool print_json_component(component_data_t * component) if (!(engine_flags & DISABLE_VULNERABILITIES)) { - print_vulnerabilities(component); + if (!component->vulnerabilities_text) + print_vulnerabilities(component); if (component->vulnerabilities_text) printf(",%s", json_remove_invalid_char(component->vulnerabilities_text)); } diff --git a/src/snippets.c b/src/snippets.c index d2ebe5b..870669e 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -145,6 +145,9 @@ void biggest_snippet(scan_data_t *scan) } int hits = compile_ranges(match_new); + if (hits < min_match_hits) + continue; + float percent = (hits * 100) / match_new->scan_ower->total_lines; int matched_percent = floor(percent); if (matched_percent > 99) @@ -446,8 +449,8 @@ uint32_t compile_ranges(match_data_t *match) /* Exit if hits is below two */ if (reported_hits < min_match_hits) { - scanlog("Discarted ranges brings hits count to %u\n", reported_hits); - return 0; + scanlog("Discarted ranges brings hits count to %u (MIN MATCH HITS: %d)\n", reported_hits, min_match_hits); + return reported_hits; } //scanlog("compile_ranges #%d = %ld to %ld - OSS from: %d\n", i, from, to, match->matchmap_reg->range[i].oss_line); @@ -711,7 +714,7 @@ int add_file_to_matchmap(scan_data_t *scan, matchmap_entry_t *item, uint8_t *md5 if (found == scan->matchmap_size) scan->matchmap_size++; - return 0; + return found; } /** @@ -751,7 +754,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) scanlog(" Add wfp %02x%02x%02x%02x to map\n",map[i].wfp[0], map[i].wfp[1],map[i].wfp[2],map[i].wfp[3]); uint32_write(map[i].md5_set, 0); map[i].line = scan->lines[i]; - ldb_fetch_recordset(NULL, oss_wfp, map[i].wfp, false, get_all_file_ids, (void *)map[i].md5_set); + fetch_recordset(oss_wfp, map[i].wfp, get_all_file_ids, (void *)map[i].md5_set); map[i].size = uint32_read(map[i].md5_set) / oss_wfp.rec_ln; //Initializate the lines indirection when a wfp from a line has at least one md5 linked if (map[i].size) @@ -910,7 +913,14 @@ match_t ldb_scan_snippets(scan_data_t *scan) break; } - add_file_to_matchmap(scan, &map[i], &md5s[wfp_p], last_sector_aux, §or_max, &scan->matchmap_rank_by_sector[sector]); + int pos = add_file_to_matchmap(scan, &map[i], &md5s[wfp_p], last_sector_aux, §or_max, &scan->matchmap_rank_by_sector[sector]); + /*if (pos >= 0 && debug_on) + { + char key_hex[(MD5_LEN+2)*2 + 1]; + ldb_bin_to_hex(&md5s[wfp_p], MD5_LEN+2, key_hex); + printf("%02x%02x%02x%02x,%s\n", map[i].wfp[0], map[i].wfp[1], map[i].wfp[2], map[i].wfp[3], key_hex); + }*/ + } } } diff --git a/src/url.c b/src/url.c index 2fe9467..aa3a4eb 100644 --- a/src/url.c +++ b/src/url.c @@ -260,7 +260,7 @@ void fetch_related_purls(component_data_t *component) uint32_t CRC = string_crc32c(purl_type); add_CRC(component->crclist, CRC); - int purls = ldb_fetch_recordset(NULL, oss_purl, component->purls_md5[i], false, handle_purl_record, component); + int purls = fetch_recordset( oss_purl, component->purls_md5[i], handle_purl_record, component); if (purls) scanlog("Finding related PURLs for %s returned %d matches\n", component->purls[i], purls); else @@ -311,7 +311,7 @@ void purl_release_date(char *purl, char *date) uint8_t purl_md5[oss_purl.key_ln]; oss_purl.hash_calc((uint8_t *)purl, strlen(purl), purl_md5); - ldb_fetch_recordset(NULL, oss_purl, purl_md5, false, get_purl_first_release, (void *) date); + fetch_recordset( oss_purl, purl_md5, get_purl_first_release, (void *) date); } diff --git a/src/vulnerability.c b/src/vulnerability.c index defc37a..b7b3f16 100644 --- a/src/vulnerability.c +++ b/src/vulnerability.c @@ -252,9 +252,11 @@ void version_md5(uint8_t *out, char *vendor, char *component, char *version) * @param match match structure */ int print_vulnerabilities(component_data_t *component) +int print_vulnerabilities(component_data_t *component) { if (!ldb_table_exists(oss_vulnerability.db, oss_vulnerability.table)) // skip purl if the table is not present return 0; + return 0; scanlog("Process vulnerabilities\n"); uint32_t crclist[CRC_LIST_LEN]; memset(crclist, 0, sizeof(crclist)); @@ -265,9 +267,10 @@ int print_vulnerabilities(component_data_t *component) component->vulnerabilities = 0; component->crclist = crclist; int records = 0; + int records = 0; /* Search for purl */ for (int i = 0; i < MAX_PURLS && component->purls[i]; i++) - records += ldb_fetch_recordset(NULL, oss_vulnerability, component->purls_md5[i], false, print_vulnerability_item, component); + records += fetch_recordset( oss_vulnerability, component->purls_md5[i], print_vulnerability_item, component); /* Search for purl@version in NVD */ @@ -275,7 +278,7 @@ int print_vulnerabilities(component_data_t *component) { uint8_t md5[MD5_LEN]; purl_version_md5(md5, component->purls[i], comp.version); - records += ldb_fetch_recordset(NULL, oss_vulnerability, md5, false, print_vulnerability_item, component); + records += fetch_recordset( oss_vulnerability, md5, print_vulnerability_item, component); } /* Search for for purl@latest_version in NVD */ @@ -285,21 +288,21 @@ int print_vulnerabilities(component_data_t *component) { uint8_t md5[MD5_LEN]; purl_version_md5(md5, component->purls[i], comp.latest_version); - records += ldb_fetch_recordset(NULL, oss_vulnerability, md5, false, print_vulnerability_item, component); + records += fetch_recordset( oss_vulnerability, md5, print_vulnerability_item, component); } } /* Search for vendor/component/version in NVD */ uint8_t md5[MD5_LEN]; version_md5(md5, component->vendor, component->component, comp.version); - records += ldb_fetch_recordset(NULL, oss_vulnerability, md5, false, print_vulnerability_item, component); + records += fetch_recordset( oss_vulnerability, md5, print_vulnerability_item, component); /* Search for vendor/component/latest_version in NVD */ if (strcmp(comp.version, comp.latest_version)) { uint8_t md5[MD5_LEN]; version_md5(md5, component->vendor, comp.component, comp.latest_version); - records += ldb_fetch_recordset(NULL, oss_vulnerability, md5, false, print_vulnerability_item, component); + records += fetch_recordset( oss_vulnerability, md5, print_vulnerability_item, component); } char * aux = NULL; From 950dcd26e46d07fc3bdbe0032a2c86c9ea06d01b Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Fri, 3 Jan 2025 00:03:22 +0100 Subject: [PATCH 12/19] remove match extensions flag --- inc/match_list.h | 2 +- inc/scanoss.h | 1 - src/help.c | 22 ++++++++++------------ src/main.c | 4 ---- src/match.c | 1 - 5 files changed, 11 insertions(+), 19 deletions(-) diff --git a/inc/match_list.h b/inc/match_list.h index 7d81be1..6fece78 100644 --- a/inc/match_list.h +++ b/inc/match_list.h @@ -78,7 +78,7 @@ #define SCAN_MAX_SNIPPETS_DEFAULT 1 #define SCAN_MAX_COMPONENTS_DEFAULT 3 -#define MATCH_LIST_TOLERANCE 99.9 +#define MATCH_LIST_TOLERANCE 95 typedef struct match_data_t match_data_t; /* Forward declaration */ typedef enum diff --git a/inc/scanoss.h b/inc/scanoss.h index 6008b40..63646a4 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -114,7 +114,6 @@ typedef struct component_item extern long microseconds_start; extern int map_rec_len; -extern bool match_extensions; /*component hint hold the last component matched/guessed */ extern char * component_hint; diff --git a/src/help.c b/src/help.c index f36ebe5..15772c7 100644 --- a/src/help.c +++ b/src/help.c @@ -46,18 +46,16 @@ Results are displayed in JSON format through STDOUT.\n\ Syntax: scanoss [parameters] [TARGET]\n\ \n\ Configuration:\n\ --w Process TARGET as a .wfp file, regardless of its actual extension.\n\ --H Enable High Precision Snippet Match mode (requires 'libhpsm.so' in the system).\n\ --e Match only files with identical extensions as the scanned file (default: off).\n\ --M NUMBER Search for up to NUMBER different components in each file (maximum: 9).\n\ --T NUMBER Set snippet scanning tolerance percentage (default: 0.1).\n\ --s SBOM Include assets from a JSON SBOM file (CycloneDX/SPDX2.2 format) in identification.\n\ --b SBOM Exclude matches from assets listed in JSON SBOM file (CycloneDX/SPDX2.2 format).\n\ --B SBOM Same as \"-b\" but with forced snippet scanning.\n\ --a SBOM Show attribution notices for the provided SBOM.json file.\n\ --c HINT Add a component HINT to guide scan results.\n\ --k KEY Show contents of the specified KEY file from MZ sources archive.\n\ --l LICENSE Display OSADL metadata for the given SPDX license ID.\n\ +-w Treats TARGET as a .wfp file regardless of the actual file extension.\n\ +-H High Precision Snippet Match mode, 'libhpsm.so' must be present in the system.\n\ +-M NUMBER Looks for NUMBER of different components in a file (MAX 9).\n\ +-s SBOM Use assets specified in JSON SBOM (CycloneDX/SPDX2.2 JSON format) as input to identification.\n\ +-b SBOM Ignore matches to assets specified in JSON SBOM (CycloneDX/SPDX2.2 JSON format).\n\ +-B SBOM Same than \"-b\" but forcing snippet scan.\n\ +-a SBOM Displays attribution notices for provided SBOM.json.\n\ +-c HINT Provide a component HINT to influence scan results.\n\ +-k KEY Displays contents of file KEY from MZ sources archive.\n\ +-l LICENSE Displays OSADL metadata for the provided SPDX license ID.\n\ \n\ Options:\n\ -t Run engine performance tests.\n\ diff --git a/src/main.c b/src/main.c index 486ce6a..30b0f6b 100644 --- a/src/main.c +++ b/src/main.c @@ -412,10 +412,6 @@ int main(int argc, char **argv) exit(EXIT_SUCCESS); break; - case 'e': - match_extensions = true; - break; - case 'q': engine_flags = engine_flags_cmd_line; debug_on = true; diff --git a/src/match.c b/src/match.c index ca82a33..a4456f3 100644 --- a/src/match.c +++ b/src/match.c @@ -51,7 +51,6 @@ #include "health.h" const char *matchtypes[] = {"none", "file", "snippet", "binary"}; /** describe the availables kinds of match */ -bool match_extensions = false; /** global match extension flag */ bool path_table_present = false; char *component_hint = NULL; From 2967bd4bac2e17db7e75032eb4e913f51d3587e9 Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Fri, 3 Jan 2025 02:42:08 +0100 Subject: [PATCH 13/19] solve errors after rebase from main --- src/match.c | 57 +++++++++++++++++---------------------------- src/report.c | 1 - src/util.c | 4 +--- src/vulnerability.c | 4 +--- 4 files changed, 23 insertions(+), 43 deletions(-) diff --git a/src/match.c b/src/match.c index a4456f3..deb8fe2 100644 --- a/src/match.c +++ b/src/match.c @@ -406,46 +406,31 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, { component_data_t *new_comp = NULL; - ldb_fetch_recordset(NULL, oss_url, url_id, false, get_oldest_url, (void *)url_rec); - - /* Extract date from url_rec */ - char date[MAX_ARGLN] = "0"; - extract_csv(date, (char *)url_rec, 4, MAX_ARGLN); - /* Create a new component and fill it from the url record */ - component_data_t *new_comp = calloc(1, sizeof(*new_comp)); - bool result = fill_component(new_comp, url_id, path, (uint8_t *)url_rec); - if (result) + fetch_recordset(oss_url, url_id, get_oldest_url, (void *)&new_comp); + if (!new_comp) + return false; + fill_component_path(new_comp, path); + new_comp->file_md5_ref = component_list->match_ref->file_md5; + /* If the component is valid add it to the component list */ + /* The component list is a fixed size list, of size 3 by default, this means the list will keep the free oldest components*/ + /* The oldest component will be the first in the list, if two components have the same age the purl date will untie */ + new_comp->file_path_ref = component_list->match_ref->scan_ower->file_path; + new_comp->path_rank = PATH_LEVEL_COMP_INIT_VALUE; + list_update_t r = component_list_update(component_list, new_comp, component_update); + if (r == LIST_ITEM_NOT_FOUND) { - new_comp->file_md5_ref = component_list->match_ref->file_md5; - /* If the component is valid add it to the component list */ - /* The component list is a fixed size list, of size 3 by default, this means the list will keep the free oldest components*/ - /* The oldest component will be the first in the list, if two components have the same age the purl date will untie */ - new_comp->identified = IDENTIFIED_NONE; - asset_declared(new_comp); - new_comp->file_path_ref = component_list->match_ref->scan_ower->file_path; - new_comp->path_rank = PATH_LEVEL_COMP_INIT_VALUE; - list_update_t r = component_list_update(component_list, new_comp, component_update); - if (r == LIST_ITEM_NOT_FOUND) + scanlog("--- new comp %s---\n", new_comp->component); + if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) { - scanlog("--- new comp %s---\n", new_comp->component); - if (!component_list_add(component_list, new_comp, component_hint_date_comparation, true)) - { - scanlog("component rejected: %s - %s\n", new_comp->purls[0], new_comp->release_date); - component_data_free(new_comp); /* Free if the componet was rejected */ - } - else - scanlog("component accepted: %s - %s - pathrank: %d\n", new_comp->purls[0], new_comp->release_date, new_comp->path_rank); - } - else if (r == LIST_ITEM_UPDATE && component_list->headp.lh_first) - { - component_list_sort(component_list->headp.lh_first, component_hint_date_comparation); + scanlog("component rejected: %s - %s\n", new_comp->purls[0], new_comp->release_date); + component_data_free(new_comp); /* Free if the componet was rejected */ } + else + scanlog("component accepted: %s - %s - pathrank: %d\n", new_comp->purls[0], new_comp->release_date, new_comp->path_rank); } - else + else if (r == LIST_ITEM_UPDATE && component_list->headp.lh_first) { - char hex_url[MD5_LEN * 2 + 1]; - ldb_bin_to_hex(new_comp->url_md5, MD5_LEN, hex_url); - scanlog("component accepted: %s@%s - pathrank: %d - %s - %s\n", new_comp->purls[0], new_comp->version, new_comp->path_rank, new_comp->file, hex_url); + component_list_sort(component_list->headp.lh_first, component_hint_date_comparation); } return true; @@ -481,7 +466,7 @@ static char * path_query(uint8_t * file_id) * @return false */ -bool component_from_file(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) +bool component_from_file(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr) { /*Iterations must be doubled if high accuracy is enabled*/ int iteration_max = ((engine_flags & ENABLE_HIGH_ACCURACY) ? FETCH_MAX_FILES * 4 : FETCH_MAX_FILES); diff --git a/src/report.c b/src/report.c index 391e9e9..36409b0 100644 --- a/src/report.c +++ b/src/report.c @@ -238,7 +238,6 @@ bool print_json_component(component_data_t * component) char url_id[oss_url.key_ln * 2 + 1]; ldb_bin_to_hex(component->url_md5, oss_url.key_ln, url_id); printf("\"url_hash\": \"%s\"", url_id); - free(url_id); if (!(engine_flags & DISABLE_LICENSES)) { diff --git a/src/util.c b/src/util.c index be218e6..a162359 100644 --- a/src/util.c +++ b/src/util.c @@ -294,7 +294,7 @@ void free_and_null(void * pr) pr = NULL; } -bool path_is_third_party(const char* path) +bool path_is_third_party(const char *path) { const char* patterns[] = { "third_party", @@ -354,5 +354,3 @@ bool path_is_third_party(const char* path) return false; } - - diff --git a/src/vulnerability.c b/src/vulnerability.c index b7b3f16..b526ccd 100644 --- a/src/vulnerability.c +++ b/src/vulnerability.c @@ -252,11 +252,10 @@ void version_md5(uint8_t *out, char *vendor, char *component, char *version) * @param match match structure */ int print_vulnerabilities(component_data_t *component) -int print_vulnerabilities(component_data_t *component) { if (!ldb_table_exists(oss_vulnerability.db, oss_vulnerability.table)) // skip purl if the table is not present return 0; - return 0; + scanlog("Process vulnerabilities\n"); uint32_t crclist[CRC_LIST_LEN]; memset(crclist, 0, sizeof(crclist)); @@ -267,7 +266,6 @@ int print_vulnerabilities(component_data_t *component) component->vulnerabilities = 0; component->crclist = crclist; int records = 0; - int records = 0; /* Search for purl */ for (int i = 0; i < MAX_PURLS && component->purls[i]; i++) records += fetch_recordset( oss_vulnerability, component->purls_md5[i], print_vulnerability_item, component); From 119c49b52b71e5a9b8810849546a86197ac42103 Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Wed, 7 May 2025 04:31:52 +0200 Subject: [PATCH 14/19] add build project structure functionality --- inc/scanoss.h | 3 ++- src/component.c | 2 +- src/main.c | 10 +++++++ src/pivot.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 src/pivot.c diff --git a/inc/scanoss.h b/inc/scanoss.h index 63646a4..59ad37c 100644 --- a/inc/scanoss.h +++ b/inc/scanoss.h @@ -122,6 +122,7 @@ extern char * component_hint; /* DB tables */ extern struct ldb_table oss_url; +extern struct ldb_table oss_pivot; extern struct ldb_table oss_file; extern struct ldb_table oss_path; extern struct ldb_table oss_wfp; @@ -156,5 +157,5 @@ void ldb_get_first_record(struct ldb_table table, uint8_t* key, void *void_ptr); int binary_scan(char * bfp); uint32_t fetch_recordset(struct ldb_table table, uint8_t *key, ldb_record_handler_t handler, void *ptr); - +void get_project_files(char * url_key); #endif diff --git a/src/component.c b/src/component.c index 5c20c09..ab57494 100644 --- a/src/component.c +++ b/src/component.c @@ -286,7 +286,7 @@ bool fill_component(component_data_t *component, uint8_t *url_key, char *file_pa { component->purls[0] = strdup(purl); component->purls_md5[0] = malloc(MD5_LEN); - MD5((uint8_t *)component->purls[0], strlen(component->purls[0]), component->purls_md5[0]); + oss_purl.hash_calc( (unsigned char *) component->purls[0], strlen(component->purls[0]), component->purls_md5[0]); } component->age = -1; return true; diff --git a/src/main.c b/src/main.c index 30b0f6b..5328ec9 100644 --- a/src/main.c +++ b/src/main.c @@ -48,6 +48,7 @@ #include struct ldb_table oss_url; +struct ldb_table oss_pivot; struct ldb_table oss_file; struct ldb_table oss_path; struct ldb_table oss_wfp; @@ -199,6 +200,10 @@ void initialize_ldb_tables(char *name) oss_notices = ldb_read_cfg(dbtable); oss_notices.hash_calc = hash_function_select(oss_notices.key_ln); + snprintf(dbtable, MAX_ARGLN * 2, "%s/%s", oss_db_name, "pivot"); + oss_pivot = ldb_read_cfg(dbtable); + oss_pivot.hash_calc = hash_function_select(oss_pivot.key_ln); + kb_version_get(); osadl_load_file(); @@ -401,6 +406,11 @@ int main(int argc, char **argv) scan_benchmark(); exit(EXIT_SUCCESS); break; + case 'p': + initialize_ldb_tables(ldb_db_name); + get_project_files(optarg); + exit(EXIT_SUCCESS); + break; case 'v': printf("scanoss-%s\n", SCANOSS_VERSION); diff --git a/src/pivot.c b/src/pivot.c new file mode 100644 index 0000000..be26b89 --- /dev/null +++ b/src/pivot.c @@ -0,0 +1,70 @@ +#include "scanoss.h" +#include +#include "decrypt.h" +struct out_buffer_s { + char * buffer; + int pos; +}; + +struct get_path_s { + char * path; + uint8_t * url_key; +}; + +bool get_path(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +{ + char * path = decrypt_data(data, datalen, *table, key, subkey); + if (!path) { + return false; + } + char ** out = (char**) ptr; + *out = path; + return true; +} + +bool get_file_path_hash(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +{ + if (datalen < table->key_ln) + return false; + struct get_path_s * get_path_url = ptr; + //if the url key is not the same is not a useful match + if (memcmp(get_path_url->url_key, data, table->key_ln)) + return false; + + uint8_t * path_key = &data[table->key_ln]; + char * path = NULL; + fetch_recordset(oss_path, path_key, get_path, (void *)&path); + get_path_url->path = path; + return true; +} + + +bool get_project_hashes(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) +{ + uint8_t * file_key = data; + struct get_path_s get_path = {.url_key = key, .path = NULL}; + char key_hex[17]; + ldb_bin_to_hex(file_key,table->key_ln,key_hex); + + fetch_recordset(oss_file, file_key, get_file_path_hash, (void *)&get_path); + char * output = ptr; + char * line = NULL; + if (get_path.path) + { + asprintf(&line, "%s,%s\n", key_hex, get_path.path); + strcat(output, line); + free(line); + } + + free(get_path.path); + return false; +} + +void get_project_files(char * url_key_hex) +{ + uint8_t url_key[8]; + ldb_hex_to_bin(url_key_hex, 16, url_key); + char * out = calloc(1,1024*1024*500); + fetch_recordset(oss_pivot, url_key, get_project_hashes, (void *)out); + printf("%s", out); +} \ No newline at end of file From fb2bec89deef1bdb78f487efd5e9bf67f6369922 Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Wed, 7 May 2025 12:53:07 +0200 Subject: [PATCH 15/19] fix rebase issues --- inc/match_list.h | 2 +- src/report.c | 3 +++ src/snippets.c | 53 +++++++++++------------------------------------- 3 files changed, 16 insertions(+), 42 deletions(-) diff --git a/inc/match_list.h b/inc/match_list.h index 6fece78..7d81be1 100644 --- a/inc/match_list.h +++ b/inc/match_list.h @@ -78,7 +78,7 @@ #define SCAN_MAX_SNIPPETS_DEFAULT 1 #define SCAN_MAX_COMPONENTS_DEFAULT 3 -#define MATCH_LIST_TOLERANCE 95 +#define MATCH_LIST_TOLERANCE 99.9 typedef struct match_data_t match_data_t; /* Forward declaration */ typedef enum diff --git a/src/report.c b/src/report.c index 36409b0..292b609 100644 --- a/src/report.c +++ b/src/report.c @@ -336,6 +336,9 @@ bool print_json_match(struct match_data_t * match) else printf(",\"file_url\": \"%s\"", match->component_list.headp.lh_first->component->url); } + else //return an empty string + printf(",\"file_url\": \" \""); + if (!(engine_flags & DISABLE_QUALITY)) { diff --git a/src/snippets.c b/src/snippets.c index 870669e..c5ca57a 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -144,17 +144,20 @@ void biggest_snippet(scan_data_t *scan) continue; } - int hits = compile_ranges(match_new); - if (hits < min_match_hits) + int matched_lines = compile_ranges(match_new); + if (matched_lines < min_match_lines) { + match_data_free(match_new); continue; + } - float percent = (hits * 100) / match_new->scan_ower->total_lines; + float percent = (matched_lines * 100) / match_new->scan_ower->total_lines; int matched_percent = floor(percent); if (matched_percent > 99) matched_percent = 99; if (matched_percent < 1) matched_percent = 1; asprintf(&match_new->matched_percent, "%u%%", matched_percent); + match_new->lines_matched = matched_lines; //match_new->hits = hits; do /*Check if there is already a list for this line ranges */ @@ -201,6 +204,7 @@ void biggest_snippet(scan_data_t *scan) } } + /** * @brief Handler function to collect all file ids. * Will be executed for the ldb_fetch_recordset function in each iteration. See LDB documentation for more details. @@ -429,37 +433,10 @@ uint32_t compile_ranges(match_data_t *match) } int hits = 0; - /* Revise hits and decrease if needed */ - for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) - { - long from = match->matchmap_reg->range[i].from; - long to = match->matchmap_reg->range[i].to; - long delta = to - from; - - if (to < 1) - break; - - /* Ranges to be ignored (under min_match_lines) should decrease hits counter */ - if (delta < min_match_lines) - { - /* Single-line range decreases by 1, otherwise decrease by 2 (from and to) */ - reported_hits -= ((delta == 0) ? 1 : 2); - } - - /* Exit if hits is below two */ - if (reported_hits < min_match_hits) - { - scanlog("Discarted ranges brings hits count to %u (MIN MATCH HITS: %d)\n", reported_hits, min_match_hits); - return reported_hits; - } - - //scanlog("compile_ranges #%d = %ld to %ld - OSS from: %d\n", i, from, to, match->matchmap_reg->range[i].oss_line); - } - /* Add tolerances and assemble line ranges */ ranges_sort(match->matchmap_reg->range, match->matchmap_reg->ranges_number); - /*if (debug_on) + if (debug_on) { scanlog("Accepted ranges (min lines range = %d):\n", min_match_lines); for (uint32_t i = 0; i < match->matchmap_reg->ranges_number; i++) @@ -468,7 +445,7 @@ uint32_t compile_ranges(match_data_t *match) scanlog(" %d = %ld to %ld - OSS from: %d\n", i, match->matchmap_reg->range[i].from,match->matchmap_reg->range[i].to, match->matchmap_reg->range[i].oss_line); } - }*/ + } matchmap_range *ranges = ranges_join_overlapping(match->matchmap_reg->range, match->matchmap_reg->ranges_number); @@ -483,7 +460,7 @@ uint32_t compile_ranges(match_data_t *match) } } - /*if (debug_on) + if (debug_on) { scanlog("Final ranges:\n"); for (uint32_t i = 0; i < MATCHMAP_RANGES; i++) @@ -491,7 +468,7 @@ uint32_t compile_ranges(match_data_t *match) if ( ranges[i].from && ranges[i].to) scanlog(" %d = %ld to %ld - OSS from: %d\n", i, ranges[i].from, ranges[i].to, ranges[i].oss_line); } - }*/ + } hits = ranges_assemble(ranges, line_ranges, oss_ranges); match->line_ranges = strdup(line_ranges); match->oss_ranges = strdup(oss_ranges); @@ -913,13 +890,7 @@ match_t ldb_scan_snippets(scan_data_t *scan) break; } - int pos = add_file_to_matchmap(scan, &map[i], &md5s[wfp_p], last_sector_aux, §or_max, &scan->matchmap_rank_by_sector[sector]); - /*if (pos >= 0 && debug_on) - { - char key_hex[(MD5_LEN+2)*2 + 1]; - ldb_bin_to_hex(&md5s[wfp_p], MD5_LEN+2, key_hex); - printf("%02x%02x%02x%02x,%s\n", map[i].wfp[0], map[i].wfp[1], map[i].wfp[2], map[i].wfp[3], key_hex); - }*/ + add_file_to_matchmap(scan, &map[i], &md5s[wfp_p], last_sector_aux, §or_max, &scan->matchmap_rank_by_sector[sector]); } } From 034701759e595f026662624d612d5e20fb4557e1 Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Wed, 2 Jul 2025 00:05:33 +0200 Subject: [PATCH 16/19] add p flag --- src/main.c | 2 +- src/snippets.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main.c b/src/main.c index 5328ec9..37c4c6d 100644 --- a/src/main.c +++ b/src/main.c @@ -327,7 +327,7 @@ int main(int argc, char **argv) int option; bool invalid_argument = false; char * ldb_db_name = NULL; - while ((option = getopt(argc, argv, ":T:s:b:B:c:k:a:F:l:n:M:N:wtvhedqH")) != -1) + while ((option = getopt(argc, argv, ":p:T:s:b:B:c:k:a:F:l:n:M:N:wtvhedqH")) != -1) { /* Check valid alpha is entered */ if (optarg) diff --git a/src/snippets.c b/src/snippets.c index c5ca57a..3109445 100644 --- a/src/snippets.c +++ b/src/snippets.c @@ -701,6 +701,8 @@ int add_file_to_matchmap(scan_data_t *scan, matchmap_entry_t *item, uint8_t *md5 */ match_t ldb_scan_snippets(scan_data_t *scan) { + if (!ldb_table_exists(oss_wfp.db, oss_wfp.table)) //skip purl if the table is not present + return MATCH_NONE; scanlog("ldb_scan_snippets\n"); if (!scan->hash_count) From 41feee1a110fc3b930a0f85c99d2137687f1defe Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Tue, 5 Aug 2025 13:51:06 +0200 Subject: [PATCH 17/19] accept more than one path for file md5 when rebuilding a project --- src/pivot.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/pivot.c b/src/pivot.c index be26b89..39b1fbb 100644 --- a/src/pivot.c +++ b/src/pivot.c @@ -7,8 +7,9 @@ struct out_buffer_s { }; struct get_path_s { - char * path; + char **paths; uint8_t * url_key; + int paths_index; }; bool get_path(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) @@ -34,7 +35,9 @@ bool get_file_path_hash(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t * path_key = &data[table->key_ln]; char * path = NULL; fetch_recordset(oss_path, path_key, get_path, (void *)&path); - get_path_url->path = path; + get_path_url->paths = realloc(get_path_url->paths, (get_path_url->paths_index + 1) * sizeof(char*)); + get_path_url->paths[get_path_url->paths_index] = path; + get_path_url->paths_index++; return true; } @@ -42,21 +45,22 @@ bool get_file_path_hash(struct ldb_table * table, uint8_t *key, uint8_t *subkey, bool get_project_hashes(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { uint8_t * file_key = data; - struct get_path_s get_path = {.url_key = key, .path = NULL}; + struct get_path_s get_path = {.url_key = key, .paths = NULL, .paths_index = 0}; char key_hex[17]; ldb_bin_to_hex(file_key,table->key_ln,key_hex); fetch_recordset(oss_file, file_key, get_file_path_hash, (void *)&get_path); char * output = ptr; char * line = NULL; - if (get_path.path) + for (int i = 0; i < get_path.paths_index; i++) { - asprintf(&line, "%s,%s\n", key_hex, get_path.path); + asprintf(&line, "%s,%s\n", key_hex, get_path.paths[i]); + free(get_path.paths[i]); strcat(output, line); free(line); } - free(get_path.path); + free(get_path.paths); return false; } From 2292e5a8f4f5f6223ec06e6cbb749f215389fe27 Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Wed, 6 Aug 2025 02:57:31 +0200 Subject: [PATCH 18/19] support MD5 as url hash. Update help --- inc/file.h | 2 +- inc/match.h | 1 - src/file.c | 29 +++++++++++++++++++++++++++++ src/help.c | 1 + src/match.c | 18 ------------------ src/pivot.c | 42 +++++++++++++++++++++++++----------------- 6 files changed, 56 insertions(+), 37 deletions(-) diff --git a/inc/file.h b/inc/file.h index 0c9d35f..0c06300 100644 --- a/inc/file.h +++ b/inc/file.h @@ -12,5 +12,5 @@ bool is_dir(char *path); void get_file_md5(char *filepath, uint8_t *md5_result); bool count_all_files(uint8_t *key, uint8_t *subkey, int subkey_ln, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); char *get_file_extension(uint8_t *md5); - +char * path_query(uint8_t * file_id); #endif diff --git a/inc/match.h b/inc/match.h index d36c8cf..af920dc 100644 --- a/inc/match.h +++ b/inc/match.h @@ -36,5 +36,4 @@ void compile_matches(scan_data_t *scan); match_list_t * match_select_m_best(scan_data_t * scan); match_list_t * match_select_m_component_best(scan_data_t * scan); bool component_from_file(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *raw_data, uint32_t datalen, int iteration, void *ptr); - #endif diff --git a/src/file.c b/src/file.c index 35a9e0d..7525237 100644 --- a/src/file.c +++ b/src/file.c @@ -260,3 +260,32 @@ char *get_file_extension(uint8_t *md5) fetch_recordset(oss_file, md5, get_first_file, out); return out; } + +static bool path_query_handler(struct ldb_table * table, uint8_t * key, uint8_t * subkey, uint8_t * data, uint32_t datalen, int record_number, void * ptr) +{ + char **path = ptr; + /* Decrypt data */ + char * decrypted = decrypt_data(data, datalen, *table, key, subkey); + if (!decrypted || !*decrypted) + return false; + + *path = decrypted; + return true; +} +/** + * @brief Get the file path from the path table. + * @param md5 input path md5 + * @return string with the path + */ +char * path_query(uint8_t * file_id) +{ + char * path = NULL; + if (!path_table_present) + { + scanlog("path_query: path table must be present to use this query\n"); + return NULL; + } + + fetch_recordset(oss_path, file_id, path_query_handler, (void *) &path); + return path; +} \ No newline at end of file diff --git a/src/help.c b/src/help.c index 15772c7..712fc7c 100644 --- a/src/help.c +++ b/src/help.c @@ -56,6 +56,7 @@ Configuration:\n\ -c HINT Provide a component HINT to influence scan results.\n\ -k KEY Displays contents of file KEY from MZ sources archive.\n\ -l LICENSE Displays OSADL metadata for the provided SPDX license ID.\n\ +-p URL_HASH Returns a list with the md5 and path for each project file (pivot table is requeried).\n\ \n\ Options:\n\ -t Run engine performance tests.\n\ diff --git a/src/match.c b/src/match.c index deb8fe2..b2de677 100644 --- a/src/match.c +++ b/src/match.c @@ -436,24 +436,6 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, return true; } -bool path_query_handler(struct ldb_table * table, uint8_t * key, uint8_t * subkey, uint8_t * data, uint32_t datalen, int record_number, void * ptr) -{ - char **path = ptr; - /* Decrypt data */ - char * decrypted = decrypt_data(data, datalen, *table, key, subkey); - if (!decrypted || !*decrypted) - return false; - - *path = decrypted; - return true; -} -static char * path_query(uint8_t * file_id) -{ - char * path = NULL; - fetch_recordset(oss_path, file_id, path_query_handler, (void *) &path); - return path; -} - /** * @brief Load componentes for a match processing the file recordset list. * For each file in the recordset we will query for the oldest url in the url table. diff --git a/src/pivot.c b/src/pivot.c index 39b1fbb..ecc2584 100644 --- a/src/pivot.c +++ b/src/pivot.c @@ -1,6 +1,8 @@ #include "scanoss.h" #include #include "decrypt.h" +#include "debug.h" +#include "file.h" struct out_buffer_s { char * buffer; int pos; @@ -12,16 +14,6 @@ struct get_path_s { int paths_index; }; -bool get_path(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) -{ - char * path = decrypt_data(data, datalen, *table, key, subkey); - if (!path) { - return false; - } - char ** out = (char**) ptr; - *out = path; - return true; -} bool get_file_path_hash(struct ldb_table * table, uint8_t *key, uint8_t *subkey, uint8_t *data, uint32_t datalen, int iteration, void *ptr) { @@ -32,11 +24,21 @@ bool get_file_path_hash(struct ldb_table * table, uint8_t *key, uint8_t *subkey, if (memcmp(get_path_url->url_key, data, table->key_ln)) return false; - uint8_t * path_key = &data[table->key_ln]; - char * path = NULL; - fetch_recordset(oss_path, path_key, get_path, (void *)&path); + char * decrypted = NULL; + + if (path_table_present) + { + decrypted = path_query(&data[table->key_ln]); + } + else + { + /* Decrypt data */ + decrypted = decrypt_data(data, datalen, *table, key, subkey); + } + + get_path_url->paths = realloc(get_path_url->paths, (get_path_url->paths_index + 1) * sizeof(char*)); - get_path_url->paths[get_path_url->paths_index] = path; + get_path_url->paths[get_path_url->paths_index] = decrypted; get_path_url->paths_index++; return true; } @@ -46,7 +48,7 @@ bool get_project_hashes(struct ldb_table * table, uint8_t *key, uint8_t *subkey, { uint8_t * file_key = data; struct get_path_s get_path = {.url_key = key, .paths = NULL, .paths_index = 0}; - char key_hex[17]; + char key_hex[oss_url.key_ln*2+1]; ldb_bin_to_hex(file_key,table->key_ln,key_hex); fetch_recordset(oss_file, file_key, get_file_path_hash, (void *)&get_path); @@ -66,8 +68,14 @@ bool get_project_hashes(struct ldb_table * table, uint8_t *key, uint8_t *subkey, void get_project_files(char * url_key_hex) { - uint8_t url_key[8]; - ldb_hex_to_bin(url_key_hex, 16, url_key); + uint8_t url_key[oss_url.key_ln]; + scanlog("Reconstructing project structure for url %s\n",url_key_hex); + if (!ldb_table_exists(oss_pivot.db, oss_pivot.table)) + { + printf("the pivot table must be present to use this functionality\n"); + exit(EXIT_FAILURE); + } + ldb_hex_to_bin(url_key_hex, oss_url.key_ln*2, url_key); char * out = calloc(1,1024*1024*500); fetch_recordset(oss_pivot, url_key, get_project_hashes, (void *)out); printf("%s", out); From c9ea2323246bdf77d621acc200c10397a47bbeff Mon Sep 17 00:00:00 2001 From: coresoftware dev Date: Wed, 6 Aug 2025 04:03:35 +0200 Subject: [PATCH 19/19] remove duplicated code --- src/match.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/match.c b/src/match.c index f0b30ee..83baefa 100644 --- a/src/match.c +++ b/src/match.c @@ -436,24 +436,6 @@ bool add_component_from_urlid(component_list_t *component_list, uint8_t *url_id, return true; } -bool path_query_handler(struct ldb_table * table, uint8_t * key, uint8_t * subkey, uint8_t * data, uint32_t datalen, int record_number, void * ptr) -{ - char **path = ptr; - /* Decrypt data */ - char * decrypted = decrypt_data(data, datalen, *table, key, subkey); - if (!decrypted || !*decrypted) - return false; - - *path = decrypted; - return true; -} -static char * path_query(uint8_t * file_id) -{ - char * path = NULL; - fetch_recordset(oss_path, file_id, path_query_handler, (void *) &path); - return path; -} - /** * @brief Load componentes for a match processing the file recordset list. * For each file in the recordset we will query for the oldest url in the url table.