From b8c2ca4e271e3ddc413e33553c81d147154849d9 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 14 Jul 2025 15:40:59 -0400 Subject: [PATCH 01/67] deduplicate convert_type_of_value Signed-off-by: Adrian Edwards --- augur/application/config.py | 30 +----------------------------- augur/application/db/lib.py | 31 +------------------------------ augur/application/util.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 59 deletions(-) diff --git a/augur/application/config.py b/augur/application/config.py index e3e93302eb..014c8ae6a0 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -5,6 +5,7 @@ import os from augur.application.db.models import Config from augur.application.db.util import execute_session_query +from augur.application.util import convert_type_of_value def get_development_flag_from_config(): @@ -109,35 +110,6 @@ def get_development_flag(): } -def convert_type_of_value(config_dict, logger=None): - - data_type = config_dict["type"] - - if data_type == "str" or data_type is None: - return config_dict - - elif data_type == "int": - config_dict["value"] = int(config_dict["value"]) - - elif data_type == "bool": - value = config_dict["value"] - - if value.lower() == "false": - config_dict["value"] = False - else: - config_dict["value"] = True - - elif data_type == "float": - config_dict["value"] = float(config_dict["value"]) - - else: - if logger: - logger.error(f"Need to add support for {data_type} types to config") - else: - print(f"Need to add support for {data_type} types to config") - - return config_dict - class AugurConfig(): from augur.application.db.session import DatabaseSession diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index b4004d7734..cb6bc283e5 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -14,40 +14,11 @@ from augur.tasks.util.collection_state import CollectionState from augur.application.db import get_session, get_engine from augur.application.db.util import execute_session_query +from augur.application.util import convert_type_of_value from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts logger = logging.getLogger("db_lib") -def convert_type_of_value(config_dict, logger=None): - - - data_type = config_dict["type"] - - if data_type == "str" or data_type is None: - return config_dict - - if data_type == "int": - config_dict["value"] = int(config_dict["value"]) - - elif data_type == "bool": - value = config_dict["value"] - - if value.lower() == "false": - config_dict["value"] = False - else: - config_dict["value"] = True - - elif data_type == "float": - config_dict["value"] = float(config_dict["value"]) - - else: - if logger: - logger.error(f"Need to add support for {data_type} types to config") - else: - print(f"Need to add support for {data_type} types to config") - - return config_dict - def get_section(section_name) -> dict: """Get a section of data from the config. diff --git a/augur/application/util.py b/augur/application/util.py index 03e591df98..fa5a63d13d 100644 --- a/augur/application/util.py +++ b/augur/application/util.py @@ -25,3 +25,33 @@ def get_all_repos_count(**kwargs): result = controller.get_repo_count(source="all", **kwargs) return result + + +def convert_type_of_value(config_dict, logger=None): + + data_type = config_dict["type"] + + if data_type == "str" or data_type is None: + return config_dict + + elif data_type == "int": + config_dict["value"] = int(config_dict["value"]) + + elif data_type == "bool": + value = config_dict["value"] + + if value.lower() == "false": + config_dict["value"] = False + else: + config_dict["value"] = True + + elif data_type == "float": + config_dict["value"] = float(config_dict["value"]) + + else: + if logger: + logger.error(f"Need to add support for {data_type} types to config") + else: + print(f"Need to add support for {data_type} types to config") + + return config_dict \ No newline at end of file From b289d662ab08588a022b141b091d79b2cb9d1de3 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 23 Jul 2025 16:22:05 -0400 Subject: [PATCH 02/67] move to db.util Signed-off-by: Adrian Edwards --- augur/application/config.py | 3 +-- augur/application/db/lib.py | 3 +-- augur/application/db/util.py | 30 ++++++++++++++++++++++++++++++ augur/application/util.py | 32 +------------------------------- 4 files changed, 33 insertions(+), 35 deletions(-) diff --git a/augur/application/config.py b/augur/application/config.py index 014c8ae6a0..7ace2befd3 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -4,8 +4,7 @@ from typing import List, Any, Optional import os from augur.application.db.models import Config -from augur.application.db.util import execute_session_query -from augur.application.util import convert_type_of_value +from augur.application.db.util import execute_session_query, convert_type_of_value def get_development_flag_from_config(): diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index cb6bc283e5..5bec1dc8ad 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -13,8 +13,7 @@ from augur.application.db.models import Config, Repo, Commit, WorkerOauth, Issue, PullRequest, PullRequestReview, ContributorsAlias,UnresolvedCommitEmail, Contributor, CollectionStatus, UserGroup, RepoGroup from augur.tasks.util.collection_state import CollectionState from augur.application.db import get_session, get_engine -from augur.application.db.util import execute_session_query -from augur.application.util import convert_type_of_value +from augur.application.db.util import execute_session_query, convert_type_of_value from augur.application.db.session import remove_duplicates_by_uniques, remove_null_characters_from_list_of_dicts logger = logging.getLogger("db_lib") diff --git a/augur/application/db/util.py b/augur/application/db/util.py index 9fa49ab00d..81f24ea6dd 100644 --- a/augur/application/db/util.py +++ b/augur/application/db/util.py @@ -58,3 +58,33 @@ def convert_orm_list_to_dict_list(result): return new_list + + +def convert_type_of_value(config_dict, logger=None): + + data_type = config_dict["type"] + + if data_type == "str" or data_type is None: + return config_dict + + elif data_type == "int": + config_dict["value"] = int(config_dict["value"]) + + elif data_type == "bool": + value = config_dict["value"] + + if value.lower() == "false": + config_dict["value"] = False + else: + config_dict["value"] = True + + elif data_type == "float": + config_dict["value"] = float(config_dict["value"]) + + else: + if logger: + logger.error(f"Need to add support for {data_type} types to config") + else: + print(f"Need to add support for {data_type} types to config") + + return config_dict \ No newline at end of file diff --git a/augur/application/util.py b/augur/application/util.py index fa5a63d13d..af11d7d367 100644 --- a/augur/application/util.py +++ b/augur/application/util.py @@ -24,34 +24,4 @@ def get_all_repos_count(**kwargs): result = controller.get_repo_count(source="all", **kwargs) - return result - - -def convert_type_of_value(config_dict, logger=None): - - data_type = config_dict["type"] - - if data_type == "str" or data_type is None: - return config_dict - - elif data_type == "int": - config_dict["value"] = int(config_dict["value"]) - - elif data_type == "bool": - value = config_dict["value"] - - if value.lower() == "false": - config_dict["value"] = False - else: - config_dict["value"] = True - - elif data_type == "float": - config_dict["value"] = float(config_dict["value"]) - - else: - if logger: - logger.error(f"Need to add support for {data_type} types to config") - else: - print(f"Need to add support for {data_type} types to config") - - return config_dict \ No newline at end of file + return result \ No newline at end of file From c3b08cb870286c5ace0ffb94fd4d4daa13bd1f19 Mon Sep 17 00:00:00 2001 From: mohsinm-dev Date: Sun, 27 Jul 2025 19:04:50 +0500 Subject: [PATCH 03/67] fix: resolve UniqueViolation error in GitHub releases collection Fix for GitHub Issue #3194 where releases collection was failing with psycopg2.errors.UniqueViolation on releases_pkey constraint. Root cause: GitHub API returns release IDs with trailing spaces that don't match existing trimmed database records. Changes: - Add str().strip() to release_id processing in get_release_inf() - Enhance duplicate detection in insert_release() with proper trimming - Add early duplicate detection to prevent unnecessary database operations Signed-off-by: mohsinm-dev --- augur/tasks/github/releases/core.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index 239b83dce9..255b34cf89 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -23,7 +23,7 @@ def get_release_inf(repo_id, release, tag_only): release_inf = { - 'release_id': release['id'], + 'release_id': str(release['id']).strip(), 'repo_id': repo_id, 'release_name': release['name'], 'release_description': release['description'] if release['description'] is not None else '', @@ -51,7 +51,7 @@ def get_release_inf(repo_id, release, tag_only): author = "nobody" date = "" release_inf = { - 'release_id': release['id'], + 'release_id': str(release['id']).strip(), 'repo_id': repo_id, 'release_name': release['name'], 'release_description': 'tag_only', @@ -67,17 +67,23 @@ def get_release_inf(repo_id, release, tag_only): def insert_release(session, logger, repo_id, owner, release, tag_only = False): - # Get current table values + # Get current table values with proper trimming logger.info('Getting release table values\n') query = session.query(Release.release_id).filter(Release.repo_id == repo_id) - release_id_data = execute_session_query(query, 'all')#pd.read_sql(release_id_data_sql, self.db, params={'repo_id': repo_id}) - release_id_data = [str(r_id).strip() for r_id in release_id_data]#release_id_data.apply(lambda x: x.str.strip()) + release_id_data = execute_session_query(query, 'all') + existing_release_ids = {str(r_id).strip() for r_id in release_id_data} # Put all data together in format of the table logger.info(f'Inserting release for repo with id:{repo_id}, owner:{owner}, release name:{release["name"]}\n') release_inf = get_release_inf(repo_id, release, tag_only) + + # Check if release already exists (with proper trimming) + new_release_id = str(release_inf['release_id']).strip() + if new_release_id in existing_release_ids: + logger.info(f"Release {new_release_id} already exists for repo {repo_id}, skipping insertion\n") + return - #Do an upsert + #Do an upsert with string field cleaning string_fields = ["release_name", "release_description", "release_author", "release_tag_name"] bulk_insert_dicts(logger, release_inf,Release,['release_id'], string_fields=string_fields) From 8539825bb217c388735dfa1bc43d25dc4cee0d51 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 27 Aug 2025 10:26:01 -0500 Subject: [PATCH 04/67] add date filter to contributer resolution logic queries Signed-off-by: Isaac Milarsky --- augur/tasks/git/facade_tasks.py | 13 ++----------- .../git/util/facade_worker/facade_worker/config.py | 11 +++++++++++ .../facade_worker/facade_worker/utilitymethods.py | 4 ++-- augur/tasks/github/facade_github/tasks.py | 13 ++++++++++--- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index ce03524e0f..f087c9272a 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -112,8 +112,6 @@ def trim_commits_post_analysis_facade_task(repo_git): repo = repo = get_repo_by_repo_git(repo_git) repo_id = repo.repo_id - start_date = facade_helper.get_setting('start_date') - logger.info(f"Generating sequence for repo {repo_id}") repo = get_repo_by_repo_git(repo_git) @@ -123,7 +121,7 @@ def trim_commits_post_analysis_facade_task(repo_git): repo_loc = (f"{absolute_path}/.git") # Grab the parents of HEAD - parent_commits = get_parent_commits_set(repo_loc, start_date) + parent_commits = get_parent_commits_set(repo_loc) # Grab the existing commits from the database existing_commits = get_existing_commits_set(repo_id) @@ -237,7 +235,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: repo = get_repo_by_repo_git(repo_git) repo_id = repo.repo_id - start_date = facade_helper.get_setting('start_date') + start_date = facade_helper.get_last_collected_commit_date(repo_id)#.get_setting('start_date') logger.info(f"Generating sequence for repo {repo_id}") @@ -438,11 +436,6 @@ def generate_analysis_sequence(logger,repo_git, facade_helper): analysis_sequence = [] - #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) - #repos = fetchall_data_from_sql_text(repo_list) - - start_date = facade_helper.get_setting('start_date') - #repo_ids = [repo['repo_id'] for repo in repos] #repo_id = repo_ids.pop(0) @@ -473,8 +466,6 @@ def facade_phase(repo_git, full_collection): #repo_list = s.sql.text("""SELECT repo_id,repo_group_id,repo_path,repo_name FROM repo WHERE repo_git=:value""").bindparams(value=repo_git) #repos = fetchall_data_from_sql_text(repo_list) - start_date = facade_helper.get_setting('start_date') - #repo_ids = [repo['repo_id'] for repo in repos] #repo_id = repo_ids.pop(0) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index c62034a94e..b65ff7bb69 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -244,6 +244,17 @@ def insert_or_update_data(self, query, **bind_args)-> None: return def inc_repos_processed(self): self.repos_processed += 1 + + def get_last_collected_commit_date(self,repo_id): + commit_date_query = s.sql.text(""" + SELECT cmt_committer_timestamp FROM commits + WHERE repo_id=:repo_id + ORDER BY data_collection_date DESC + LIMIT 1; + """).bindparams(repo_id=repo_id) + + result = execute_sql(commit_date_query).fetchone() + return result[0] """ class FacadeConfig: diff --git a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py index caae6c02ba..c06614ac7d 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/utilitymethods.py @@ -105,10 +105,10 @@ def get_absolute_repo_path(repo_base_dir, repo_id, repo_path,repo_name): return f"{repo_base_dir}{repo_id}-{repo_path}/{repo_name}" -def get_parent_commits_set(absolute_repo_path, start_date): +def get_parent_commits_set(absolute_repo_path): parents = subprocess.Popen(["git --git-dir %s log --ignore-missing " - "--pretty=format:'%%H' --since=%s" % (absolute_repo_path,start_date)], + "--pretty=format:'%%H'" % (absolute_repo_path)], stdout=subprocess.PIPE, shell=True) parent_commits = set(parents.stdout.read().decode("utf-8",errors="ignore").split(os.linesep)) diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 1b11f98223..26a01f21ae 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -198,6 +198,10 @@ def insert_facade_contributors(self, repo_git): logger = logging.getLogger(insert_facade_contributors.__name__) repo = get_repo_by_repo_git(repo_git) repo_id = repo.repo_id + facade_helper = FacadeHelper(logger) + + collection_status = repo.collection_status[0] + last_collected_date = collection_status.facade_data_last_collected # Get all of the commit data's emails and names from the commit table that do not appear # in the contributors table or the contributors_aliases table. @@ -214,6 +218,7 @@ def insert_facade_contributors(self, repo_git): commits WHERE commits.repo_id = :repo_id + AND (:since_date is NULL OR commits.data_collection_date > :since_date) AND (NOT EXISTS ( SELECT contributors.cntrb_canonical FROM contributors WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email ) or NOT EXISTS ( SELECT contributors_aliases.alias_email from contributors_aliases where contributors_aliases.alias_email = commits.cmt_author_raw_email) AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name )) @@ -231,6 +236,7 @@ def insert_facade_contributors(self, repo_git): commits WHERE commits.repo_id = :repo_id + AND (:since_date is NULL OR commits.data_collection_date > :since_date) AND EXISTS ( SELECT unresolved_commit_emails.email FROM unresolved_commit_emails WHERE unresolved_commit_emails.email = commits.cmt_author_raw_email ) AND ( commits.cmt_author_name ) IN ( SELECT C.cmt_author_name FROM commits AS C WHERE C.repo_id = :repo_id GROUP BY C.cmt_author_name ) GROUP BY @@ -239,7 +245,7 @@ def insert_facade_contributors(self, repo_git): commits.cmt_author_raw_email ORDER BY hash - """).bindparams(repo_id=repo_id) + """).bindparams(repo_id=repo_id,since_date=last_collected_date) #Execute statement with session. result = execute_sql(new_contrib_sql) @@ -257,7 +263,6 @@ def insert_facade_contributors(self, repo_git): logger.debug("DEBUG: Got through the new_contribs") - facade_helper = FacadeHelper(logger) # sql query used to find corresponding cntrb_id's of emails found in the contributor's table # i.e., if a contributor already exists, we use it! resolve_email_to_cntrb_id_sql = s.sql.text(""" @@ -271,6 +276,7 @@ def insert_facade_contributors(self, repo_git): commits WHERE contributors.cntrb_canonical = commits.cmt_author_raw_email + AND (:since_date is NULL OR commits.data_collection_date > :since_date) AND commits.repo_id = :repo_id UNION SELECT DISTINCT @@ -286,7 +292,8 @@ def insert_facade_contributors(self, repo_git): contributors_aliases.alias_email = commits.cmt_author_raw_email AND contributors.cntrb_id = contributors_aliases.cntrb_id AND commits.repo_id = :repo_id - """).bindparams(repo_id=repo_id) + AND (:since_date is NULL OR commits.data_collection_date > :since_date) + """).bindparams(repo_id=repo_id,since_date=last_collected_date) result = execute_sql(resolve_email_to_cntrb_id_sql) From fe85b3f5a27298eb807d42e0711bf5a51c045c3a Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 27 Aug 2025 10:28:28 -0500 Subject: [PATCH 05/67] dont use start date Signed-off-by: Isaac Milarsky --- augur/tasks/git/facade_tasks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index f087c9272a..c049b4b831 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -235,8 +235,6 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: repo = get_repo_by_repo_git(repo_git) repo_id = repo.repo_id - start_date = facade_helper.get_last_collected_commit_date(repo_id)#.get_setting('start_date') - logger.info(f"Generating sequence for repo {repo_id}") repo = get_repo_by_repo_id(repo_id) @@ -246,7 +244,7 @@ def analyze_commits_in_parallel(repo_git, multithreaded: bool)-> None: repo_loc = (f"{absolute_path}/.git") # Grab the parents of HEAD - parent_commits = get_parent_commits_set(repo_loc, start_date) + parent_commits = get_parent_commits_set(repo_loc) # Grab the existing commits from the database existing_commits = get_existing_commits_set(repo_id) From b17d8f977dd80d994b4ffff1bb938584ef84226f Mon Sep 17 00:00:00 2001 From: Ulincsys Date: Tue, 22 Jul 2025 18:30:47 -0500 Subject: [PATCH 06/67] Try a new version of the TZdata fix Signed-off-by: Ulincsys --- augur/application/db/lib.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 5bec1dc8ad..5fe0443967 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -225,18 +225,37 @@ def facade_bulk_insert_commits(logger, records): facade_bulk_insert_commits(logger, firsthalfRecords) facade_bulk_insert_commits(logger, secondhalfRecords) - elif len(records) == 1 and isinstance(e,DataError) and "time zone displacement" in f"{e}": + elif len(records) == 1: commit_record = records[0] #replace incomprehensible dates with epoch. #2021-10-11 11:57:46 -0500 # placeholder_date = "1970-01-01 00:00:15 -0500" placeholder_date = commit_record['author_timestamp'] + + postgres_valid_timezones = { + -1200, -1100, -1000, -930, -900, -800, -700, + -600, -500, -400, -300, -230, -200, -100, 000, + 100, 200, 300, 330, 400, 430, 500, 530, 545, 600, + 630, 700, 800, 845, 900, 930, 1000, 1030, 1100, 1200, + 1245, 1300, 1400 + } # Reconstruct timezone portion of the date string to UTC - placeholder_date = re.split("[-+]", placeholder_date) - placeholder_date.pop() - placeholder_date = "-".join(placeholder_date) + "+0000" + placeholder_date_segments = re.split(" ", placeholder_date) + tzdata = placeholder_date_segments.pop() + + if ":" in tzdata: + tzdata = tzdata.replace(":", "") + + if int(tzdata) not in postgres_valid_timezones: + tzdata = "+0000" + else: + raise e + + placeholder_date_segments.append(tzdata) + + placeholder_date = " ".join(placeholder_date_segments) #Check for improper utc timezone offset #UTC timezone offset should be between -14:00 and +14:00 From aeaf09f5d6debfcc9f4cbc5bae2cf04fa9ab55b5 Mon Sep 17 00:00:00 2001 From: Ulincsys Date: Sat, 16 Aug 2025 11:25:31 -0500 Subject: [PATCH 07/67] Fix KeyError in invalid timezone handling The keys on this dictionary are defined in: analyzecommit.generate_commit_record() - Update reference to use proper 'cmt_author_timestamp' key - Add warning log when replacing TZdata to show commit hash Signed-off-by: Ulincsys --- augur/application/db/lib.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 5fe0443967..a82c97dd66 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -231,7 +231,7 @@ def facade_bulk_insert_commits(logger, records): #2021-10-11 11:57:46 -0500 # placeholder_date = "1970-01-01 00:00:15 -0500" - placeholder_date = commit_record['author_timestamp'] + placeholder_date = commit_record['cmt_author_timestamp'] postgres_valid_timezones = { -1200, -1100, -1000, -930, -900, -800, -700, @@ -260,8 +260,11 @@ def facade_bulk_insert_commits(logger, records): #Check for improper utc timezone offset #UTC timezone offset should be between -14:00 and +14:00 - commit_record['author_timestamp'] = placeholder_date - commit_record['committer_timestamp'] = placeholder_date + # analyzecommit.generate_commit_record() defines the keys on the commit_record dictionary + commit_record['cmt_author_timestamp'] = placeholder_date + commit_record['cmt_committer_timestamp'] = placeholder_date + + logger.warning(f"commit with invalid timezone set to UTC: {commit_record['cmt_commit_hash']}") session.execute( s.insert(Commit), From ca922adc2e93b22f64b0705a9599c71df40edc2d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 4 Sep 2025 11:45:21 -0400 Subject: [PATCH 08/67] add a dockerfile that can build the empty DB container Signed-off-by: Adrian Edwards --- docker/empty_database/Dockerfile | 66 ++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 docker/empty_database/Dockerfile diff --git a/docker/empty_database/Dockerfile b/docker/empty_database/Dockerfile new file mode 100644 index 0000000000..d4e3122450 --- /dev/null +++ b/docker/empty_database/Dockerfile @@ -0,0 +1,66 @@ +from postgres:16 AS builder + +ENV DEBIAN_FRONTEND=noninteractive + +# Install uv (https://docs.astral.sh/uv/guides/integration/docker/#installing-uv) +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ +ENV UV_COMPILE_BYTECODE=1 +# The uv package cache will be on a cache volume, so can't be linked +ENV UV_LINK_MODE=copy +# Assert that the lockfile (uv.lock) is up-to-date. Use `uv lock` to update it +# manually if this fails the container build. +ENV UV_LOCKED=1 + +WORKDIR /augur + +COPY pyproject.toml . +COPY uv.lock . +COPY .python-version . + +# Install augur's dependencies early to take advantage of build cache +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --no-install-project --no-dev + +# Copy in the actual code +# The RUN line below ensure that permissions are set correctly. +# This is the equivalent of the following docker --chmod flags, but done in a way thats compatible with podman. +# This can be removed once https://github.com/containers/buildah/issues/6066 or relevant equivalent is fixed +# - u=rw,u+X: user can read and write all files/dirs and execute directories +# - go=r,go+X: group and others can read all files/dirs and execute directories +COPY README.md . +COPY LICENSE . +COPY alembic.ini . +COPY augur/ augur/ +COPY metadata.py . +COPY scripts/ scripts/ + +RUN find augur -type d -exec chmod u=rwx,go=rx {} + && find augur -type f -exec chmod u=rw,go=r {} + + +RUN find scripts -exec chmod u=rwx,go=rx {} + + +# Install the main project +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --no-dev + +# We aren't going to activate the virtualenv (manually or via uv run), so we +# need adjust the PATH +ENV PATH="/augur/.venv/bin:${PATH}" + +ENV POSTGRES_DB="augur" +ENV POSTGRES_USER="augur" +ENV POSTGRES_PASSWORD="augur" +ENV AUGUR_DB="postgresql+psycopg2://augur:augur@localhost:5432/augur" +# ENV PGDATA="/var/lib/postgresql/data" + +RUN set -e && \ + gosu postgres initdb && \ + gosu postgres pg_ctl -D "$PGDATA" -o "-c listen_addresses='localhost'" -w start && \ + gosu postgres psql -c "CREATE USER ${POSTGRES_USER} WITH SUPERUSER PASSWORD '${POSTGRES_PASSWORD}';" && \ + gosu postgres psql -c "CREATE DATABASE ${POSTGRES_DB} OWNER ${POSTGRES_USER};" && \ + augur db create-schema && \ + gosu postgres pg_ctl -D "$PGDATA" -m fast -w stop + + +FROM postgres:16 + +COPY --from=builder /var/lib/postgresql/data /var/lib/postgresql/data From 4631d9005f574040a248dc5f38e33e68e977a037 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 4 Sep 2025 11:47:46 -0400 Subject: [PATCH 09/67] add empty db to GHCR container builds Signed-off-by: Adrian Edwards --- .github/workflows/build_docker.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index d37d205da4..e7fa4b262c 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -300,6 +300,7 @@ jobs: - database - keyman - rabbitmq + - empty_database runs-on: ubuntu-latest steps: - name: Checkout repository From 6a6c76014c64a3a201bddcf4ab41a4a17c1f140a Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Tue, 9 Sep 2025 17:30:55 -0500 Subject: [PATCH 10/67] remove unused method Signed-off-by: Isaac Milarsky --- Makefile | 2 +- .../facade_worker/facade_worker/config.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 4fe926edc4..485ac13732 100644 --- a/Makefile +++ b/Makefile @@ -129,7 +129,7 @@ test-api: # .PHONY: uv uv: - @ command -v uv >/dev/null 2>&1 || { echo "Installing uv..."; pip install --user uv; } + @ command -v uv >/dev/null 2>&1 || { echo "Installing uv..."; pip3 install uv; } # # Documentation diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index b65ff7bb69..c75329aaff 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -245,16 +245,16 @@ def insert_or_update_data(self, query, **bind_args)-> None: def inc_repos_processed(self): self.repos_processed += 1 - def get_last_collected_commit_date(self,repo_id): - commit_date_query = s.sql.text(""" - SELECT cmt_committer_timestamp FROM commits - WHERE repo_id=:repo_id - ORDER BY data_collection_date DESC - LIMIT 1; - """).bindparams(repo_id=repo_id) - - result = execute_sql(commit_date_query).fetchone() - return result[0] +# def get_last_collected_commit_date(self,repo_id): +# commit_date_query = s.sql.text(""" +# SELECT cmt_committer_timestamp FROM commits +# WHERE repo_id=:repo_id +# ORDER BY data_collection_date DESC +# LIMIT 1; +# """).bindparams(repo_id=repo_id) +# +# result = execute_sql(commit_date_query).fetchone() +# return result[0] """ class FacadeConfig: From 33debfd1ba515c899608b29e6fb31bb2cd524c5c Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 15 Sep 2025 13:11:06 -0400 Subject: [PATCH 11/67] refactor send_messages to remove a almost entirely duplicate code path Signed-off-by: Adrian Edwards --- augur/tasks/util/collection_util.py | 62 ++++++++++++++--------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index bed73bd120..44effbbf78 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -597,37 +597,33 @@ def send_messages(self): for repo_git, full_collection in col_hook.repo_list: repo = get_repo_by_repo_git(repo_git) + platform_name = "github" + # this needs to be here and not up a level since it should be set/reset for each repo. + # otherwise a gitlab repo would reset it and cause subsequent github repos to use gitlab phases. + phases = None if "github" in repo.repo_git: - augur_collection_sequence = [] - for job in col_hook.phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git, full_collection)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting github repo {col_hook.name} status to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id, col_hook.name - else: - if col_hook.gitlab_phases is not None: - - augur_collection_sequence = [] - for job in col_hook.gitlab_phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git, full_collection)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting gitlab repo {col_hook.name} status to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id, col_hook.name + phases = col_hook.phases + # use default platform name + + elif "gitlab" in repo.repo_git: + platform_name = "gitlab" + if col_hook.gitlab_phases is None: + return + phases = col_hook.gitlab_phases + + augur_collection_sequence = [] + for job in phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git, full_collection)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting {platform_name} repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name + From 83d0db1c76d4837d1fde84750692595d3a15c4e1 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 15 Sep 2025 16:37:03 -0400 Subject: [PATCH 12/67] refactor facade_phase to return the group itself so that celery can properly convert things into a chord Signed-off-by: Adrian Edwards --- augur/tasks/git/facade_tasks.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/augur/tasks/git/facade_tasks.py b/augur/tasks/git/facade_tasks.py index ce03524e0f..15537a4d1a 100644 --- a/augur/tasks/git/facade_tasks.py +++ b/augur/tasks/git/facade_tasks.py @@ -491,7 +491,6 @@ def facade_phase(repo_git, full_collection): #force_analysis = session.force_analysis run_facade_contributors = facade_helper.run_facade_contributors - facade_sequence = [] facade_core_collection = [] if not limited_run or (limited_run and pull_repos): @@ -509,14 +508,12 @@ def facade_phase(repo_git, full_collection): #These tasks need repos to be cloned by facade before they can work. - facade_sequence.append( - group( - chain(*facade_core_collection), - process_dependency_metrics.si(repo_git), - process_libyear_dependency_metrics.si(repo_git), - process_scc_value_metrics.si(repo_git) - ) + facade_sequence = group( + chain(*facade_core_collection), + process_dependency_metrics.si(repo_git), + process_libyear_dependency_metrics.si(repo_git), + process_scc_value_metrics.si(repo_git) ) logger.info(f"Facade sequence: {facade_sequence}") - return chain(*facade_sequence) \ No newline at end of file + return facade_sequence \ No newline at end of file From bc35b3802538656d451c0a4b6cdd017e1e550073 Mon Sep 17 00:00:00 2001 From: Sajal-Kulshreshtha Date: Tue, 9 Sep 2025 20:05:11 +0530 Subject: [PATCH 13/67] fix(cli): display timezone in output for GitHub key expiry (#3251) Signed-off-by: Sajal-Kulshreshtha --- augur/application/cli/github.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/augur/application/cli/github.py b/augur/application/cli/github.py index 4896bf05fb..bcc29597b6 100644 --- a/augur/application/cli/github.py +++ b/augur/application/cli/github.py @@ -88,9 +88,12 @@ def update_api_key(): engine.dispose() + + def epoch_to_local_time_with_am_pm(epoch): - local_time = datetime.fromtimestamp(epoch) - formatted_time = local_time.strftime('%I:%M %p') # This format includes the date as well + # Convert epoch to local time with timezone awareness + local_time = datetime.fromtimestamp(epoch).astimezone() + formatted_time = local_time.strftime('%Y-%m-%d %I:%M %p %Z (UTC%z)') return formatted_time From 60d3036d8d6b8b54d5f3668c892955fde60f190a Mon Sep 17 00:00:00 2001 From: Sajal-Kulshreshtha Date: Fri, 12 Sep 2025 03:15:36 +0530 Subject: [PATCH 14/67] removed day from formatted time Signed-off-by: Sajal-Kulshreshtha --- augur/application/cli/github.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/cli/github.py b/augur/application/cli/github.py index bcc29597b6..6dfca0f7d5 100644 --- a/augur/application/cli/github.py +++ b/augur/application/cli/github.py @@ -93,7 +93,7 @@ def update_api_key(): def epoch_to_local_time_with_am_pm(epoch): # Convert epoch to local time with timezone awareness local_time = datetime.fromtimestamp(epoch).astimezone() - formatted_time = local_time.strftime('%Y-%m-%d %I:%M %p %Z (UTC%z)') + formatted_time = local_time.strftime('%I:%M %p %Z (UTC%z)') return formatted_time From 259869925d161c94881f921230d0d2f599596058 Mon Sep 17 00:00:00 2001 From: Sajal-Kulshreshtha Date: Tue, 16 Sep 2025 02:27:21 +0530 Subject: [PATCH 15/67] removed extra lines Signed-off-by: Sajal-Kulshreshtha --- augur/application/cli/github.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/augur/application/cli/github.py b/augur/application/cli/github.py index 6dfca0f7d5..8716a1f623 100644 --- a/augur/application/cli/github.py +++ b/augur/application/cli/github.py @@ -87,16 +87,12 @@ def update_api_key(): engine.dispose() - - - def epoch_to_local_time_with_am_pm(epoch): # Convert epoch to local time with timezone awareness local_time = datetime.fromtimestamp(epoch).astimezone() formatted_time = local_time.strftime('%I:%M %p %Z (UTC%z)') return formatted_time - def find_duplicates(lst): counter = Counter(lst) return [item for item, count in counter.items() if count > 1] From 139d5bcc326210ab3f61e6dd94e6cc7f45d1b556 Mon Sep 17 00:00:00 2001 From: Sajal-Kulshreshtha Date: Tue, 16 Sep 2025 20:25:51 +0530 Subject: [PATCH 16/67] Align timestamp formatting with header layout Signed-off-by: Sajal-Kulshreshtha --- augur/application/cli/github.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/application/cli/github.py b/augur/application/cli/github.py index 8716a1f623..0fa1f2967c 100644 --- a/augur/application/cli/github.py +++ b/augur/application/cli/github.py @@ -56,7 +56,7 @@ def update_api_key(): core_reset_header = "Core Reset Time" graphql_request_header = "Graphql Requests Left" graphql_reset_header = "Graphql Reset Time" - print(f"{'Key'.center(40)} {core_request_header} {core_reset_header} {graphql_request_header} {graphql_reset_header}") + print(f"{'Key'.center(40)} {core_request_header} {core_reset_header.center(24)} {graphql_request_header} {graphql_reset_header.center(24)}") for key, core_key_data, graphql_key_data in valid_key_data: core_requests = str(core_key_data['requests_remaining']).center(len(core_request_header)) core_reset_time = str(epoch_to_local_time_with_am_pm(core_key_data["reset_epoch"])).center(len(core_reset_header)) @@ -90,7 +90,7 @@ def update_api_key(): def epoch_to_local_time_with_am_pm(epoch): # Convert epoch to local time with timezone awareness local_time = datetime.fromtimestamp(epoch).astimezone() - formatted_time = local_time.strftime('%I:%M %p %Z (UTC%z)') + formatted_time = local_time.strftime('%I:%M %p %Z (UTC%z)').center(24) return formatted_time def find_duplicates(lst): From b8e1c79072039ecbbdbf43815907a1844b4901c9 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 16 Sep 2025 15:08:02 -0400 Subject: [PATCH 17/67] bump rabbit image to 4.1 Signed-off-by: Adrian Edwards --- docker/rabbitmq/Dockerfile | 2 +- docker/rabbitmq/definitions.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index ad86dfebb7..b4afc5345d 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -1,4 +1,4 @@ -FROM rabbitmq:3.12-management-alpine +FROM rabbitmq:4.1-management-alpine LABEL maintainer="574/augur@simplelogin.com" LABEL version="0.90.0" diff --git a/docker/rabbitmq/definitions.json b/docker/rabbitmq/definitions.json index 1cd8cc172e..d5fd9faef2 100644 --- a/docker/rabbitmq/definitions.json +++ b/docker/rabbitmq/definitions.json @@ -1,5 +1,5 @@ { - "rabbit_version": "3.12", + "rabbit_version": "4.1", "users": [ { "name": "", From db8843dacfec7dd251dafe88126ef42fc34bbc27 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 23 Sep 2025 11:21:38 -0500 Subject: [PATCH 18/67] commented out abused API endpoint Signed-off-by: Sean P. Goggins --- augur/api/routes/contributor_reports.py | 536 ++++++++++++------------ 1 file changed, 274 insertions(+), 262 deletions(-) diff --git a/augur/api/routes/contributor_reports.py b/augur/api/routes/contributor_reports.py index 711f321b3e..a72e4478c6 100644 --- a/augur/api/routes/contributor_reports.py +++ b/augur/api/routes/contributor_reports.py @@ -39,283 +39,295 @@ def new_contributor_data_collection(repo_id, required_contributions): rank_list.append(num) rank_tuple = tuple(rank_list) - contributor_query = salc.sql.text(f""" +##### + +## Commented out due to abuse. + +##### + + + # contributor_query = salc.sql.text(f""" - SELECT * FROM ( - SELECT ID AS - cntrb_id, - A.created_at AS created_at, - date_part('month', A.created_at::DATE) AS month, - date_part('year', A.created_at::DATE) AS year, - A.repo_id, - repo_name, - full_name, - login, - ACTION, - rank() OVER ( - PARTITION BY id - ORDER BY A.created_at ASC - ) - FROM - ( - ( - SELECT - canonical_id AS ID, - created_at AS created_at, - repo_id, - 'issue_opened' AS ACTION, - contributors.cntrb_full_name AS full_name, - contributors.cntrb_login AS login - FROM - augur_data.issues - LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id - LEFT OUTER JOIN ( - SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - cntrb_canonical AS canonical_email, - data_collection_date, - cntrb_id AS canonical_id - FROM augur_data.contributors - WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - WHERE - repo_id = {repo_id} - AND pull_request IS NULL - GROUP BY - canonical_id, - repo_id, - issues.created_at, - contributors.cntrb_full_name, - contributors.cntrb_login - ) UNION ALL - ( - SELECT - canonical_id AS ID, - TO_TIMESTAMP( cmt_author_date, 'YYYY-MM-DD' ) AS created_at, - repo_id, - 'commit' AS ACTION, - contributors.cntrb_full_name AS full_name, - contributors.cntrb_login AS login - FROM - augur_data.commits - LEFT OUTER JOIN augur_data.contributors ON cntrb_email = cmt_author_email - LEFT OUTER JOIN ( - SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - cntrb_canonical AS canonical_email, - data_collection_date, cntrb_id AS canonical_id - FROM augur_data.contributors - WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - WHERE - repo_id = {repo_id} - GROUP BY - repo_id, - canonical_email, - canonical_id, - commits.cmt_author_date, - contributors.cntrb_full_name, - contributors.cntrb_login - ) UNION ALL - ( - SELECT - message.cntrb_id AS ID, - created_at AS created_at, - commits.repo_id, - 'commit_comment' AS ACTION, - contributors.cntrb_full_name AS full_name, - contributors.cntrb_login AS login + # SELECT * FROM ( + # SELECT ID AS + # cntrb_id, + # A.created_at AS created_at, + # date_part('month', A.created_at::DATE) AS month, + # date_part('year', A.created_at::DATE) AS year, + # A.repo_id, + # repo_name, + # full_name, + # login, + # ACTION, + # rank() OVER ( + # PARTITION BY id + # ORDER BY A.created_at ASC + # ) + # FROM + # ( + # ( + # SELECT + # canonical_id AS ID, + # created_at AS created_at, + # repo_id, + # 'issue_opened' AS ACTION, + # contributors.cntrb_full_name AS full_name, + # contributors.cntrb_login AS login + # FROM + # augur_data.issues + # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id + # LEFT OUTER JOIN ( + # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + # cntrb_canonical AS canonical_email, + # data_collection_date, + # cntrb_id AS canonical_id + # FROM augur_data.contributors + # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + # WHERE + # repo_id = {repo_id} + # AND pull_request IS NULL + # GROUP BY + # canonical_id, + # repo_id, + # issues.created_at, + # contributors.cntrb_full_name, + # contributors.cntrb_login + # ) UNION ALL + # ( + # SELECT + # canonical_id AS ID, + # TO_TIMESTAMP( cmt_author_date, 'YYYY-MM-DD' ) AS created_at, + # repo_id, + # 'commit' AS ACTION, + # contributors.cntrb_full_name AS full_name, + # contributors.cntrb_login AS login + # FROM + # augur_data.commits + # LEFT OUTER JOIN augur_data.contributors ON cntrb_email = cmt_author_email + # LEFT OUTER JOIN ( + # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + # cntrb_canonical AS canonical_email, + # data_collection_date, cntrb_id AS canonical_id + # FROM augur_data.contributors + # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + # WHERE + # repo_id = {repo_id} + # GROUP BY + # repo_id, + # canonical_email, + # canonical_id, + # commits.cmt_author_date, + # contributors.cntrb_full_name, + # contributors.cntrb_login + # ) UNION ALL + # ( + # SELECT + # message.cntrb_id AS ID, + # created_at AS created_at, + # commits.repo_id, + # 'commit_comment' AS ACTION, + # contributors.cntrb_full_name AS full_name, + # contributors.cntrb_login AS login - FROM - augur_data.commit_comment_ref, - augur_data.commits, - augur_data.message - LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id - LEFT OUTER JOIN ( - SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - cntrb_canonical AS canonical_email, - data_collection_date, cntrb_id AS canonical_id - FROM augur_data.contributors - WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - WHERE - commits.cmt_id = commit_comment_ref.cmt_id - AND commits.repo_id = {repo_id} - AND commit_comment_ref.msg_id = message.msg_id + # FROM + # augur_data.commit_comment_ref, + # augur_data.commits, + # augur_data.message + # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + # LEFT OUTER JOIN ( + # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + # cntrb_canonical AS canonical_email, + # data_collection_date, cntrb_id AS canonical_id + # FROM augur_data.contributors + # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + # WHERE + # commits.cmt_id = commit_comment_ref.cmt_id + # AND commits.repo_id = {repo_id} + # AND commit_comment_ref.msg_id = message.msg_id - GROUP BY - ID, - commits.repo_id, - commit_comment_ref.created_at, - contributors.cntrb_full_name, - contributors.cntrb_login - ) UNION ALL - ( - SELECT - issue_events.cntrb_id AS ID, - issue_events.created_at AS created_at, - issues.repo_id, - 'issue_closed' AS ACTION, - contributors.cntrb_full_name AS full_name, - contributors.cntrb_login AS login - FROM - augur_data.issues, - augur_data.issue_events - LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id - LEFT OUTER JOIN ( - SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - cntrb_canonical AS canonical_email, - data_collection_date, - cntrb_id AS canonical_id - FROM augur_data.contributors - WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - WHERE - issues.repo_id = {repo_id} - AND issues.issue_id = issue_events.issue_id - AND issues.pull_request IS NULL - AND issue_events.cntrb_id IS NOT NULL - AND ACTION = 'closed' - GROUP BY - issue_events.cntrb_id, - issues.repo_id, - issue_events.created_at, - contributors.cntrb_full_name, - contributors.cntrb_login - ) UNION ALL - ( - SELECT - pr_augur_contributor_id AS ID, - pr_created_at AS created_at, - pull_requests.repo_id, - 'open_pull_request' AS ACTION, - contributors.cntrb_full_name AS full_name, - contributors.cntrb_login AS login - FROM - augur_data.pull_requests - LEFT OUTER JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id - LEFT OUTER JOIN ( - SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - cntrb_canonical AS canonical_email, - data_collection_date, - cntrb_id AS canonical_id - FROM augur_data.contributors - WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - WHERE - pull_requests.repo_id = {repo_id} - GROUP BY - pull_requests.pr_augur_contributor_id, - pull_requests.repo_id, - pull_requests.pr_created_at, - contributors.cntrb_full_name, - contributors.cntrb_login - ) UNION ALL - ( - SELECT - message.cntrb_id AS ID, - msg_timestamp AS created_at, - pull_requests.repo_id as repo_id, - 'pull_request_comment' AS ACTION, - contributors.cntrb_full_name AS full_name, - contributors.cntrb_login AS login - FROM - augur_data.pull_requests, - augur_data.pull_request_message_ref, - augur_data.message - LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id - LEFT OUTER JOIN ( - SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - cntrb_canonical AS canonical_email, - data_collection_date, - cntrb_id AS canonical_id - FROM augur_data.contributors - WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - WHERE - pull_requests.repo_id = {repo_id} - AND pull_request_message_ref.pull_request_id = pull_requests.pull_request_id - AND pull_request_message_ref.msg_id = message.msg_id - GROUP BY - message.cntrb_id, - pull_requests.repo_id, - message.msg_timestamp, - contributors.cntrb_full_name, - contributors.cntrb_login - ) UNION ALL - ( - SELECT - issues.reporter_id AS ID, - msg_timestamp AS created_at, - issues.repo_id as repo_id, - 'issue_comment' AS ACTION, - contributors.cntrb_full_name AS full_name, - contributors.cntrb_login AS login - FROM - issues, - issue_message_ref, - message - LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id - LEFT OUTER JOIN ( - SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - cntrb_canonical AS canonical_email, - data_collection_date, - cntrb_id AS canonical_id - FROM augur_data.contributors - WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - WHERE - issues.repo_id = {repo_id} - AND issue_message_ref.msg_id = message.msg_id - AND issues.issue_id = issue_message_ref.issue_id - AND issues.pull_request_id = NULL - GROUP BY - issues.reporter_id, - issues.repo_id, - message.msg_timestamp, - contributors.cntrb_full_name, - contributors.cntrb_login - ) - ) A, - repo - WHERE - ID IS NOT NULL - AND A.repo_id = repo.repo_id - GROUP BY - A.ID, - A.repo_id, - A.ACTION, - A.created_at, - repo.repo_name, - A.full_name, - A.login - ORDER BY - cntrb_id - ) b - WHERE RANK IN {rank_tuple} - - """) + # GROUP BY + # ID, + # commits.repo_id, + # commit_comment_ref.created_at, + # contributors.cntrb_full_name, + # contributors.cntrb_login + # ) UNION ALL + # ( + # SELECT + # issue_events.cntrb_id AS ID, + # issue_events.created_at AS created_at, + # issues.repo_id, + # 'issue_closed' AS ACTION, + # contributors.cntrb_full_name AS full_name, + # contributors.cntrb_login AS login + # FROM + # augur_data.issues, + # augur_data.issue_events + # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id + # LEFT OUTER JOIN ( + # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + # cntrb_canonical AS canonical_email, + # data_collection_date, + # cntrb_id AS canonical_id + # FROM augur_data.contributors + # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + # WHERE + # issues.repo_id = {repo_id} + # AND issues.issue_id = issue_events.issue_id + # AND issues.pull_request IS NULL + # AND issue_events.cntrb_id IS NOT NULL + # AND ACTION = 'closed' + # GROUP BY + # issue_events.cntrb_id, + # issues.repo_id, + # issue_events.created_at, + # contributors.cntrb_full_name, + # contributors.cntrb_login + # ) UNION ALL + # ( + # SELECT + # pr_augur_contributor_id AS ID, + # pr_created_at AS created_at, + # pull_requests.repo_id, + # 'open_pull_request' AS ACTION, + # contributors.cntrb_full_name AS full_name, + # contributors.cntrb_login AS login + # FROM + # augur_data.pull_requests + # LEFT OUTER JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id + # LEFT OUTER JOIN ( + # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + # cntrb_canonical AS canonical_email, + # data_collection_date, + # cntrb_id AS canonical_id + # FROM augur_data.contributors + # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + # WHERE + # pull_requests.repo_id = {repo_id} + # GROUP BY + # pull_requests.pr_augur_contributor_id, + # pull_requests.repo_id, + # pull_requests.pr_created_at, + # contributors.cntrb_full_name, + # contributors.cntrb_login + # ) UNION ALL + # ( + # SELECT + # message.cntrb_id AS ID, + # msg_timestamp AS created_at, + # pull_requests.repo_id as repo_id, + # 'pull_request_comment' AS ACTION, + # contributors.cntrb_full_name AS full_name, + # contributors.cntrb_login AS login + # FROM + # augur_data.pull_requests, + # augur_data.pull_request_message_ref, + # augur_data.message + # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + # LEFT OUTER JOIN ( + # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + # cntrb_canonical AS canonical_email, + # data_collection_date, + # cntrb_id AS canonical_id + # FROM augur_data.contributors + # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + # WHERE + # pull_requests.repo_id = {repo_id} + # AND pull_request_message_ref.pull_request_id = pull_requests.pull_request_id + # AND pull_request_message_ref.msg_id = message.msg_id + # GROUP BY + # message.cntrb_id, + # pull_requests.repo_id, + # message.msg_timestamp, + # contributors.cntrb_full_name, + # contributors.cntrb_login + # ) UNION ALL + # ( + # SELECT + # issues.reporter_id AS ID, + # msg_timestamp AS created_at, + # issues.repo_id as repo_id, + # 'issue_comment' AS ACTION, + # contributors.cntrb_full_name AS full_name, + # contributors.cntrb_login AS login + # FROM + # issues, + # issue_message_ref, + # message + # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id + # LEFT OUTER JOIN ( + # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, + # cntrb_canonical AS canonical_email, + # data_collection_date, + # cntrb_id AS canonical_id + # FROM augur_data.contributors + # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical + # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical + # WHERE + # issues.repo_id = {repo_id} + # AND issue_message_ref.msg_id = message.msg_id + # AND issues.issue_id = issue_message_ref.issue_id + # AND issues.pull_request_id = NULL + # GROUP BY + # issues.reporter_id, + # issues.repo_id, + # message.msg_timestamp, + # contributors.cntrb_full_name, + # contributors.cntrb_login + # ) + # ) A, + # repo + # WHERE + # ID IS NOT NULL + # AND A.repo_id = repo.repo_id + # GROUP BY + # A.ID, + # A.repo_id, + # A.ACTION, + # A.created_at, + # repo.repo_name, + # A.full_name, + # A.login + # ORDER BY + # cntrb_id + # ) b + # WHERE RANK IN {rank_tuple} + + # """) + # contributor_query2 = (""" + + # select count(*) from augur_data.repo; + # """) with current_app.engine.connect() as conn: - df = pd.read_sql(contributor_query, conn) + df = pd.read_sql(contributor_query2, conn) - df = df.loc[~df['full_name'].str.contains('bot', na=False)] - df = df.loc[~df['login'].str.contains('bot', na=False)] + #df = df.loc[~df['full_name'].str.contains('bot', na=False)] + #df = df.loc[~df['login'].str.contains('bot', na=False)] - df = df.loc[~df['cntrb_id'].isin(df[df.duplicated(['cntrb_id', 'created_at', 'repo_id', 'rank'])]['cntrb_id'])] + #df = df.loc[~df['cntrb_id'].isin(df[df.duplicated(['cntrb_id', 'created_at', 'repo_id', 'rank'])]['cntrb_id'])] # add yearmonths to contributor - df[['month', 'year']] = df[['month', 'year']].astype(int).astype(str) - df['yearmonth'] = df['month'] + '/' + df['year'] - df['yearmonth'] = pd.to_datetime(df['yearmonth']) + #df[['month', 'year']] = df[['month', 'year']].astype(int).astype(str) + #df['yearmonth'] = df['month'] + '/' + df['year'] + #df['yearmonth'] = pd.to_datetime(df['yearmonth']) # add column with every value being one, so when the contributor df is concatenated # with the months df, the filler months won't be counted in the sums - df['new_contributors'] = 1 + #df['new_contributors'] = 1 # add quarters to contributor dataframe - df['month'] = df['month'].astype(int) - df['quarter'] = df.apply(lambda x: quarters(x['month'], x['year']), axis=1, result_type='reduce') - df['quarter'] = pd.to_datetime(df['quarter']) + #df['month'] = df['month'].astype(int) + #df['quarter'] = df.apply(lambda x: quarters(x['month'], x['year']), axis=1, result_type='reduce') + #df['quarter'] = pd.to_datetime(df['quarter']) + df = [1] return df def months_data_collection(start_date, end_date): From 5899e01112c18cd6880fd3a1f1a70d303fca1c0e Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 23 Sep 2025 13:04:52 -0400 Subject: [PATCH 19/67] bump celery and a related dependency Signed-off-by: Adrian Edwards --- pyproject.toml | 4 ++-- uv.lock | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6e78b2118f..c086babe25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,8 @@ dependencies = [ "bokeh==2.0.2", "boto3==1.17.57", "bs4==0.0.1", - "celery==5.2.7", - "click==8.0.3", + "celery~=5.5", + "click~=8.1", "cloudpickle>=0.2.2", "coloredlogs==15.0", "dask>=2021.6.2", diff --git a/uv.lock b/uv.lock index 66352a8f2e..1c74a61de7 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -244,8 +244,8 @@ requires-dist = [ { name = "bokeh", specifier = "==2.0.2" }, { name = "boto3", specifier = "==1.17.57" }, { name = "bs4", specifier = "==0.0.1" }, - { name = "celery", specifier = "==5.2.7" }, - { name = "click", specifier = "==8.0.3" }, + { name = "celery", specifier = "~=5.5" }, + { name = "click", specifier = "~=8.1" }, { name = "cloudpickle", specifier = ">=0.2.2" }, { name = "coloredlogs", specifier = "==15.0" }, { name = "dask", specifier = ">=2021.6.2" }, @@ -364,11 +364,11 @@ wheels = [ [[package]] name = "billiard" -version = "3.6.4.0" +version = "4.2.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/92/91/40de1901da8ec9eeb7c6a22143ba5d55d8aaa790761ca31342cedcd5c793/billiard-3.6.4.0.tar.gz", hash = "sha256:299de5a8da28a783d51b197d496bef4f1595dd023a93a4f59dde1886ae905547", size = 155303, upload-time = "2021-04-01T09:23:50.092Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/6a/1405343016bce8354b29d90aad6b0bf6485b5e60404516e4b9a3a9646cf0/billiard-4.2.2.tar.gz", hash = "sha256:e815017a062b714958463e07ba15981d802dc53d41c5b69d28c5a7c238f8ecf3", size = 155592, upload-time = "2025-09-20T14:44:40.456Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2b/89/0c43de91d4e52eaa7bd748771d417f6ac9e51e66b2f61928c2151bf65878/billiard-3.6.4.0-py3-none-any.whl", hash = "sha256:87103ea78fa6ab4d5c751c4909bcff74617d985de7fa8b672cf8618afd5a875b", size = 89472, upload-time = "2021-04-01T09:23:42.019Z" }, + { url = "https://files.pythonhosted.org/packages/a6/80/ef8dff49aae0e4430f81842f7403e14e0ca59db7bbaf7af41245b67c6b25/billiard-4.2.2-py3-none-any.whl", hash = "sha256:4bc05dcf0d1cc6addef470723aac2a6232f3c7ed7475b0b580473a9145829457", size = 86896, upload-time = "2025-09-20T14:44:39.157Z" }, ] [[package]] @@ -441,7 +441,7 @@ wheels = [ [[package]] name = "celery" -version = "5.2.7" +version = "5.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "billiard" }, @@ -450,12 +450,12 @@ dependencies = [ { name = "click-plugins" }, { name = "click-repl" }, { name = "kombu" }, - { name = "pytz" }, + { name = "python-dateutil" }, { name = "vine" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ce/21/41a0028f6d610987c0839250357c1a00f351790b8a448c2eb323caa719ac/celery-5.2.7.tar.gz", hash = "sha256:fafbd82934d30f8a004f81e8f7a062e31413a23d444be8ee3326553915958c6d", size = 1474243, upload-time = "2022-05-29T12:58:03.046Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/7d/6c289f407d219ba36d8b384b42489ebdd0c84ce9c413875a8aae0c85f35b/celery-5.5.3.tar.gz", hash = "sha256:6c972ae7968c2b5281227f01c3a3f984037d21c5129d07bf3550cc2afc6b10a5", size = 1667144, upload-time = "2025-06-01T11:08:12.563Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/1d/99/21fe9d1829cab4fc77d18f89d0c4cbcfe754e95f8b8f4af64fe4997c442f/celery-5.2.7-py3-none-any.whl", hash = "sha256:138420c020cd58d6707e6257b6beda91fd39af7afde5d36c6334d175302c0e14", size = 405637, upload-time = "2022-05-29T12:57:59.911Z" }, + { url = "https://files.pythonhosted.org/packages/c9/af/0dcccc7fdcdf170f9a1585e5e96b6fb0ba1749ef6be8c89a6202284759bd/celery-5.5.3-py3-none-any.whl", hash = "sha256:0b5761a07057acee94694464ca482416b959568904c9dfa41ce8413a7d65d525", size = 438775, upload-time = "2025-06-01T11:08:09.94Z" }, ] [[package]] @@ -530,14 +530,14 @@ wheels = [ [[package]] name = "click" -version = "8.0.3" +version = "8.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f4/09/ad003f1e3428017d1c3da4ccc9547591703ffea548626f47ec74509c5824/click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b", size = 329034, upload-time = "2021-10-10T18:07:33.001Z" } +sdist = { url = "https://files.pythonhosted.org/packages/46/61/de6cd827efad202d7057d93e0fed9294b96952e188f7384832791c7b2254/click-8.3.0.tar.gz", hash = "sha256:e7b8232224eba16f4ebe410c25ced9f7875cb5f3263ffc93cc3e8da705e229c4", size = 276943, upload-time = "2025-09-18T17:32:23.696Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/48/58/c8aa6a8e62cc75f39fee1092c45d6b6ba684122697d7ce7d53f64f98a129/click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3", size = 97516, upload-time = "2021-10-10T18:07:30.752Z" }, + { url = "https://files.pythonhosted.org/packages/db/d3/9dcc0f5797f070ec8edf30fbadfb200e71d9db6b84d211e3b2085a7589a0/click-8.3.0-py3-none-any.whl", hash = "sha256:9b9f285302c6e3064f4330c05f05b81945b2a39544279343e6e7c5f27a9baddc", size = 107295, upload-time = "2025-09-18T17:32:22.42Z" }, ] [[package]] From 493ddddfc2d063a2e2743281a2bb20889cba07a5 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 23 Sep 2025 17:46:03 -0500 Subject: [PATCH 20/67] updated Signed-off-by: Sean P. Goggins --- augur/api/routes/contributor_reports.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/api/routes/contributor_reports.py b/augur/api/routes/contributor_reports.py index a72e4478c6..6c107ed603 100644 --- a/augur/api/routes/contributor_reports.py +++ b/augur/api/routes/contributor_reports.py @@ -305,8 +305,8 @@ def new_contributor_data_collection(repo_id, required_contributions): # select count(*) from augur_data.repo; # """) - with current_app.engine.connect() as conn: - df = pd.read_sql(contributor_query2, conn) + #with current_app.engine.connect() as conn: + # df = pd.read_sql(contributor_query2, conn) #df = df.loc[~df['full_name'].str.contains('bot', na=False)] #df = df.loc[~df['login'].str.contains('bot', na=False)] From bcd58f56b4df646804accff4f0c7f516d138879b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 23 Sep 2025 18:46:19 -0400 Subject: [PATCH 21/67] Revert "Merge pull request #3271 from MoralCode/collection_util-duplicate-code" This reverts commit c8f22bba099e7538b102edda26369092b2c4fd23, reversing changes made to fefb48a171001a032ccd55a8706befb045f35c91. --- augur/tasks/util/collection_util.py | 62 +++++++++++++++-------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index 44effbbf78..bed73bd120 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -597,33 +597,37 @@ def send_messages(self): for repo_git, full_collection in col_hook.repo_list: repo = get_repo_by_repo_git(repo_git) - platform_name = "github" - # this needs to be here and not up a level since it should be set/reset for each repo. - # otherwise a gitlab repo would reset it and cause subsequent github repos to use gitlab phases. - phases = None if "github" in repo.repo_git: - phases = col_hook.phases - # use default platform name - - elif "gitlab" in repo.repo_git: - platform_name = "gitlab" - if col_hook.gitlab_phases is None: - return - phases = col_hook.gitlab_phases - - augur_collection_sequence = [] - for job in phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git, full_collection)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting {platform_name} repo {col_hook.name} status to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id, col_hook.name - + augur_collection_sequence = [] + for job in col_hook.phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git, full_collection)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting github repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name + else: + if col_hook.gitlab_phases is not None: + + augur_collection_sequence = [] + for job in col_hook.gitlab_phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git, full_collection)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting gitlab repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name From b0bb3b80402ee5fcd84bec7334e58a41f9f5ec8a Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 23 Sep 2025 17:56:25 -0500 Subject: [PATCH 22/67] updated metadata for new version Signed-off-by: Sean P. Goggins --- README.md | 4 ++-- docker/backend/Dockerfile | 2 +- docker/database/Dockerfile | 2 +- docker/keyman/Dockerfile | 2 +- metadata.py | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9c7acddc65..bac449c3d8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.90.0 +# Augur NEW Release v0.90.3 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io). @@ -11,7 +11,7 @@ We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy o ## NEW RELEASE ALERT! **If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**. -Augur is now releasing a dramatically improved new version. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.90.0). +Augur is now releasing a dramatically improved new version. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.90.3). - The `release` branch is a stable version of our new architecture, which features: diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index e627dc6681..0a05daf848 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -20,7 +20,7 @@ RUN go install github.com/ossf/scorecard/v5@v5.1.1 \ FROM python:3.11-slim-bullseye LABEL maintainer="outdoors@acm.org" -LABEL version="0.90.0" +LABEL version="0.90.3" ENV DEBIAN_FRONTEND=noninteractive ENV PATH="/usr/bin/:/usr/local/bin:/usr/lib:${PATH}" diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index 50e7653af2..6558fe44ec 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -2,7 +2,7 @@ FROM postgres:16 LABEL maintainer="outdoors@acm.org" -LABEL version="0.90.0" +LABEL version="0.90.3" ENV POSTGRES_DB="test" ENV POSTGRES_USER="augur" diff --git a/docker/keyman/Dockerfile b/docker/keyman/Dockerfile index ed77ef18d4..72c46ba225 100644 --- a/docker/keyman/Dockerfile +++ b/docker/keyman/Dockerfile @@ -1,7 +1,7 @@ FROM python:3.11.12-alpine LABEL maintainer="outdoors@acm.org" -LABEL version="0.90.0" +LABEL version="0.90.3" RUN pip install --no-cache-dir --upgrade pip diff --git a/metadata.py b/metadata.py index da181f6526..bc38e3fe34 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.90.0" -__release__ = "v0.90.0 (Trade Deadline)" +__version__ = "0.90.3" +__release__ = "v0.90.3 (Trade Deadline)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Sean Goggins, Brian Warner & Augurlabs 2025, Red Hat Software" From 9ede8af92cf6b23d61d2b963189e3b57531954e7 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Wed, 1 Oct 2025 16:32:27 -0500 Subject: [PATCH 23/67] revert Signed-off-by: Isaac Milarsky --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 485ac13732..4fe926edc4 100644 --- a/Makefile +++ b/Makefile @@ -129,7 +129,7 @@ test-api: # .PHONY: uv uv: - @ command -v uv >/dev/null 2>&1 || { echo "Installing uv..."; pip3 install uv; } + @ command -v uv >/dev/null 2>&1 || { echo "Installing uv..."; pip install --user uv; } # # Documentation From f41e8f812f6966271a3d863ee84c77f3218da242 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 2 Oct 2025 12:38:06 -0500 Subject: [PATCH 24/67] add toggle to toggle full collection of contributor resolution Signed-off-by: Isaac Milarsky --- .../34_add_facade_config_full_recollect.py | 37 +++++++++++++++++++ .../facade_worker/facade_worker/config.py | 1 + augur/tasks/github/facade_github/tasks.py | 2 +- 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py diff --git a/augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py b/augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py new file mode 100644 index 0000000000..b17a75eaa9 --- /dev/null +++ b/augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py @@ -0,0 +1,37 @@ +"""change config table to add toggle to force facade to recollect through all commits + +Revision ID: 34 +Revises: 33 +Create Date: 2025-10-02 12:45:57.486871 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql +from sqlalchemy.sql import text + +# revision identifiers, used by Alembic. +revision = '34' +down_revision = '33' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + #Add toggle for facade collection. + conn = op.get_bind() + + conn.execute(text(f""" + INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES ('Facade', 'facade_contributor_full_recollect', '{0}', 'int'); + """)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + conn = op.get_bind() + conn.execute(text(f""" + DELETE FROM "augur_operations"."config" WHERE section_name = 'Facade' AND setting_name = 'facade_contributor_full_recollect' AND type = 'int'; + """)) + # ### end Alembic commands ### diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index c75329aaff..488cdac3f4 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -127,6 +127,7 @@ def __init__(self,logger: Logger): self.rebuild_caches = worker_options["rebuild_caches"] self.multithreaded = worker_options["multithreaded"] self.create_xlsx_summary_files = worker_options["create_xlsx_summary_files"] + self.facade_contributor_full_recollect = worker_options["facade_contributor_full_recollect"] self.tool_source = "Facade" self.data_source = "Git Log" diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 26a01f21ae..3658ef7957 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -201,7 +201,7 @@ def insert_facade_contributors(self, repo_git): facade_helper = FacadeHelper(logger) collection_status = repo.collection_status[0] - last_collected_date = collection_status.facade_data_last_collected + last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None # Get all of the commit data's emails and names from the commit table that do not appear # in the contributors table or the contributors_aliases table. From c2e87ffd10605ae22a295524c95b102f4ba173f9 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Thu, 2 Oct 2025 13:27:29 -0500 Subject: [PATCH 25/67] amend method of getting collection status record Signed-off-by: Isaac Milarsky --- augur/tasks/github/facade_github/tasks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/augur/tasks/github/facade_github/tasks.py b/augur/tasks/github/facade_github/tasks.py index 3658ef7957..eff64df6ee 100644 --- a/augur/tasks/github/facade_github/tasks.py +++ b/augur/tasks/github/facade_github/tasks.py @@ -8,6 +8,7 @@ from augur.application.db.models import Contributor from augur.tasks.github.facade_github.core import * from augur.application.db.lib import execute_sql, get_contributor_aliases_by_email, get_unresolved_commit_emails_by_name, get_contributors_by_full_name, get_repo_by_repo_git, batch_insert_contributors +from augur.application.db.lib import get_session, execute_session_query from augur.tasks.git.util.facade_worker.facade_worker.facade00mainprogram import * @@ -200,8 +201,10 @@ def insert_facade_contributors(self, repo_git): repo_id = repo.repo_id facade_helper = FacadeHelper(logger) - collection_status = repo.collection_status[0] - last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None + with get_session() as session: + query = session.query(CollectionStatus).filter(CollectionStatus.repo_id == repo.repo_id) + collection_status = execute_session_query(query,'one') + last_collected_date = collection_status.facade_data_last_collected if not facade_helper.facade_contributor_full_recollect else None # Get all of the commit data's emails and names from the commit table that do not appear # in the contributors table or the contributors_aliases table. From 8ae5002c6e43588b19b9da37509c45201d326e22 Mon Sep 17 00:00:00 2001 From: Isaac Milarsky Date: Fri, 3 Oct 2025 14:59:36 -0500 Subject: [PATCH 26/67] add config option in proper place Signed-off-by: Isaac Milarsky --- augur/application/config.py | 3 +- .../34_add_facade_config_full_recollect.py | 37 ------------------- 2 files changed, 2 insertions(+), 38 deletions(-) delete mode 100644 augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py diff --git a/augur/application/config.py b/augur/application/config.py index 7ace2befd3..009adb69f1 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -50,7 +50,8 @@ def get_development_flag(): "pull_repos": 1, "rebuild_caches": 1, "run_analysis": 1, - "run_facade_contributors": 1 + "run_facade_contributors": 1, + "facade_contributor_full_recollect": 1 }, "Server": { "cache_expire": "3600", diff --git a/augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py b/augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py deleted file mode 100644 index b17a75eaa9..0000000000 --- a/augur/application/schema/alembic/versions/34_add_facade_config_full_recollect.py +++ /dev/null @@ -1,37 +0,0 @@ -"""change config table to add toggle to force facade to recollect through all commits - -Revision ID: 34 -Revises: 33 -Create Date: 2025-10-02 12:45:57.486871 - -""" -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql -from sqlalchemy.sql import text - -# revision identifiers, used by Alembic. -revision = '34' -down_revision = '33' -branch_labels = None -depends_on = None - - -def upgrade(): - # ### commands auto generated by Alembic - please adjust! ### - #Add toggle for facade collection. - conn = op.get_bind() - - conn.execute(text(f""" - INSERT INTO "augur_operations"."config" ("section_name", "setting_name", "value", "type") VALUES ('Facade', 'facade_contributor_full_recollect', '{0}', 'int'); - """)) - # ### end Alembic commands ### - - -def downgrade(): - # ### commands auto generated by Alembic - please adjust! ### - conn = op.get_bind() - conn.execute(text(f""" - DELETE FROM "augur_operations"."config" WHERE section_name = 'Facade' AND setting_name = 'facade_contributor_full_recollect' AND type = 'int'; - """)) - # ### end Alembic commands ### From 4bb65549b89407884c3392591f7efd4c99a6b37c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Mon, 6 Oct 2025 21:31:05 +0100 Subject: [PATCH 27/67] Fix examples of repos CSV files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code which parses the repos files interprets the first column as URL and the second one as repo group ID, but all the examples were doing the opposite. Signed-off-by: Mosè Giordano --- augur/application/schema/repo_load_sample.csv | 16 ++++++++-------- .../command-line-interface/db.rst | 16 ++++++++-------- .../test_repos.csv | 16 ++++++++-------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/augur/application/schema/repo_load_sample.csv b/augur/application/schema/repo_load_sample.csv index b04519f30f..fb537d4949 100644 --- a/augur/application/schema/repo_load_sample.csv +++ b/augur/application/schema/repo_load_sample.csv @@ -1,8 +1,8 @@ -10,https://github.com/chaoss/augur.git -10,https://github.com/chaoss/grimoirelab.git -20,https://github.com/chaoss/wg-evolution.git -20,https://github.com/chaoss/wg-risk.git -20,https://github.com/chaoss/wg-common.git -20,https://github.com/chaoss/wg-value.git -20,https://github.com/chaoss/wg-diversity-inclusion.git -20,https://github.com/chaoss/wg-app-ecosystem.git +https://github.com/chaoss/augur.git,10 +https://github.com/chaoss/grimoirelab.git,10 +https://github.com/chaoss/wg-evolution.git,20 +https://github.com/chaoss/wg-risk.git,20 +https://github.com/chaoss/wg-common.git,20 +https://github.com/chaoss/wg-value.git,20 +https://github.com/chaoss/wg-diversity-inclusion.git,20 +https://github.com/chaoss/wg-app-ecosystem.git,20 diff --git a/docs/source/getting-started/command-line-interface/db.rst b/docs/source/getting-started/command-line-interface/db.rst index b754f2e067..a810f1b9d7 100644 --- a/docs/source/getting-started/command-line-interface/db.rst +++ b/docs/source/getting-started/command-line-interface/db.rst @@ -78,14 +78,14 @@ Example usage\: .. code-block:: bash # contents of repos.csv - 10,https://github.com/chaoss/augur.git - 10,https://github.com/chaoss/grimoirelab.git - 20,https://github.com/chaoss/wg-evolution.git - 20,https://github.com/chaoss/wg-risk.git - 20,https://github.com/chaoss/wg-common.git - 20,https://github.com/chaoss/wg-value.git - 20,https://github.com/chaoss/wg-diversity-inclusion.git - 20,https://github.com/chaoss/wg-app-ecosystem.git + https://github.com/chaoss/augur.git,10 + https://github.com/chaoss/grimoirelab.git,10 + https://github.com/chaoss/wg-evolution.git,20 + https://github.com/chaoss/wg-risk.git,20 + https://github.com/chaoss/wg-common.git,20 + https://github.com/chaoss/wg-value.git,20 + https://github.com/chaoss/wg-diversity-inclusion.git,20 + https://github.com/chaoss/wg-app-ecosystem.git,20 # to add repos to the database $ augur db add-repos repos.csv diff --git a/tests/test_workers/test_facade/test_facade_contributor_interface/test_repos.csv b/tests/test_workers/test_facade/test_facade_contributor_interface/test_repos.csv index 8967ae2142..fb537d4949 100644 --- a/tests/test_workers/test_facade/test_facade_contributor_interface/test_repos.csv +++ b/tests/test_workers/test_facade/test_facade_contributor_interface/test_repos.csv @@ -1,8 +1,8 @@ -10,https://github.com/chaoss/augur.git -10,https://github.com/chaoss/grimoirelab.git -20,https://github.com/chaoss/wg-evolution.git -20,https://github.com/chaoss/wg-risk.git -20,https://github.com/chaoss/wg-common.git -20,https://github.com/chaoss/wg-value.git -20,https://github.com/chaoss/wg-diversity-inclusion.git -20,https://github.com/chaoss/wg-app-ecosystem.git \ No newline at end of file +https://github.com/chaoss/augur.git,10 +https://github.com/chaoss/grimoirelab.git,10 +https://github.com/chaoss/wg-evolution.git,20 +https://github.com/chaoss/wg-risk.git,20 +https://github.com/chaoss/wg-common.git,20 +https://github.com/chaoss/wg-value.git,20 +https://github.com/chaoss/wg-diversity-inclusion.git,20 +https://github.com/chaoss/wg-app-ecosystem.git,20 From 49fe2066ecae2e1fa5e2c734117323cd7fe89476 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Mon, 6 Oct 2025 21:56:34 +0100 Subject: [PATCH 28/67] docker compose: Make it crystal clear that all the GitHub/GitLab user/keys must be set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The non-interactive Docker compose workflow requires all the variables `AUGUR_GIT{LA,HU}B_{USERNAME,API_KEY}` to be set even if they are going to be unused, otherwise you get stuck waiting for [the prompt](https://github.com/chaoss/augur/blob/b0bb3b80402ee5fcd84bec7334e58a41f9f5ec8a/scripts/install/config.sh#L18-L29) ``` You entered a blank line, are you sure? ``` Signed-off-by: Mosè Giordano --- docs/source/docker/docker-compose.rst | 2 +- docs/source/docker/getting-started.rst | 2 +- docs/source/docker/quick-start.rst | 2 +- docs/source/getting-started/using-docker.rst | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/docker/docker-compose.rst b/docs/source/docker/docker-compose.rst index d96476c392..1bc3a25a6f 100644 --- a/docs/source/docker/docker-compose.rst +++ b/docs/source/docker/docker-compose.rst @@ -26,7 +26,7 @@ This section of the documentation details how to use Augur's Docker Compose conf .. warning:: - Don't forget to provide your external database credentials in a file called ``.env`` file. Make sure the following environment variables are specified. + Don't forget to provide your external database credentials in a file called ``.env`` file. Make sure all the following environment variables are specified, keep placeholder values if you don't need some of them. Don't specify AUGUR_DB if you want the docker database to be used. Example .env: diff --git a/docs/source/docker/getting-started.rst b/docs/source/docker/getting-started.rst index e747bbb304..c413d9ed45 100644 --- a/docs/source/docker/getting-started.rst +++ b/docs/source/docker/getting-started.rst @@ -31,7 +31,7 @@ the following resources (or more): - 10 GB RAM Clone the Augur repository and create a .env file in the top level directory -with the following fields: +with the following fields (don't remove any variable, keep placeholder values if you don't need some of them): .. code:: python diff --git a/docs/source/docker/quick-start.rst b/docs/source/docker/quick-start.rst index b7d7b7cc7e..c7530f6ae9 100644 --- a/docs/source/docker/quick-start.rst +++ b/docs/source/docker/quick-start.rst @@ -9,7 +9,7 @@ Before you get off to such a quick start, go ahead and git checkout main - 4. Create a .env file in the top level directory with the following fields: + 4. Create a .env file in the top level directory with the following fields (don't remove any variable, keep placeholder values if you don't need some of them): .. code:: python diff --git a/docs/source/getting-started/using-docker.rst b/docs/source/getting-started/using-docker.rst index cc5e23896c..c1c693eda2 100644 --- a/docs/source/getting-started/using-docker.rst +++ b/docs/source/getting-started/using-docker.rst @@ -10,7 +10,7 @@ the following resources (or more). 1. Clone the Augur repository https://github.com/chaoss/augur -2. Create a .env file in the top level directory with the following fields: +2. Create a ``.env`` file in the top level directory with the following fields (don't remove any variable, keep placeholder values if you don't need some of them): .. code:: python @@ -35,7 +35,7 @@ or podman compose up --build -And augur should be up and running! Over time, you may decide that you want to download and run newer releases of Augur. It is critical that your `.env` file remains configured to use the same database name and password; though you can change the password if you understand how to connect to a database running inside a Docker container on your computer. +And augur should be up and running! Over time, you may decide that you want to download and run newer releases of Augur. It is critical that your ``.env`` file remains configured to use the same database name and password; though you can change the password if you understand how to connect to a database running inside a Docker container on your computer. Rebuilding Augur in Docker ---------------------------- From a87c6a0e60fcdd3ee919aa14e4d5f77bf540dd69 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Thu, 9 Oct 2025 09:14:52 -0500 Subject: [PATCH 29/67] Change facade_contributor_full_recollect to 0 Set default to 0. Signed-off-by: Sean P. Goggins --- augur/application/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/config.py b/augur/application/config.py index 009adb69f1..ee3c33dc8b 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -51,7 +51,7 @@ def get_development_flag(): "rebuild_caches": 1, "run_analysis": 1, "run_facade_contributors": 1, - "facade_contributor_full_recollect": 1 + "facade_contributor_full_recollect": 0 }, "Server": { "cache_expire": "3600", From fdc00056fafa8bc25800c0f40a5c9c3363f4a534 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Thu, 9 Oct 2025 09:57:53 -0500 Subject: [PATCH 30/67] update schema Signed-off-by: Sean P. Goggins --- .../versions/34_add_contrib_to_config.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 augur/application/schema/alembic/versions/34_add_contrib_to_config.py diff --git a/augur/application/schema/alembic/versions/34_add_contrib_to_config.py b/augur/application/schema/alembic/versions/34_add_contrib_to_config.py new file mode 100644 index 0000000000..273265be2f --- /dev/null +++ b/augur/application/schema/alembic/versions/34_add_contrib_to_config.py @@ -0,0 +1,57 @@ +"""Add extra celery options to the config if they do not exist + +Revision ID: 34 +Revises: 33 +Create Date: 2025-10-09 12:03:57.171011 + +""" +from alembic import op +from augur.application.db.session import DatabaseSession +from augur.application.config import * +from sqlalchemy.sql import text +import logging + +# revision identifiers, used by Alembic. +revision = '34' +down_revision = '33' +branch_labels = None +depends_on = None + +logger = logging.getLogger(__name__) + +def upgrade(): + + with DatabaseSession(logger) as session: + config = AugurConfig(logger,session) + config_dict = config.load_config() + + #Update the missing fields of the facade section in the config + section = config_dict.get("Facade") + + #Just copy the default if section doesn't exist. + if section: + if 'facade_contributor_full_recollect' not in section.keys(): + section['facade_contributor_full_recollect'] = 0 + + else: + section = config.default_config["Facade"] + + config.add_section_from_json("Facade", section) + + +def downgrade(): + + conn = op.get_bind() + + conn.execute(text(f""" + DELETE FROM augur_operations.config + WHERE section_name='Facade' AND (setting_name='facade_contributor_full_recollect'); + """)) + + try: + conn.execute(text(f""" + DELETE FROM augur_operations.config + WHERE section_name='Facade' AND (setting_name='facade_contributor_full_recollect'); + """)) + except: + pass \ No newline at end of file From 512ff818eedc8fee1deb627af1cc2c515aea0212 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Thu, 9 Oct 2025 10:26:16 -0500 Subject: [PATCH 31/67] fixing description Signed-off-by: Sean P. Goggins --- .../schema/alembic/versions/34_add_contrib_to_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/schema/alembic/versions/34_add_contrib_to_config.py b/augur/application/schema/alembic/versions/34_add_contrib_to_config.py index 273265be2f..1a87be365e 100644 --- a/augur/application/schema/alembic/versions/34_add_contrib_to_config.py +++ b/augur/application/schema/alembic/versions/34_add_contrib_to_config.py @@ -1,4 +1,4 @@ -"""Add extra celery options to the config if they do not exist +"""Add Facade contributor full recollect to config, default to off (0) Revision ID: 34 Revises: 33 From 7f58fbc8075d228c9d427f86e577af31e4a49520 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Mon, 13 Oct 2025 12:24:50 -0500 Subject: [PATCH 32/67] Update date-released format in CITATION.cff Year not recognized without month and date. Signed-off-by: Sean P. Goggins --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index e26f3d8a86..01514fb22f 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -10,5 +10,5 @@ authors: given-names: Matt title: "Open Source Community Health: Analytical Metrics and Their Corresponding Narratives" doi: 10.1109/SoHeal52568.2021.00010 -date-released: 2021 +date-released: 2021-01-01 url: https://www.seangoggins.net/wp-content/plugins/zotpress/lib/request/request.dl.php?api_user_id=655145&dlkey=HNG22ZSU&content_type=application/pdf From e4a618f50951f82488623ec1c5c4405e0df6235b Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 14 Oct 2025 20:46:44 +0100 Subject: [PATCH 33/67] Log facade messages based on the type requested Signed-off-by: Adrian Edwards --- .../tasks/git/util/facade_worker/facade_worker/config.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/config.py b/augur/tasks/git/util/facade_worker/facade_worker/config.py index 488cdac3f4..f060b34390 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/config.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/config.py @@ -167,7 +167,13 @@ def log_activity(self, level, status): # Log an activity based upon urgency and user's preference. If the log level is # "Debug", then just print it and don't save it in the database. log_options = ('Error','Quiet','Info','Verbose','Debug') - self.logger.info(f"* {status}\n") + logmsg = f"* {status}\n" + if level == "Error": + self.logger.error(logmsg) + elif level == "Debug" or level == "Verbose": + self.logger.debug(logmsg) + else: + self.logger.info(logmsg) #Return if only debug if level == 'Debug': From 16fb4a038bedecc2f33322980d989a4457cc4138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <765740+giordano@users.noreply.github.com> Date: Wed, 15 Oct 2025 11:57:27 +0200 Subject: [PATCH 34/67] Fix `git reset` command to include remote default branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `git reset --hard ` isn't a correct command, the argument of `git reset` must be a tree-ish, which a remote name alone isn't. In some cases this command fails with ``` fatal: ambiguous argument 'origin': unknown revision or path not in the working tree. Use '--' to separate paths from revisions, like this: 'git [...] -- [...]' ``` Signed-off-by: Mosè Giordano <765740+giordano@users.noreply.github.com> --- augur/tasks/git/util/facade_worker/facade_worker/repofetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py index 874f338902..f754f4e098 100644 --- a/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py +++ b/augur/tasks/git/util/facade_worker/facade_worker/repofetch.py @@ -435,7 +435,7 @@ def git_repo_updates(facade_helper, repo_git): cmdpull2 = (f"git -C {absolute_path} pull") - cmd_reset = (f"git -C {absolute_path} reset --hard origin") + cmd_reset = (f"git -C {absolute_path} reset --hard origin/{remotedefault}") cmd_reset_wait = subprocess.Popen( [cmd_reset], shell=True).wait() From 9ebc4eb08ba14dc26ea20d9b56a74e87b0189508 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Mon, 15 Sep 2025 13:11:06 -0400 Subject: [PATCH 35/67] refactor send_messages to remove a almost entirely duplicate code path Signed-off-by: Adrian Edwards --- augur/tasks/util/collection_util.py | 62 ++++++++++++++--------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/augur/tasks/util/collection_util.py b/augur/tasks/util/collection_util.py index bed73bd120..28489d63c8 100644 --- a/augur/tasks/util/collection_util.py +++ b/augur/tasks/util/collection_util.py @@ -597,37 +597,33 @@ def send_messages(self): for repo_git, full_collection in col_hook.repo_list: repo = get_repo_by_repo_git(repo_git) + platform_name = "github" + # this needs to be here and not up a level since it should be set/reset for each repo. + # otherwise a gitlab repo would reset it and cause subsequent github repos to use gitlab phases. + phases = None if "github" in repo.repo_git: - augur_collection_sequence = [] - for job in col_hook.phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git, full_collection)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting github repo {col_hook.name} status to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id, col_hook.name - else: - if col_hook.gitlab_phases is not None: - - augur_collection_sequence = [] - for job in col_hook.gitlab_phases: - #Add the phase to the sequence in order as a celery task. - #The preliminary task creates the larger task chain - augur_collection_sequence.append(job(repo_git, full_collection)) - - #augur_collection_sequence.append(core_task_success_util.si(repo_git)) - #Link all phases in a chain and send to celery - augur_collection_chain = chain(*augur_collection_sequence) - task_id = augur_collection_chain.apply_async().task_id - - self.logger.info(f"Setting gitlab repo {col_hook.name} status to collecting for repo: {repo_git}") - - #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated - yield repo_git, task_id, col_hook.name + phases = col_hook.phases + # use default platform name + + elif "gitlab" in repo.repo_git: + platform_name = "gitlab" + if col_hook.gitlab_phases is None: + continue + phases = col_hook.gitlab_phases + + augur_collection_sequence = [] + for job in phases: + #Add the phase to the sequence in order as a celery task. + #The preliminary task creates the larger task chain + augur_collection_sequence.append(job(repo_git, full_collection)) + + #augur_collection_sequence.append(core_task_success_util.si(repo_git)) + #Link all phases in a chain and send to celery + augur_collection_chain = chain(*augur_collection_sequence) + task_id = augur_collection_chain.apply_async().task_id + + self.logger.info(f"Setting {platform_name} repo {col_hook.name} status to collecting for repo: {repo_git}") + + #yield the value of the task_id to the calling method so that the proper collectionStatus field can be updated + yield repo_git, task_id, col_hook.name + From b1c04dc5cf701ab2e873f8f5b2d1d481073be440 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 30 Sep 2025 19:41:33 -0400 Subject: [PATCH 36/67] remove from template Signed-off-by: Adrian Edwards --- augur/templates/repo-info.j2 | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/augur/templates/repo-info.j2 b/augur/templates/repo-info.j2 index 311daa45f7..2738d70e2a 100644 --- a/augur/templates/repo-info.j2 +++ b/augur/templates/repo-info.j2 @@ -5,21 +5,7 @@ {% if repo.repo_id %}

Report for: {{ repo.repo_name|title }}

{{ repo.repo_git }}

- {% for report in reports %} -

{{ report|replace("_", " ")|title }}

- {% for image in images[report] %} -
-
-
-
-
- -
-
- {% endfor %} - {% endfor %} + {% else %}

Repository {{ repo_id }} not found

{% endif %} From 9a92e17b18b550dd5febf693ae311af2355b4e8d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 30 Sep 2025 19:41:49 -0400 Subject: [PATCH 37/67] remove from main route Signed-off-by: Adrian Edwards --- augur/api/view/routes.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/augur/api/view/routes.py b/augur/api/view/routes.py index 00d456733f..91d23531b4 100644 --- a/augur/api/view/routes.py +++ b/augur/api/view/routes.py @@ -221,13 +221,9 @@ def user_settings(): """ @app.route('/repos/views/repo/') def repo_repo_view(id): - # For some reason, there is no reports definition (shouldn't be possible) - if reports is None: - return render_message("Report Definitions Missing", "You requested a report for a repo on this instance, but a definition for the report layout was not found.") - repo = Repo.get_by_id(db_session, id) - return render_module("repo-info", reports=reports.keys(), images=reports, title="Repo", repo=repo, repo_id=id) + return render_module("repo-info", title="Repo", repo=repo, repo_id=id) """ ---------------------------------------------------------------- default: From f07aa9288282c2c97466f2985371e70b846af184 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 30 Sep 2025 19:41:56 -0400 Subject: [PATCH 38/67] remove the report routes Signed-off-by: Adrian Edwards --- augur/api/routes/__init__.py | 2 - augur/api/routes/contributor_reports.py | 1284 --------------- augur/api/routes/pull_request_reports.py | 1922 ---------------------- 3 files changed, 3208 deletions(-) delete mode 100644 augur/api/routes/contributor_reports.py delete mode 100644 augur/api/routes/pull_request_reports.py diff --git a/augur/api/routes/__init__.py b/augur/api/routes/__init__.py index 03c2e2fa71..8176dad94b 100644 --- a/augur/api/routes/__init__.py +++ b/augur/api/routes/__init__.py @@ -4,10 +4,8 @@ from .batch import * from .collection_status import * from .config import * -from .contributor_reports import * from .manager import * from .nonstandard_metrics import * -from .pull_request_reports import * from .user import * from .dei import * from .util import * diff --git a/augur/api/routes/contributor_reports.py b/augur/api/routes/contributor_reports.py deleted file mode 100644 index 6c107ed603..0000000000 --- a/augur/api/routes/contributor_reports.py +++ /dev/null @@ -1,1284 +0,0 @@ -import psycopg2 -import psycopg2 -import sqlalchemy as salc -import numpy as np -import warnings -import datetime -import pandas as pd -from math import pi -from flask import request, send_file, Response, current_app - -# import visualization libraries -from bokeh.io import export_png -from bokeh.embed import json_item -from bokeh.plotting import figure -from bokeh.models import Label, LabelSet, ColumnDataSource, Legend -from bokeh.palettes import Colorblind -from bokeh.layouts import gridplot -from bokeh.transform import cumsum - -from augur.api.routes import AUGUR_API_VERSION -from ..server import app - -warnings.filterwarnings('ignore') - -def quarters(month, year): - if 1 <= month <= 3: - return '01' + '/' + year - elif 4 <= month <= 6: - return '04' + '/' + year - elif 5 <= month <= 9: - return '07' + '/' + year - elif 10 <= month <= 12: - return '10' + '/' + year - -def new_contributor_data_collection(repo_id, required_contributions): - - rank_list = [] - for num in range(1, required_contributions + 1): - rank_list.append(num) - rank_tuple = tuple(rank_list) - -##### - -## Commented out due to abuse. - -##### - - - # contributor_query = salc.sql.text(f""" - - # SELECT * FROM ( - # SELECT ID AS - # cntrb_id, - # A.created_at AS created_at, - # date_part('month', A.created_at::DATE) AS month, - # date_part('year', A.created_at::DATE) AS year, - # A.repo_id, - # repo_name, - # full_name, - # login, - # ACTION, - # rank() OVER ( - # PARTITION BY id - # ORDER BY A.created_at ASC - # ) - # FROM - # ( - # ( - # SELECT - # canonical_id AS ID, - # created_at AS created_at, - # repo_id, - # 'issue_opened' AS ACTION, - # contributors.cntrb_full_name AS full_name, - # contributors.cntrb_login AS login - # FROM - # augur_data.issues - # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issues.reporter_id - # LEFT OUTER JOIN ( - # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - # cntrb_canonical AS canonical_email, - # data_collection_date, - # cntrb_id AS canonical_id - # FROM augur_data.contributors - # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - # WHERE - # repo_id = {repo_id} - # AND pull_request IS NULL - # GROUP BY - # canonical_id, - # repo_id, - # issues.created_at, - # contributors.cntrb_full_name, - # contributors.cntrb_login - # ) UNION ALL - # ( - # SELECT - # canonical_id AS ID, - # TO_TIMESTAMP( cmt_author_date, 'YYYY-MM-DD' ) AS created_at, - # repo_id, - # 'commit' AS ACTION, - # contributors.cntrb_full_name AS full_name, - # contributors.cntrb_login AS login - # FROM - # augur_data.commits - # LEFT OUTER JOIN augur_data.contributors ON cntrb_email = cmt_author_email - # LEFT OUTER JOIN ( - # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - # cntrb_canonical AS canonical_email, - # data_collection_date, cntrb_id AS canonical_id - # FROM augur_data.contributors - # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - # WHERE - # repo_id = {repo_id} - # GROUP BY - # repo_id, - # canonical_email, - # canonical_id, - # commits.cmt_author_date, - # contributors.cntrb_full_name, - # contributors.cntrb_login - # ) UNION ALL - # ( - # SELECT - # message.cntrb_id AS ID, - # created_at AS created_at, - # commits.repo_id, - # 'commit_comment' AS ACTION, - # contributors.cntrb_full_name AS full_name, - # contributors.cntrb_login AS login - - # FROM - # augur_data.commit_comment_ref, - # augur_data.commits, - # augur_data.message - # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id - # LEFT OUTER JOIN ( - # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - # cntrb_canonical AS canonical_email, - # data_collection_date, cntrb_id AS canonical_id - # FROM augur_data.contributors - # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - # WHERE - # commits.cmt_id = commit_comment_ref.cmt_id - # AND commits.repo_id = {repo_id} - # AND commit_comment_ref.msg_id = message.msg_id - - # GROUP BY - # ID, - # commits.repo_id, - # commit_comment_ref.created_at, - # contributors.cntrb_full_name, - # contributors.cntrb_login - # ) UNION ALL - # ( - # SELECT - # issue_events.cntrb_id AS ID, - # issue_events.created_at AS created_at, - # issues.repo_id, - # 'issue_closed' AS ACTION, - # contributors.cntrb_full_name AS full_name, - # contributors.cntrb_login AS login - # FROM - # augur_data.issues, - # augur_data.issue_events - # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = issue_events.cntrb_id - # LEFT OUTER JOIN ( - # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - # cntrb_canonical AS canonical_email, - # data_collection_date, - # cntrb_id AS canonical_id - # FROM augur_data.contributors - # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - # WHERE - # issues.repo_id = {repo_id} - # AND issues.issue_id = issue_events.issue_id - # AND issues.pull_request IS NULL - # AND issue_events.cntrb_id IS NOT NULL - # AND ACTION = 'closed' - # GROUP BY - # issue_events.cntrb_id, - # issues.repo_id, - # issue_events.created_at, - # contributors.cntrb_full_name, - # contributors.cntrb_login - # ) UNION ALL - # ( - # SELECT - # pr_augur_contributor_id AS ID, - # pr_created_at AS created_at, - # pull_requests.repo_id, - # 'open_pull_request' AS ACTION, - # contributors.cntrb_full_name AS full_name, - # contributors.cntrb_login AS login - # FROM - # augur_data.pull_requests - # LEFT OUTER JOIN augur_data.contributors ON pull_requests.pr_augur_contributor_id = contributors.cntrb_id - # LEFT OUTER JOIN ( - # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - # cntrb_canonical AS canonical_email, - # data_collection_date, - # cntrb_id AS canonical_id - # FROM augur_data.contributors - # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - # WHERE - # pull_requests.repo_id = {repo_id} - # GROUP BY - # pull_requests.pr_augur_contributor_id, - # pull_requests.repo_id, - # pull_requests.pr_created_at, - # contributors.cntrb_full_name, - # contributors.cntrb_login - # ) UNION ALL - # ( - # SELECT - # message.cntrb_id AS ID, - # msg_timestamp AS created_at, - # pull_requests.repo_id as repo_id, - # 'pull_request_comment' AS ACTION, - # contributors.cntrb_full_name AS full_name, - # contributors.cntrb_login AS login - # FROM - # augur_data.pull_requests, - # augur_data.pull_request_message_ref, - # augur_data.message - # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id - # LEFT OUTER JOIN ( - # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - # cntrb_canonical AS canonical_email, - # data_collection_date, - # cntrb_id AS canonical_id - # FROM augur_data.contributors - # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - # WHERE - # pull_requests.repo_id = {repo_id} - # AND pull_request_message_ref.pull_request_id = pull_requests.pull_request_id - # AND pull_request_message_ref.msg_id = message.msg_id - # GROUP BY - # message.cntrb_id, - # pull_requests.repo_id, - # message.msg_timestamp, - # contributors.cntrb_full_name, - # contributors.cntrb_login - # ) UNION ALL - # ( - # SELECT - # issues.reporter_id AS ID, - # msg_timestamp AS created_at, - # issues.repo_id as repo_id, - # 'issue_comment' AS ACTION, - # contributors.cntrb_full_name AS full_name, - # contributors.cntrb_login AS login - # FROM - # issues, - # issue_message_ref, - # message - # LEFT OUTER JOIN augur_data.contributors ON contributors.cntrb_id = message.cntrb_id - # LEFT OUTER JOIN ( - # SELECT DISTINCT ON ( cntrb_canonical ) cntrb_full_name, - # cntrb_canonical AS canonical_email, - # data_collection_date, - # cntrb_id AS canonical_id - # FROM augur_data.contributors - # WHERE cntrb_canonical = cntrb_email ORDER BY cntrb_canonical - # ) canonical_full_names ON canonical_full_names.canonical_email =contributors.cntrb_canonical - # WHERE - # issues.repo_id = {repo_id} - # AND issue_message_ref.msg_id = message.msg_id - # AND issues.issue_id = issue_message_ref.issue_id - # AND issues.pull_request_id = NULL - # GROUP BY - # issues.reporter_id, - # issues.repo_id, - # message.msg_timestamp, - # contributors.cntrb_full_name, - # contributors.cntrb_login - # ) - # ) A, - # repo - # WHERE - # ID IS NOT NULL - # AND A.repo_id = repo.repo_id - # GROUP BY - # A.ID, - # A.repo_id, - # A.ACTION, - # A.created_at, - # repo.repo_name, - # A.full_name, - # A.login - # ORDER BY - # cntrb_id - # ) b - # WHERE RANK IN {rank_tuple} - - # """) - # contributor_query2 = (""" - - # select count(*) from augur_data.repo; - # """) - - #with current_app.engine.connect() as conn: - # df = pd.read_sql(contributor_query2, conn) - - #df = df.loc[~df['full_name'].str.contains('bot', na=False)] - #df = df.loc[~df['login'].str.contains('bot', na=False)] - - #df = df.loc[~df['cntrb_id'].isin(df[df.duplicated(['cntrb_id', 'created_at', 'repo_id', 'rank'])]['cntrb_id'])] - - # add yearmonths to contributor - #df[['month', 'year']] = df[['month', 'year']].astype(int).astype(str) - #df['yearmonth'] = df['month'] + '/' + df['year'] - #df['yearmonth'] = pd.to_datetime(df['yearmonth']) - - # add column with every value being one, so when the contributor df is concatenated - # with the months df, the filler months won't be counted in the sums - #df['new_contributors'] = 1 - - # add quarters to contributor dataframe - #df['month'] = df['month'].astype(int) - #df['quarter'] = df.apply(lambda x: quarters(x['month'], x['year']), axis=1, result_type='reduce') - #df['quarter'] = pd.to_datetime(df['quarter']) - - df = [1] - return df - -def months_data_collection(start_date, end_date): - - # months_query makes a df of years and months, this is used to fill - # the months with no data in the visualizations - months_query = salc.sql.text(f""" - SELECT * - FROM - ( - SELECT - date_part( 'year', created_month :: DATE ) AS year, - date_part( 'month', created_month :: DATE ) AS MONTH - FROM - (SELECT * - FROM ( - SELECT created_month :: DATE - FROM generate_series (TIMESTAMP '{start_date}', TIMESTAMP '{end_date}', INTERVAL '1 month' ) created_month ) d ) x - ) y - """) - - with current_app.engine.connect() as conn: - months_df = pd.read_sql(months_query, conn) - - # add yearmonths to months_df - months_df[['year', 'month']] = months_df[['year', 'month']].astype(float).astype(int).astype(str) - months_df['yearmonth'] = months_df['month'] + '/' + months_df['year'] - months_df['yearmonth'] = pd.to_datetime(months_df['yearmonth']) - - # filter months_df with start_date and end_date, the contributor df is filtered in the visualizations - months_df = months_df.set_index(months_df['yearmonth']) - months_df = months_df.loc[start_date: end_date].reset_index(drop=True) - - # add quarters to months dataframe - months_df['month'] = months_df['month'].astype(int) - months_df['quarter'] = months_df.apply(lambda x: quarters(x['month'], x['year']), axis=1) - months_df['quarter'] = pd.to_datetime(months_df['quarter']) - - return months_df - -def get_repo_id_start_date_and_end_date(): - - now = datetime.datetime.now() - - repo_id = request.args.get('repo_id') - start_date = str(request.args.get('start_date', "{}-01-01".format(now.year - 1))) - end_date = str(request.args.get('end_date', "{}-{}-{}".format(now.year, now.month, now.day))) - - if repo_id: - - if start_date < end_date: - return int(repo_id), start_date, end_date, None - else: - - error = { - "message": "Invalid end_date. end_date is before the start_date", - "status_code": 400 - } - - return int(repo_id), None, None, error - - else: - error = { - "message": "repo_id not specified. Use this endpoint to get a list of available repos: http:///api/unstable/repos", - "status_code": 400 - } - return None, None, None, error - -def filter_out_repeats_without_required_contributions_in_required_time(repeat_list, repeats_df, required_time, - first_list): - - differences = [] - for i in range(0, len(repeat_list)): - time_difference = repeat_list[i] - first_list[i] - total = time_difference.days * 86400 + time_difference.seconds - differences.append(total) - repeats_df['differences'] = differences - - # remove contributions who made enough contributions, but not in a short enough time - repeats_df = repeats_df.loc[repeats_df['differences'] <= required_time * 86400] - - return repeats_df - -def compute_fly_by_and_returning_contributors_dfs(input_df, required_contributions, required_time, start_date): - - # create a copy of contributor dataframe - driver_df = input_df.copy() - - # remove first time contributors before begin date, along with their second contribution - mask = (driver_df['yearmonth'] < start_date) - driver_df = driver_df[~driver_df['cntrb_id'].isin(driver_df.loc[mask]['cntrb_id'])] - - # determine if contributor is a drive by by finding all the cntrb_id's that do not have a second contribution - repeats_df = driver_df.copy() - - repeats_df = repeats_df.loc[repeats_df['rank'].isin([1, required_contributions])] - - # removes all the contributors that only have a first contirbution - repeats_df = repeats_df[ - repeats_df['cntrb_id'].isin(repeats_df.loc[driver_df['rank'] == required_contributions]['cntrb_id'])] - - repeat_list = repeats_df.loc[driver_df['rank'] == required_contributions]['created_at'].tolist() - first_list = repeats_df.loc[driver_df['rank'] == 1]['created_at'].tolist() - - repeats_df = repeats_df.loc[driver_df['rank'] == 1] - repeats_df['type'] = 'repeat' - - repeats_df = filter_out_repeats_without_required_contributions_in_required_time( - repeat_list, repeats_df, required_time, first_list) - - repeats_df = repeats_df.loc[repeats_df['differences'] <= required_time * 86400] - - repeat_cntrb_ids = repeats_df['cntrb_id'].to_list() - - drive_by_df = driver_df.loc[~driver_df['cntrb_id'].isin(repeat_cntrb_ids)] - - drive_by_df = drive_by_df.loc[driver_df['rank'] == 1] - drive_by_df['type'] = 'drive_by' - - return drive_by_df, repeats_df - -def add_caption_to_visualizations(caption, required_contributions, required_time, plot_width): - - caption_plot = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - - caption_plot.add_layout(Label( - x=0, - y=160, - x_units='screen', - y_units='screen', - text='{}'.format(caption.format(required_contributions, required_time)), - text_font='times', - text_font_size='15pt', - render_mode='css' - )) - caption_plot.outline_line_color = None - - return caption_plot - -def format_new_cntrb_bar_charts(plot, rank, group_by_format_string): - - plot.xgrid.grid_line_color = None - plot.y_range.start = 0 - plot.axis.minor_tick_line_color = None - plot.outline_line_color = None - - plot.title.align = "center" - plot.title.text_font_size = "18px" - - plot.yaxis.axis_label = 'Second Time Contributors' if rank == 2 else 'New Contributors' - plot.xaxis.axis_label = group_by_format_string - - plot.xaxis.axis_label_text_font_size = "18px" - plot.yaxis.axis_label_text_font_size = "16px" - - plot.xaxis.major_label_text_font_size = "16px" - plot.xaxis.major_label_orientation = 45.0 - - plot.yaxis.major_label_text_font_size = "16px" - - return plot - -def add_charts_and_captions_to_correct_positions(chart_plot, caption_plot, rank, contributor_type, - row_1, row_2, row_3, row_4): - - if rank == 1 and (contributor_type == 'All' or contributor_type == 'repeat'): - row_1.append(chart_plot) - row_2.append(caption_plot) - elif rank == 2 or contributor_type == 'drive_by': - row_3.append(chart_plot) - row_4.append(caption_plot) - -def get_new_cntrb_bar_chart_query_params(): - - group_by = str(request.args.get('group_by', "quarter")) - required_contributions = int(request.args.get('required_contributions', 4)) - required_time = int(request.args.get('required_time', 365)) - - return group_by, required_contributions, required_time - -def remove_rows_before_start_date(df, start_date): - - mask = (df['yearmonth'] < start_date) - result_df = df[~df['cntrb_id'].isin(df.loc[mask]['cntrb_id'])] - - return result_df - -def remove_rows_with_null_values(df, not_null_columns=[]): - """Remove null data from pandas df - - Parameters - -- df - description: the dataframe that will be modified - type: Pandas Dataframe - - -- list_of_columns - description: columns that are searched for NULL values - type: list - default: [] (means all columns will be checked for NULL values) - IMPORTANT: if an empty list is passed or nothing is passed it will check all columns for NULL values - - Return Value - -- Modified Pandas Dataframe - """ - - if len(not_null_columns) == 0: - not_null_columns = df.columns.to_list() - - total_rows_removed = 0 - for col in not_null_columns: - rows_removed = len(df.loc[df[col].isnull() is True]) - - if rows_removed > 0: - print(f"{rows_removed} rows have been removed because of null values in column {col}") - total_rows_removed += rows_removed - - df = df.loc[df[col].isnull() is False] - - if total_rows_removed > 0: - print(f"\nTotal rows removed because of null data: {total_rows_removed}"); - else: - print("No null data found") - - return df - -def get_needed_columns(df, list_of_columns): - """Get only a specific list of columns from a Pandas Dataframe - - Parameters - -- df - description: the dataframe that will be modified - type: Pandas Dataframe - - -- list_of_columns - description: columns that will be kept in dataframe - type: list - - Return Value - -- Modified Pandas Dataframe - """ - return df[list_of_columns] - -def filter_data(df, needed_columns, not_null_columns=[]): - """Filters out the unneeded rows in the df, and removed NULL data from df - - Parameters - -- df - description: the dataframe that will be modified - type: Pandas Dataframe - - -- needed_columns - description: the columns to keep in the dataframe - - -- not_null_columns - description: columns that will be searched for NULL data, - if NULL values are found those rows will be removed - default: [] (means all columns in needed_columns list will be checked for NULL values) - IMPORTANT: if an empty list is passed or nothing is passed it will check - all columns in needed_columns list for NULL values - Return Value - -- Modified Pandas Dataframe - """ - - if all(x in needed_columns for x in not_null_columns): - - df = get_needed_columns(df, needed_columns) - #Use the pandas method bc the other method was erroring on boolean index. - #IM - 9/23/22 - df = df.dropna(subset=not_null_columns)#remove_rows_with_null_values(df, not_null_columns) - - return df - else: - print("Developer error, not null columns should be a subset of needed columns") - return df - -@app.route('/{}/contributor_reports/new_contributors_bar/'.format(AUGUR_API_VERSION), methods=["GET"]) -def new_contributors_bar(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - group_by, required_contributions, required_time = get_new_cntrb_bar_chart_query_params() - - input_df = new_contributor_data_collection(repo_id=repo_id, required_contributions=required_contributions) - months_df = months_data_collection(start_date=start_date, end_date=end_date) - - # TODO remove full_name from data for all charts since it is not needed in vis generation - not_null_columns = ['cntrb_id', 'created_at', 'month', 'year', 'repo_id', 'repo_name', 'login', 'action', - 'rank', 'yearmonth', 'new_contributors', 'quarter'] - - #Use the pandas method bc the other method was erroring on boolean index. - #IM - 9/23/22 - input_df = input_df.dropna(subset=not_null_columns)#remove_rows_with_null_values(input_df, not_null_columns) - - if len(input_df) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: input_df.loc[input_df['repo_id'] == repo_id].iloc[0]['repo_name']} - - contributor_types = ['All', 'repeat', 'drive_by'] - ranks = [1, 2] - - row_1, row_2, row_3, row_4 = [], [], [], [] - - all_df = remove_rows_before_start_date(input_df, start_date) - - drive_by_df, repeats_df = compute_fly_by_and_returning_contributors_dfs(input_df, required_contributions, - required_time, start_date) - - for rank in ranks: - for contributor_type in contributor_types: - - # do not display these visualizations since drive-by's do not have second contributions, and the - # second contribution of a repeat contributor is the same thing as the all the second time contributors - if (rank == 2 and contributor_type == 'drive_by') or (rank == 2 and contributor_type == 'repeat'): - continue - - if contributor_type == 'repeat': - driver_df = repeats_df - - caption = """This graph shows repeat contributors in the specified time period. Repeat contributors - are contributors who have made {} or more contributions in {} days and their first contribution is - in the specified time period. New contributors are individuals who make their first contribution - in the specified time period.""" - - elif contributor_type == 'drive_by': - - driver_df = drive_by_df - - caption = """This graph shows fly by contributors in the specified time period. Fly by contributors - are contributors who make less than the required {} contributions in {} days. New contributors are - individuals who make their first contribution in the specified time period. Of course, then, “All - fly-by’s are by definition first time contributors”. However, not all first time contributors are - fly-by’s.""" - - elif contributor_type == 'All': - - if rank == 1: - driver_df = all_df - # makes df with all first time contributors - driver_df = driver_df.loc[driver_df['rank'] == 1] - caption = """This graph shows all the first time contributors, whether they contribute once, or - contribute multiple times. New contributors are individuals who make their first contribution - in the specified time period.""" - - if rank == 2: - - driver_df = all_df - - # creates df with all second time contributors - driver_df = driver_df.loc[driver_df['rank'] == 2] - caption = """This graph shows the second contribution of all - first time contributors in the specified time period.""" - # y_axis_label = 'Second Time Contributors' - - # filter by end_date, this is not done with the begin date filtering because a repeat contributor - # will look like drive-by if the second contribution is removed by end_date filtering - mask = (driver_df['yearmonth'] < end_date) - driver_df = driver_df.loc[mask] - - # adds all months to driver_df so the lists of dates will include all months and years - driver_df = pd.concat([driver_df, months_df]) - - data = pd.DataFrame() - if group_by == 'year': - - data['dates'] = driver_df[group_by].unique() - - # new contributor counts for y-axis - data['new_contributor_counts'] = driver_df.groupby([group_by]).sum().reset_index()[ - 'new_contributors'] - - # used to format x-axis and title - group_by_format_string = "Year" - - elif group_by == 'quarter' or group_by == 'month': - - # set variables to group the data by quarter or month - if group_by == 'quarter': - date_column = 'quarter' - group_by_format_string = "Quarter" - - elif group_by == 'month': - date_column = 'yearmonth' - group_by_format_string = "Month" - - # modifies the driver_df[date_column] to be a string with year and month, - # then finds all the unique values - data['dates'] = np.unique(np.datetime_as_string(driver_df[date_column], unit='M')) - - # new contributor counts for y-axis - data['new_contributor_counts'] = driver_df.groupby([date_column]).sum().reset_index()[ - 'new_contributors'] - - # if the data set is large enough it will dynamically assign the width, if the data set is - # too small it will by default set to 870 pixel so the title fits - if len(data['new_contributor_counts']) >= 15: - plot_width = 46 * len(data['new_contributor_counts']) - else: - plot_width = 870 - - # create a dict convert an integer number into a word - # used to turn the rank into a word, so it is nicely displayed in the title - numbers = ['Zero', 'First', 'Second'] - num_conversion_dict = {} - for i in range(1, len(numbers)): - num_conversion_dict[i] = numbers[i] - number = '{}'.format(num_conversion_dict[rank]) - - # define pot for bar chart - p = figure(x_range=data['dates'], plot_height=400, plot_width=plot_width, - title="{}: {} {} Time Contributors Per {}".format(repo_dict[repo_id], - contributor_type.capitalize(), number, - group_by_format_string), - y_range=(0, max(data['new_contributor_counts']) * 1.15), margin=(0, 0, 10, 0)) - - p.vbar(x=data['dates'], top=data['new_contributor_counts'], width=0.8) - - source = ColumnDataSource( - data=dict(dates=data['dates'], new_contributor_counts=data['new_contributor_counts'])) - - # add contributor_count labels to chart - p.add_layout(LabelSet(x='dates', y='new_contributor_counts', text='new_contributor_counts', y_offset=4, - text_font_size="13pt", text_color="black", - source=source, text_align='center')) - - plot = format_new_cntrb_bar_charts(p, rank, group_by_format_string) - - caption_plot = add_caption_to_visualizations(caption, required_contributions, required_time, plot_width) - - add_charts_and_captions_to_correct_positions(plot, caption_plot, rank, contributor_type, row_1, - row_2, row_3, row_4) - - # puts plots together into a grid - grid = gridplot([row_1, row_2, row_3, row_4]) - - filename = export_png(grid) - - return send_file(filename) - -@app.route('/{}/contributor_reports/new_contributors_stacked_bar/'.format(AUGUR_API_VERSION), - methods=["GET"]) -def new_contributors_stacked_bar(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - group_by, required_contributions, required_time = get_new_cntrb_bar_chart_query_params() - - input_df = new_contributor_data_collection(repo_id=repo_id, required_contributions=required_contributions) - months_df = months_data_collection(start_date=start_date, end_date=end_date) - - needed_columns = ['cntrb_id', 'created_at', 'month', 'year', 'repo_id', 'repo_name', 'login', 'action', - 'rank', 'yearmonth', 'new_contributors', 'quarter'] - - input_df = filter_data(input_df, needed_columns) - - if len(input_df) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: input_df.loc[input_df['repo_id'] == repo_id].iloc[0]['repo_name']} - - contributor_types = ['All', 'repeat', 'drive_by'] - ranks = [1, 2] - - row_1, row_2, row_3, row_4 = [], [], [], [] - - all_df = remove_rows_before_start_date(input_df, start_date) - - drive_by_df, repeats_df = compute_fly_by_and_returning_contributors_dfs(input_df, required_contributions, - required_time, start_date) - - for rank in ranks: - for contributor_type in contributor_types: - # do not display these visualizations since drive-by's do not have second contributions, - # and the second contribution of a repeat contributor is the same thing as the all the - # second time contributors - if (rank == 2 and contributor_type == 'drive_by') or (rank == 2 and contributor_type == 'repeat'): - continue - - if contributor_type == 'repeat': - driver_df = repeats_df - - caption = """This graph shows repeat contributors in the specified time period. Repeat contributors - are contributors who have made {} or more contributions in {} days and their first contribution is - in the specified time period. New contributors are individuals who make their first contribution in - the specified time period.""" - - elif contributor_type == 'drive_by': - - driver_df = drive_by_df - - caption = """This graph shows fly by contributors in the specified time period. Fly by contributors - are contributors who make less than the required {} contributions in {} days. New contributors are - individuals who make their first contribution in the specified time period. Of course, then, “All - fly-by’s are by definition first time contributors”. However, not all first time contributors are - fly-by’s.""" - - elif contributor_type == 'All': - if rank == 1: - driver_df = all_df - - # makes df with all first time contributors - driver_df = driver_df.loc[driver_df['rank'] == 1] - - caption = """This graph shows all the first time contributors, whether they contribute once, or - contribute multiple times. New contributors are individuals who make their first contribution in - the specified time period.""" - - if rank == 2: - driver_df = all_df - - # creates df with all second time contributor - driver_df = driver_df.loc[driver_df['rank'] == 2] - caption = """This graph shows the second contribution of all first time - contributors in the specified time period.""" - # y_axis_label = 'Second Time Contributors' - - # filter by end_date, this is not done with the begin date filtering because a repeat contributor will - # look like drive-by if the second contribution is removed by end_date filtering - mask = (driver_df['yearmonth'] < end_date) - driver_df = driver_df.loc[mask] - - # adds all months to driver_df so the lists of dates will include all months and years - driver_df = pd.concat([driver_df, months_df]) - - actions = ['open_pull_request', 'pull_request_comment', 'commit', 'issue_closed', 'issue_opened', - 'issue_comment'] - - data = pd.DataFrame() - if group_by == 'year': - - # x-axis dates - data['dates'] = driver_df[group_by].unique() - - for contribution_type in actions: - data[contribution_type] = \ - pd.concat([driver_df.loc[driver_df['action'] == contribution_type], months_df]).groupby( - group_by).sum().reset_index()['new_contributors'] - - # new contributor counts for all actions - data['new_contributor_counts'] = driver_df.groupby([group_by]).sum().reset_index()[ - 'new_contributors'] - - # used to format x-axis and graph title - group_by_format_string = "Year" - - elif group_by == 'quarter' or group_by == 'month': - - # set variables to group the data by quarter or month - if group_by == 'quarter': - date_column = 'quarter' - group_by_format_string = "Quarter" - - elif group_by == 'month': - date_column = 'yearmonth' - group_by_format_string = "Month" - - # modifies the driver_df[date_column] to be a string with year and month, - # then finds all the unique values - data['dates'] = np.unique(np.datetime_as_string(driver_df[date_column], unit='M')) - - # new_contributor counts for each type of action - for contribution_type in actions: - data[contribution_type] = \ - pd.concat([driver_df.loc[driver_df['action'] == contribution_type], months_df]).groupby( - date_column).sum().reset_index()['new_contributors'] - - print(data.to_string()) - - # new contributor counts for all actions - data['new_contributor_counts'] = driver_df.groupby([date_column]).sum().reset_index()[ - 'new_contributors'] - - # if the data set is large enough it will dynamically assign the width, if the data set is too small it - # will by default set to 870 pixel so the title fits - if len(data['new_contributor_counts']) >= 15: - plot_width = 46 * len(data['new_contributor_counts']) + 200 - else: - plot_width = 870 - - # create list of values for data source dict - actions_df_references = [] - for action in actions: - actions_df_references.append(data[action]) - - # created dict with the actions as the keys, and the values as the values from the df - data_source = {actions[i]: actions_df_references[i] for i in range(len(actions))} - data_source.update({'dates': data['dates'], 'New Contributor Counts': data['new_contributor_counts']}) - - colors = Colorblind[len(actions)] - - source = ColumnDataSource(data=data_source) - - # create a dict convert an integer number into a word - # used to turn the rank into a word, so it is nicely displayed in the title - numbers = ['Zero', 'First', 'Second'] - num_conversion_dict = {} - for i in range(1, len(numbers)): - num_conversion_dict[i] = numbers[i] - number = '{}'.format(num_conversion_dict[rank]) - - # y_max = 20 - # creates plot to hold chart - p = figure(x_range=data['dates'], plot_height=400, plot_width=plot_width, - title='{}: {} {} Time Contributors Per {}'.format(repo_dict[repo_id], - contributor_type.capitalize(), number, - group_by_format_string), - toolbar_location=None, y_range=(0, max(data['new_contributor_counts']) * 1.15)) - # max(data['new_contributor_counts'])* 1.15), margin = (0, 0, 0, 0)) - - vbar = p.vbar_stack(actions, x='dates', width=0.8, color=colors, source=source) - - # add total count labels - p.add_layout(LabelSet(x='dates', y='New Contributor Counts', text='New Contributor Counts', y_offset=4, - text_font_size="14pt", - text_color="black", source=source, text_align='center')) - - # add legend - legend = Legend(items=[(date, [action]) for (date, action) in zip(actions, vbar)], location=(0, 120), - label_text_font_size="16px") - p.add_layout(legend, 'right') - - plot = format_new_cntrb_bar_charts(p, rank, group_by_format_string) - - caption_plot = add_caption_to_visualizations(caption, required_contributions, required_time, plot_width) - - add_charts_and_captions_to_correct_positions(plot, caption_plot, rank, contributor_type, row_1, - row_2, row_3, row_4) - - # puts plots together into a grid - grid = gridplot([row_1, row_2, row_3, row_4]) - - filename = export_png(grid) - - return send_file(filename) - -@app.route('/{}/contributor_reports/returning_contributors_pie_chart/'.format(AUGUR_API_VERSION), - methods=["GET"]) -def returning_contributors_pie_chart(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - required_contributions = int(request.args.get('required_contributions', 4)) - required_time = int(request.args.get('required_time', 365)) - - input_df = new_contributor_data_collection(repo_id=repo_id, required_contributions=required_contributions) - - needed_columns = ['cntrb_id', 'created_at', 'month', 'year', 'repo_id', 'repo_name', 'login', 'action', - 'rank', 'yearmonth', 'new_contributors', 'quarter'] - - input_df = filter_data(input_df, needed_columns) - - if len(input_df) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: input_df.loc[input_df['repo_id'] == repo_id].iloc[0]['repo_name']} - - drive_by_df, repeats_df = compute_fly_by_and_returning_contributors_dfs(input_df, required_contributions, - required_time, start_date) - - print(repeats_df.to_string()) - - driver_df = pd.concat([drive_by_df, repeats_df]) - - # filter df by end date - mask = (driver_df['yearmonth'] < end_date) - driver_df = driver_df.loc[mask] - - # first and second time contributor counts - drive_by_contributors = driver_df.loc[driver_df['type'] == 'drive_by'].count()['new_contributors'] - repeat_contributors = driver_df.loc[driver_df['type'] == 'repeat'].count()['new_contributors'] - - # create a dict with the # of drive-by and repeat contributors - x = {'Drive_By': drive_by_contributors, - 'Repeat': repeat_contributors} - - # turn dict 'x' into a dataframe with columns 'contributor_type', and 'counts' - data = pd.Series(x).reset_index(name='counts').rename(columns={'index': 'contributor_type'}) - - data['angle'] = data['counts'] / data['counts'].sum() * 2 * pi - data['color'] = ('#0072B2', '#E69F00') - data['percentage'] = ((data['angle'] / (2 * pi)) * 100).round(2) - - # format title - title = "{}: Number of Returning " \ - "Contributors out of {} from {} to {}" \ - .format(repo_dict[repo_id], drive_by_contributors + repeat_contributors, start_date, end_date) - - title_text_font_size = 18 - - plot_width = 850 - - # sets plot_width to width of title if title is wider than 850 pixels - if len(title) * title_text_font_size / 2 > plot_width: - plot_width = int(len(title) * title_text_font_size / 2) - - # creates plot for chart - p = figure(plot_height=450, plot_width=plot_width, title=title, - toolbar_location=None, x_range=(-0.5, 1.3), tools='hover', tooltips="@contributor_type", - margin=(0, 0, 0, 0)) - - p.wedge(x=0.87, y=1, radius=0.4, start_angle=cumsum('angle', include_zero=True), - end_angle=cumsum('angle'), line_color=None, fill_color='color', - legend_field='contributor_type', source=data) - - start_point = 0.88 - for i in range(0, len(data['percentage'])): - # percentages - p.add_layout(Label(x=-0.17, y=start_point + 0.13 * (len(data['percentage']) - 1 - i), - text='{}%'.format(data.iloc[i]['percentage']), - render_mode='css', text_font_size='15px', text_font_style='bold')) - - # contributors - p.add_layout(Label(x=0.12, y=start_point + 0.13 * (len(data['percentage']) - 1 - i), - text='{}'.format(data.iloc[i]['counts']), - render_mode='css', text_font_size='15px', text_font_style='bold')) - - # percentages header - p.add_layout( - Label(x=-0.22, y=start_point + 0.13 * (len(data['percentage'])), text='Percentages', render_mode='css', - text_font_size='15px', text_font_style='bold')) - - # legend header - p.add_layout( - Label(x=-0.43, y=start_point + 0.13 * (len(data['percentage'])), text='Category', render_mode='css', - text_font_size='15px', text_font_style='bold')) - - # contributors header - p.add_layout( - Label(x=0, y=start_point + 0.13 * (len(data['percentage'])), text='# Contributors', render_mode='css', - text_font_size='15px', text_font_style='bold')) - - p.axis.axis_label = None - p.axis.visible = False - p.grid.grid_line_color = None - - p.title.align = "center" - p.title.text_font_size = "{}px".format(title_text_font_size) - - p.legend.location = "center_left" - p.legend.border_line_color = None - p.legend.label_text_font_style = 'bold' - p.legend.label_text_font_size = "15px" - - plot = p - - caption = """This pie chart shows the percentage of new contributors who were fly-by or repeat contributors. - Fly by contributors are contributors who make less than the required {0} contributions in {1} days. - New contributors are individuals who make their first contribution in the specified time period. - Repeat contributors are contributors who have made {0} or more contributions in {1} days and their - first contribution is in the specified time period.""" - - caption_plot = add_caption_to_visualizations(caption, required_contributions, required_time, plot_width) - - # put graph and caption plot together into one grid - grid = gridplot([[plot], [caption_plot]]) - - filename = export_png(grid) - - return send_file(filename) - -@app.route('/{}/contributor_reports/returning_contributors_stacked_bar/'.format(AUGUR_API_VERSION), - methods=["GET"]) -def returning_contributors_stacked_bar(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - group_by = str(request.args.get('group_by', "quarter")) - required_contributions = int(request.args.get('required_contributions', 4)) - required_time = int(request.args.get('required_time', 365)) - - input_df = new_contributor_data_collection(repo_id=repo_id, required_contributions=required_contributions) - months_df = months_data_collection(start_date=start_date, end_date=end_date) - - needed_columns = ['cntrb_id', 'created_at', 'month', 'year', 'repo_id', 'repo_name', 'login', 'action', - 'rank', 'yearmonth', 'new_contributors', 'quarter'] - - input_df = filter_data(input_df, needed_columns) - - if len(input_df) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: input_df.loc[input_df['repo_id'] == repo_id].iloc[0]['repo_name']} - - drive_by_df, repeats_df = compute_fly_by_and_returning_contributors_dfs(input_df, required_contributions, - required_time, start_date) - - driver_df = pd.concat([drive_by_df, repeats_df, months_df]) - - # filter by end_date - mask = (driver_df['yearmonth'] < end_date) - driver_df = driver_df.loc[mask] - - # create df to hold data needed for chart - data = pd.DataFrame() - if group_by == 'year': - - # x-axis dates - data['dates'] = driver_df[group_by].unique() - - data['repeat_counts'] = \ - driver_df.loc[driver_df['type'] == 'repeat'].groupby(group_by).count().reset_index()['new_contributors'] - data['drive_by_counts'] = \ - driver_df.loc[driver_df['type'] == 'drive_by'].groupby(group_by).count().reset_index()[ - 'new_contributors'] - - # new contributor counts for all contributor counts - total_counts = [] - for i in range(0, len(data['drive_by_counts'])): - total_counts.append(data.iloc[i]['drive_by_counts'] + data.iloc[i]['repeat_counts']) - data['total_counts'] = total_counts - - # used to format x-axis and graph title - group_by_format_string = "Year" - - # font size of drive by and repeat labels - label_text_font_size = "14pt" - - elif group_by == 'quarter' or group_by == 'month': - - # set variables to group the data by quarter or month - if group_by == 'quarter': - date_column = 'quarter' - group_by_format_string = "Quarter" - - elif group_by == 'month': - date_column = 'yearmonth' - group_by_format_string = "Month" - - # modifies the driver_df[date_column] to be a string with year and month, then finds all the unique values - data['dates'] = np.unique(np.datetime_as_string(driver_df[date_column], unit='M')) - data['drive_by_counts'] = pd.concat([driver_df.loc[driver_df['type'] == 'drive_by'], months_df]).groupby( - date_column).sum().reset_index()['new_contributors'] - data['repeat_counts'] = pd.concat([driver_df.loc[driver_df['type'] == 'repeat'], months_df]).groupby( - date_column).sum().reset_index()['new_contributors'] - - # new contributor counts for all contributor types - total_counts = [] - for i in range(0, len(data['drive_by_counts'])): - total_counts.append(data.iloc[i]['drive_by_counts'] + data.iloc[i]['repeat_counts']) - data['total_counts'] = total_counts - - # font size of drive by and repeat labels - label_text_font_size = "13pt" - - data_source = {'Dates': data['dates'], - 'Fly By': data['drive_by_counts'], - 'Repeat': data['repeat_counts'], - 'All': data['total_counts']} - - groups = ["Fly By", "Repeat"] - - colors = ['#56B4E9', '#E69F00'] - - source = ColumnDataSource(data=data_source) - - # format title - title_text_font_size = 18 - - # if the data set is large enough it will dynamically assign the width, if the data set - # is too small it will by default set to 780 pixel so the title fits - if len(data['total_counts']) >= 13: - plot_width = 46 * len(data['total_counts']) + 210 - else: - plot_width = 780 - - p = figure(x_range=data['dates'], plot_height=500, plot_width=plot_width, - title="{}: Fly By and Repeat Contributor Counts per {}".format(repo_dict[repo_id], - group_by_format_string), - toolbar_location=None, y_range=(0, max(total_counts) * 1.15), margin=(0, 0, 0, 0)) - - vbar = p.vbar_stack(groups, x='Dates', width=0.8, color=colors, source=source) - - # add total counts above bars - p.add_layout(LabelSet(x='Dates', y='All', text='All', y_offset=8, text_font_size="14pt", - text_color="black", source=source, text_align='center')) - - # add drive by count labels - p.add_layout(LabelSet(x='Dates', y='Fly By', text='Fly By', y_offset=-22, text_font_size=label_text_font_size, - text_color="black", source=source, text_align='center')) - - # add repeat count labels - p.add_layout(LabelSet(x='Dates', y='All', text='Repeat', y_offset=-22, text_font_size=label_text_font_size, - text_color="black", source=source, text_align='center')) - - # add legend - legend = Legend(items=[(date, [group]) for (date, group) in zip(groups, vbar)], location=(0, 200), - label_text_font_size="16px") - p.add_layout(legend, 'right') - - p.xgrid.grid_line_color = None - p.y_range.start = 0 - p.axis.minor_tick_line_color = None - p.outline_line_color = None - - p.title.align = "center" - p.title.text_font_size = "{}px".format(title_text_font_size) - - p.yaxis.axis_label = '# Contributors' - p.xaxis.axis_label = group_by_format_string - - p.xaxis.axis_label_text_font_size = "18px" - p.yaxis.axis_label_text_font_size = "16px" - - p.xaxis.major_label_text_font_size = "16px" - p.xaxis.major_label_orientation = 45.0 - - p.yaxis.major_label_text_font_size = "16px" - - p.legend.label_text_font_size = "20px" - - plot = p - - caption = """This graph shows the number of new contributors in the specified time period, and indicates how - many were fly-by and repeat contributors. Fly by contributors are contributors who make less than the required - {0} contributions in {1} days. New contributors are individuals who make their first contribution in the - specified time period. Repeat contributors are contributors who have made {0} or more contributions in {1} - days and their first contribution is in the specified time period.""" - - caption_plot = add_caption_to_visualizations(caption, required_contributions, required_time, plot_width) - - # put graph and caption plot together into one grid - grid = gridplot([[plot], [caption_plot]]) - - filename = export_png(grid) - - return send_file(filename) diff --git a/augur/api/routes/pull_request_reports.py b/augur/api/routes/pull_request_reports.py deleted file mode 100644 index 13aea31e8d..0000000000 --- a/augur/api/routes/pull_request_reports.py +++ /dev/null @@ -1,1922 +0,0 @@ -# import psycopg2 -import pandas as pd -import sqlalchemy as salc -import numpy as np -import warnings -import datetime -import json -# from scipy import stats -from flask import request, send_file, Response, current_app -import math - -from bokeh.palettes import Colorblind, mpl, Category20 -from bokeh.layouts import gridplot, column -from bokeh.models.annotations import Title -from bokeh.io import export_png, show # get_screenshot_as_png -# from bokeh.io.export import get_screenshot_as_png -from bokeh.embed import json_item -from bokeh.models import ColumnDataSource, Legend, LabelSet, Range1d, Label, FactorRange, BasicTicker, ColorBar, \ - LinearColorMapper, PrintfTickFormatter -from bokeh.plotting import figure -from bokeh.models.glyphs import Rect -from bokeh.transform import dodge, factor_cmap, transform - -# from selenium.webdriver import Firefox, FirefoxOptions -# options = FirefoxOptions() -# options.headless = True -# webdriver = Firefox(options=options) -#export_png(item, path, webdriver=webdriver) - -warnings.filterwarnings('ignore') - -from augur.api.routes import AUGUR_API_VERSION -from ..server import app - -def pull_request_data_collection(repo_id, start_date, end_date): - - pr_query = salc.sql.text(f""" - SELECT - repo.repo_id AS repo_id, - pull_requests.pr_src_id AS pr_src_id, - repo.repo_name AS repo_name, - pr_src_author_association, - repo_groups.rg_name AS repo_group, - pull_requests.pr_src_state, - pull_requests.pr_merged_at, - pull_requests.pr_created_at AS pr_created_at, - pull_requests.pr_closed_at AS pr_closed_at, - date_part( 'year', pr_created_at :: DATE ) AS CREATED_YEAR, - date_part( 'month', pr_created_at :: DATE ) AS CREATED_MONTH, - date_part( 'year', pr_closed_at :: DATE ) AS CLOSED_YEAR, - date_part( 'month', pr_closed_at :: DATE ) AS CLOSED_MONTH, - pr_src_meta_label, - pr_head_or_base, - ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_close, - ( EXTRACT ( EPOCH FROM pull_requests.pr_closed_at ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_close, - ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_first_response, - ( EXTRACT ( EPOCH FROM first_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_first_response, - ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 3600 AS hours_to_last_response, - ( EXTRACT ( EPOCH FROM last_response_time ) - EXTRACT ( EPOCH FROM pull_requests.pr_created_at ) ) / 86400 AS days_to_last_response, - first_response_time, - last_response_time, - EXTRACT ( EPOCH FROM average_time_between_responses), - assigned_count, - review_requested_count, - labeled_count, - subscribed_count, - mentioned_count, - referenced_count, - closed_count, - head_ref_force_pushed_count, - merged_count::INT, - milestoned_count, - unlabeled_count, - head_ref_deleted_count, - comment_count, - COALESCE(lines_added, 0) as lines_added, - COALESCE(lines_removed, 0) as lines_removed, - commit_count, - COALESCE(file_count, 0) as file_count - FROM - repo, - repo_groups, - pull_requests LEFT OUTER JOIN ( - SELECT pull_requests.pull_request_id, - count(*) FILTER (WHERE action = 'assigned') AS assigned_count, - count(*) FILTER (WHERE action = 'review_requested') AS review_requested_count, - count(*) FILTER (WHERE action = 'labeled') AS labeled_count, - count(*) FILTER (WHERE action = 'unlabeled') AS unlabeled_count, - count(*) FILTER (WHERE action = 'subscribed') AS subscribed_count, - count(*) FILTER (WHERE action = 'mentioned') AS mentioned_count, - count(*) FILTER (WHERE action = 'referenced') AS referenced_count, - count(*) FILTER (WHERE action = 'closed') AS closed_count, - count(*) FILTER (WHERE action = 'head_ref_force_pushed') AS head_ref_force_pushed_count, - count(*) FILTER (WHERE action = 'head_ref_deleted') AS head_ref_deleted_count, - count(*) FILTER (WHERE action = 'milestoned') AS milestoned_count, - COALESCE(count(*) FILTER (WHERE action = 'merged'), 0) AS merged_count, - COALESCE(MIN(message.msg_timestamp), pull_requests.pr_merged_at, pull_requests.pr_closed_at) AS first_response_time, - COALESCE(COUNT(DISTINCT message.msg_timestamp), 0) AS comment_count, - COALESCE(MAX(message.msg_timestamp), pull_requests.pr_closed_at) AS last_response_time, - COALESCE((MAX(message.msg_timestamp) - MIN(message.msg_timestamp)) / COUNT(DISTINCT message.msg_timestamp), pull_requests.pr_created_at - pull_requests.pr_closed_at) AS average_time_between_responses - FROM pull_requests - LEFT OUTER JOIN pull_request_events on pull_requests.pull_request_id = pull_request_events.pull_request_id - JOIN repo on repo.repo_id = pull_requests.repo_id - LEFT OUTER JOIN pull_request_message_ref on pull_requests.pull_request_id = pull_request_message_ref.pull_request_id - LEFT OUTER JOIN message on pull_request_message_ref.msg_id = message.msg_id - WHERE repo.repo_id = {repo_id} - GROUP BY pull_requests.pull_request_id - ) response_times - ON pull_requests.pull_request_id = response_times.pull_request_id - LEFT JOIN ( - SELECT pull_request_commits.pull_request_id, count(DISTINCT pr_cmt_sha) AS commit_count - FROM pull_request_commits, pull_requests, pull_request_meta - WHERE pull_requests.pull_request_id = pull_request_commits.pull_request_id - AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} - AND pr_cmt_sha <> pull_requests.pr_merge_commit_sha - AND pr_cmt_sha <> pull_request_meta.pr_sha - GROUP BY pull_request_commits.pull_request_id - ) all_commit_counts - ON pull_requests.pull_request_id = all_commit_counts.pull_request_id - LEFT JOIN ( - SELECT MAX(pr_repo_meta_id), pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label - FROM pull_requests, pull_request_meta - WHERE pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} - AND pr_head_or_base = 'base' - GROUP BY pull_request_meta.pull_request_id, pr_head_or_base, pr_src_meta_label - ) base_labels - ON base_labels.pull_request_id = all_commit_counts.pull_request_id - LEFT JOIN ( - SELECT sum(cmt_added) AS lines_added, sum(cmt_removed) AS lines_removed, pull_request_commits.pull_request_id, count(DISTINCT cmt_filename) AS file_count - FROM pull_request_commits, commits, pull_requests, pull_request_meta - WHERE cmt_commit_hash = pr_cmt_sha - AND pull_requests.pull_request_id = pull_request_commits.pull_request_id - AND pull_requests.pull_request_id = pull_request_meta.pull_request_id - AND pull_requests.repo_id = {repo_id} - AND commits.repo_id = pull_requests.repo_id - AND commits.cmt_commit_hash <> pull_requests.pr_merge_commit_sha - AND commits.cmt_commit_hash <> pull_request_meta.pr_sha - GROUP BY pull_request_commits.pull_request_id - ) master_merged_counts - ON base_labels.pull_request_id = master_merged_counts.pull_request_id - WHERE - repo.repo_group_id = repo_groups.repo_group_id - AND repo.repo_id = pull_requests.repo_id - AND repo.repo_id = {repo_id} - ORDER BY - merged_count DESC - """) - - with current_app.engine.connect() as conn: - pr_all = pd.read_sql(pr_query, conn) - - pr_all[['assigned_count', - 'review_requested_count', - 'labeled_count', - 'subscribed_count', - 'mentioned_count', - 'referenced_count', - 'closed_count', - 'head_ref_force_pushed_count', - 'merged_count', - 'milestoned_count', - 'unlabeled_count', - 'head_ref_deleted_count', - 'comment_count', - 'commit_count', - 'file_count', - 'lines_added', - 'lines_removed' - ]] = pr_all[['assigned_count', - 'review_requested_count', - 'labeled_count', - 'subscribed_count', - 'mentioned_count', - 'referenced_count', - 'closed_count', - 'head_ref_force_pushed_count', - 'merged_count', - 'milestoned_count', - 'unlabeled_count', - 'head_ref_deleted_count', - 'comment_count', - 'commit_count', - 'file_count', - 'lines_added', - 'lines_removed' - ]].astype(float) - # Change years to int so that doesn't display as 2019.0 for example - pr_all[['created_year', 'closed_year']] = pr_all[['created_year', 'closed_year']].fillna(-1).astype(int).astype( - str) - - start_date = pd.to_datetime(start_date) - # end_date = pd.to_datetime('2020-02-01 09:00:00') - end_date = pd.to_datetime(end_date) - pr_all = pr_all[(pr_all['pr_created_at'] > start_date) & (pr_all['pr_closed_at'] < end_date)] - - pr_all['created_year'] = pr_all['created_year'].map(int) - pr_all['created_month'] = pr_all['created_month'].map(int) - pr_all['created_month'] = pr_all['created_month'].map(lambda x: '{0:0>2}'.format(x)) - pr_all['created_yearmonth'] = pd.to_datetime( - pr_all['created_year'].map(str) + '-' + pr_all['created_month'].map(str) + '-01') - - # getting the number of days of (today - created at) for the PRs that are still open - # and putting this in the days_to_close column - - # get timedeltas of creation time to todays date/time - days_to_close_open_pr = datetime.datetime.now() - pr_all.loc[pr_all['pr_src_state'] == 'open']['pr_created_at'] - - # get num days from above timedelta - days_to_close_open_pr = days_to_close_open_pr.apply(lambda x: x.days).astype(int) - - # for only OPEN pr's, set the days_to_close column equal to above dataframe - pr_all.loc[pr_all['pr_src_state'] == 'open'] = pr_all.loc[pr_all['pr_src_state'] == 'open'].assign( - days_to_close=days_to_close_open_pr) - - pr_all.loc[pr_all['pr_src_state'] == 'open'].head() - - # initiate column by setting all null datetimes - pr_all['closed_yearmonth'] = pd.to_datetime(np.nan) - - # Fill column with prettified string of year/month closed that looks like: 2019-07-01 - pr_all.loc[pr_all['pr_src_state'] == 'closed'] = pr_all.loc[pr_all['pr_src_state'] == 'closed'].assign( - closed_yearmonth=pd.to_datetime(pr_all.loc[pr_all['pr_src_state'] == 'closed']['closed_year'].astype(int - ).map( - str) + '-' + pr_all.loc[pr_all['pr_src_state'] == 'closed']['closed_month'].astype(int).map( - str) + '-01')) - - """ Merged flag """ - if 'pr_merged_at' in pr_all.columns.values: - pr_all['pr_merged_at'] = pr_all['pr_merged_at'].fillna(0) - pr_all['merged_flag'] = 'Not Merged / Rejected' - pr_all['merged_flag'].loc[pr_all['pr_merged_at'] != 0] = 'Merged / Accepted' - pr_all['merged_flag'].loc[pr_all['pr_src_state'] == 'open'] = 'Still Open' - del pr_all['pr_merged_at'] - - # Isolate the different state PRs for now - pr_open = pr_all.loc[pr_all['pr_src_state'] == 'open'] - pr_closed = pr_all.loc[pr_all['pr_src_state'] == 'closed'] - pr_merged = pr_all.loc[pr_all['merged_flag'] == 'Merged / Accepted'] - pr_not_merged = pr_all.loc[pr_all['merged_flag'] == 'Not Merged / Rejected'] - - # Filtering the 80th percentile slowest PRs - def filter_20_per_slowest(input_df): - pr_slow20_filtered = pd.DataFrame() - pr_slow20_x = pd.DataFrame() - pr_slow20_filtered = input_df.copy() - pr_slow20_filtered['percentile_rank_local'] = pr_slow20_filtered.days_to_close.rank(pct=True) - pr_slow20_filtered = pr_slow20_filtered.query('percentile_rank_local >= .8', ) - - return pr_slow20_filtered - - pr_slow20_open = filter_20_per_slowest(pr_open) - pr_slow20_closed = filter_20_per_slowest(pr_closed) - pr_slow20_merged = filter_20_per_slowest(pr_merged) - pr_slow20_not_merged = filter_20_per_slowest(pr_not_merged) - pr_slow20_all = filter_20_per_slowest(pr_all) - - return pr_all, pr_open, pr_closed, pr_merged, pr_not_merged, pr_slow20_all, pr_slow20_open, pr_slow20_closed, pr_slow20_merged, pr_slow20_not_merged - -def remove_outliers(input_df, field, num_outliers_repo_map): - df_no_outliers = input_df.copy() - for repo_name, num_outliers in num_outliers_repo_map.items(): - indices_to_drop = input_df.loc[input_df['repo_name'] == repo_name].nlargest(num_outliers, field).index - df_no_outliers = df_no_outliers.drop(index=indices_to_drop) - return df_no_outliers - -def remove_outliers_by_standard_deviation(input_df, column): - '''Takes a dataframe and a numeric column name. - Then removes all rows thare are than 3 standard deviations from the mean. - Returns a df without outliers, the # of outliers removed, outlier cutoff value''' - - # finds rows that are more than 3 standard deviations from the mean - outlier_cutoff = input_df[column].mean() + (3 * input_df[column].std()) - outlier_mask = input_df[column] > outlier_cutoff - - # determine number of outliers - outliers_removed = len(input_df.loc[outlier_mask]) - - df_no_outliers = input_df.loc[~outlier_mask] - - return df_no_outliers, outliers_removed, outlier_cutoff - -def hex_to_RGB(hex): - ''' "#FFFFFF" -> [255,255,255] ''' - # Pass 16 to the integer function for change of base - return [int(hex[i:i + 2], 16) for i in range(1, 6, 2)] - -def color_dict(gradient): - ''' Takes in a list of RGB sub-lists and returns dictionary of - colors in RGB and hex form for use in a graphing function - defined later on ''' - return {"hex": [RGB_to_hex(RGB) for RGB in gradient], - "r": [RGB[0] for RGB in gradient], - "g": [RGB[1] for RGB in gradient], - "b": [RGB[2] for RGB in gradient]} - -def RGB_to_hex(RGB): - ''' [255,255,255] -> "#FFFFFF" ''' - # Components need to be integers for hex to make sense - RGB = [int(x) for x in RGB] - return "#" + "".join(["0{0:x}".format(v) if v < 16 else - "{0:x}".format(v) for v in RGB]) - -def linear_gradient(start_hex, finish_hex="#FFFFFF", n=10): - ''' returns a gradient list of (n) colors between - two hex colors. start_hex and finish_hex - should be the full six-digit color string, - inlcuding the number sign ("#FFFFFF") ''' - # Starting and ending colors in RGB form - s = hex_to_RGB(start_hex) - f = hex_to_RGB(finish_hex) - # Initilize a list of the output colors with the starting color - RGB_list = [s] - # Calcuate a color at each evenly spaced value of t from 1 to n - for t in range(1, n): - # Interpolate RGB vector for color at the current value of t - curr_vector = [ - int(s[j] + (float(t) / (n - 1)) * (f[j] - s[j])) - for j in range(3) - ] - # Add it to our list of output colors - RGB_list.append(curr_vector) - - return color_dict(RGB_list) - -# dict of df types, and their locaiton in the tuple that the function pull_request_data_collection returns -def get_df_tuple_locations(): - return {"pr_all": 0, "pr_open": 1, "pr_closed": 2, "pr_merged": 3, "pr_not_merged": 4, "pr_slow20_all": 5, - "pr_slow20_open": 6, "pr_slow20_closed": 7, "pr_slow20_merged": 8, "pr_slow20_not_merged": 9} - -def add_caption_to_plot(caption_plot, caption): - - caption_plot.add_layout(Label( - x=0, # Change to shift caption left or right - y=160, - x_units='screen', - y_units='screen', - text='{}'.format(caption), - text_font='times', # Use same font as paper - text_font_size='15pt', - render_mode='css' - )) - caption_plot.outline_line_color = None - - return caption_plot - -def remove_rows_with_null_values(df, not_null_columns=[]): - """Remove null data from pandas df - - Parameters - -- df - description: the dataframe that will be modified - type: Pandas Dataframe - - -- list_of_columns - description: columns that are searched for NULL values - type: list - default: [] (means all columns will be checked for NULL values) - IMPORTANT: if an empty list is passed or nothing is passed it will check all columns for NULL values - - Return Value - -- Modified Pandas Dataframe - """ - - if len(not_null_columns) == 0: - not_null_columns = df.columns.to_list() - - total_rows_removed = 0 - for col in not_null_columns: - rows_removed = len(df.loc[df[col].isnull()]) - #rows_removed = len(df.loc[df[col].isnull() is True]) - - if rows_removed > 0: - print(f"{rows_removed} rows have been removed because of null values in column {col}") - total_rows_removed += rows_removed - - df = df.loc[df[col].isnull() is False] - - if total_rows_removed > 0: - print(f"\nTotal rows removed because of null data: {total_rows_removed}"); - else: - print("No null data found") - - return df - -def get_needed_columns(df, list_of_columns): - """Get only a specific list of columns from a Pandas Dataframe - - Parameters - -- df - description: the dataframe that will be modified - type: Pandas Dataframe - - -- list_of_columns - description: columns that will be kept in dataframe - type: list - - Return Value - -- Modified Pandas Dataframe - """ - return df[list_of_columns] - -def filter_data(df, needed_columns, not_null_columns=[]): - """Filters out the unneeded rows in the df, and removed NULL data from df - - Parameters - -- df - description: the dataframe that will be modified - type: Pandas Dataframe - - -- needed_columns - description: the columns to keep in the dataframe - - -- not_null_columns - description: columns that will be searched for NULL data, - if NULL values are found those rows will be removed - default: [] (means all columns in needed_columns list will be checked for NULL values) - IMPORTANT: if an empty list is passed or nothing is passed it will check - all columns in needed_columns list for NULL values - Return Value - -- Modified Pandas Dataframe - """ - - if all(x in needed_columns for x in not_null_columns): - - df = get_needed_columns(df, needed_columns) - #Use the pandas method bc the other method was erroring on boolean index. - #IM - 9/23/22 - df = df.dropna(subset=not_null_columns)#remove_rows_with_null_values(df, not_null_columns) - - return df - else: - print("Developer error, not null columns should be a subset of needed columns") - return df - -def get_repo_id_start_date_and_end_date(): - - """ Gets the repo_id, start_date, and end_date from the GET requests array - - :return: repo_id - id of the repo data is being retrieved for - :return: start_date - earliest time on visualization. Defaults to the January 1st of last year - :return: end_date - latest time on visualization. Defaults to current date - """ - - now = datetime.datetime.now() - - repo_id = request.args.get('repo_id') - start_date = str(request.args.get('start_date', "{}-01-01".format(now.year - 1))) - end_date = str(request.args.get('end_date', "{}-{}-{}".format(now.year, now.month, now.day))) - - if repo_id: - - if start_date < end_date: - return int(repo_id), start_date, end_date, None - else: - - error = { - "message": "Invalid end_date. end_date is before the start_date", - "status_code": 400 - } - - return int(repo_id), None, None, error - - else: - error = { - "message": "repo_id not specified. Use this endpoint to get a list of available repos: http:///api/unstable/repos", - "status_code": 400 - } - return None, None, None, error - -@app.route('/{}/pull_request_reports/average_commits_per_PR/'.format(AUGUR_API_VERSION), methods=["GET"]) -def average_commits_per_PR(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - group_by = str(request.args.get('group_by', "month")) - return_json = request.args.get('return_json', "false") - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - y_axis = 'num_commits' - group_by_bars = 'merged_flag' - description = 'All' - - # gets pr_all data - # selects only need columns (pr_closed_needed_columns) - # removes columns that cannot be NULL (pr_closed_not_null_columns) - input_df = df_tuple[df_type["pr_all"]] - needed_columns = ['repo_id', 'repo_name', 'closed_year', 'closed_yearmonth', group_by_bars, 'commit_count'] - input_df = filter_data(input_df, needed_columns) - - if len(input_df) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - # print(input_df.to_string()) - - repo_dict = {repo_id: input_df.loc[input_df['repo_id'] == repo_id].iloc[0]['repo_name']} - - driver_df = input_df.copy() # deep copy input data so we do not change the external dataframe - - # Change closed year to int so that doesn't display as 2019.0 for example - driver_df['closed_year'] = driver_df['closed_year'].astype(int).astype(str) - - # defaults to year - x_axis = 'closed_year' - x_groups = sorted(list(driver_df[x_axis].unique())) - - if group_by == 'month': - x_axis = "closed_yearmonth" - x_groups = np.unique(np.datetime_as_string(input_df[x_axis], unit='M')) - - # inner groups on x_axis they are merged and not_merged - groups = list(driver_df[group_by_bars].unique()) - - # setup color pallete - try: - colors = mpl['Plasma'][len(groups)] - except: - colors = [mpl['Plasma'][3][0]] + [mpl['Plasma'][3][1]] - - merged_avg_values = list(driver_df.loc[driver_df[group_by_bars] == 'Merged / Accepted'].groupby([x_axis], - as_index=False).mean().round( - 1)['commit_count']) - not_merged_avg_values = list( - driver_df.loc[driver_df[group_by_bars] == 'Not Merged / Rejected'].groupby([x_axis], - as_index=False).mean().round(1)[ - 'commit_count']) - - # Setup data in format for grouped bar chart - data = { - 'years': x_groups, - 'Merged / Accepted': merged_avg_values, - 'Not Merged / Rejected': not_merged_avg_values, - } - - x = [(year, pr_state) for year in x_groups for pr_state in groups] - counts = sum(zip(data['Merged / Accepted'], data['Not Merged / Rejected']), ()) - - source = ColumnDataSource(data=dict(x=x, counts=counts)) - - title_beginning = '{}: '.format(repo_dict[repo_id]) - title = "{}Average Commit Counts Per Year for {} Pull Requests".format(title_beginning, description) - - plot_width = len(x_groups) * 300 - title_text_font_size = 16 - - if (len(title) * title_text_font_size / 2) > plot_width: - plot_width = int(len(title) * title_text_font_size / 2) + 40 - - p = figure(x_range=FactorRange(*x), plot_height=450, plot_width=plot_width, title=title, - y_range=(0, max(merged_avg_values + not_merged_avg_values) * 1.15), toolbar_location=None) - - # Vertical bar glyph - p.vbar(x='x', top='counts', width=0.9, source=source, line_color="white", - fill_color=factor_cmap('x', palette=colors, factors=groups, start=1, end=2)) - - # Data label - labels = LabelSet(x='x', y='counts', text='counts', # y_offset=-8, x_offset=34, - text_font_size="12pt", text_color="black", - source=source, text_align='center') - p.add_layout(labels) - - p.y_range.start = 0 - p.x_range.range_padding = 0.1 - p.xaxis.major_label_orientation = 1 - p.xgrid.grid_line_color = None - - p.yaxis.axis_label = 'Average Commits / Pull Request' - p.xaxis.axis_label = 'Year Closed' - - p.title.align = "center" - p.title.text_font_size = "{}px".format(title_text_font_size) - - p.xaxis.axis_label_text_font_size = "16px" - p.xaxis.major_label_text_font_size = "15px" - - p.yaxis.axis_label_text_font_size = "15px" - p.yaxis.major_label_text_font_size = "15px" - - plot = p - - p = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the average commits per pull requests over an entire year," \ - " for merged and not merged pull requests." - p = add_caption_to_plot(p, caption) - - caption_plot = p - - grid = gridplot([[plot], [caption_plot]]) - - if return_json == "true": - var = Response(response=json.dumps(json_item(grid, "average_commits_per_PR")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - # filename = export_png(grid, timeout=180, webdriver=webdriver) - filename = export_png(grid, timeout=180) - - return send_file(filename) - -@app.route('/{}/pull_request_reports/average_comments_per_PR/'.format(AUGUR_API_VERSION), methods=["GET"]) -def average_comments_per_PR(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - return_json = request.args.get('return_json', "false") - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - group_by = 'merged_flag' - x_axis = 'comment_count' - description = "All Closed" - y_axis = 'closed_year' - - # gets pr_closed data - # selects only need columns (pr_closed_needed_columns) - # removes columns that cannot be NULL (pr_closed_not_null_columns) - input_df = df_tuple[df_type["pr_closed"]] - needed_columns = ['repo_id', 'repo_name', y_axis, group_by, x_axis] - not_null_columns = needed_columns - input_df = filter_data(input_df, needed_columns) - - if len(input_df) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: input_df.loc[input_df['repo_id'] == repo_id].iloc[0]['repo_name']} - - driver_df = input_df.copy() - - try: - y_groups = sorted(list(driver_df[y_axis].unique())) - except: - y_groups = [repo_id] - - groups = driver_df[group_by].unique() - try: - colors = mpl['Plasma'][len(groups)] - except: - colors = [mpl['Plasma'][3][0]] + [mpl['Plasma'][3][1]] - - len_not_merged = len(driver_df.loc[driver_df['merged_flag'] == 'Not Merged / Rejected']) - len_merged = len(driver_df.loc[driver_df['merged_flag'] == 'Merged / Accepted']) - - title_beginning = '{}: '.format(repo_dict[repo_id]) - plot_width = 650 - p = figure(y_range=y_groups, plot_height=450, plot_width=plot_width, - # y_range=y_groups,#(pr_all[y_axis].min(),pr_all[y_axis].max()) #y_axis_type="datetime", - title='{} {}'.format(title_beginning, "Mean Comments for {} Pull Requests".format(description)), - toolbar_location=None) - - possible_maximums = [] - for y_value in y_groups: - - y_merged_data = driver_df.loc[ - (driver_df[y_axis] == y_value) & (driver_df['merged_flag'] == 'Merged / Accepted')] - y_not_merged_data = driver_df.loc[ - (driver_df[y_axis] == y_value) & (driver_df['merged_flag'] == 'Not Merged / Rejected')] - - if len(y_merged_data) > 0: - y_merged_data_mean = y_merged_data[x_axis].mean() - - if (math.isnan(y_merged_data_mean)): - return Response( - response="There is no message data for this repo, in the database you are accessing", - mimetype='application/json', status=200) - else: - y_merged_data[x_axis + '_mean'] = y_merged_data_mean.round(1) - - else: - y_merged_data[x_axis + '_mean'] = 0 - - if len(y_not_merged_data) > 0: - y_not_merged_data_mean = y_not_merged_data[x_axis].mean() - - if math.isnan(y_not_merged_data_mean): - return Response( - response="There is no message data for this repo, in the database you are accessing", - mimetype='application/json', status=200) - else: - y_not_merged_data[x_axis + '_mean'] = y_not_merged_data_mean.round(1) - - else: - y_not_merged_data[x_axis + '_mean'] = 0 - - not_merged_source = ColumnDataSource(y_not_merged_data) - merged_source = ColumnDataSource(y_merged_data) - - possible_maximums.append(max(y_not_merged_data[x_axis + '_mean'])) - possible_maximums.append(max(y_merged_data[x_axis + '_mean'])) - - # mean comment count for merged - merged_comment_count_glyph = p.hbar(y=dodge(y_axis, -0.1, range=p.y_range), left=0, right=x_axis + '_mean', - height=0.04 * len(driver_df[y_axis].unique()), - source=merged_source, - fill_color="black") # ,legend_label="Mean Days to Close", - # Data label - labels = LabelSet(x=x_axis + '_mean', y=dodge(y_axis, -0.1, range=p.y_range), text=x_axis + '_mean', - y_offset=-8, x_offset=34, - text_font_size="12pt", text_color="black", - source=merged_source, text_align='center') - p.add_layout(labels) - # mean comment count For nonmerged - not_merged_comment_count_glyph = p.hbar(y=dodge(y_axis, 0.1, range=p.y_range), left=0, - right=x_axis + '_mean', - height=0.04 * len(driver_df[y_axis].unique()), - source=not_merged_source, - fill_color="#e84d60") # legend_label="Mean Days to Close", - # Data label - labels = LabelSet(x=x_axis + '_mean', y=dodge(y_axis, 0.1, range=p.y_range), text=x_axis + '_mean', - y_offset=-8, x_offset=34, - text_font_size="12pt", text_color="#e84d60", - source=not_merged_source, text_align='center') - p.add_layout(labels) - - # p.y_range.range_padding = 0.1 - p.ygrid.grid_line_color = None - p.legend.location = "bottom_right" - p.axis.minor_tick_line_color = None - p.outline_line_color = None - p.xaxis.axis_label = 'Average Comments / Pull Request' - p.yaxis.axis_label = 'Repository' if y_axis == 'repo_name' else 'Year Closed' if y_axis == 'closed_year' else '' - - legend = Legend( - items=[ - ("Merged Pull Request Mean Comment Count", [merged_comment_count_glyph]), - ("Rejected Pull Request Mean Comment Count", [not_merged_comment_count_glyph]) - ], - - location='center', - orientation='vertical', - border_line_color="black" - ) - p.add_layout(legend, "below") - - p.title.text_font_size = "16px" - p.title.align = "center" - - p.xaxis.axis_label_text_font_size = "16px" - p.xaxis.major_label_text_font_size = "16px" - - p.yaxis.axis_label_text_font_size = "16px" - p.yaxis.major_label_text_font_size = "16px" - - p.x_range = Range1d(0, max(possible_maximums) * 1.15) - - plot = p - - p = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the average number of comments per merged or not merged pull request." - - p = add_caption_to_plot(p, caption) - - caption_plot = p - - grid = gridplot([[plot], [caption_plot]]) - - if return_json == "true": - var = Response(response=json.dumps(json_item(grid, "average_comments_per_PR")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - filename = export_png(grid, timeout=180) - - return send_file(filename) - -@app.route('/{}/pull_request_reports/PR_counts_by_merged_status/'.format(AUGUR_API_VERSION), - methods=["GET"]) -def PR_counts_by_merged_status(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - return_json = request.args.get('return_json', "false") - - x_axis = 'closed_year' - description = 'All Closed' - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - # gets pr_closed data - # selects only need columns (pr_closed_needed_columns) - # removes columns that cannot be NULL (pr_closed_not_null_columns) - pr_closed = df_tuple[df_type["pr_closed"]] - pr_closed_needed_columns = ['repo_id', 'repo_name', x_axis, 'merged_flag'] - pr_closed = filter_data(pr_closed, pr_closed_needed_columns) - - # gets pr_slow20_not_merged data - # selects only need columns (pr_slow20_not_merged_needed_columns) - # removes columns that cannot be NULL (pr_slow20_not_merged_not_null_columns) - pr_slow20_not_merged = df_tuple[df_type["pr_slow20_not_merged"]] - pr_slow20_not_merged_needed_columns = ['repo_id', 'repo_name', x_axis, 'merged_flag'] - pr_slow20_not_merged = filter_data(pr_slow20_not_merged, pr_slow20_not_merged_needed_columns,) - - # gets pr_slow20_merged data - # selects only need columns (pr_slow20_not_merged_needed_columns) - # removes columns that cannot be NULL (pr_slow20_not_merged_not_null_columns) - pr_slow20_merged = df_tuple[df_type["pr_slow20_merged"]] - pr_slow20_merged_needed_columns = ['repo_id', 'repo_name', x_axis, 'merged_flag'] - pr_slow20_merged = filter_data(pr_slow20_merged, pr_slow20_merged_needed_columns) - - if len(pr_closed) == 0 or len(pr_slow20_not_merged) == 0 or len(pr_slow20_merged) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: pr_closed.loc[pr_closed['repo_id'] == repo_id].iloc[0]['repo_name']} - - data_dict = {'All': pr_closed, 'Slowest 20%': pr_slow20_not_merged.append(pr_slow20_merged, ignore_index=True)} - - colors = mpl['Plasma'][6] - - for data_desc, input_df in data_dict.items(): - x_groups = sorted(list(input_df[x_axis].astype(str).unique())) - break - - plot_width = 315 * len(x_groups) - - if plot_width < 900: - plot_width = 900 - title_beginning = repo_dict[repo_id] - p = figure(x_range=x_groups, plot_height=350, plot_width=plot_width, - title='{}: {}'.format(title_beginning, - "Count of {} Pull Requests by Merged Status".format(description)), - toolbar_location=None) - - dodge_amount = 0.12 - color_index = 0 - x_offset = 60 - - all_totals = [] - for data_desc, input_df in data_dict.items(): - driver_df = input_df.copy() - - driver_df[x_axis] = driver_df[x_axis].astype(str) - - groups = sorted(list(driver_df['merged_flag'].unique())) - - driver_df = driver_df.loc[driver_df['repo_id'] == repo_id] - - len_merged = [] - zeros = [] - len_not_merged = [] - totals = [] - - for x_group in x_groups: - len_merged_entry = len( - driver_df.loc[(driver_df['merged_flag'] == 'Merged / Accepted') & (driver_df[x_axis] == x_group)]) - totals += [len(driver_df.loc[(driver_df['merged_flag'] == 'Not Merged / Rejected') & ( - driver_df[x_axis] == x_group)]) + len_merged_entry] - len_not_merged += [len(driver_df.loc[(driver_df['merged_flag'] == 'Not Merged / Rejected') & ( - driver_df[x_axis] == x_group)])] - len_merged += [len_merged_entry] - zeros.append(0) - - data = {'X': x_groups} - for group in groups: - data[group] = [] - for x_group in x_groups: - data[group] += [ - len(driver_df.loc[(driver_df['merged_flag'] == group) & (driver_df[x_axis] == x_group)])] - - data['len_merged'] = len_merged - data['len_not_merged'] = len_not_merged - data['totals'] = totals - data['zeros'] = zeros - - if data_desc == "All": - all_totals = totals - - source = ColumnDataSource(data) - - stacked_bar = p.vbar_stack(groups, x=dodge('X', dodge_amount, range=p.x_range), width=0.2, source=source, - color=colors[1:3], legend_label=[f"{data_desc} " + "%s" % x for x in groups]) - # Data label for merged - - p.add_layout( - LabelSet(x=dodge('X', dodge_amount, range=p.x_range), y='zeros', text='len_merged', y_offset=2, - x_offset=x_offset, - text_font_size="12pt", text_color=colors[1:3][0], - source=source, text_align='center') - ) - if min(data['totals']) < 400: - y_offset = 15 - else: - y_offset = 0 - # Data label for not merged - p.add_layout( - LabelSet(x=dodge('X', dodge_amount, range=p.x_range), y='totals', text='len_not_merged', - y_offset=y_offset, x_offset=x_offset, - text_font_size="12pt", text_color=colors[1:3][1], - source=source, text_align='center') - ) - # Data label for total - p.add_layout( - LabelSet(x=dodge('X', dodge_amount, range=p.x_range), y='totals', text='totals', y_offset=0, x_offset=0, - text_font_size="12pt", text_color='black', - source=source, text_align='center') - ) - dodge_amount *= -1 - colors = colors[::-1] - x_offset *= -1 - - p.y_range = Range1d(0, max(all_totals) * 1.4) - - p.xgrid.grid_line_color = None - p.legend.location = "top_center" - p.legend.orientation = "horizontal" - p.axis.minor_tick_line_color = None - p.outline_line_color = None - p.yaxis.axis_label = 'Count of Pull Requests' - p.xaxis.axis_label = 'Repository' if x_axis == 'repo_name' else 'Year Closed' if x_axis == 'closed_year' else '' - - p.title.align = "center" - p.title.text_font_size = "16px" - - p.xaxis.axis_label_text_font_size = "16px" - p.xaxis.major_label_text_font_size = "16px" - - p.yaxis.axis_label_text_font_size = "16px" - p.yaxis.major_label_text_font_size = "16px" - - p.outline_line_color = None - - plot = p - - p = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the number of closed pull requests per year in " \ - "four different categories. These four categories are All Merged, All Not Merged," \ - " Slowest 20% Merged, and Slowest 20% Not Merged." - p = add_caption_to_plot(p, caption) - - caption_plot = p - - grid = gridplot([[plot], [caption_plot]]) - - if return_json == "true": - var = Response(response=json.dumps(json_item(grid, "PR_counts_by_merged_status")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - filename = export_png(grid, timeout=180) - - return send_file(filename) - -@app.route('/{}/pull_request_reports/mean_response_times_for_PR/'.format(AUGUR_API_VERSION), - methods=["GET"]) -def mean_response_times_for_PR(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - return_json = request.args.get('return_json', "false") - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - time_unit = 'days' - x_max = 95 - y_axis = 'closed_year' - description = "All Closed" - legend_position = (410, 10) - - # gets pr_closed data - # selects only need columns (pr_closed_needed_columns) - # removes columns that cannot be NULL (pr_closed_not_null_columns) - input_df = df_tuple[df_type["pr_closed"]] - needed_columns = ['repo_id', 'repo_name', y_axis, 'merged_flag', time_unit + '_to_first_response', - time_unit + '_to_last_response', time_unit + '_to_close'] - input_df = filter_data(input_df, needed_columns) - - if len(input_df) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: input_df.loc[input_df['repo_id'] == repo_id].iloc[0]['repo_name']} - - driver_df = input_df.copy() # deep copy input data so we do not alter the external dataframe - - title_beginning = '{}: '.format(repo_dict[repo_id]) - plot_width = 950 - p = figure(toolbar_location=None, y_range=sorted(driver_df[y_axis].unique()), plot_width=plot_width, - plot_height=450, # 75*len(driver_df[y_axis].unique()), - title="{}Mean Response Times for Pull Requests {}".format(title_beginning, description)) - - first_response_glyphs = [] - last_response_glyphs = [] - merged_days_to_close_glyphs = [] - not_merged_days_to_close_glyphs = [] - - possible_maximums = [] - - # FIXME repo_set is not defined - # setup color pallete - try: - colors = Colorblind[len(repo_set)] - except: - colors = Colorblind[3] - - y_merged_data_list = [] - y_not_merged_data_list = [] - - # calculate data frist time to obtain the maximum and make sure there is message data - for y_value in driver_df[y_axis].unique(): - - y_merged_data = driver_df.loc[ - (driver_df[y_axis] == y_value) & (driver_df['merged_flag'] == 'Merged / Accepted')] - y_not_merged_data = driver_df.loc[ - (driver_df[y_axis] == y_value) & (driver_df['merged_flag'] == 'Not Merged / Rejected')] - - if len(y_merged_data) > 0: - - y_merged_data_first_response_mean = y_merged_data[time_unit + '_to_first_response'].mean() - y_merged_data_last_response_mean = y_merged_data[time_unit + '_to_last_response'].mean() - y_merged_data_to_close_mean = y_merged_data[time_unit + '_to_close'].mean() - - if (math.isnan(y_merged_data_first_response_mean) or math.isnan( - y_merged_data_last_response_mean) or math.isnan(y_merged_data_to_close_mean)): - return Response( - response="There is no message data for this repo, in the database you are accessing", - mimetype='application/json', status=200) - else: - y_merged_data[time_unit + '_to_first_response_mean'] = y_merged_data_first_response_mean.round(1) - y_merged_data[time_unit + '_to_last_response_mean'] = y_merged_data_last_response_mean.round(1) - y_merged_data[time_unit + '_to_close_mean'] = y_merged_data_to_close_mean.round(1) - else: - y_merged_data[time_unit + '_to_first_response_mean'] = 0.00 - y_merged_data[time_unit + '_to_last_response_mean'] = 0.00 - y_merged_data[time_unit + '_to_close_mean'] = 0.00 - - if len(y_not_merged_data) > 0: - - y_not_merged_data_first_response_mean = y_not_merged_data[time_unit + '_to_first_response'].mean() - y_not_merged_data_last_response_mean = y_not_merged_data[time_unit + '_to_last_response'].mean() - y_not_merged_data_to_close_mean = y_not_merged_data[time_unit + '_to_close'].mean() - - if (math.isnan(y_not_merged_data_first_response_mean) or math.isnan( - y_not_merged_data_last_response_mean) or math.isnan(y_not_merged_data_to_close_mean)): - return Response( - response="There is no message data for this repo, in the database you are accessing", - mimetype='application/json', status=200) - else: - y_not_merged_data[ - time_unit + '_to_first_response_mean'] = y_not_merged_data_first_response_mean.round(1) - y_not_merged_data[ - time_unit + '_to_last_response_mean'] = y_not_merged_data_last_response_mean.round(1) - y_not_merged_data[time_unit + '_to_close_mean'] = y_not_merged_data_to_close_mean.round(1) - else: - y_not_merged_data[time_unit + '_to_first_response_mean'] = 0.00 - y_not_merged_data[time_unit + '_to_last_response_mean'] = 0.00 - y_not_merged_data[time_unit + '_to_close_mean'] = 0.00 - - possible_maximums.append(max(y_merged_data[time_unit + '_to_close_mean'])) - possible_maximums.append(max(y_not_merged_data[time_unit + '_to_close_mean'])) - - maximum = max(possible_maximums) * 1.15 - ideal_difference = maximum * 0.064 - - y_merged_data_list.append(y_merged_data) - y_not_merged_data_list.append(y_not_merged_data) - - # loop through data and add it to the plot - for index in range(0, len(y_merged_data_list)): - - y_merged_data = y_merged_data_list[index] - y_not_merged_data = y_not_merged_data_list[index] - - not_merged_source = ColumnDataSource(y_not_merged_data) - merged_source = ColumnDataSource(y_merged_data) - - # mean PR length for merged - merged_days_to_close_glyph = p.hbar(y=dodge(y_axis, -0.1, range=p.y_range), left=0, - right=time_unit + '_to_close_mean', - height=0.04 * len(driver_df[y_axis].unique()), - source=merged_source, - fill_color="black") # ,legend_label="Mean Days to Close", - merged_days_to_close_glyphs.append(merged_days_to_close_glyph) - # Data label - labels = LabelSet(x=time_unit + '_to_close_mean', y=dodge(y_axis, -0.1, range=p.y_range), - text=time_unit + '_to_close_mean', y_offset=-8, x_offset=34, # 34 - text_font_size="12pt", text_color="black", - source=merged_source, text_align='center') - p.add_layout(labels) - - # mean PR length For nonmerged - not_merged_days_to_close_glyph = p.hbar(y=dodge(y_axis, 0.1, range=p.y_range), left=0, - right=time_unit + '_to_close_mean', - height=0.04 * len(driver_df[y_axis].unique()), - source=not_merged_source, - fill_color="#e84d60") # legend_label="Mean Days to Close", - not_merged_days_to_close_glyphs.append(not_merged_days_to_close_glyph) - # Data label - labels = LabelSet(x=time_unit + '_to_close_mean', y=dodge(y_axis, 0.1, range=p.y_range), - text=time_unit + '_to_close_mean', y_offset=-8, x_offset=44, - text_font_size="12pt", text_color="#e84d60", - source=not_merged_source, text_align='center') - p.add_layout(labels) - - # if the difference between two values is less than 6.4 percent move the second one to the right 30 pixels - if (max(y_merged_data[time_unit + '_to_last_response_mean']) - max( - y_merged_data[time_unit + '_to_first_response_mean'])) < ideal_difference: - merged_x_offset = 30 - else: - merged_x_offset = 0 - - # if the difference between two values is less than 6.4 percent move the second one to the right 30 pixels - if (max(y_not_merged_data[time_unit + '_to_last_response_mean']) - max( - y_not_merged_data[time_unit + '_to_first_response_mean'])) < ideal_difference: - not_merged_x_offset = 30 - else: - not_merged_x_offset = 0 - - # if there is only one bar set the y_offsets so the labels will not overlap the bars - if len(driver_df[y_axis].unique()) == 1: - merged_y_offset = -65 - not_merged_y_offset = 45 - else: - merged_y_offset = -45 - not_merged_y_offset = 25 - - # mean time to first response - glyph = Rect(x=time_unit + '_to_first_response_mean', y=dodge(y_axis, -0.1, range=p.y_range), - width=x_max / 100, height=0.08 * len(driver_df[y_axis].unique()), fill_color=colors[0]) - first_response_glyph = p.add_glyph(merged_source, glyph) - first_response_glyphs.append(first_response_glyph) - # Data label - labels = LabelSet(x=time_unit + '_to_first_response_mean', y=dodge(y_axis, 0, range=p.y_range), - text=time_unit + '_to_first_response_mean', x_offset=0, y_offset=merged_y_offset, # -60, - text_font_size="12pt", text_color=colors[0], - source=merged_source, text_align='center') - p.add_layout(labels) - - # for nonmerged - glyph = Rect(x=time_unit + '_to_first_response_mean', y=dodge(y_axis, 0.1, range=p.y_range), - width=x_max / 100, height=0.08 * len(driver_df[y_axis].unique()), fill_color=colors[0]) - first_response_glyph = p.add_glyph(not_merged_source, glyph) - first_response_glyphs.append(first_response_glyph) - # Data label - labels = LabelSet(x=time_unit + '_to_first_response_mean', y=dodge(y_axis, 0, range=p.y_range), - text=time_unit + '_to_first_response_mean', x_offset=0, y_offset=not_merged_y_offset, - # 40, - text_font_size="12pt", text_color=colors[0], - source=not_merged_source, text_align='center') - p.add_layout(labels) - - # mean time to last response - glyph = Rect(x=time_unit + '_to_last_response_mean', y=dodge(y_axis, -0.1, range=p.y_range), - width=x_max / 100, height=0.08 * len(driver_df[y_axis].unique()), fill_color=colors[1]) - last_response_glyph = p.add_glyph(merged_source, glyph) - last_response_glyphs.append(last_response_glyph) - # Data label - labels = LabelSet(x=time_unit + '_to_last_response_mean', y=dodge(y_axis, 0, range=p.y_range), - text=time_unit + '_to_last_response_mean', x_offset=merged_x_offset, - y_offset=merged_y_offset, # -60, - text_font_size="12pt", text_color=colors[1], - source=merged_source, text_align='center') - p.add_layout(labels) - - # for nonmerged - glyph = Rect(x=time_unit + '_to_last_response_mean', y=dodge(y_axis, 0.1, range=p.y_range), - width=x_max / 100, height=0.08 * len(driver_df[y_axis].unique()), fill_color=colors[1]) - last_response_glyph = p.add_glyph(not_merged_source, glyph) - last_response_glyphs.append(last_response_glyph) - # Data label - labels = LabelSet(x=time_unit + '_to_last_response_mean', y=dodge(y_axis, 0, range=p.y_range), - text=time_unit + '_to_last_response_mean', x_offset=not_merged_x_offset, - y_offset=not_merged_y_offset, # 40, - text_font_size="12pt", text_color=colors[1], - source=not_merged_source, text_align='center') - p.add_layout(labels) - - p.title.align = "center" - p.title.text_font_size = "16px" - - p.xaxis.axis_label = "Days to Close" - p.xaxis.axis_label_text_font_size = "16px" - p.xaxis.major_label_text_font_size = "16px" - - # adjust the starting point and ending point based on the maximum of maximum of the graph - p.x_range = Range1d(maximum / 30 * -1, maximum * 1.15) - - p.yaxis.axis_label = "Repository" if y_axis == 'repo_name' else 'Year Closed' if y_axis == 'closed_year' else '' - p.yaxis.axis_label_text_font_size = "16px" - p.yaxis.major_label_text_font_size = "16px" - p.ygrid.grid_line_color = None - p.y_range.range_padding = 0.15 - - p.outline_line_color = None - p.toolbar.logo = None - p.toolbar_location = None - - def add_legend(location, orientation, side): - legend = Legend( - items=[ - ("Mean Days to First Response", first_response_glyphs), - ("Mean Days to Last Response", last_response_glyphs), - ("Merged Mean Days to Close", merged_days_to_close_glyphs), - ("Not Merged Mean Days to Close", not_merged_days_to_close_glyphs) - ], - - location=location, - orientation=orientation, - border_line_color="black" - # title='Example Title' - ) - p.add_layout(legend, side) - - # add_legend((150, 50), "horizontal", "center") - add_legend((10, 135), "vertical", "right") - - plot = p - - p = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the average number of days between comments for all closed pull requests per month " \ - "in four categories. These four categories are All Merged, All Not Merged, Slowest 20% Merged, " \ - "and Slowest 20% Not Merged." - p = add_caption_to_plot(p, caption) - - caption_plot = p - - grid = gridplot([[plot], [caption_plot]]) - - if return_json == "true": - var = Response(response=json.dumps(json_item(grid, "mean_response_times_for_PR")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - filename = export_png(grid, timeout=180) - - return send_file(filename) - -@app.route('/{}/pull_request_reports/mean_days_between_PR_comments/'.format(AUGUR_API_VERSION), - methods=["GET"]) -def mean_days_between_PR_comments(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - return_json = request.args.get('return_json', "false") - - time_unit = 'Days' - x_axis = 'closed_yearmonth' - y_axis = 'average_days_between_responses' - description = "All Closed" - line_group = 'merged_flag' - num_outliers_repo_map = {} - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - # gets pr_closed data - # selects only need columns (pr_closed_needed_columns) - # removes columns that cannot be NULL (pr_closed_not_null_columns) - pr_closed = df_tuple[df_type["pr_closed"]] - pr_closed_needed_columns = ['repo_id', 'repo_name', x_axis, 'average_time_between_responses', line_group] - pr_closed = filter_data(pr_closed, pr_closed_needed_columns) - - # gets pr_slow20_not_merged data - # selects only need columns (pr_slow20_not_merged_needed_columns) - # removes columns that cannot be NULL (pr_slow20_not_merged_not_null_columns) - pr_slow20_not_merged = df_tuple[df_type["pr_slow20_not_merged"]] - pr_slow20_not_merged_needed_columns = ['repo_id', 'repo_name', x_axis, 'average_time_between_responses', line_group] - pr_slow20_not_merged = filter_data(pr_slow20_not_merged, pr_slow20_not_merged_needed_columns) - - # gets pr_slow20_merged data - # selects only need columns (pr_slow20_not_merged_needed_columns) - # removes columns that cannot be NULL (pr_slow20_not_merged_not_null_columns) - pr_slow20_merged = df_tuple[df_type["pr_slow20_merged"]] - pr_slow20_merged_needed_columns = ['repo_id', 'repo_name', x_axis, 'average_time_between_responses', line_group] - pr_slow20_merged = filter_data(pr_slow20_merged, pr_slow20_merged_needed_columns) - - if len(pr_closed) == 0 or len(pr_slow20_not_merged) == 0 or len(pr_slow20_merged) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - try: - pr_closed['average_days_between_responses'] = pr_closed['average_time_between_responses'].map( - lambda x: x.days).astype(float) - pr_slow20_not_merged['average_days_between_responses'] = pr_slow20_not_merged[ - 'average_time_between_responses'].map(lambda x: x.days).astype(float) - pr_slow20_merged['average_days_between_responses'] = pr_slow20_merged['average_time_between_responses'].map( - lambda x: x.days).astype(float) - except: - return Response(response="There is no message data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: pr_closed.loc[pr_closed['repo_id'] == repo_id].iloc[0]['repo_name']} - - data_dict = {'All': pr_closed, 'Slowest 20%': pr_slow20_not_merged.append(pr_slow20_merged, ignore_index=True)} - - plot_width = 950 - p1 = figure(x_axis_type="datetime", - title="{}: Mean {} Between Comments by Month Closed for {} Pull Requests".format(repo_dict[repo_id], time_unit, description), - plot_width=plot_width, x_range=(data_dict["All"][x_axis].min(), data_dict["All"][x_axis].max()), plot_height=500, - toolbar_location=None) - colors = Category20[10][6:] - color_index = 0 - - glyphs = [] - - possible_maximums = [] - for data_desc, input_df in data_dict.items(): - - driver_df = input_df.copy() - - driver_df = remove_outliers(driver_df, y_axis, num_outliers_repo_map) - - driver_df = driver_df.loc[driver_df['repo_id'] == repo_id] - index = 0 - - driver_df_mean = driver_df.groupby(['repo_id', line_group, x_axis], as_index=False).mean() - - title_ending = '' - if repo_id: - title_ending += ' for Repo: {}'.format(repo_id) - - for group_num, line_group_value in enumerate(driver_df[line_group].unique(), color_index): - glyphs.append(p1.line(driver_df_mean.loc[driver_df_mean[line_group] == line_group_value][x_axis], - driver_df_mean.loc[driver_df_mean[line_group] == line_group_value][y_axis], - color=colors[group_num], line_width=3)) - color_index += 1 - possible_maximums.append( - max(driver_df_mean.loc[driver_df_mean[line_group] == line_group_value][y_axis].dropna())) - for repo, num_outliers in num_outliers_repo_map.items(): - p1.add_layout( - Title(text="** {} outliers for {} were removed".format(num_outliers, repo), align="center"), - "below") - - p1.grid.grid_line_alpha = 0.3 - p1.xaxis.axis_label = 'Month Closed' - p1.xaxis.ticker.desired_num_ticks = 15 - p1.yaxis.axis_label = 'Mean {} Between Responses'.format(time_unit) - p1.legend.location = "top_left" - - legend = Legend( - items=[ - ("All Not Merged / Rejected", [glyphs[0]]), - ("All Merged / Accepted", [glyphs[1]]), - ("Slowest 20% Not Merged / Rejected", [glyphs[2]]), - ("Slowest 20% Merged / Accepted", [glyphs[3]]) - ], - - location='center_right', - orientation='vertical', - border_line_color="black" - ) - - p1.add_layout(legend, 'right') - - p1.title.text_font_size = "16px" - - p1.xaxis.axis_label_text_font_size = "16px" - p1.xaxis.major_label_text_font_size = "16px" - - p1.yaxis.axis_label_text_font_size = "16px" - p1.yaxis.major_label_text_font_size = "16px" - p1.xaxis.major_label_orientation = 45.0 - - p1.y_range = Range1d(0, max(possible_maximums) * 1.15) - - plot = p1 - - p = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the average number of days between comments for all" \ - " closed pull requests per month in four categories. These four categories" \ - " are All Merged, All Not Merged, Slowest 20% Merged, and Slowest 20% Not Merged." - p = add_caption_to_plot(p, caption) - - caption_plot = p - - grid = gridplot([[plot], [caption_plot]]) - - if return_json == "true": - var = Response(response=json.dumps(json_item(grid, "mean_days_between_PR_comments")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - filename = export_png(grid, timeout=180) - - return send_file(filename) - -@app.route('/{}/pull_request_reports/PR_time_to_first_response/'.format(AUGUR_API_VERSION), methods=["GET"]) -def PR_time_to_first_response(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - return_json = request.args.get('return_json', "false") - remove_outliers = str(request.args.get('remove_outliers', "true")) - - x_axis = 'pr_closed_at' - y_axis = 'days_to_first_response' - description = 'All' - group_by = 'merged_flag' - legend_position = 'top_right' - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - pr_closed = df_tuple[df_type["pr_closed"]] - needed_columns = ['repo_id', 'repo_name', x_axis, group_by, y_axis] - pr_closed = filter_data(pr_closed, needed_columns) - - if len(pr_closed) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: pr_closed.loc[pr_closed['repo_id'] == repo_id].iloc[0]['repo_name']} - - driver_df = pr_closed.copy() - - outliers_removed = 0 - - if remove_outliers == "true": - driver_df, outliers_removed, outlier_cutoff = remove_outliers_by_standard_deviation(driver_df, 'days_to_first_response') - - group_by_groups = sorted(driver_df[group_by].unique()) - - # setup color pallete - try: - # FIXME repo_set is not defined - colors = Colorblind[len(repo_set)] - except: - colors = Colorblind[3] - - title_beginning = '{}: '.format(repo_dict[repo_id]) - plot_width = 180 * 5 - p = figure(x_range=( - driver_df[x_axis].min() - datetime.timedelta(days=30), driver_df[x_axis].max() + datetime.timedelta(days=25)), - # (driver_df[y_axis].min(), driver_df[y_axis].max()), - toolbar_location=None, - title='{}Days to First Response for {} Closed Pull Requests'.format(title_beginning, description), - plot_width=plot_width, - plot_height=400, x_axis_type='datetime') - - for index, group_by_group in enumerate(group_by_groups): - p.scatter(x_axis, y_axis, color=colors[index], marker="square", - source=driver_df.loc[driver_df[group_by] == group_by_group], legend_label=group_by_group) - - if group_by_group == "Merged / Accepted": - merged_values = driver_df.loc[driver_df[group_by] == group_by_group][y_axis].dropna().values.tolist() - else: - not_merged_values = driver_df.loc[driver_df[group_by] == group_by_group][ - y_axis].dropna().values.tolist() - - values = not_merged_values + merged_values - - if outliers_removed > 0: - if repo_id: - p.add_layout(Title( - text="** Outliers cut off at {} days: {} outlier(s) for {} were removed **".format(outlier_cutoff, - outliers_removed, - repo_dict[ - repo_id]), - align="center"), "below") - else: - p.add_layout(Title( - text="** Outliers cut off at {} days: {} outlier(s) were removed **".format(outlier_cutoff, - outliers_removed), - align="center"), "below") - - p.xaxis.axis_label = 'Date Closed' if x_axis == 'pr_closed_at' else 'Date Created' if x_axis == 'pr_created_at' else 'Date' - p.yaxis.axis_label = 'Days to First Response' - p.legend.location = legend_position - - p.title.align = "center" - p.title.text_font_size = "16px" - - p.xaxis.axis_label_text_font_size = "16px" - p.xaxis.major_label_text_font_size = "16px" - - p.yaxis.axis_label_text_font_size = "16px" - p.yaxis.major_label_text_font_size = "16px" - - if len(values) == 0: - return Response(response="There is no message data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - # determine y_max by finding the max of the values and scaling it up a small amoutn - y_max = max(values) * 1.015 - - p.y_range = Range1d(0, y_max) - - plot = p - - p = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the days to first reponse for individual pull requests, either Merged or Not Merged." - p = add_caption_to_plot(p, caption) - - caption_plot = p - - grid = gridplot([[plot], [caption_plot]]) - - if return_json == "true": - var = Response(response=json.dumps(json_item(grid, "PR_time_to_first_response")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - filename = export_png(grid, timeout=180) - - return send_file(filename) - -@app.route('/{}/pull_request_reports/average_PR_events_for_closed_PRs/'.format(AUGUR_API_VERSION), - methods=["GET"]) -def average_PR_events_for_closed_PRs(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - return_json = request.args.get('return_json', "false") - include_comments = str(request.args.get('include_comments', True)) - - x_axis = 'closed_year' - facet = 'merged_flag' - columns = 2 - x_max = 1100 - y_axis = 'repo_name' - description = 'All Closed' - optional_comments = ['comment_count'] if include_comments else [] - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - pr_closed = df_tuple[df_type["pr_closed"]] - needed_columns = ['repo_id', 'repo_name', x_axis, 'assigned_count', - 'review_requested_count', - 'labeled_count', - 'subscribed_count', - 'mentioned_count', - 'referenced_count', - 'closed_count', - 'head_ref_force_pushed_count', - 'merged_count', - 'milestoned_count', - 'unlabeled_count', - 'head_ref_deleted_count', facet] + optional_comments - pr_closed = filter_data(pr_closed, needed_columns) - - if len(pr_closed) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - repo_dict = {repo_id: pr_closed.loc[pr_closed['repo_id'] == repo_id].iloc[0]['repo_name']} - - colors = linear_gradient('#f5f5dc', '#fff44f', 150)['hex'] - - driver_df = pr_closed.copy() - driver_df[x_axis] = driver_df[x_axis].astype(str) - - if facet == 'closed_year' or y_axis == 'closed_year': - driver_df['closed_year'] = driver_df['closed_year'].astype(int).astype(str) - - y_groups = [ - 'review_requested_count', - 'labeled_count', - 'subscribed_count', - 'referenced_count', - 'closed_count', - # 'milestoned_count', - ] + optional_comments - - optional_group_comments = ['comment'] if include_comments else [] - # y_groups = ['subscribed', 'mentioned', 'labeled', 'review_requested', 'head_ref_force_pushed', - # 'referenced', 'closed', 'merged', 'unlabeled', 'head_ref_deleted', 'milestoned', 'assigned'] - # + optional_group_comments - - x_groups = sorted(list(driver_df[x_axis].unique())) - - grid_array = [] - grid_row = [] - - for index, facet_group in enumerate(sorted(driver_df[facet].unique())): - - facet_data = driver_df.loc[driver_df[facet] == facet_group] - # display(facet_data.sort_values('merged_count', ascending=False).head(50)) - driver_df_mean = facet_data.groupby(['repo_id', 'repo_name', x_axis], as_index=False).mean().round(1) - - # if a record is field in a record is Nan then it is not counted by count() so when it is not - # 2 meaning both rows have a value, there is not enough data - if (driver_df_mean['assigned_count'].count() != 2 or driver_df_mean[ - 'review_requested_count'].count() != 2 or driver_df_mean['labeled_count'].count() != 2 or - driver_df_mean['subscribed_count'].count() != 2 or driver_df_mean['mentioned_count'].count() != 2 or - driver_df_mean['referenced_count'].count() != 2 or - driver_df_mean['closed_count'].count() != 2 or driver_df_mean[ - 'head_ref_force_pushed_count'].count() != 2 or driver_df_mean['merged_count'].count() != 2 or - driver_df_mean['milestoned_count'].count() != 2 or driver_df_mean['unlabeled_count'].count() != 2 or - driver_df_mean['head_ref_deleted_count'].count() != 2 or - driver_df_mean['comment_count'].count() != 2): - return Response(response="There is not enough data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - # print(driver_df_mean.to_string()) - # data = {'Y' : y_groups} - # for group in y_groups: - # data[group] = driver_df_mean[group].tolist() - plot_width = 700 - p = figure(y_range=y_groups, plot_height=500, plot_width=plot_width, x_range=x_groups, - title='{}'.format(format(facet_group))) - - for y_group in y_groups: - driver_df_mean['field'] = y_group - source = ColumnDataSource(driver_df_mean) - mapper = LinearColorMapper(palette=colors, low=driver_df_mean[y_group].min(), - high=driver_df_mean[y_group].max()) - - p.rect(y='field', x=x_axis, width=1, height=1, source=source, - line_color=None, fill_color=transform(y_group, mapper)) - # Data label - labels = LabelSet(x=x_axis, y='field', text=y_group, y_offset=-8, - text_font_size="12pt", text_color='black', - source=source, text_align='center') - p.add_layout(labels) - - color_bar = ColorBar(color_mapper=mapper, location=(0, 0), - ticker=BasicTicker(desired_num_ticks=9), - formatter=PrintfTickFormatter(format="%d")) - # p.add_layout(color_bar, 'right') - - p.y_range.range_padding = 0.1 - p.ygrid.grid_line_color = None - - p.legend.location = "bottom_right" - p.axis.minor_tick_line_color = None - p.outline_line_color = None - - p.xaxis.axis_label = 'Year Closed' - p.yaxis.axis_label = 'Event Type' - - p.title.align = "center" - p.title.text_font_size = "15px" - - p.xaxis.axis_label_text_font_size = "16px" - p.xaxis.major_label_text_font_size = "16px" - - p.yaxis.axis_label_text_font_size = "16px" - p.yaxis.major_label_text_font_size = "16px" - - grid_row.append(p) - if index % columns == columns - 1: - grid_array.append(grid_row) - grid_row = [] - grid = gridplot(grid_array) - - # create caption plot - caption_plot = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the average count of several different event types for " \ - "closed pull requests per year. It spilits the pull requests into two categories, " \ - "Merged / Accepted, and Not Merged / Rejected, so the similarities and differences are clear." - - caption_plot.add_layout(Label(x=0, y=380, x_units='screen', y_units='screen', text='{}'.format(caption), - text_font='times', text_font_size='15pt', render_mode='css')) - - # caption_plot.outline_line_color = None - caption_plot.toolbar_location = None - - # create title plot - title_plot = figure(width=plot_width, height=50, margin=(0, 0, 0, 0)) - title = '{}: Average Pull Request Event Types for {} Pull Requests'.format(repo_dict[repo_id], description) - - title_plot.add_layout(Label(x=550, y=0, x_units='screen', y_units='screen', text='{}'.format(title), - text_font='times', text_font_size='17px', - text_font_style='bold', render_mode='css')) - - # title_plot.outline_line_color = None - title_plot.toolbar_location = None - - layout = column([title_plot, grid, caption_plot], sizing_mode='scale_width') - - if return_json == "true": - var = Response(response=json.dumps(json_item(layout, "average_PR_events_for_closed_PRs")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - filename = export_png(layout, timeout=181) # , webdriver=selenium.webdriver.firefox.webdriver) - - return send_file(filename) - -@app.route('/{}/pull_request_reports/Average_PR_duration/'.format(AUGUR_API_VERSION), methods=["GET"]) -def Average_PR_duration(): - - repo_id, start_date, end_date, error = get_repo_id_start_date_and_end_date() - - if error: - return Response(response=error["message"], - mimetype='application/json', - status=error["status_code"]) - - group_by = str(request.args.get('group_by', "month")) - return_json = request.args.get('return_json', "false") - remove_outliers = str(request.args.get('remove_outliers', "true")) - - x_axis = 'repo_name' - group_by = 'merged_flag' - y_axis = 'closed_yearmonth' - description = "All Closed" - heat_field = 'pr_duration_days' - columns = 2 - - df_type = get_df_tuple_locations() - - df_tuple = pull_request_data_collection(repo_id=repo_id, start_date=start_date, end_date=end_date) - - pr_closed = df_tuple[df_type["pr_closed"]] - needed_columns = ['repo_id', y_axis, group_by, x_axis, 'pr_closed_at', 'pr_created_at'] - pr_closed = filter_data(pr_closed, needed_columns) - - if len(pr_closed) == 0: - return Response(response="There is no data for this repo, in the database you are accessing", - mimetype='application/json', - status=200) - - pr_duration_frame = pr_closed.assign(pr_duration=(pr_closed['pr_closed_at'] - pr_closed['pr_created_at'])) - pr_duration_frame = pr_duration_frame.assign( - pr_duration_days=(pr_duration_frame['pr_duration'] / datetime.timedelta(minutes=1)) / 60 / 24) - - repo_dict = {repo_id: pr_duration_frame.loc[pr_duration_frame['repo_id'] == repo_id].iloc[0]['repo_name']} - - red_green_gradient = linear_gradient('#0080FF', '#DC143C', 150)['hex'] # 32CD32 - - driver_df = pr_duration_frame.copy() - - driver_df[y_axis] = driver_df[y_axis].astype(str) - - # add new group by + xaxis column - driver_df['grouped_x'] = driver_df[x_axis] + ' - ' + driver_df[group_by] - - driver_df_mean = driver_df.groupby(['grouped_x', y_axis], as_index=False).mean() - - colors = red_green_gradient - y_groups = driver_df_mean[y_axis].unique() - x_groups = sorted(driver_df[x_axis].unique()) - grouped_x_groups = sorted(driver_df_mean['grouped_x'].unique()) - - # defualt outliers removed to 0 - outliers_removed = 0 - - if remove_outliers == "true": - driver_df_mean, outliers_removed, outlier_cutoff = remove_outliers_by_standard_deviation(driver_df_mean, - heat_field) - - values = driver_df_mean[heat_field].values.tolist() - - heat_max = max(values) * 1.02 - - mapper = LinearColorMapper(palette=colors, low=driver_df_mean[heat_field].min(), - high=heat_max) # driver_df_mean[heat_field].max()) - - source = ColumnDataSource(driver_df_mean) - title_beginning = repo_dict[repo_id] + ':' - plot_width = 1100 - p = figure(plot_width=plot_width, plot_height=300, - title="{} Mean Duration (Days) {} Pull Requests".format(title_beginning, description), - y_range=grouped_x_groups[::-1], x_range=y_groups, - toolbar_location=None, tools="") # , x_axis_location="above") - - for x_group in x_groups: - outliers = driver_df_mean.loc[ - (driver_df_mean[heat_field] > heat_max) & (driver_df_mean['grouped_x'].str.contains(x_group))] - - if outliers_removed > 0: - p.add_layout(Title( - text="** Outliers capped at {} days: {} outlier(s) for {} were capped at {} **".format( - outlier_cutoff, outliers_removed, x_group, outlier_cutoff), align="center"), "below") - - p.rect(x=y_axis, y='grouped_x', width=1, height=1, source=source, - line_color=None, fill_color=transform(heat_field, mapper)) - - color_bar = ColorBar(color_mapper=mapper, location=(0, 0), - ticker=BasicTicker(desired_num_ticks=9), - formatter=PrintfTickFormatter(format="%d")) - - p.add_layout(color_bar, 'right') - - p.title.align = "center" - p.title.text_font_size = "16px" - - p.axis.axis_line_color = None - p.axis.major_tick_line_color = None - p.axis.major_label_text_font_size = "11pt" - p.axis.major_label_standoff = 0 - p.xaxis.major_label_orientation = 1.0 - p.xaxis.axis_label = 'Month Closed' if y_axis[0:6] == 'closed' else 'Date Created' if y_axis[ - 0:7] == 'created' else 'Repository' if y_axis == 'repo_name' else '' - # p.yaxis.axis_label = 'Merged Status' - - p.title.text_font_size = "16px" - - p.xaxis.axis_label_text_font_size = "16px" - p.xaxis.major_label_text_font_size = "14px" - - p.yaxis.major_label_text_font_size = "15px" - - plot = p - - p = figure(width=plot_width, height=200, margin=(0, 0, 0, 0)) - caption = "This graph shows the average duration of all closed pull requests. " \ - "Red represents a slow response relative to the others, while blue a light blue " \ - "represents a fast response relative to the others. Blank cells represents months " \ - "without pull requests." - p = add_caption_to_plot(p, caption) - caption_plot = p - - grid = gridplot([[plot], [caption_plot]]) - - if return_json == "true": - var = Response(response=json.dumps(json_item(grid, "Average_PR_duration")), - mimetype='application/json', - status=200) - - var.headers["Access-Control-Allow-Orgin"] = "*" - - return var - - # opts = FirefoxOptions() - # opts.add_argument("--headless") - # driver = webdriver.Firefox(firefox_options=opts) - # newt = get_screenshot_as_png(grid, timeout=180, webdriver=selenium.webdriver.firefox.webdriver) - # filename = export_png(grid, timeout=180, webdriver=selenium.webdriver.firefox.webdriver) - filename = export_png(grid, timeout=180) - - # return sendfile(newt) - return send_file(filename) From 2e60ffb4fc331ab0f603628984ed784f53d584b3 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 30 Sep 2025 19:42:22 -0400 Subject: [PATCH 39/67] remove reports definition Signed-off-by: Adrian Edwards --- augur/api/view/init.py | 57 ------------------------------------------ 1 file changed, 57 deletions(-) diff --git a/augur/api/view/init.py b/augur/api/view/init.py index 869b383a62..2a4ce44191 100644 --- a/augur/api/view/init.py +++ b/augur/api/view/init.py @@ -33,63 +33,6 @@ def write_settings(current_settings): with open(configFile, 'w') as file: yaml.dump(current_settings, file) -# default reports definition -reports = { - "pull_request_reports":[ - { - "url":"average_commits_per_PR", - "description":"Average commits per pull request" - }, - { - "url":"average_comments_per_PR", - "description":"Average comments per pull request" - }, - { - "url":"PR_counts_by_merged_status", - "description":"Pull request counts by merged status" - }, - { - "url":"mean_response_times_for_PR", - "description":"Mean response times for pull requests" - }, - { - "url":"mean_days_between_PR_comments", - "description":"Mean days between pull request comments" - }, - { - "url":"PR_time_to_first_response", - "description":"Pull request time until first response" - }, - { - "url":"average_PR_events_for_closed_PRs", - "description":"Average pull request events for closed pull requests" - }, - { - "url":"Average_PR_duration", - "description":"Average pull request duration" - } - ], - "contributor_reports":[ - { - "url":"new_contributors_bar", - "description":"New contributors bar graph" - }, - { - "url":"returning_contributors_pie_chart", - "description":"Returning contributors pie chart" - } - ], - "contributor_reports_stacked":[ - { - "url":"new_contributors_stacked_bar", - "description":"New contributors stacked bar chart" - }, - { - "url":"returning_contributors_stacked_bar", - "description":"Returning contributors stacked bar chart" - } - ] -} # Initialize logging def init_logging(): From a28c7b4187cfca644a926cbb59c14a5749ad4d9d Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 30 Sep 2025 19:43:02 -0400 Subject: [PATCH 40/67] additional reports related things Signed-off-by: Adrian Edwards --- augur/api/view/init.py | 3 - augur/api/view/utils.py | 80 ------------------- .../create-a-metric/api-development.rst | 4 - 3 files changed, 87 deletions(-) diff --git a/augur/api/view/init.py b/augur/api/view/init.py index 2a4ce44191..1737131352 100644 --- a/augur/api/view/init.py +++ b/augur/api/view/init.py @@ -19,9 +19,6 @@ def init_settings(): settings["cache_expiry"] = 604800 settings["serving"] = "http://augur.chaoss.io/api/unstable" settings["pagination_offset"] = 25 - # Put reports.yml in the same directory as the config file - config_dir = configFile.parent - settings["reports"] = os.path.join(config_dir, "reports.yml") settings["session_key"] = secrets.token_hex() def write_settings(current_settings): diff --git a/augur/api/view/utils.py b/augur/api/view/utils.py index aae5140cd7..dbfdd1b121 100644 --- a/augur/api/view/utils.py +++ b/augur/api/view/utils.py @@ -70,34 +70,6 @@ def getSetting(key, section = "View"): #version_check(settings) -""" ---------------------------------------------------------------- -""" -def loadReports(): - global reports - try: - with open(getSetting("reports")) as file: - reports = yaml.load(file, Loader=yaml.FullLoader) - id = -1 - for report in reports: - for image in reports[report]: - image['id'] = id = id + 1 - return True - except Exception as err: - logger.error(f"An exception occurred reading reports endpoints from [{getSetting('reports')}]:") - logger.error(err) - try: - with open(getSetting("reports"), 'w') as file: - logger.info("Attempting to generate default reports.yml") - yaml.dump(reports, file) - logger.info("Default reports file successfully generated.") - except Exception as ioErr: - logger.error("Error creating default report configuration:") - logger.error(ioErr) - return False - -if not loadReports(): - loadReports() - cache_files_requested = [] """ ---------------------------------------------------------------- @@ -160,58 +132,6 @@ def download(url, cmanager, filename, image_cache, image_id, repo_id = None): logger.error("An exception occurred writing a cache file to disk") logger.error(err) -""" ---------------------------------------------------------------- -""" -def requestReports(repo_id): - # If this request has already been fulfilled, no need to process it again - if(repo_id in report_requests.keys()): - return - - # initialize a new request entry to hold the resulting data - report_requests[repo_id] = {} - report_requests[repo_id]['complete'] = False - - host = getSetting("host", "Server") - port = getSetting("port", "Server") - - """ ---------- - If the report definition could not be loaded, we cannot determine what - files to request from the backend to compose the report. Returning here - causes the completion status of the request to be False, which will - display an error message when sent to the frontend. - """ - if reports is None: - return - - threadPools = [] - reportImages = {} - for report in reports: - # Reports is a dictionary of lists, so we get the size of each list - size = len(reports[report]) - - # Set up various threading components to manage image downloading - connection_mgr = urllib3.PoolManager(maxsize=size) - thread_pool = ThreadPoolExecutor(size) - threadPools.append(thread_pool) - - for image in reports[report]: - # Where should the downloaded image be stored (in cache) - filename = toCacheFilename(f"{image['url']}?repo_id={repo_id}") - # Where are we downloading the image from - image_url = f"{host}:{port}" + url_for(image['url'], repo_id = repo_id) - # f"{getSetting('serving')}/{image['url']}?repo_id={repo_id}" - - # Add a request for this image to the thread pool using the download function - thread_pool.submit(download, image_url, connection_mgr, filename, reportImages, image['id'], repo_id) - - # Wait for all connections to resolve, then clean up - for thread_pool in threadPools: - thread_pool.shutdown() - - report_requests[repo_id]['images'] = reportImages - - # Remove the request from the queue when completed - report_requests[repo_id]['complete'] = True """ ---------------------------------------------------------------- renderRepos: diff --git a/docs/source/development-guide/create-a-metric/api-development.rst b/docs/source/development-guide/create-a-metric/api-development.rst index 834b42e8e0..05e1ebb977 100644 --- a/docs/source/development-guide/create-a-metric/api-development.rst +++ b/docs/source/development-guide/create-a-metric/api-development.rst @@ -133,10 +133,6 @@ There is also, generally, a block in a standard metric for pulling data by a rep 'begin_date': begin_date, 'end_date': end_date}) return results -Existing Visualization Metrics Files: --------------------------------------------- -1. augur/routes/contributor_reports.py -2. augur/routes/pull_request_reports.py Existing Metrics Files: -------------------------------------------- From 148680b3a581e29199a3277082cc41ba9dacf7ac Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Tue, 7 Oct 2025 08:15:33 +0100 Subject: [PATCH 41/67] basic implementation Signed-off-by: Adrian Edwards --- augur/application/config.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/augur/application/config.py b/augur/application/config.py index ee3c33dc8b..ab8bb4e93d 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -5,6 +5,7 @@ import os from augur.application.db.models import Config from augur.application.db.util import execute_session_query, convert_type_of_value +from pathlib import Path def get_development_flag_from_config(): @@ -122,7 +123,11 @@ def __init__(self, logger, session: DatabaseSession): self.logger = logger self.accepted_types = ["str", "bool", "int", "float", "NoneType"] - self.default_config = default_config + config_path = Path("./augur.json") + if config_path.exists(): + self.default_config = json.loads(config_path.read_text(encoding="UTF-8")) + else: + self.default_config = default_config def get_section(self, section_name) -> dict: """Get a section of data from the config. From d13aca8cbed396ff26b0f021c0761e675d673e93 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 15 Oct 2025 20:23:47 +0100 Subject: [PATCH 42/67] introduce a config datadir item and use that so that config can be in a standard location going forward Signed-off-by: Adrian Edwards --- augur/application/config.py | 3 ++- docker-compose.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/augur/application/config.py b/augur/application/config.py index ab8bb4e93d..2cc6f65cdb 100644 --- a/augur/application/config.py +++ b/augur/application/config.py @@ -123,7 +123,8 @@ def __init__(self, logger, session: DatabaseSession): self.logger = logger self.accepted_types = ["str", "bool", "int", "float", "NoneType"] - config_path = Path("./augur.json") + config_dir = Path(os.getenv("CONFIG_DATADIR", "./")) + config_path = config_dir.joinpath("augur.json") if config_path.exists(): self.default_config = json.loads(config_path.read_text(encoding="UTF-8")) else: diff --git a/docker-compose.yml b/docker-compose.yml index b32f0a1696..f0ef41015b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -73,6 +73,7 @@ services: - REDIS_CONN_STRING=redis://redis:6379 - RABBITMQ_CONN_STRING=amqp://${AUGUR_RABBITMQ_USERNAME:-augur}:${AUGUR_RABBITMQ_PASSWORD:-password123}@rabbitmq:5672/${AUGUR_RABBITMQ_VHOST:-augur_vhost} - CONFIG_LOCATION=/config/config.yml + - CONFIG_DATADIR=/config - CACHE_DATADIR=/cache - CACHE_LOCKDIR=/cache - CELERYBEAT_SCHEDULE_DB=/tmp/celerybeat-schedule.db From 4887e3ebd5d1483e1febdbee9940cd8367396048 Mon Sep 17 00:00:00 2001 From: Sajal-Kulshreshtha Date: Sat, 11 Oct 2025 17:55:54 +0530 Subject: [PATCH 43/67] Centralize versioning Signed-off-by: Sajal-Kulshreshtha --- .github/workflows/build_docker.yml | 11 +++++++++++ docker/backend/Dockerfile | 4 +++- docker/database/Dockerfile | 4 +++- docker/keyman/Dockerfile | 4 +++- docker/rabbitmq/Dockerfile | 4 +++- scripts/ci/get_version.py | 7 +++++++ 6 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 scripts/ci/get_version.py diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index e7fa4b262c..7828019514 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -58,6 +58,13 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Extract project version + id: version + run: | + VERSION=$(python -c "import re; exec(open('metadata.py').read()); print(__version__)") + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "Using version: $VERSION" + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 id: setup-buildx @@ -67,6 +74,7 @@ jobs: with: context: . file: ./docker/database/Dockerfile + build-args: VERSION=${{ steps.version.outputs.version }} platforms: linux/amd64 tags: ghcr.io/${{ github.repository_owner }}/augur_database:test cache-from: type=gha,scope=container-database @@ -78,6 +86,7 @@ jobs: with: context: . file: ./docker/keyman/Dockerfile + build-args: VERSION=${{ steps.version.outputs.version }} platforms: linux/amd64 tags: ghcr.io/${{ github.repository_owner }}/augur_keyman:test cache-from: type=gha,scope=container-keyman @@ -89,6 +98,7 @@ jobs: with: context: . file: ./docker/rabbitmq/Dockerfile + build-args: VERSION=${{ steps.version.outputs.version }} platforms: linux/amd64 tags: ghcr.io/${{ github.repository_owner }}/augur_rabbitmq:test cache-from: type=gha,scope=container-rabbitmq @@ -100,6 +110,7 @@ jobs: with: context: . file: ./docker/backend/Dockerfile + build-args: VERSION=${{ steps.version.outputs.version }} platforms: linux/amd64 tags: ghcr.io/${{ github.repository_owner }}/augur_backend:test cache-from: type=gha,scope=container-backend diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 0a05daf848..5a8bfaaa3e 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -20,7 +20,9 @@ RUN go install github.com/ossf/scorecard/v5@v5.1.1 \ FROM python:3.11-slim-bullseye LABEL maintainer="outdoors@acm.org" -LABEL version="0.90.3" + +ARG VERSION +LABEL version=${VERSION} ENV DEBIAN_FRONTEND=noninteractive ENV PATH="/usr/bin/:/usr/local/bin:/usr/lib:${PATH}" diff --git a/docker/database/Dockerfile b/docker/database/Dockerfile index 6558fe44ec..e4393fe0b3 100644 --- a/docker/database/Dockerfile +++ b/docker/database/Dockerfile @@ -2,7 +2,9 @@ FROM postgres:16 LABEL maintainer="outdoors@acm.org" -LABEL version="0.90.3" + +ARG VERSION +LABEL version=${VERSION} ENV POSTGRES_DB="test" ENV POSTGRES_USER="augur" diff --git a/docker/keyman/Dockerfile b/docker/keyman/Dockerfile index 72c46ba225..3fe1996223 100644 --- a/docker/keyman/Dockerfile +++ b/docker/keyman/Dockerfile @@ -1,7 +1,9 @@ FROM python:3.11.12-alpine LABEL maintainer="outdoors@acm.org" -LABEL version="0.90.3" + +ARG VERSION +LABEL version=${VERSION} RUN pip install --no-cache-dir --upgrade pip diff --git a/docker/rabbitmq/Dockerfile b/docker/rabbitmq/Dockerfile index ad86dfebb7..387eb9ae77 100644 --- a/docker/rabbitmq/Dockerfile +++ b/docker/rabbitmq/Dockerfile @@ -1,7 +1,9 @@ FROM rabbitmq:3.12-management-alpine LABEL maintainer="574/augur@simplelogin.com" -LABEL version="0.90.0" + +ARG VERSION +LABEL version=${VERSION} ARG RABBIT_MQ_DEFAULT_USER=augur ARG RABBIT_MQ_DEFAULT_PASSWORD=password123 diff --git a/scripts/ci/get_version.py b/scripts/ci/get_version.py new file mode 100644 index 0000000000..e98d520602 --- /dev/null +++ b/scripts/ci/get_version.py @@ -0,0 +1,7 @@ +import sys +import os + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) +from metadata import __version__ + +print(__version__) From 3f2768df10052997709a3c757b4a1be07dd3bb00 Mon Sep 17 00:00:00 2001 From: Sajal-Kulshreshtha Date: Wed, 15 Oct 2025 01:36:59 +0530 Subject: [PATCH 44/67] docs: update release process to reflect centralized version management Signed-off-by: Sajal-Kulshreshtha --- docs/source/procedures/creating-releases.rst | 100 +++++++++++++++---- 1 file changed, 80 insertions(+), 20 deletions(-) diff --git a/docs/source/procedures/creating-releases.rst b/docs/source/procedures/creating-releases.rst index 007db72441..7ef2b32a9c 100644 --- a/docs/source/procedures/creating-releases.rst +++ b/docs/source/procedures/creating-releases.rst @@ -2,49 +2,109 @@ The Augur Release Process ========================= The first step to releasing any changes is to have changes in the first place. -Augur's `CONTRIBUTING.md `__ file contains all the information that is needed to get started with topics like reporting issues, contributing code, and understanding the code review process. +Augur's `CONTRIBUTING.md `__ file +contains all the information that is needed to get started with topics like +reporting issues, contributing code, and understanding the code review process. This document outlines how these changes end up in an Augur release after they are merged into the `main` branch. +Release Workflow +---------------- - -Release workflow: -Starting after version 0.89.3, Augur follows a workflow similar to those you may already be familiar with (such as github flow and git flow). The Augur workflow has two long-lived branches, `main` and `release` and is designed such that changes only flow in one direction - from main into release. +Starting after version **0.89.3**, Augur follows a workflow similar to those you may already +be familiar with (such as GitHub Flow and Git Flow). The Augur workflow has two long-lived branches, +`main` and `release`, and is designed such that changes only flow in one direction — from `main` into `release`. Branches +-------- + +**main** -`main` -The `main` branch is the primary development branch that is the target for all new pull requests. At any given point in time, this branch represents the best approximation of what the next upcoming release will look like. Since this is the active development branch, changes happen more frequently and this branch should be considered to be less stable than the `release` branch due to the possibility of breaking changes being made (and potentially reverted) between releases. It is not recommended for production deployment and is primarily intended for use by Augur contributors running their own copies against test data for development purposes. +The `main` branch is the primary development branch that is the target for all new pull requests. +At any given point in time, this branch represents the best approximation of what the next upcoming +release will look like. Since this is the active development branch, changes happen more frequently +and this branch should be considered to be less stable than the `release` branch due to the possibility +of breaking changes being made (and potentially reverted) between releases. It is not recommended for +production deployment and is primarily intended for use by Augur contributors running their own copies +against test data for development purposes. -`release` -The `release` branch is where all augur versions (after 0.89.3) are tagged. Each commit on this branch represents either a hotfix to the prior release or a new major or minor version. +**release** -Currently, Augur only officially supports the last-released version represented by the latest **release** tag. In most cases, the latest commit on the `release` branch is made immediately prior to a release, but always rely on the latest tagged release, not the release branch in production. +The `release` branch is where all Augur versions (after 0.89.3) are tagged. Each commit on this branch +represents either a hotfix to the prior release or a new major or minor version. + +Currently, Augur only officially supports the last-released version represented by the latest **release** tag. +In most cases, the latest commit on the `release` branch is made immediately prior to a release, but always rely +on the latest tagged release, not the `release` branch in production. .. note:: - If future needs require supporting multiple Augur versions concurrently, individual numbered release branches may be made from this central `release`` branch to allow any hotfixes to be applied to each supported version independently of the others. + If future needs require supporting multiple Augur versions concurrently, individual numbered + release branches may be made from this central `release` branch to allow any hotfixes to be applied + to each supported version independently of the others. The Release Process +------------------- + +When the next release is set to be cut, some preparation steps need to take place first. These include: + +- Ensuring all features planned for that release are merged, and any unrelated changes are delayed (as appropriate) until after the release. +- Creating a Pull Request to update any applicable metadata (such as version information and changelogs) on the `main` branch. + +Version Management (Updated) +---------------------------- + +Starting from version **0.90.0**, Augur now uses a **single source of truth** for its version information, +defined in `metadata.py`. + +Previously, the version number needed to be manually updated in several different places during a release, including: -When the next release is set to be cut, some preparation steps need to take place first, these include: -- Ensuring all features planned for that release are merged and any unrelated changes are delayed (as appropriate) until after the release. -- Creating a Pull Request to update any applicable metadata (such as version information and Changelogs) on the `main` branch. +- ``pyproject.toml`` (for Python packaging) +- Dockerfiles (used for building and tagging images) +- GitHub Actions workflow files (e.g., ``.github/workflows/build_docker.yml``) +- Any scripts or documentation pages referencing specific versions -Once all release preparation has been completed, a new Pull Request can be created to merge the main branch into the `release 'branch. This creates a final review opportunity and allows for another run of (potentially more stringent) CI jobs compared to those run on `main`, catching issues that may have come up throughout the various merges or in the process of preparing for release. +This manual process increased the chance of version mismatches between code, Docker images, and releases. -After this PR is merged, a tag is created that points to the commit on the `release` branch, effectively labeling it so that it can be returned to later if needed. This labeling process can also be the basis for additional CI jobs that build and upload the released code to distribution platforms such as Docker Hub or the GitHub Container Registry +Now, this has been **fully centralized**: + +- The version number is declared once in ``metadata.py`` as ``__version__``. +- A helper script ``get_version.py`` reads this value and dynamically injects it into Docker builds via a build argument. +- The CI/CD pipeline (GitHub Actions) also reads the same version from ``metadata.py`` when tagging builds and Docker images. + +This ensures that all parts of Augur — including Python packaging, Docker images, and release artifacts — +use the **exact same version**, automatically. + +Therefore, before tagging a new release, only the version in ``metadata.py`` needs to be updated. +All other build and deployment steps automatically consume this version during the release process. + +Once all release preparation has been completed, a new Pull Request can be created to merge the `main` +branch into the `release` branch. This creates a final review opportunity and allows for another run of +(potentially more stringent) CI jobs compared to those run on `main`, catching issues that may have come up +throughout the various merges or during the process of preparing for release. + +After this PR is merged, a tag is created that points to the commit on the `release` branch, +effectively labeling it so that it can be returned to later if needed. This labeling process can +also be the basis for additional CI jobs that build and upload the released code to distribution +platforms such as Docker Hub or the GitHub Container Registry. Why? +---- This is done to solve a number of problems: -- having changes moving in two directions at once (i.e. features coming from main, and hotfixes coming from release) was often confusing and increased the odds that a change would be missed, such as being shipped as a hotfix but not merged into the main codebase - leading to a regression in the next release. +- Having changes moving in two directions at once (i.e. features coming from `main`, and hotfixes coming from `release`) + was often confusing and increased the odds that a change would be missed, such as being shipped as a hotfix + but not merged into the main codebase — leading to a regression in the next release. + + +Special Case: Hotfixes +---------------------- +If the fix is a hotfix: -Special case: Hotfixes -if the fix was a hotfix: -- changelog updates and other metadata changes should be included as part of the PR -- this is where mergeify or something helps re-create the PR targeting the release branch directly. at which point the release process is followed +- Changelog updates and other metadata changes should be included as part of the PR. +- This is where tools like **Mergeify** can help re-create the PR targeting the `release` branch directly, + at which point the regular release process is followed. From fb88a929ed16ab5540eb9b6f67fd2c9c931e0ab0 Mon Sep 17 00:00:00 2001 From: Sajal-Kulshreshtha Date: Wed, 15 Oct 2025 23:43:15 +0530 Subject: [PATCH 45/67] removed unused import and script Signed-off-by: Sajal-Kulshreshtha --- .github/workflows/build_docker.yml | 2 +- scripts/ci/get_version.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) delete mode 100644 scripts/ci/get_version.py diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 7828019514..75590ac0ad 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -61,7 +61,7 @@ jobs: - name: Extract project version id: version run: | - VERSION=$(python -c "import re; exec(open('metadata.py').read()); print(__version__)") + VERSION=$(python -c "exec(open('metadata.py').read()); print(__version__)") echo "version=$VERSION" >> $GITHUB_OUTPUT echo "Using version: $VERSION" diff --git a/scripts/ci/get_version.py b/scripts/ci/get_version.py deleted file mode 100644 index e98d520602..0000000000 --- a/scripts/ci/get_version.py +++ /dev/null @@ -1,7 +0,0 @@ -import sys -import os - -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) -from metadata import __version__ - -print(__version__) From cd87f5cbe421e216ee152f489ad470518747ae2d Mon Sep 17 00:00:00 2001 From: saksham23467 <142910439+saksham23467@users.noreply.github.com> Date: Thu, 31 Jul 2025 17:57:26 +0530 Subject: [PATCH 46/67] Add Clones metric API (#2604) Signed-off-by: saksham23467 <142910439+saksham23467@users.noreply.github.com> --- augur/api/metrics/repo_meta.py | 56 ++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/augur/api/metrics/repo_meta.py b/augur/api/metrics/repo_meta.py index ffc8fc84ef..c39922e17b 100644 --- a/augur/api/metrics/repo_meta.py +++ b/augur/api/metrics/repo_meta.py @@ -1240,3 +1240,59 @@ def aggregate_summary(repo_group_id, repo_id=None, begin_date=None, end_date=Non results = pd.read_sql(summarySQL, conn, params={'repo_id': repo_id, 'begin_date': begin_date, 'end_date': end_date}) return results + +@register_metric() +def clones(repo_group_id, repo_id=None, begin_date=None, end_date=None): + """ + Returns the number of repository clones (total and unique) for a given repo or repo group. + :param repo_group_id: The repository's repo_group_id + :param repo_id: The repository's repo_id, defaults to None + :param begin_date: Start date for filtering clone data (optional) + :param end_date: End date for filtering clone data (optional) + :return: DataFrame of clone counts (total and unique) per day + """ + if not begin_date: + begin_date = '1970-1-1 00:00:00' + if not end_date: + end_date = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + + if repo_id: + clones_sql = s.sql.text(""" + SELECT + repo_id, + clone_data_timestamp AS date, + count_clones AS total_clones, + unique_clones + FROM augur_data.repo_clones_data + WHERE repo_id = :repo_id + AND clone_data_timestamp BETWEEN :begin_date AND :end_date + ORDER BY clone_data_timestamp + """) + with current_app.engine.connect() as conn: + results = pd.read_sql(clones_sql, conn, params={ + 'repo_id': repo_id, + 'begin_date': begin_date, + 'end_date': end_date + }) + return results + else: + clones_sql = s.sql.text(""" + SELECT + repo_id, + clone_data_timestamp AS date, + count_clones AS total_clones, + unique_clones + FROM augur_data.repo_clones_data + WHERE repo_id IN ( + SELECT repo_id FROM augur_data.repo WHERE repo_group_id = :repo_group_id + ) + AND clone_data_timestamp BETWEEN :begin_date AND :end_date + ORDER BY repo_id, clone_data_timestamp + """) + with current_app.engine.connect() as conn: + results = pd.read_sql(clones_sql, conn, params={ + 'repo_group_id': repo_group_id, + 'begin_date': begin_date, + 'end_date': end_date + }) + return results From c088b8118634fa8a2094f2088179161e202f7aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <765740+giordano@users.noreply.github.com> Date: Fri, 17 Oct 2025 19:47:58 +0200 Subject: [PATCH 47/67] Format formatting of wait until time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Current formatting of time in "sleeping until" message doesn't including leading zeros for single digits minutes (and hours), which is odd: ```python >>> import time >>> wait_until_time = time.localtime(1760684520) >>> f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min}" 'sleeping until 8:2' >>> f"sleeping until {wait_until_time.tm_hour:02d}:{wait_until_time.tm_min:02d}" 'sleeping until 08:02' ``` Signed-off-by: Mosè Giordano <765740+giordano@users.noreply.github.com> --- augur/application/db/models/augur_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 9212bcc5e9..9bb8ef9104 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -939,7 +939,7 @@ def is_valid_github_repo(gh_session, url: str) -> bool: ) wait_until_time = localtime(wait_until) logger.error(f"rate limited fetching {url}") - logger.error(f"sleeping until {wait_until_time.tm_hour}:{wait_until_time.tm_min} ({wait_in_seconds} seconds)") + logger.error(f"sleeping until {wait_until_time.tm_hour:02d}:{wait_until_time.tm_min:02d} ({wait_in_seconds} seconds)") sleep(wait_in_seconds) attempts+=1 continue @@ -3600,4 +3600,4 @@ class RepoClone(Base): count_clones = Column(BigInteger) clone_data_timestamp = Column(TIMESTAMP(precision=6)) - repo = relationship("Repo") \ No newline at end of file + repo = relationship("Repo") From d5bb5391ee7b6b42534062c59b6fa7b2b622ed9a Mon Sep 17 00:00:00 2001 From: Ulincsys Date: Tue, 21 Oct 2025 18:39:17 -0500 Subject: [PATCH 48/67] Remove extraneous log statement Signed-off-by: Ulincsys --- augur/application/db/lib.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index a82c97dd66..52efee87ee 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -217,8 +217,6 @@ def facade_bulk_insert_commits(logger, records): session.rollback() if len(records) > 1: - logger.error(f"Ran into issue when trying to insert commits \n Error: {e}") - #split list into halves and retry insert until we isolate offending record firsthalfRecords = records[:len(records)//2] secondhalfRecords = records[len(records)//2:] From 058120b296f5fb8b3a3ec3508fa80350df9e2c92 Mon Sep 17 00:00:00 2001 From: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Date: Wed, 22 Oct 2025 07:39:00 -0400 Subject: [PATCH 49/67] copy podman test cleanup step to docker build as well Add step to remove unnecessary files from Docker image Signed-off-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> --- .github/workflows/build_docker.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 75590ac0ad..2f82922617 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -55,6 +55,11 @@ jobs: name: End-to-end test (Docker) runs-on: ubuntu-latest steps: + - name: Remove unnecessary files from the base image + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Checkout repository uses: actions/checkout@v4 From ca257b63314653f59927355f5b20515d9d127eaa Mon Sep 17 00:00:00 2001 From: Mahmoud Abdelrazek Date: Tue, 21 Oct 2025 22:01:40 +0100 Subject: [PATCH 50/67] fix typo Signed-off-by: Mahmoud Abdelrazek --- augur/application/db/models/augur_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/augur/application/db/models/augur_data.py b/augur/application/db/models/augur_data.py index 9bb8ef9104..c80077d9b6 100644 --- a/augur/application/db/models/augur_data.py +++ b/augur/application/db/models/augur_data.py @@ -2885,7 +2885,7 @@ class PullRequestAssignee(Base): @classmethod def from_github(cls, assignee, repo_id, tool_source, tool_version, data_source): - pr_assignee_ojb = cls() + pr_assignee_obj = cls() # store the pr_url data on in the pr assignee data for now so we can relate it back to a pr later pr_assignee_obj.contrib_id = assignee["cntrb_id"] From 548f4298366d98b4eb3904031f6e719cd628001c Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Wed, 22 Oct 2025 21:39:25 +0530 Subject: [PATCH 51/67] Fixing the API Visibility issue in debug mode w/signoff Signed-off-by: PredictiveManish --- augur/tasks/util/random_key_auth.py | 68 ++++++++++++++++------------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index f2fea35b36..d1f6bd3435 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -3,23 +3,31 @@ from httpx import Auth, Request, Response from random import choice +import hashlib +def mask_key(key: str, first: int = 6, last: int = 3, stars: int = 6) -> str: + """Mask key except for the first and last few characters.""" + if not isinstance(key, str) or len(key) <= (first + last): + return "*" * stars + return f"{key[:first]}{'*' * stars}{key[-last:]}" + + +def key_fingerprint(key: str, length: int = 12) -> str: + """Return a short non-reversible fingerprint of the key for correlation.""" + h = hashlib.sha256(key.encode("utf-8")).hexdigest() + return h[:length] + class RandomKeyAuth(Auth): - """Custom Auth class for httpx that randomly assigns an api key to each request + """Custom Auth class for httpx that randomly assigns an API key to each request. Attributes: - list_of_keys ([str]): list of keys which are randomly selected from on each request - header_name (str): name of header that the keys need to be set to - key_format (str): format string that defines the structure of the key and leaves a {} for the key to be inserted + list_of_keys (List[str]): list of keys to choose from + header_name (str): name of header to set the key into + key_format (str): optional format string with {0} placeholder for key """ - - # pass a list of keys that are strings - # pass the name of the header that you would like to be set on the request - # Optionally pass the key_format. This is a string that contains a {} so the key can be added and applied to the header in the correct way. - # For example on github the keys are formatted like "token asdfasfdasf" where asdfasfdasf is the key. So for github - # the key_format="token {0}" + def __init__(self, list_of_keys: List[str], header_name: str, logger, key_format: Optional[str] = None): self.list_of_keys = list_of_keys self.header_name = header_name @@ -27,27 +35,27 @@ def __init__(self, list_of_keys: List[str], header_name: str, logger, key_format self.logger = logger def auth_flow(self, request: Request) -> Generator[Request, Response, None]: - - # the choice function is from the random library, and gets a random value from a list - # this gets a random key from the list - - if self.list_of_keys: - key_value = choice(self.list_of_keys) - self.logger.debug(f'Key value used in request: {key_value}') - # formats the key string into a format GitHub will accept - - if self.key_format: - key_string = self.key_format.format(key_value) - else: - key_string = key_value - - # set the headers of the request with the new key - request.headers[self.header_name] = key_string - #self.logger.info(f"List of Keys: {self.list_of_keys}") - + """Attach a randomly selected API key to the request headers.""" + if not self.list_of_keys: + self.logger.error("No valid keys available to make a request.") + yield request + return + + key_value = choice(self.list_of_keys) + + # Log only masked or hashed form, never the full key + masked = mask_key(key_value) + fingerprint = key_fingerprint(key_value) + self.logger.debug(f"Key used for request (masked): {masked} | fingerprint: {fingerprint}") + + # Apply formatting if needed + if self.key_format: + key_string = self.key_format.format(key_value) else: - self.logger.error(f"There are no valid keys to make a request with: {self.list_of_keys}") + key_string = key_value - # sends the request back with modified headers + # Set header + request.headers[self.header_name] = key_string + # sends the request back with modified headers # basically it saves our changes to the request object yield request From 17ec742e5dcc6d0df9a03fc46be155adcef69330 Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Wed, 22 Oct 2025 21:54:37 +0530 Subject: [PATCH 52/67] Fixing the warnings in #3183 w/signoff Signed-off-by: PredictiveManish --- docs/source/conf.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 45966f19ec..e925a59ffd 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,13 +19,15 @@ # import os import sys -import sphinx_rtd_theme + here = os.path.abspath(os.path.dirname(__file__)) -exec(open(os.path.join(here, "../../metadata.py")).read()) +# Add the project root (two levels up: docs/source → augur) +sys.path.insert(0, os.path.abspath(os.path.join(here, '../..'))) -sys.path.insert(0, os.path.abspath('../../../augur')) +# Now import metadata +from metadata import __copyright__, __release__, __version__ # -- General configuration ------------------------------------------------ From 6d45241abe72d1386809f19f0dc6945674fe40a1 Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Thu, 23 Oct 2025 09:40:49 +0530 Subject: [PATCH 53/67] Fixing warnings in #3183 w/signoff Signed-off-by: PredictiveManish --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index e925a59ffd..cf9c6ec7a8 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ # Add the project root (two levels up: docs/source → augur) sys.path.insert(0, os.path.abspath(os.path.join(here, '../..'))) -# Now import metadata +# Now importing variables from metadata.py from metadata import __copyright__, __release__, __version__ # -- General configuration ------------------------------------------------ From 0b86eff350fc1de76608cea49c46327865419592 Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Fri, 24 Oct 2025 12:20:40 +0530 Subject: [PATCH 54/67] Reverting changes Signed-off-by: PredictiveManish --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index cf9c6ec7a8..94921bd5d1 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -19,7 +19,7 @@ # import os import sys - +import sphinx_rtd_theme here = os.path.abspath(os.path.dirname(__file__)) From 27049bb0c309af4cbf9cf2766e79e0ec98b43170 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Fri, 24 Oct 2025 11:00:15 -0400 Subject: [PATCH 55/67] allow materialized view refresh to be disabled Signed-off-by: Adrian Edwards --- augur/tasks/init/celery_app.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/augur/tasks/init/celery_app.py b/augur/tasks/init/celery_app.py index db8d2239d4..d1209fadd0 100644 --- a/augur/tasks/init/celery_app.py +++ b/augur/tasks/init/celery_app.py @@ -241,8 +241,11 @@ def setup_periodic_tasks(sender, **kwargs): sender.add_periodic_task(thirty_days_in_seconds, non_repo_domain_tasks.s()) mat_views_interval = int(config.get_value('Celery', 'refresh_materialized_views_interval_in_days')) - logger.info(f"Scheduling refresh materialized view every night at 1am CDT") - sender.add_periodic_task(datetime.timedelta(days=mat_views_interval), refresh_materialized_views.s()) + if mat_views_interval > 0: + logger.info(f"Scheduling refresh materialized view every night at 1am CDT") + sender.add_periodic_task(datetime.timedelta(days=mat_views_interval), refresh_materialized_views.s()) + else: + logger.info(f"Refresh materialized view task is disabled.") # logger.info(f"Scheduling update of collection weights on midnight each day") # sender.add_periodic_task(crontab(hour=0, minute=0),augur_collection_update_weights.s()) From 8ad15bd833917944544345c88b2d6076279bc4a0 Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Sat, 25 Oct 2025 14:06:09 +0530 Subject: [PATCH 56/67] Reverting the unnecessary changes Signed-off-by: PredictiveManish --- augur/tasks/util/random_key_auth.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index d1f6bd3435..7af17d81b9 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -20,12 +20,12 @@ def key_fingerprint(key: str, length: int = 12) -> str: class RandomKeyAuth(Auth): - """Custom Auth class for httpx that randomly assigns an API key to each request. + """Custom Auth class for httpx that randomly assigns an api key to each request. Attributes: - list_of_keys (List[str]): list of keys to choose from - header_name (str): name of header to set the key into - key_format (str): optional format string with {0} placeholder for key + list_of_keys (List[str]): list of keys which are randomly selected from on each request + header_name (str): name of header that the keys need to be set to + key_format (str): format string that defines the structure of the key and leaves a {} for the key to be inserted """ def __init__(self, list_of_keys: List[str], header_name: str, logger, key_format: Optional[str] = None): From 67000888c7d61a2d369bd9cf5a543d4b90cf5ee1 Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Sat, 25 Oct 2025 14:11:22 +0530 Subject: [PATCH 57/67] Moving mask_key() to augur.util Signed-off-by: PredictiveManish --- augur/tasks/util/random_key_auth.py | 8 +------- augur/util/keys.py | 5 +++++ 2 files changed, 6 insertions(+), 7 deletions(-) create mode 100644 augur/util/keys.py diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index 7af17d81b9..5d075f8202 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -4,13 +4,7 @@ from httpx import Auth, Request, Response from random import choice import hashlib - - -def mask_key(key: str, first: int = 6, last: int = 3, stars: int = 6) -> str: - """Mask key except for the first and last few characters.""" - if not isinstance(key, str) or len(key) <= (first + last): - return "*" * stars - return f"{key[:first]}{'*' * stars}{key[-last:]}" +from augur.util.keys import mask_key def key_fingerprint(key: str, length: int = 12) -> str: diff --git a/augur/util/keys.py b/augur/util/keys.py new file mode 100644 index 0000000000..31ef63d0cb --- /dev/null +++ b/augur/util/keys.py @@ -0,0 +1,5 @@ +def mask_key(key: str, first: int = 6, last: int = 3, stars: int = 6) -> str: + """Mask key except for the first and last few characters.""" + if not isinstance(key, str) or len(key) <= (first + last): + return "*" * stars + return f"{key[:first]}{'*' * stars}{key[-last:]}" \ No newline at end of file From bc9bd96969a67a05f09b7b47323a7f2f9d73a36b Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Sat, 25 Oct 2025 14:15:02 +0530 Subject: [PATCH 58/67] Removed key_fingerprint for easy interpretation Signed-off-by: PredictiveManish --- augur/tasks/util/random_key_auth.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/augur/tasks/util/random_key_auth.py b/augur/tasks/util/random_key_auth.py index 5d075f8202..dc59544aef 100644 --- a/augur/tasks/util/random_key_auth.py +++ b/augur/tasks/util/random_key_auth.py @@ -7,12 +7,6 @@ from augur.util.keys import mask_key -def key_fingerprint(key: str, length: int = 12) -> str: - """Return a short non-reversible fingerprint of the key for correlation.""" - h = hashlib.sha256(key.encode("utf-8")).hexdigest() - return h[:length] - - class RandomKeyAuth(Auth): """Custom Auth class for httpx that randomly assigns an api key to each request. @@ -39,8 +33,7 @@ def auth_flow(self, request: Request) -> Generator[Request, Response, None]: # Log only masked or hashed form, never the full key masked = mask_key(key_value) - fingerprint = key_fingerprint(key_value) - self.logger.debug(f"Key used for request (masked): {masked} | fingerprint: {fingerprint}") + self.logger.debug(f"Key used for request (masked): {masked}") # Apply formatting if needed if self.key_format: From f3f4065d8adf5c79c0b011016a83fee44967f4a3 Mon Sep 17 00:00:00 2001 From: PredictiveManish Date: Sun, 26 Oct 2025 10:08:30 +0530 Subject: [PATCH 59/67] Fix: python3.9 compatibility message in docs #3266 Signed-off-by: PredictiveManish --- docs/source/getting-started/installation.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index d2a79c4f71..9fa00dc291 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -31,8 +31,6 @@ Required: -**Python 3.9 is not yet supported because TensorFlow, which we use in our machine learning workers, does not yet support Python 3.9.** - Our REST API & data collection workers write in Python 3.6. We query the GitHub & GitLab API to collect data about issues, pull requests, contributors, and other information about a repository, so GitLab and GitHub access tokens are **required** for data collection. Optional: From c615eb0720ba60c64afd60e9305e92439081c778 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 15 Oct 2025 20:41:19 +0100 Subject: [PATCH 60/67] install mypy Signed-off-by: Adrian Edwards --- pyproject.toml | 1 + uv.lock | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index c086babe25..c3c9b98552 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,6 +99,7 @@ dev = [ "pytest==6.2.5", "toml>=0.10.2", "ipdb==0.13.9", + "mypy>=1.18.2", {include-group = "docs"}, ] docs = [ diff --git a/uv.lock b/uv.lock index 1c74a61de7..8604dfbb4c 100644 --- a/uv.lock +++ b/uv.lock @@ -218,6 +218,7 @@ dependencies = [ dev = [ { name = "docutils" }, { name = "ipdb" }, + { name = "mypy" }, { name = "pytest" }, { name = "setuptools" }, { name = "sphinx" }, @@ -316,6 +317,7 @@ requires-dist = [ dev = [ { name = "docutils", specifier = "==0.20.1" }, { name = "ipdb", specifier = "==0.13.9" }, + { name = "mypy", specifier = ">=1.18.2" }, { name = "pytest", specifier = "==6.2.5" }, { name = "setuptools" }, { name = "sphinx", specifier = "==7.2.6" }, @@ -2016,6 +2018,60 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ca/91/7dc28d5e2a11a5ad804cf2b7f7a5fcb1eb5a4966d66a5d2b41aee6376543/msgpack-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:6d489fba546295983abd142812bda76b57e33d0b9f5d5b71c09a583285506f69", size = 72341, upload-time = "2025-06-13T06:52:27.835Z" }, ] +[[package]] +name = "mypy" +version = "1.18.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mypy-extensions" }, + { name = "pathspec" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c0/77/8f0d0001ffad290cef2f7f216f96c814866248a0b92a722365ed54648e7e/mypy-1.18.2.tar.gz", hash = "sha256:06a398102a5f203d7477b2923dda3634c36727fa5c237d8f859ef90c42a9924b", size = 3448846, upload-time = "2025-09-19T00:11:10.519Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/6f/657961a0743cff32e6c0611b63ff1c1970a0b482ace35b069203bf705187/mypy-1.18.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1eab0cf6294dafe397c261a75f96dc2c31bffe3b944faa24db5def4e2b0f77c", size = 12807973, upload-time = "2025-09-19T00:10:35.282Z" }, + { url = "https://files.pythonhosted.org/packages/10/e9/420822d4f661f13ca8900f5fa239b40ee3be8b62b32f3357df9a3045a08b/mypy-1.18.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7a780ca61fc239e4865968ebc5240bb3bf610ef59ac398de9a7421b54e4a207e", size = 11896527, upload-time = "2025-09-19T00:10:55.791Z" }, + { url = "https://files.pythonhosted.org/packages/aa/73/a05b2bbaa7005f4642fcfe40fb73f2b4fb6bb44229bd585b5878e9a87ef8/mypy-1.18.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448acd386266989ef11662ce3c8011fd2a7b632e0ec7d61a98edd8e27472225b", size = 12507004, upload-time = "2025-09-19T00:11:05.411Z" }, + { url = "https://files.pythonhosted.org/packages/4f/01/f6e4b9f0d031c11ccbd6f17da26564f3a0f3c4155af344006434b0a05a9d/mypy-1.18.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f9e171c465ad3901dc652643ee4bffa8e9fef4d7d0eece23b428908c77a76a66", size = 13245947, upload-time = "2025-09-19T00:10:46.923Z" }, + { url = "https://files.pythonhosted.org/packages/d7/97/19727e7499bfa1ae0773d06afd30ac66a58ed7437d940c70548634b24185/mypy-1.18.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:592ec214750bc00741af1f80cbf96b5013d81486b7bb24cb052382c19e40b428", size = 13499217, upload-time = "2025-09-19T00:09:39.472Z" }, + { url = "https://files.pythonhosted.org/packages/9f/4f/90dc8c15c1441bf31cf0f9918bb077e452618708199e530f4cbd5cede6ff/mypy-1.18.2-cp310-cp310-win_amd64.whl", hash = "sha256:7fb95f97199ea11769ebe3638c29b550b5221e997c63b14ef93d2e971606ebed", size = 9766753, upload-time = "2025-09-19T00:10:49.161Z" }, + { url = "https://files.pythonhosted.org/packages/88/87/cafd3ae563f88f94eec33f35ff722d043e09832ea8530ef149ec1efbaf08/mypy-1.18.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:807d9315ab9d464125aa9fcf6d84fde6e1dc67da0b6f80e7405506b8ac72bc7f", size = 12731198, upload-time = "2025-09-19T00:09:44.857Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e0/1e96c3d4266a06d4b0197ace5356d67d937d8358e2ee3ffac71faa843724/mypy-1.18.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:776bb00de1778caf4db739c6e83919c1d85a448f71979b6a0edd774ea8399341", size = 11817879, upload-time = "2025-09-19T00:09:47.131Z" }, + { url = "https://files.pythonhosted.org/packages/72/ef/0c9ba89eb03453e76bdac5a78b08260a848c7bfc5d6603634774d9cd9525/mypy-1.18.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1379451880512ffce14505493bd9fe469e0697543717298242574882cf8cdb8d", size = 12427292, upload-time = "2025-09-19T00:10:22.472Z" }, + { url = "https://files.pythonhosted.org/packages/1a/52/ec4a061dd599eb8179d5411d99775bec2a20542505988f40fc2fee781068/mypy-1.18.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1331eb7fd110d60c24999893320967594ff84c38ac6d19e0a76c5fd809a84c86", size = 13163750, upload-time = "2025-09-19T00:09:51.472Z" }, + { url = "https://files.pythonhosted.org/packages/c4/5f/2cf2ceb3b36372d51568f2208c021870fe7834cf3186b653ac6446511839/mypy-1.18.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3ca30b50a51e7ba93b00422e486cbb124f1c56a535e20eff7b2d6ab72b3b2e37", size = 13351827, upload-time = "2025-09-19T00:09:58.311Z" }, + { url = "https://files.pythonhosted.org/packages/c8/7d/2697b930179e7277529eaaec1513f8de622818696857f689e4a5432e5e27/mypy-1.18.2-cp311-cp311-win_amd64.whl", hash = "sha256:664dc726e67fa54e14536f6e1224bcfce1d9e5ac02426d2326e2bb4e081d1ce8", size = 9757983, upload-time = "2025-09-19T00:10:09.071Z" }, + { url = "https://files.pythonhosted.org/packages/07/06/dfdd2bc60c66611dd8335f463818514733bc763e4760dee289dcc33df709/mypy-1.18.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:33eca32dd124b29400c31d7cf784e795b050ace0e1f91b8dc035672725617e34", size = 12908273, upload-time = "2025-09-19T00:10:58.321Z" }, + { url = "https://files.pythonhosted.org/packages/81/14/6a9de6d13a122d5608e1a04130724caf9170333ac5a924e10f670687d3eb/mypy-1.18.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a3c47adf30d65e89b2dcd2fa32f3aeb5e94ca970d2c15fcb25e297871c8e4764", size = 11920910, upload-time = "2025-09-19T00:10:20.043Z" }, + { url = "https://files.pythonhosted.org/packages/5f/a9/b29de53e42f18e8cc547e38daa9dfa132ffdc64f7250e353f5c8cdd44bee/mypy-1.18.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5d6c838e831a062f5f29d11c9057c6009f60cb294fea33a98422688181fe2893", size = 12465585, upload-time = "2025-09-19T00:10:33.005Z" }, + { url = "https://files.pythonhosted.org/packages/77/ae/6c3d2c7c61ff21f2bee938c917616c92ebf852f015fb55917fd6e2811db2/mypy-1.18.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01199871b6110a2ce984bde85acd481232d17413868c9807e95c1b0739a58914", size = 13348562, upload-time = "2025-09-19T00:10:11.51Z" }, + { url = "https://files.pythonhosted.org/packages/4d/31/aec68ab3b4aebdf8f36d191b0685d99faa899ab990753ca0fee60fb99511/mypy-1.18.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a2afc0fa0b0e91b4599ddfe0f91e2c26c2b5a5ab263737e998d6817874c5f7c8", size = 13533296, upload-time = "2025-09-19T00:10:06.568Z" }, + { url = "https://files.pythonhosted.org/packages/9f/83/abcb3ad9478fca3ebeb6a5358bb0b22c95ea42b43b7789c7fb1297ca44f4/mypy-1.18.2-cp312-cp312-win_amd64.whl", hash = "sha256:d8068d0afe682c7c4897c0f7ce84ea77f6de953262b12d07038f4d296d547074", size = 9828828, upload-time = "2025-09-19T00:10:28.203Z" }, + { url = "https://files.pythonhosted.org/packages/5f/04/7f462e6fbba87a72bc8097b93f6842499c428a6ff0c81dd46948d175afe8/mypy-1.18.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:07b8b0f580ca6d289e69209ec9d3911b4a26e5abfde32228a288eb79df129fcc", size = 12898728, upload-time = "2025-09-19T00:10:01.33Z" }, + { url = "https://files.pythonhosted.org/packages/99/5b/61ed4efb64f1871b41fd0b82d29a64640f3516078f6c7905b68ab1ad8b13/mypy-1.18.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ed4482847168439651d3feee5833ccedbf6657e964572706a2adb1f7fa4dfe2e", size = 11910758, upload-time = "2025-09-19T00:10:42.607Z" }, + { url = "https://files.pythonhosted.org/packages/3c/46/d297d4b683cc89a6e4108c4250a6a6b717f5fa96e1a30a7944a6da44da35/mypy-1.18.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c3ad2afadd1e9fea5cf99a45a822346971ede8685cc581ed9cd4d42eaf940986", size = 12475342, upload-time = "2025-09-19T00:11:00.371Z" }, + { url = "https://files.pythonhosted.org/packages/83/45/4798f4d00df13eae3bfdf726c9244bcb495ab5bd588c0eed93a2f2dd67f3/mypy-1.18.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a431a6f1ef14cf8c144c6b14793a23ec4eae3db28277c358136e79d7d062f62d", size = 13338709, upload-time = "2025-09-19T00:11:03.358Z" }, + { url = "https://files.pythonhosted.org/packages/d7/09/479f7358d9625172521a87a9271ddd2441e1dab16a09708f056e97007207/mypy-1.18.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7ab28cc197f1dd77a67e1c6f35cd1f8e8b73ed2217e4fc005f9e6a504e46e7ba", size = 13529806, upload-time = "2025-09-19T00:10:26.073Z" }, + { url = "https://files.pythonhosted.org/packages/71/cf/ac0f2c7e9d0ea3c75cd99dff7aec1c9df4a1376537cb90e4c882267ee7e9/mypy-1.18.2-cp313-cp313-win_amd64.whl", hash = "sha256:0e2785a84b34a72ba55fb5daf079a1003a34c05b22238da94fcae2bbe46f3544", size = 9833262, upload-time = "2025-09-19T00:10:40.035Z" }, + { url = "https://files.pythonhosted.org/packages/5a/0c/7d5300883da16f0063ae53996358758b2a2df2a09c72a5061fa79a1f5006/mypy-1.18.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:62f0e1e988ad41c2a110edde6c398383a889d95b36b3e60bcf155f5164c4fdce", size = 12893775, upload-time = "2025-09-19T00:10:03.814Z" }, + { url = "https://files.pythonhosted.org/packages/50/df/2cffbf25737bdb236f60c973edf62e3e7b4ee1c25b6878629e88e2cde967/mypy-1.18.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:8795a039bab805ff0c1dfdb8cd3344642c2b99b8e439d057aba30850b8d3423d", size = 11936852, upload-time = "2025-09-19T00:10:51.631Z" }, + { url = "https://files.pythonhosted.org/packages/be/50/34059de13dd269227fb4a03be1faee6e2a4b04a2051c82ac0a0b5a773c9a/mypy-1.18.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ca1e64b24a700ab5ce10133f7ccd956a04715463d30498e64ea8715236f9c9c", size = 12480242, upload-time = "2025-09-19T00:11:07.955Z" }, + { url = "https://files.pythonhosted.org/packages/5b/11/040983fad5132d85914c874a2836252bbc57832065548885b5bb5b0d4359/mypy-1.18.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d924eef3795cc89fecf6bedc6ed32b33ac13e8321344f6ddbf8ee89f706c05cb", size = 13326683, upload-time = "2025-09-19T00:09:55.572Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ba/89b2901dd77414dd7a8c8729985832a5735053be15b744c18e4586e506ef/mypy-1.18.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20c02215a080e3a2be3aa50506c67242df1c151eaba0dcbc1e4e557922a26075", size = 13514749, upload-time = "2025-09-19T00:10:44.827Z" }, + { url = "https://files.pythonhosted.org/packages/25/bc/cc98767cffd6b2928ba680f3e5bc969c4152bf7c2d83f92f5a504b92b0eb/mypy-1.18.2-cp314-cp314-win_amd64.whl", hash = "sha256:749b5f83198f1ca64345603118a6f01a4e99ad4bf9d103ddc5a3200cc4614adf", size = 9982959, upload-time = "2025-09-19T00:10:37.344Z" }, + { url = "https://files.pythonhosted.org/packages/87/e3/be76d87158ebafa0309946c4a73831974d4d6ab4f4ef40c3b53a385a66fd/mypy-1.18.2-py3-none-any.whl", hash = "sha256:22a1748707dd62b58d2ae53562ffc4d7f8bcc727e8ac7cbc69c053ddc874d47e", size = 2352367, upload-time = "2025-09-19T00:10:15.489Z" }, +] + +[[package]] +name = "mypy-extensions" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, +] + [[package]] name = "networkx" version = "3.4.2" @@ -2168,6 +2224,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/e7/40fb618334dcdf7c5a316c0e7343c5cd82d3d866edc100d98e29bc945ecd/partd-1.4.2-py3-none-any.whl", hash = "sha256:978e4ac767ec4ba5b86c6eaa52e5a2a3bc748a2ca839e8cc798f1cc6ce6efb0f", size = 18905, upload-time = "2024-05-06T19:51:39.271Z" }, ] +[[package]] +name = "pathspec" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, +] + [[package]] name = "pexpect" version = "4.9.0" From 718034accfb87c9ef07221d085c677e1246a7236 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 16 Oct 2025 20:08:15 +0100 Subject: [PATCH 61/67] add mypy config to toml file Signed-off-by: Adrian Edwards --- pyproject.toml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index c3c9b98552..4ed15deea2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,3 +150,10 @@ constraint-dependencies = [ # https://docs.python.org/3.10/whatsnew/3.10.html#removed "graphql-server-core>1.1.1", ] + +[tool.mypy] +files = ['augur/application/db/util.py'] +ignore_missing_imports = true +follow_imports = "skip" +disallow_untyped_defs = false +exclude_gitignore = true From 2fbffd1e966e971726e7000ab2dec7b6b41e48cc Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Thu, 16 Oct 2025 20:08:43 +0100 Subject: [PATCH 62/67] add missing types packages as suggested by mypy Signed-off-by: Adrian Edwards --- pyproject.toml | 6 +++++- uv.lock | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4ed15deea2..1829b4e7d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,7 +100,11 @@ dev = [ "toml>=0.10.2", "ipdb==0.13.9", "mypy>=1.18.2", - {include-group = "docs"}, + "types-requests>=2.31.0.6", + "types-pyyaml>=6.0.12.20250915", + "types-python-dateutil>=2.9.0.20251008", + "types-toml>=0.10.8.20240310", + { include-group = "docs" }, ] docs = [ "docutils==0.20.1", diff --git a/uv.lock b/uv.lock index 8604dfbb4c..18681966a7 100644 --- a/uv.lock +++ b/uv.lock @@ -227,6 +227,10 @@ dev = [ { name = "sphinxcontrib-redoc" }, { name = "toml" }, { name = "tox" }, + { name = "types-python-dateutil" }, + { name = "types-pyyaml" }, + { name = "types-requests" }, + { name = "types-toml" }, ] docs = [ { name = "docutils" }, @@ -326,6 +330,10 @@ dev = [ { name = "sphinxcontrib-redoc", specifier = "==1.6.0" }, { name = "toml", specifier = ">=0.10.2" }, { name = "tox", specifier = "==3.24.4" }, + { name = "types-python-dateutil", specifier = ">=2.9.0.20251008" }, + { name = "types-pyyaml", specifier = ">=6.0.12.20250915" }, + { name = "types-requests", specifier = ">=2.31.0.6" }, + { name = "types-toml", specifier = ">=0.10.8.20240310" }, ] docs = [ { name = "docutils", specifier = "==0.20.1" }, @@ -3793,6 +3801,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] +[[package]] +name = "types-python-dateutil" +version = "2.9.0.20251008" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fc/83/24ed25dd0c6277a1a170c180ad9eef5879ecc9a4745b58d7905a4588c80d/types_python_dateutil-2.9.0.20251008.tar.gz", hash = "sha256:c3826289c170c93ebd8360c3485311187df740166dbab9dd3b792e69f2bc1f9c", size = 16128, upload-time = "2025-10-08T02:51:34.93Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/af/5d24b8d49ef358468ecfdff5c556adf37f4fd28e336b96f923661a808329/types_python_dateutil-2.9.0.20251008-py3-none-any.whl", hash = "sha256:b9a5232c8921cf7661b29c163ccc56055c418ab2c6eabe8f917cbcc73a4c4157", size = 17934, upload-time = "2025-10-08T02:51:33.55Z" }, +] + +[[package]] +name = "types-pyyaml" +version = "6.0.12.20250915" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/69/3c51b36d04da19b92f9e815be12753125bd8bc247ba0470a982e6979e71c/types_pyyaml-6.0.12.20250915.tar.gz", hash = "sha256:0f8b54a528c303f0e6f7165687dd33fafa81c807fcac23f632b63aa624ced1d3", size = 17522, upload-time = "2025-09-15T03:01:00.728Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bd/e0/1eed384f02555dde685fff1a1ac805c1c7dcb6dd019c916fe659b1c1f9ec/types_pyyaml-6.0.12.20250915-py3-none-any.whl", hash = "sha256:e7d4d9e064e89a3b3cae120b4990cd370874d2bf12fa5f46c97018dd5d3c9ab6", size = 20338, upload-time = "2025-09-15T03:00:59.218Z" }, +] + +[[package]] +name = "types-requests" +version = "2.31.0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "types-urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f9/b8/c1e8d39996b4929b918aba10dba5de07a8b3f4c8487bb61bb79882544e69/types-requests-2.31.0.6.tar.gz", hash = "sha256:cd74ce3b53c461f1228a9b783929ac73a666658f223e28ed29753771477b3bd0", size = 15535, upload-time = "2023-09-27T06:19:38.443Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5c/a1/6f8dc74d9069e790d604ddae70cb46dcbac668f1bb08136e7b0f2f5cd3bf/types_requests-2.31.0.6-py3-none-any.whl", hash = "sha256:a2db9cb228a81da8348b49ad6db3f5519452dd20a9c1e1a868c83c5fe88fd1a9", size = 14516, upload-time = "2023-09-27T06:19:36.373Z" }, +] + +[[package]] +name = "types-toml" +version = "0.10.8.20240310" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/86/47/3e4c75042792bff8e90d7991aa5c51812cc668828cc6cce711e97f63a607/types-toml-0.10.8.20240310.tar.gz", hash = "sha256:3d41501302972436a6b8b239c850b26689657e25281b48ff0ec06345b8830331", size = 4392, upload-time = "2024-03-10T02:18:37.518Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/da/a2/d32ab58c0b216912638b140ab2170ee4b8644067c293b170e19fba340ccc/types_toml-0.10.8.20240310-py3-none-any.whl", hash = "sha256:627b47775d25fa29977d9c70dc0cbab3f314f32c8d8d0c012f2ef5de7aaec05d", size = 4777, upload-time = "2024-03-10T02:18:36.568Z" }, +] + +[[package]] +name = "types-urllib3" +version = "1.26.25.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/73/de/b9d7a68ad39092368fb21dd6194b362b98a1daeea5dcfef5e1adb5031c7e/types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f", size = 11239, upload-time = "2023-07-20T15:19:31.307Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/7b/3fc711b2efea5e85a7a0bbfe269ea944aa767bbba5ec52f9ee45d362ccf3/types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e", size = 15377, upload-time = "2023-07-20T15:19:30.379Z" }, +] + [[package]] name = "typing-extensions" version = "4.7.1" From 7f5056e066f16574539a0c40821efd256234ac44 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 15 Oct 2025 22:04:13 +0100 Subject: [PATCH 63/67] broaden checking to other application db files too Signed-off-by: Adrian Edwards --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1829b4e7d1..a3866d86e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,7 +156,7 @@ constraint-dependencies = [ ] [tool.mypy] -files = ['augur/application/db/util.py'] +files = ['augur/application/db/*.py'] ignore_missing_imports = true follow_imports = "skip" disallow_untyped_defs = false From 85e317cd5af49d02b95e8bdf784367bfc88729eb Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 15 Oct 2025 22:23:39 +0100 Subject: [PATCH 64/67] type fixes for Application DB files Signed-off-by: Adrian Edwards --- augur/application/db/data_parse.py | 8 ++++---- augur/application/db/engine.py | 2 +- augur/application/db/lib.py | 22 ++++++++++++---------- augur/application/db/session.py | 17 ++++++++++------- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/augur/application/db/data_parse.py b/augur/application/db/data_parse.py index de0d9aaa81..eaa99fd394 100644 --- a/augur/application/db/data_parse.py +++ b/augur/application/db/data_parse.py @@ -457,7 +457,7 @@ def extract_needed_gitlab_issue_label_data(labels: List[dict], repo_id: int, too -def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: +def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: """ Retrieve only the needed data for pr labels from the api response @@ -487,7 +487,7 @@ def extract_needed_issue_message_ref_data(message: dict, issue_id: int, repo_id: return message_ref_dict # retrieve only the needed data for pr labels from the api response -def extract_needed_pr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: +def extract_needed_pr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: message_ref_dict = { 'pull_request_id': pull_request_id, @@ -1128,7 +1128,7 @@ def extract_needed_mr_metadata(mr_dict, repo_id, pull_request_id, tool_source, t return all_meta -def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: +def extract_needed_gitlab_issue_message_ref_data(message: dict, issue_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: """ Extract the message id for a given message on an issue from an api response and connect it to the relevant repo id. @@ -1190,7 +1190,7 @@ def extract_needed_gitlab_message_data(comment: dict, platform_id: int, repo_id: return comment_dict -def extract_needed_gitlab_mr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> List[dict]: +def extract_needed_gitlab_mr_message_ref_data(comment: dict, pull_request_id: int, repo_id: int, tool_source: str, tool_version: str, data_source: str) -> dict: """ Retrieve only the needed data for pr labels from the api response diff --git a/augur/application/db/engine.py b/augur/application/db/engine.py index 2870909093..0ea2bc1730 100644 --- a/augur/application/db/engine.py +++ b/augur/application/db/engine.py @@ -10,7 +10,7 @@ from augur.application.db.util import catch_operational_error -def parse_database_string(db_string: str) -> str: +def parse_database_string(db_string: str) -> tuple[str,str, str, str, str]: """Parse database string into the following components: username, password, host, port, database """ diff --git a/augur/application/db/lib.py b/augur/application/db/lib.py index 52efee87ee..09820168fc 100644 --- a/augur/application/db/lib.py +++ b/augur/application/db/lib.py @@ -271,31 +271,32 @@ def facade_bulk_insert_commits(logger, records): session.commit() else: raise e - - -def batch_insert_contributors(logger, data: Union[List[dict], dict]) -> Optional[List[dict]]: - batch_size = 1000 +def batch_insert_contributors(logger, data: Union[List[dict], dict], batch_size = 1000) -> Optional[List[dict]]: for i in range(0, len(data), batch_size): batch = data[i:i + batch_size] bulk_insert_dicts(logger, batch, Contributor, ['cntrb_id']) + + return None -def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: +def bulk_insert_dicts(logger, data_input: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: - if isinstance(data, list) is False: + if isinstance(data_input, list) is False: # if a dict is passed to data then # convert it to a list with one value - if isinstance(data, dict) is True: - data = [data] + if isinstance(data_input, dict) is True: + data = [data_input] else: logger.error("Data must be a list or a dict") return None + else: + data = list(data_input) if len(data) == 0: # self.logger.info("Gave no data to insert, returning...") @@ -397,8 +398,9 @@ def bulk_insert_dicts(logger, data: Union[List[dict], dict], table, natural_keys if deadlock_detected is True: logger.error("Made it through even though Deadlock was detected") - - return "success" + + # success + return None # othewise it gets the requested return columns and returns them as a list of dicts diff --git a/augur/application/db/session.py b/augur/application/db/session.py index a26fc172b7..661e989dd4 100644 --- a/augur/application/db/session.py +++ b/augur/application/db/session.py @@ -93,18 +93,20 @@ def fetchall_data_from_sql_text(self,sql_text): result = connection.execute(sql_text) return [dict(row) for row in result.mappings()] - def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: + def insert_data(self, data_input: Union[List[dict], dict], table, natural_keys: List[str], return_columns: Optional[List[str]] = None, string_fields: Optional[List[str]] = None, on_conflict_update:bool = True) -> Optional[List[dict]]: - if isinstance(data, list) is False: + if isinstance(data_input, list) is False: # if a dict is passed to data then # convert it to a list with one value - if isinstance(data, dict) is True: - data = [data] + if isinstance(data_input, dict) is True: + data = [data_input] else: self.logger.info("Data must be a list or a dict") return None + else: + data = list(data_input) if len(data) == 0: # self.logger.info("Gave no data to insert, returning...") @@ -166,7 +168,7 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s # if there is no data to return then it executes the insert then returns nothing if not return_columns: - + # TODO: duplicate-looking code alert while attempts < 10: try: #begin keyword is needed for sqlalchemy 2.x @@ -205,8 +207,9 @@ def insert_data(self, data: Union[List[dict], dict], table, natural_keys: List[s if deadlock_detected is True: self.logger.error("Made it through even though Deadlock was detected") - - return "success" + + # success + return None # othewise it gets the requested return columns and returns them as a list of dicts From b902e0d1ada0d9d2a95228d0a23c8e2f4c1af22f Mon Sep 17 00:00:00 2001 From: Manish Tiwari Date: Tue, 28 Oct 2025 13:00:52 +0530 Subject: [PATCH 65/67] Update docs/source/getting-started/installation.rst Co-authored-by: Adrian Edwards <17362949+MoralCode@users.noreply.github.com> Signed-off-by: Manish Tiwari --- docs/source/getting-started/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index 9fa00dc291..41bc1be4dc 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -31,7 +31,7 @@ Required: -Our REST API & data collection workers write in Python 3.6. We query the GitHub & GitLab API to collect data about issues, pull requests, contributors, and other information about a repository, so GitLab and GitHub access tokens are **required** for data collection. +Our REST API & data collection workers query the GitHub & GitLab API to collect data about issues, pull requests, contributors, and other information about a repository. Values for GitLab and GitHub access tokens are **required** for data collection and must be provided (an invalid token can be provided if you don't plan to use one platform) . Optional: From 15f1f27df6e7a62868511139b38003b838de3203 Mon Sep 17 00:00:00 2001 From: "Sean P. Goggins" Date: Tue, 28 Oct 2025 17:53:56 -0500 Subject: [PATCH 66/67] testing Signed-off-by: Sean P. Goggins --- docker/empty_database/Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker/empty_database/Dockerfile b/docker/empty_database/Dockerfile index d4e3122450..cc375a0f84 100644 --- a/docker/empty_database/Dockerfile +++ b/docker/empty_database/Dockerfile @@ -1,5 +1,9 @@ from postgres:16 AS builder +RUN apt-get update && \ + apt-get install -y gcc python3-dev && \ + rm -rf /var/lib/apt/lists/* + ENV DEBIAN_FRONTEND=noninteractive # Install uv (https://docs.astral.sh/uv/guides/integration/docker/#installing-uv) From 5acb58b5d3dd43e7becb2896bec4f2971bfd0f95 Mon Sep 17 00:00:00 2001 From: Adrian Edwards Date: Wed, 29 Oct 2025 12:41:13 -0400 Subject: [PATCH 67/67] remove empty db image build Signed-off-by: Adrian Edwards --- .github/workflows/build_docker.yml | 1 - docker/empty_database/Dockerfile | 70 ------------------------------ 2 files changed, 71 deletions(-) delete mode 100644 docker/empty_database/Dockerfile diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index 2f82922617..3a0e3f953a 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -316,7 +316,6 @@ jobs: - database - keyman - rabbitmq - - empty_database runs-on: ubuntu-latest steps: - name: Checkout repository diff --git a/docker/empty_database/Dockerfile b/docker/empty_database/Dockerfile deleted file mode 100644 index cc375a0f84..0000000000 --- a/docker/empty_database/Dockerfile +++ /dev/null @@ -1,70 +0,0 @@ -from postgres:16 AS builder - -RUN apt-get update && \ - apt-get install -y gcc python3-dev && \ - rm -rf /var/lib/apt/lists/* - -ENV DEBIAN_FRONTEND=noninteractive - -# Install uv (https://docs.astral.sh/uv/guides/integration/docker/#installing-uv) -COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ -ENV UV_COMPILE_BYTECODE=1 -# The uv package cache will be on a cache volume, so can't be linked -ENV UV_LINK_MODE=copy -# Assert that the lockfile (uv.lock) is up-to-date. Use `uv lock` to update it -# manually if this fails the container build. -ENV UV_LOCKED=1 - -WORKDIR /augur - -COPY pyproject.toml . -COPY uv.lock . -COPY .python-version . - -# Install augur's dependencies early to take advantage of build cache -RUN --mount=type=cache,target=/root/.cache/uv \ - uv sync --no-install-project --no-dev - -# Copy in the actual code -# The RUN line below ensure that permissions are set correctly. -# This is the equivalent of the following docker --chmod flags, but done in a way thats compatible with podman. -# This can be removed once https://github.com/containers/buildah/issues/6066 or relevant equivalent is fixed -# - u=rw,u+X: user can read and write all files/dirs and execute directories -# - go=r,go+X: group and others can read all files/dirs and execute directories -COPY README.md . -COPY LICENSE . -COPY alembic.ini . -COPY augur/ augur/ -COPY metadata.py . -COPY scripts/ scripts/ - -RUN find augur -type d -exec chmod u=rwx,go=rx {} + && find augur -type f -exec chmod u=rw,go=r {} + - -RUN find scripts -exec chmod u=rwx,go=rx {} + - -# Install the main project -RUN --mount=type=cache,target=/root/.cache/uv \ - uv sync --no-dev - -# We aren't going to activate the virtualenv (manually or via uv run), so we -# need adjust the PATH -ENV PATH="/augur/.venv/bin:${PATH}" - -ENV POSTGRES_DB="augur" -ENV POSTGRES_USER="augur" -ENV POSTGRES_PASSWORD="augur" -ENV AUGUR_DB="postgresql+psycopg2://augur:augur@localhost:5432/augur" -# ENV PGDATA="/var/lib/postgresql/data" - -RUN set -e && \ - gosu postgres initdb && \ - gosu postgres pg_ctl -D "$PGDATA" -o "-c listen_addresses='localhost'" -w start && \ - gosu postgres psql -c "CREATE USER ${POSTGRES_USER} WITH SUPERUSER PASSWORD '${POSTGRES_PASSWORD}';" && \ - gosu postgres psql -c "CREATE DATABASE ${POSTGRES_DB} OWNER ${POSTGRES_USER};" && \ - augur db create-schema && \ - gosu postgres pg_ctl -D "$PGDATA" -m fast -w stop - - -FROM postgres:16 - -COPY --from=builder /var/lib/postgresql/data /var/lib/postgresql/data