diff --git a/README.md b/README.md index 230e2074f8..9567757aad 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Augur NEW Release v0.89.0 +# Augur NEW Release v0.89.3 Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else! The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io). @@ -11,7 +11,7 @@ We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy o ## NEW RELEASE ALERT! **If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**. -Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.89.0). +Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.89.3). - The `main` branch is a stable version of our new architecture, which features: diff --git a/augur/application/schema/alembic/versions/32_update_openssf_deps.py b/augur/application/schema/alembic/versions/32_update_openssf_deps.py new file mode 100644 index 0000000000..50343067f9 --- /dev/null +++ b/augur/application/schema/alembic/versions/32_update_openssf_deps.py @@ -0,0 +1,45 @@ +"""Remove old OpenSSF Scorecard Unique Constraint; Add new one. + +Revision ID: 32 +Revises: 31 +Create Date: 2025-06-06 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text +from augur.application.db import create_database_engine, get_database_string + + +# revision identifiers, used by Alembic. +revision = '32' +down_revision = '31' +branch_labels = None +depends_on = None +def upgrade(): + op.drop_constraint( + 'deps-scorecard-insert-unique', + 'repo_deps_scorecard', + schema='augur_data', + type_='unique' + ) + op.create_unique_constraint( + 'deps_scorecard_new_unique', + 'repo_deps_scorecard', + ['repo_id', 'repo_deps_scorecard_id'], + schema='augur_data' + ) + +def downgrade(): + op.drop_constraint( + 'deps_scorecard_new_unique', + 'repo_deps_scorecard', + schema='augur_data', + type_='unique' + ) + op.create_unique_constraint( + 'deps-scorecard-insert-unique', + 'repo_deps_scorecard', + ['repo_id', 'name'], + schema='augur_data' + ) \ No newline at end of file diff --git a/augur/application/schema/alembic/versions/33_update_openssf_deps.py b/augur/application/schema/alembic/versions/33_update_openssf_deps.py new file mode 100644 index 0000000000..c10d041a7b --- /dev/null +++ b/augur/application/schema/alembic/versions/33_update_openssf_deps.py @@ -0,0 +1,45 @@ +"""Fix OpenSSF Scorecard Unique Constraint + +Revision ID: 33 +Revises: 32 +Create Date: 2025-06-06 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy import text +from augur.application.db import create_database_engine, get_database_string + + +# revision identifiers, used by Alembic. +revision = '33' +down_revision = '32' +branch_labels = None +depends_on = None +def upgrade(): + op.drop_constraint( + 'deps_scorecard_new_unique', + 'repo_deps_scorecard', + schema='augur_data', + type_='unique' + ) + op.create_unique_constraint( + 'deps_scorecard_new_unique', + 'repo_deps_scorecard', + ['repo_id', 'name', 'data_collection_date'], + schema='augur_data' + ) + +def downgrade(): + op.drop_constraint( + 'deps_scorecard_new_unique', + 'repo_deps_scorecard', + schema='augur_data', + type_='unique' + ) + op.create_unique_constraint( + 'deps-scorecard-insert-unique', + 'repo_deps_scorecard', + ['repo_id', 'name'], + schema='augur_data' + ) \ No newline at end of file diff --git a/augur/tasks/db/refresh_materialized_views.py b/augur/tasks/db/refresh_materialized_views.py index 8a06ac7a61..37e3ef561a 100644 --- a/augur/tasks/db/refresh_materialized_views.py +++ b/augur/tasks/db/refresh_materialized_views.py @@ -88,6 +88,12 @@ def refresh_materialized_views(self): COMMIT; """) + mv14_refresh = s.sql.text(""" + + REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_repo_languages with data; + COMMIT; + """) + try: execute_sql(mv1_refresh) except Exception as e: @@ -166,6 +172,12 @@ def refresh_materialized_views(self): logger.info(f"error is {e}") pass + try: + execute_sql(mv14_refresh) + except Exception as e: + logger.info(f"error is {e}") + pass + #Now refresh facade tables #Use this class to get all the settings and #utility functions for facade diff --git a/augur/tasks/git/dependency_tasks/core.py b/augur/tasks/git/dependency_tasks/core.py index 9262f241b4..0f9b08d7d3 100644 --- a/augur/tasks/git/dependency_tasks/core.py +++ b/augur/tasks/git/dependency_tasks/core.py @@ -39,7 +39,7 @@ def generate_deps_data(logger, repo_git): 'dep_count' : dep.count, 'dep_language' : dep.language, 'tool_source': 'deps_model', - 'tool_version': '0.43.9', + 'tool_version': '0.89.1', 'data_source': 'Git', 'data_collection_date': scan_date } @@ -47,7 +47,7 @@ def generate_deps_data(logger, repo_git): to_insert.append(repo_deps) bulk_insert_dicts(logger, to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"]) - + logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}") """ @@ -76,7 +76,7 @@ def generate_scorecard(logger, repo_git): path = repo_git[8:] if path[-4:] == '.git': path = path.replace(".git", "") - command = '--local=' + path + command = '--repo=' + path #this is path where our scorecard project is located path_to_scorecard = os.getenv('SCORECARD_DIR', os.environ['HOME'] + '/scorecard') @@ -111,7 +111,7 @@ def generate_scorecard(logger, repo_git): 'scorecard_check_details': required_output['repo'], 'score': required_output['score'], 'tool_source': 'scorecard_model', - 'tool_version': '0.43.9', + 'tool_version': '0.89.1', 'data_source': 'Git', 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') } @@ -126,16 +126,19 @@ def generate_scorecard(logger, repo_git): 'scorecard_check_details': check, 'score': check['score'], 'tool_source': 'scorecard_model', - 'tool_version': '0.43.9', + 'tool_version': '0.89.1', 'data_source': 'Git', 'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') } to_insert.append(repo_deps_scorecard) + + bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name","data_collection_date"]) - bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"]) + logger.info(f"Inserted {len(to_insert)} scorecard entries for repo {repo_id}") logger.info(f"Done generating scorecard for repo {repo_id} from path {path}") except Exception as e: - raise MetadataException(e, f"required_output: {required_output}") + logger.exception("Error generating scorecard", exc_info=e) + raise MetadataException(e, f"required_output: {required_output}; error {e}") diff --git a/augur/tasks/github/events.py b/augur/tasks/github/events.py index 08efb35d92..38a5e9e9c6 100644 --- a/augur/tasks/github/events.py +++ b/augur/tasks/github/events.py @@ -325,7 +325,7 @@ def _collect_and_process_issue_events(self, owner, repo, repo_id, key_auth, sinc self._tool_source, self._tool_version, self._data_source) ) except UrlNotFoundException as e: - self._logger.warning(f"{self.repo_identifier}: Url not found for {event_url}") + self._logger.info(f"{self.repo_identifier}: Issue with number of {issue_number} returned 404 on event data. Skipping.") if len(events) > 500: self._insert_contributors(contributors) @@ -386,7 +386,7 @@ def _collect_and_process_pr_events(self, owner, repo, repo_id, key_auth, since): self._tool_source, self._tool_version, self._data_source) ) except UrlNotFoundException: - self._logger.warning(f"{self.repo_identifier}: Url not found for {event_url}") + self._logger.info(f"{self.repo_identifier}: PR with number of {pr_number} returned 404 on event data. Skipping.") continue if len(events) > 500: diff --git a/augur/tasks/github/messages.py b/augur/tasks/github/messages.py index 40b35e942d..812af0fada 100644 --- a/augur/tasks/github/messages.py +++ b/augur/tasks/github/messages.py @@ -120,12 +120,10 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger try: messages = list(github_data_access.paginate_resource(comment_url)) all_data += messages - except UrlNotFoundException as e: - logger.warning(f"{task_name}: 404 on comment URL {comment_url}. Skipping. Reason: {e}") + except UrlNotFoundException: + logger.info(f"{task_name}: PR or issue comment url of {comment_url} returned 404. Skipping.") skipped_urls += 1 - except Exception as e: - logger.error(f"{task_name}: Unexpected error on comment URL {comment_url}: {e}", exc_info=True) - + if len(all_data) >= 20: process_messages(all_data, task_name, repo_id, logger, augur_db) all_data.clear() diff --git a/augur/tasks/github/pull_requests/commits_model/core.py b/augur/tasks/github/pull_requests/commits_model/core.py index 3867f94733..2df6d66f5d 100644 --- a/augur/tasks/github/pull_requests/commits_model/core.py +++ b/augur/tasks/github/pull_requests/commits_model/core.py @@ -71,10 +71,8 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collecti } all_data.append(pr_commit_row) except UrlNotFoundException: - logger.warning(f"{task_name}: PR #{index + 1} returned 404 on commit data. Skipping.") - except Exception as e: - logger.error(f"{task_name}: Unexpected error while processing PR #{index + 1}: {e}", exc_info=True) - + logger.info(f"{task_name}: PR with url of {pr_info['pr_url']} returned 404 on commit data. Skipping.") + continue if len(all_data) > 0: logger.info(f"{task_name}: Inserting {len(all_data)} rows") diff --git a/augur/tasks/github/pull_requests/files_model/core.py b/augur/tasks/github/pull_requests/files_model/core.py index badc86cd38..cbecb44d6d 100644 --- a/augur/tasks/github/pull_requests/files_model/core.py +++ b/augur/tasks/github/pull_requests/files_model/core.py @@ -36,6 +36,8 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection repo = execute_session_query(query, 'one') owner, name = get_owner_repo(repo.repo_git) + task_name = f"{owner}/{name} Pr files" + github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger) pr_file_rows = [] @@ -90,10 +92,14 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection } pr_file_rows.append(data) - except (NotFoundException, InvalidDataException) as e: - logger.warning(e) + except NotFoundException as e: + logger.info(f"{task_name}: PR with number of {pr_info['pr_src_number']} returned 404 on file data. Skipping.") + continue + except InvalidDataException as e: + logger.warning(f"{task_name}: PR with number of {pr_info['pr_src_number']} returned null for file data. Skipping.") continue + if len(pr_file_rows) > 0: # Execute a bulk upsert with sqlalchemy pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"] diff --git a/augur/tasks/github/releases/core.py b/augur/tasks/github/releases/core.py index 3192401ae3..239b83dce9 100644 --- a/augur/tasks/github/releases/core.py +++ b/augur/tasks/github/releases/core.py @@ -26,7 +26,7 @@ def get_release_inf(repo_id, release, tag_only): 'release_id': release['id'], 'repo_id': repo_id, 'release_name': release['name'], - 'release_description': release['description'], + 'release_description': release['description'] if release['description'] is not None else '', 'release_author': author, 'release_created_at': release['createdAt'], 'release_published_at': release['publishedAt'], @@ -54,6 +54,7 @@ def get_release_inf(repo_id, release, tag_only): 'release_id': release['id'], 'repo_id': repo_id, 'release_name': release['name'], + 'release_description': 'tag_only', 'release_author': author, 'release_tag_name': release['name'], 'tag_only': tag_only diff --git a/augur/tasks/github/repo_info/core.py b/augur/tasks/github/repo_info/core.py index e782fde5a5..57cd970bc0 100644 --- a/augur/tasks/github/repo_info/core.py +++ b/augur/tasks/github/repo_info/core.py @@ -1,7 +1,7 @@ #SPDX-License-Identifier: MIT import json import sqlalchemy as s -from augur.tasks.github.util.github_data_access import GithubDataAccess +from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess from augur.tasks.github.util.github_paginator import hit_api from augur.tasks.github.util.util import get_owner_repo @@ -55,60 +55,38 @@ def get_repo_data(logger, url, response): return data """ -def get_repo_data(logger, url, response): - if response is None: - logger.error(f"Failed to retrieve data from {url}. Response is None.") - raise Exception(f"Failed API request to {url}. Got None response.") - data = {} +def get_repo_data(logger, owner, repo): + try: - data = response.json() + url = f'https://api.github.com/repos/{owner}/{repo}' + github_data_access = GithubDataAccess(None, logger) + result = github_data_access.get_resource(url) + return result + except UrlNotFoundException as e: + message = f"GitHub repo was not found or does not exist for endpoint: {url}" + logger.error(message) + raise Exception(message) from e except Exception as e: - logger.warning(f"Failed to parse JSON from {url}: {e}") - try: - data = json.loads(json.dumps(response.text)) # This is effectively a no-op - except Exception as inner_e: - logger.error(f"Completely failed to parse response from {url}: {inner_e}") - raise Exception(f"Unparseable response from {url}") - - if 'errors' in data: - logger.error(f"GitHub API returned errors: {data['errors']}") - raise Exception(f"GitHub returned error response! {data['errors']}") - - if 'id' not in data and 'message' in data: - logger.warning(f"Unexpected response structure from {url}: {data}") - if data['message'] == 'Not Found': - raise Exception(f"GitHub repo was not found or does not exist for endpoint: {url}") - - return data - -def is_forked(key_auth, logger, owner, repo): #/repos/:owner/:repo parent - logger.info('Querying parent info to verify if the repo is forked\n') - url = f'https://api.github.com/repos/{owner}/{repo}' - - r = hit_api(key_auth, url, logger)#requests.get(url, headers=self.headers) + logger.error(e) + raise e - data = get_repo_data(logger, url, r) - - if 'fork' in data: - if 'parent' in data: - return data['parent']['full_name'] +def is_forked(logger, repo_data): #/repos/:owner/:repo parent + logger.info('Determining if the repo is forked\n') + + if 'fork' in repo_data: + if 'parent' in repo_data: + return repo_data['parent']['full_name'] return 'Parent not available' return False -def is_archived(key_auth, logger, owner, repo): - logger.info('Querying committers count\n') - url = f'https://api.github.com/repos/{owner}/{repo}' - - r = hit_api(key_auth, url, logger)#requests.get(url, headers=self.headers) - #self.update_gh_rate_limit(r) +def is_archived(logger, repo_data): + logger.info('Determining if the repo is archived\n') - data = get_repo_data(logger, url, r) - - if 'archived' in data: - if data['archived']: - if 'updated_at' in data: - return data['updated_at'] + if 'archived' in repo_data: + if repo_data['archived']: + if 'updated_at' in repo_data: + return repo_data['updated_at'] return 'Date not available' return False @@ -267,8 +245,10 @@ def repo_info_model(key_auth, repo_orm_obj, logger): execute_sql(insert_statement) # Note that the addition of information about where a repository may be forked from, and whether a repository is archived, updates the `repo` table, not the `repo_info` table. - forked = is_forked(key_auth, logger, owner, repo) - archived = is_archived(key_auth, logger, owner, repo) + repo_data = get_repo_data(logger, owner, repo) + + forked = is_forked(logger, repo_data) + archived = is_archived(logger, repo_data) archived_date_collected = None if archived is not False: archived_date_collected = archived diff --git a/augur/tasks/github/util/github_data_access.py b/augur/tasks/github/util/github_data_access.py index 890165a8db..3e8a8accc6 100644 --- a/augur/tasks/github/util/github_data_access.py +++ b/augur/tasks/github/util/github_data_access.py @@ -119,18 +119,7 @@ def make_request(self, url, method="GET", timeout=100): # This is not an issue that is really an Exception. It is more of a nominal signal. if response.status_code == 404: - parsed = urlparse(url) - path = parsed.path.lower() - query = parsed.query.lower() - - if any(k in path for k in ["commits", "files", "comments"]) or \ - any(k in query for k in ["commits", "files", "comments"]): - self.logger.warning( - f"Github response with 404 for PR files, PR commits or messages. " - f"This is a data anomaly in the platform API, not an error. URL: {url}. Response: {response.text}" - ) - else: - raise UrlNotFoundException(f"Could not find {url}") + raise UrlNotFoundException(f"Could not find {url}") if response.status_code == 401: raise NotAuthorizedException(f"Could not authorize with the github api") diff --git a/docker/backend/Dockerfile b/docker/backend/Dockerfile index 38e7ca0b1c..34ef8ed788 100644 --- a/docker/backend/Dockerfile +++ b/docker/backend/Dockerfile @@ -113,5 +113,6 @@ RUN ln -s /cache /augur/augur/static/cache COPY --chmod=u=rwx,go=rx ./docker/backend/entrypoint.sh / COPY --chmod=u=rwx,go=rx ./docker/backend/init.sh / +RUN chmod +x /entrypoint.sh /init.sh ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] CMD ["/init.sh"] diff --git a/metadata.py b/metadata.py index acd1292d10..601b018631 100644 --- a/metadata.py +++ b/metadata.py @@ -5,8 +5,8 @@ __short_description__ = "Python 3 package for free/libre and open-source software community metrics, models & data collection" -__version__ = "0.89.0" -__release__ = "v0.89.0 (Midnight Sun)" +__version__ = "0.89.3" +__release__ = "v0.89.3 (Midnight Sun)" __license__ = "MIT" __copyright__ = "University of Missouri, University of Nebraska-Omaha, CHAOSS, Sean Goggins, Brian Warner & Augurlabs 2025"