Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
2e923f5
line missed in bug fix to allow non root user
cdolfi Jun 2, 2025
44cdb74
Merge pull request #3164 from cdolfi/main_init_permission
sgoggins Jun 3, 2025
d000e49
remove generic exception handling and make missing pr and issue logs …
ABrain7710 Jun 3, 2025
5bab213
remove logic that ignored 404s in github data access
ABrain7710 Jun 3, 2025
f7e1600
simplify is_forked and is_archived logic
ABrain7710 Jun 3, 2025
78f6b0a
add refresh for explorer_repo_languages
cdolfi Jun 6, 2025
68de2d4
updated
sgoggins Jun 6, 2025
acebc89
updating OpenSSF Scorecard insert
sgoggins Jun 7, 2025
c81bf94
task collection id update
sgoggins Jun 7, 2025
a67bce2
alembic upgrade
sgoggins Jun 7, 2025
e49c373
alembic finalization
sgoggins Jun 7, 2025
9c11f63
updating upgrade text message
sgoggins Jun 7, 2025
f9cb944
updated alembic
sgoggins Jun 7, 2025
cb6abc6
udpated alembic
sgoggins Jun 7, 2025
032a62a
aligning code with db
sgoggins Jun 7, 2025
98079de
update test
sgoggins Jun 7, 2025
d335f4e
another try
sgoggins Jun 7, 2025
c8ee156
typo
sgoggins Jun 7, 2025
b26fb37
OpenSSF Scorecard double fixer
sgoggins Jun 7, 2025
e364065
db typo fix
sgoggins Jun 7, 2025
6c18ed4
bug fix?
sgoggins Jun 7, 2025
b4dbfea
added error messages
sgoggins Jun 7, 2025
ce0135c
typo fix 2
sgoggins Jun 7, 2025
217ed9f
Update augur/tasks/git/dependency_tasks/core.py
sgoggins Jun 7, 2025
26e4c66
Update augur/tasks/git/dependency_tasks/core.py
sgoggins Jun 7, 2025
287d027
Merge pull request #3175 from chaoss/scorecard-check-spg1
sgoggins Jun 8, 2025
a098ed6
handling NULL descriptions in relesaes
sgoggins Jun 8, 2025
fe696f7
fixing releases NULL handling
sgoggins Jun 8, 2025
4a17efa
fix: key error
ABrain7710 Jun 8, 2025
acf1811
Merge remote-tracking branch 'origin/main-fixes' into main-fixes
ABrain7710 Jun 8, 2025
210969c
Merge pull request #3169 from chaoss/main-fixes
sgoggins Jun 8, 2025
fe4edc7
Update README.md
sgoggins Jun 8, 2025
8e3fff7
Update metadata.py
sgoggins Jun 8, 2025
ac94225
Merge pull request #3174 from cdolfi/main_materialized_view_update
sgoggins Jun 8, 2025
f7679d4
Update README.md
sgoggins Jun 8, 2025
7f8f049
Update metadata.py
sgoggins Jun 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Augur NEW Release v0.89.0
# Augur NEW Release v0.89.3

Augur is primarily a data engineering tool that makes it possible for data scientists to gather open source software community data - less data carpentry for everyone else!
The primary way of looking at Augur data is through [8Knot](https://github.com/oss-aspen/8knot), a public instance of 8Knot is available [here](https://metrix.chaoss.io) - this is tied to a public instance of [Augur](https://ai.chaoss.io).
Expand All @@ -11,7 +11,7 @@ We follow the [First Timers Only](https://www.firsttimersonly.com/) philosophy o
## NEW RELEASE ALERT!
**If you want to jump right in, the updated docker, docker-compose and bare metal installation instructions are available [here](docs/new-install.md)**.

Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.89.0).
Augur is now releasing a dramatically improved new version to the ```main``` branch. It is also available [here](https://github.com/chaoss/augur/releases/tag/v0.89.3).


- The `main` branch is a stable version of our new architecture, which features:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Remove old OpenSSF Scorecard Unique Constraint; Add new one.

Revision ID: 32
Revises: 31
Create Date: 2025-06-06

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy import text
from augur.application.db import create_database_engine, get_database_string


# revision identifiers, used by Alembic.
revision = '32'
down_revision = '31'
branch_labels = None
depends_on = None
def upgrade():
op.drop_constraint(
'deps-scorecard-insert-unique',
'repo_deps_scorecard',
schema='augur_data',
type_='unique'
)
op.create_unique_constraint(
'deps_scorecard_new_unique',
'repo_deps_scorecard',
['repo_id', 'repo_deps_scorecard_id'],
schema='augur_data'
)

def downgrade():
op.drop_constraint(
'deps_scorecard_new_unique',
'repo_deps_scorecard',
schema='augur_data',
type_='unique'
)
op.create_unique_constraint(
'deps-scorecard-insert-unique',
'repo_deps_scorecard',
['repo_id', 'name'],
schema='augur_data'
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Fix OpenSSF Scorecard Unique Constraint

Revision ID: 33
Revises: 32
Create Date: 2025-06-06

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy import text
from augur.application.db import create_database_engine, get_database_string


# revision identifiers, used by Alembic.
revision = '33'
down_revision = '32'
branch_labels = None
depends_on = None
def upgrade():
op.drop_constraint(
'deps_scorecard_new_unique',
'repo_deps_scorecard',
schema='augur_data',
type_='unique'
)
op.create_unique_constraint(
'deps_scorecard_new_unique',
'repo_deps_scorecard',
['repo_id', 'name', 'data_collection_date'],
schema='augur_data'
)

def downgrade():
op.drop_constraint(
'deps_scorecard_new_unique',
'repo_deps_scorecard',
schema='augur_data',
type_='unique'
)
op.create_unique_constraint(
'deps-scorecard-insert-unique',
'repo_deps_scorecard',
['repo_id', 'name'],
schema='augur_data'
)
12 changes: 12 additions & 0 deletions augur/tasks/db/refresh_materialized_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ def refresh_materialized_views(self):
COMMIT;
""")

mv14_refresh = s.sql.text("""

REFRESH MATERIALIZED VIEW concurrently augur_data.explorer_repo_languages with data;
COMMIT;
""")

try:
execute_sql(mv1_refresh)
except Exception as e:
Expand Down Expand Up @@ -166,6 +172,12 @@ def refresh_materialized_views(self):
logger.info(f"error is {e}")
pass

try:
execute_sql(mv14_refresh)
except Exception as e:
logger.info(f"error is {e}")
pass

#Now refresh facade tables
#Use this class to get all the settings and
#utility functions for facade
Expand Down
17 changes: 10 additions & 7 deletions augur/tasks/git/dependency_tasks/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from augur.tasks.git.dependency_tasks.dependency_util import dependency_calculator as dep_calc
from augur.tasks.util.worker_util import parse_json_from_subprocess_call
from augur.tasks.git.util.facade_worker.facade_worker.utilitymethods import get_absolute_repo_path
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth

Check warning on line 9 in augur/tasks/git/dependency_tasks/core.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0611: Unused GithubRandomKeyAuth imported from augur.tasks.github.util.github_random_key_auth (unused-import) Raw Output: augur/tasks/git/dependency_tasks/core.py:9:0: W0611: Unused GithubRandomKeyAuth imported from augur.tasks.github.util.github_random_key_auth (unused-import)
from augur.tasks.util.metadata_exception import MetadataException


Expand Down Expand Up @@ -39,15 +39,15 @@
'dep_count' : dep.count,
'dep_language' : dep.language,
'tool_source': 'deps_model',
'tool_version': '0.43.9',
'tool_version': '0.89.1',
'data_source': 'Git',
'data_collection_date': scan_date
}

to_insert.append(repo_deps)

bulk_insert_dicts(logger, to_insert,RepoDependency,["repo_id","dep_name","data_collection_date"])

logger.info(f"Inserted {len(deps)} dependencies for repo {repo_id}")

"""
Expand Down Expand Up @@ -76,7 +76,7 @@
path = repo_git[8:]
if path[-4:] == '.git':
path = path.replace(".git", "")
command = '--local=' + path
command = '--repo=' + path

#this is path where our scorecard project is located
path_to_scorecard = os.getenv('SCORECARD_DIR', os.environ['HOME'] + '/scorecard')
Expand Down Expand Up @@ -111,7 +111,7 @@
'scorecard_check_details': required_output['repo'],
'score': required_output['score'],
'tool_source': 'scorecard_model',
'tool_version': '0.43.9',
'tool_version': '0.89.1',
'data_source': 'Git',
'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
}
Expand All @@ -126,16 +126,19 @@
'scorecard_check_details': check,
'score': check['score'],
'tool_source': 'scorecard_model',
'tool_version': '0.43.9',
'tool_version': '0.89.1',
'data_source': 'Git',
'data_collection_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
}
to_insert.append(repo_deps_scorecard)

bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name","data_collection_date"])

bulk_insert_dicts(logger, to_insert, RepoDepsScorecard, ["repo_id","name"])
logger.info(f"Inserted {len(to_insert)} scorecard entries for repo {repo_id}")

logger.info(f"Done generating scorecard for repo {repo_id} from path {path}")

except Exception as e:

raise MetadataException(e, f"required_output: {required_output}")
logger.exception("Error generating scorecard", exc_info=e)
raise MetadataException(e, f"required_output: {required_output}; error {e}")
4 changes: 2 additions & 2 deletions augur/tasks/github/events.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
import traceback

Check warning on line 2 in augur/tasks/github/events.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0611: Unused import traceback (unused-import) Raw Output: augur/tasks/github/events.py:2:0: W0611: Unused import traceback (unused-import)
import sqlalchemy as s

Check warning on line 3 in augur/tasks/github/events.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0611: Unused sqlalchemy imported as s (unused-import) Raw Output: augur/tasks/github/events.py:3:0: W0611: Unused sqlalchemy imported as s (unused-import)
from sqlalchemy.sql import text
from abc import ABC, abstractmethod
Expand All @@ -10,11 +10,11 @@
from augur.application.db.data_parse import *
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException
from augur.tasks.github.util.github_random_key_auth import GithubRandomKeyAuth
from augur.tasks.github.util.github_task_session import GithubTaskManifest

Check warning on line 13 in augur/tasks/github/events.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0611: Unused GithubTaskManifest imported from augur.tasks.github.util.github_task_session (unused-import) Raw Output: augur/tasks/github/events.py:13:0: W0611: Unused GithubTaskManifest imported from augur.tasks.github.util.github_task_session (unused-import)
from augur.tasks.github.util.util import get_owner_repo
from augur.tasks.util.worker_util import remove_duplicate_dicts
from augur.application.db.models import PullRequestEvent, IssueEvent, Contributor, Repo

Check warning on line 16 in augur/tasks/github/events.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0611: Unused Repo imported from augur.application.db.models (unused-import) Raw Output: augur/tasks/github/events.py:16:0: W0611: Unused Repo imported from augur.application.db.models (unused-import)

Check warning on line 16 in augur/tasks/github/events.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0611: Unused Contributor imported from augur.application.db.models (unused-import) Raw Output: augur/tasks/github/events.py:16:0: W0611: Unused Contributor imported from augur.application.db.models (unused-import)
from augur.application.db.lib import get_repo_by_repo_git, bulk_insert_dicts, get_issues_by_repo_id, get_pull_requests_by_repo_id, update_issue_closed_cntrbs_by_repo_id, get_session, get_engine, get_core_data_last_collected, batch_insert_contributors

Check warning on line 17 in augur/tasks/github/events.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0611: Unused get_session imported from augur.application.db.lib (unused-import) Raw Output: augur/tasks/github/events.py:17:0: W0611: Unused get_session imported from augur.application.db.lib (unused-import)


platform_id = 1
Expand Down Expand Up @@ -325,7 +325,7 @@
self._tool_source, self._tool_version, self._data_source)
)
except UrlNotFoundException as e:
self._logger.warning(f"{self.repo_identifier}: Url not found for {event_url}")
self._logger.info(f"{self.repo_identifier}: Issue with number of {issue_number} returned 404 on event data. Skipping.")

if len(events) > 500:
self._insert_contributors(contributors)
Expand Down Expand Up @@ -386,7 +386,7 @@
self._tool_source, self._tool_version, self._data_source)
)
except UrlNotFoundException:
self._logger.warning(f"{self.repo_identifier}: Url not found for {event_url}")
self._logger.info(f"{self.repo_identifier}: PR with number of {pr_number} returned 404 on event data. Skipping.")
continue

if len(events) > 500:
Expand Down
8 changes: 3 additions & 5 deletions augur/tasks/github/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,10 @@ def process_large_issue_and_pr_message_collection(repo_id, repo_git: str, logger
try:
messages = list(github_data_access.paginate_resource(comment_url))
all_data += messages
except UrlNotFoundException as e:
logger.warning(f"{task_name}: 404 on comment URL {comment_url}. Skipping. Reason: {e}")
except UrlNotFoundException:
logger.info(f"{task_name}: PR or issue comment url of {comment_url} returned 404. Skipping.")
skipped_urls += 1
except Exception as e:
logger.error(f"{task_name}: Unexpected error on comment URL {comment_url}: {e}", exc_info=True)


if len(all_data) >= 20:
process_messages(all_data, task_name, repo_id, logger, augur_db)
all_data.clear()
Expand Down
6 changes: 2 additions & 4 deletions augur/tasks/github/pull_requests/commits_model/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,8 @@ def pull_request_commits_model(repo_id,logger, augur_db, key_auth, full_collecti
}
all_data.append(pr_commit_row)
except UrlNotFoundException:
logger.warning(f"{task_name}: PR #{index + 1} returned 404 on commit data. Skipping.")
except Exception as e:
logger.error(f"{task_name}: Unexpected error while processing PR #{index + 1}: {e}", exc_info=True)

logger.info(f"{task_name}: PR with url of {pr_info['pr_url']} returned 404 on commit data. Skipping.")
continue

if len(all_data) > 0:
logger.info(f"{task_name}: Inserting {len(all_data)} rows")
Expand Down
10 changes: 8 additions & 2 deletions augur/tasks/github/pull_requests/files_model/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection
repo = execute_session_query(query, 'one')
owner, name = get_owner_repo(repo.repo_git)

task_name = f"{owner}/{name} Pr files"

github_graphql_data_access = GithubGraphQlDataAccess(key_auth, logger)

pr_file_rows = []
Expand Down Expand Up @@ -90,10 +92,14 @@ def pull_request_files_model(repo_id,logger, augur_db, key_auth, full_collection
}

pr_file_rows.append(data)
except (NotFoundException, InvalidDataException) as e:
logger.warning(e)
except NotFoundException as e:
logger.info(f"{task_name}: PR with number of {pr_info['pr_src_number']} returned 404 on file data. Skipping.")
continue
except InvalidDataException as e:
logger.warning(f"{task_name}: PR with number of {pr_info['pr_src_number']} returned null for file data. Skipping.")
continue


if len(pr_file_rows) > 0:
# Execute a bulk upsert with sqlalchemy
pr_file_natural_keys = ["pull_request_id", "repo_id", "pr_file_path"]
Expand Down
3 changes: 2 additions & 1 deletion augur/tasks/github/releases/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_release_inf(repo_id, release, tag_only):
'release_id': release['id'],
'repo_id': repo_id,
'release_name': release['name'],
'release_description': release['description'],
'release_description': release['description'] if release['description'] is not None else '',
'release_author': author,
'release_created_at': release['createdAt'],
'release_published_at': release['publishedAt'],
Expand Down Expand Up @@ -54,6 +54,7 @@ def get_release_inf(repo_id, release, tag_only):
'release_id': release['id'],
'repo_id': repo_id,
'release_name': release['name'],
'release_description': 'tag_only',
'release_author': author,
'release_tag_name': release['name'],
'tag_only': tag_only
Expand Down
78 changes: 29 additions & 49 deletions augur/tasks/github/repo_info/core.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#SPDX-License-Identifier: MIT
import json
import sqlalchemy as s
from augur.tasks.github.util.github_data_access import GithubDataAccess
from augur.tasks.github.util.github_data_access import GithubDataAccess, UrlNotFoundException
from augur.tasks.github.util.github_graphql_data_access import GithubGraphQlDataAccess
from augur.tasks.github.util.github_paginator import hit_api
from augur.tasks.github.util.util import get_owner_repo
Expand Down Expand Up @@ -55,60 +55,38 @@

return data
"""
def get_repo_data(logger, url, response):
if response is None:
logger.error(f"Failed to retrieve data from {url}. Response is None.")
raise Exception(f"Failed API request to {url}. Got None response.")
data = {}
def get_repo_data(logger, owner, repo):

try:
data = response.json()
url = f'https://api.github.com/repos/{owner}/{repo}'
github_data_access = GithubDataAccess(None, logger)
result = github_data_access.get_resource(url)
return result
except UrlNotFoundException as e:
message = f"GitHub repo was not found or does not exist for endpoint: {url}"
logger.error(message)
raise Exception(message) from e
except Exception as e:
logger.warning(f"Failed to parse JSON from {url}: {e}")
try:
data = json.loads(json.dumps(response.text)) # This is effectively a no-op
except Exception as inner_e:
logger.error(f"Completely failed to parse response from {url}: {inner_e}")
raise Exception(f"Unparseable response from {url}")

if 'errors' in data:
logger.error(f"GitHub API returned errors: {data['errors']}")
raise Exception(f"GitHub returned error response! {data['errors']}")

if 'id' not in data and 'message' in data:
logger.warning(f"Unexpected response structure from {url}: {data}")
if data['message'] == 'Not Found':
raise Exception(f"GitHub repo was not found or does not exist for endpoint: {url}")

return data

def is_forked(key_auth, logger, owner, repo): #/repos/:owner/:repo parent
logger.info('Querying parent info to verify if the repo is forked\n')
url = f'https://api.github.com/repos/{owner}/{repo}'

r = hit_api(key_auth, url, logger)#requests.get(url, headers=self.headers)
logger.error(e)
raise e

data = get_repo_data(logger, url, r)

if 'fork' in data:
if 'parent' in data:
return data['parent']['full_name']
def is_forked(logger, repo_data): #/repos/:owner/:repo parent
logger.info('Determining if the repo is forked\n')

if 'fork' in repo_data:
if 'parent' in repo_data:
return repo_data['parent']['full_name']
return 'Parent not available'

return False

def is_archived(key_auth, logger, owner, repo):
logger.info('Querying committers count\n')
url = f'https://api.github.com/repos/{owner}/{repo}'

r = hit_api(key_auth, url, logger)#requests.get(url, headers=self.headers)
#self.update_gh_rate_limit(r)
def is_archived(logger, repo_data):
logger.info('Determining if the repo is archived\n')

data = get_repo_data(logger, url, r)

if 'archived' in data:
if data['archived']:
if 'updated_at' in data:
return data['updated_at']
if 'archived' in repo_data:
if repo_data['archived']:
if 'updated_at' in repo_data:
return repo_data['updated_at']
return 'Date not available'
return False

Expand Down Expand Up @@ -267,8 +245,10 @@
execute_sql(insert_statement)

# Note that the addition of information about where a repository may be forked from, and whether a repository is archived, updates the `repo` table, not the `repo_info` table.
forked = is_forked(key_auth, logger, owner, repo)
archived = is_archived(key_auth, logger, owner, repo)
repo_data = get_repo_data(logger, owner, repo)

forked = is_forked(logger, repo_data)
archived = is_archived(logger, repo_data)
archived_date_collected = None
if archived is not False:
archived_date_collected = archived
Expand Down Expand Up @@ -304,7 +284,7 @@

try:
response_data = response.json()
except:

Check warning on line 287 in augur/tasks/github/repo_info/core.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0702: No exception type(s) specified (bare-except) Raw Output: augur/tasks/github/repo_info/core.py:287:4: W0702: No exception type(s) specified (bare-except)
response_data = json.loads(json.dumps(response.text))

#Insert any data that was returned
Expand Down
Loading
Loading