Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions minecode_pipelines/miners/cargo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
from datetime import datetime

from minecode_pipelines.pipes import get_checkpoint_from_file
from minecode_pipelines.pipes import get_commit_at_distance_ahead
from minecode_pipelines.pipes import update_checkpoints_in_github
from minecode_pipelines.pipes import get_changed_files
from minecode_pipelines.pipes.cargo import store_cargo_packages
from scanpipe.pipes.federatedcode import commit_changes
from scanpipe.pipes.federatedcode import push_changes
from minecode_pipelines import VERSION

import json
from pathlib import Path


PACKAGE_BATCH_SIZE = 500
COMMIT_BATCH_SIZE = 10

CARGO_CHECKPOINT_PATH = "cargo/checkpoints.json"


def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logger):
"""
Process Cargo index files commit by commit.
Push changes to fed_repo after:
- every `commit_batch` commits, OR when reaching HEAD.
"""

base_path = Path(cargo_index_repo.working_tree_dir)

while True:
cargo_checkpoints = get_checkpoint_from_file(
cloned_repo=config_repo, path=CARGO_CHECKPOINT_PATH
)

checkpoints_last_commit = cargo_checkpoints.get("last_commit")

try:
next_commit = get_commit_at_distance_ahead(
cargo_index_repo,
checkpoints_last_commit,
num_commits_ahead=COMMIT_BATCH_SIZE,
branch_name="master",
)
except ValueError as e:
logger(str(e))
break

if next_commit == checkpoints_last_commit:
logger("No new commits to mine")
break

changed_files = get_changed_files(
cargo_index_repo, commit_x=checkpoints_last_commit, commit_y=next_commit
)
logger(f"Found {len(changed_files)} changed files in Cargo index.")

file_counter = 0
purl_files = []
purls = []
for idx, rel_path in enumerate(changed_files):
file_path = base_path / rel_path
logger(f"Found {file_path}.")

if not file_path.is_file() or file_path.name in {
"config.json",
"README.md",
"update-dl-url.yml",
}:
continue

packages = []
with open(file_path, encoding="utf-8") as f:
for line in f:
if line.strip():
try:
packages.append(json.loads(line))
except json.JSONDecodeError as e:
logger(f"Skipping invalid JSON in {file_path}: {e}")

file_counter += 1

# Commit and push after each full batch or when processing the last file
commit_and_push = (file_counter % PACKAGE_BATCH_SIZE == 0) or (
idx == len(changed_files) - 1
)

result_store = store_cargo_packages(packages, cloned_data_repo)
if result_store:
purl_file, base_purl = result_store
logger(f"writing packageURLs for package: {base_purl} at: {purl_file}")

purl_files.append(purl_file)
purls.append(str(base_purl))

if not commit_and_push:
continue

commit_changes(
repo=cloned_data_repo,
files_to_commit=purl_files,
purls=purls,
mine_type="packageURL",
tool_name="pkg:pypi/minecode-pipelines",
tool_version=VERSION,
)

push_changes(repo=cloned_data_repo)
purl_files = []
purls = []

if logger:
logger(
f"Updating checkpoint at: {CARGO_CHECKPOINT_PATH} with last commit: {checkpoints_last_commit}"
)

if next_commit != checkpoints_last_commit:
settings_data = {
"date": str(datetime.now()),
"last_commit": next_commit,
}

update_checkpoints_in_github(
checkpoint=settings_data,
cloned_repo=config_repo,
path=CARGO_CHECKPOINT_PATH,
)

logger(f"Pushed batch for commit range {checkpoints_last_commit}:{next_commit}.")
84 changes: 84 additions & 0 deletions minecode_pipelines/pipelines/mine_cargo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode
from minecode_pipelines.miners import cargo
from minecode_pipelines import pipes
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO

MINECODE_DATA_CARGO_REPO = os.environ.get(
"MINECODE_DATA_CARGO_REPO", "https://github.com/aboutcode-data/minecode-data-cargo-test"
)
MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index"


class MineCargo(Pipeline):
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.clone_cargo_repos,
cls.mine_and_publish_cargo_packageurls,
cls.delete_cloned_repos,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available(logger=self.log)

def clone_cargo_repos(self):
"""
Clone the Cargo-related repositories (index, data, and pipelines config)
and store their Repo objects in the corresponding instance variables.
"""
self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO)
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO)
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)

if self.log:
self.log(
f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}"
)
self.log(
f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
)
self.log(
f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {self.cloned_config_repo.working_dir}"
)

def mine_and_publish_cargo_packageurls(self):
cargo.process_cargo_packages(
self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo, self.log
)

def delete_cloned_repos(self):
pipes.delete_cloned_repos(
repos=[self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo],
logger=self.log,
)
67 changes: 66 additions & 1 deletion minecode_pipelines/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
import saneyaml

from aboutcode.hashid import PURLS_FILENAME
from git import Repo

from scanpipe.pipes.federatedcode import delete_local_clone
from scanpipe.pipes.federatedcode import commit_and_push_changes

Expand All @@ -34,7 +36,7 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path):
)
response = requests.get(checkpoints_file)
if not response.ok:
return
return {}

checkpoint_data = json.loads(response.text)
return checkpoint_data
Expand Down Expand Up @@ -112,3 +114,66 @@ def delete_cloned_repos(repos, logger=None):
if logger:
logger(f"Deleting local clone at: {repo.working_dir}")
delete_local_clone(repo)


def get_changed_files(repo: Repo, commit_x: str = None, commit_y: str = None):
"""
Return a list of files changed between two commits using GitPython.
Includes added, modified, deleted, and renamed files.
- commit_x: base commit (or the empty tree hash for the first commit)
- commit_y: target commit (defaults to HEAD if not provided)
"""
EMPTY_TREE_HASH = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"

if commit_y is None:
commit_y = repo.head.commit.hexsha
commit_y_obj = repo.commit(commit_y)

if commit_x is None or commit_x == EMPTY_TREE_HASH:
# First commit case: diff against empty tree
diff_index = commit_y_obj.diff(EMPTY_TREE_HASH, R=True)
else:
commit_x_obj = repo.commit(commit_x)
diff_index = commit_x_obj.diff(commit_y_obj, R=True)

changed_files = {item.a_path or item.b_path for item in diff_index}
return list(changed_files)


def get_last_commit(repo, ecosystem):
"""
Retrieve the last mined commit for a given ecosystem.
This function reads a JSON checkpoint file from the repository, which stores
mining progress. Each checkpoint contains the "last_commit" from the package
index (e.g., PyPI) that was previously mined.
https://github.com/AyanSinhaMahapatra/minecode-test/blob/main/minecode_checkpoints/pypi.json
https://github.com/ziadhany/cargo-test/blob/main/minecode_checkpoints/cargo.json
"""

last_commit_file_path = (
Path(repo.working_tree_dir) / "minecode_checkpoints" / f"{ecosystem}.json"
)
try:
with open(last_commit_file_path) as f:
settings_data = json.load(f)
except FileNotFoundError:
return
return settings_data.get("last_commit")


def get_commit_at_distance_ahead(
repo: Repo,
current_commit: str,
num_commits_ahead: int = 10,
branch_name: str = "master",
) -> str:
"""
Return the commit hash that is `num_commits_ahead` commits ahead of `current_commit`
on the given branch.
"""
if not current_commit:
current_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
revs = repo.git.rev_list(f"^{current_commit}", branch_name).splitlines()
if len(revs) < num_commits_ahead:
raise ValueError(f"Not enough commits ahead; only {len(revs)} available.")
return revs[-num_commits_ahead]
32 changes: 32 additions & 0 deletions minecode_pipelines/pipes/cargo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from pathlib import Path

from aboutcode import hashid
from packageurl import PackageURL
from aboutcode.hashid import get_core_purl

from minecode_pipelines.pipes import write_data_to_yaml_file


def store_cargo_packages(packages, repo):
"""Collect Cargo package versions into purls and write them to the repo."""

if not packages:
return

first_pkg = packages[0]
name = first_pkg.get("name")
version = first_pkg.get("vers")
purl = PackageURL(type="cargo", name=name, version=version)
base_purl = get_core_purl(purl)

updated_purls = []
for package in packages:
name = package.get("name")
version = package.get("vers")
purl = PackageURL(type="cargo", name=name, version=version).to_string()
updated_purls.append(purl)

ppath = hashid.get_package_purls_yml_file_path(base_purl)
purl_file_full_path = Path(repo.working_dir) / ppath
write_data_to_yaml_file(path=purl_file_full_path, data=updated_purls)
return purl_file_full_path, base_purl
1 change: 1 addition & 0 deletions minecode_pipelines/pipes/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from aboutcode.hashid import get_package_base_dir
from packageurl import PackageURL
from scanpipe.pipes.federatedcode import clone_repository

from scanpipe.pipes.federatedcode import commit_changes
from scanpipe.pipes.federatedcode import push_changes

Expand Down
Loading