diff --git a/minecode_pipelines/miners/composer.py b/minecode_pipelines/miners/composer.py new file mode 100644 index 00000000..a58adf8d --- /dev/null +++ b/minecode_pipelines/miners/composer.py @@ -0,0 +1,103 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from minecode_pipelines.utils import get_temp_file +import requests +from packageurl import PackageURL + + +def get_composer_packages(): + """ + Fetch all Composer packages from Packagist and save them to a temporary JSON file. + Response example: + { + "packageNames" ["0.0.0/composer-include-files", "0.0.0/laravel-env-shim"] + } + """ + + response = requests.get("https://packagist.org/packages/list.json") + if not response.ok: + return + + packages = response.json() + temp_file = get_temp_file("ComposerPackages", "json") + with open(temp_file, "w", encoding="utf-8") as f: + json.dump(packages, f, indent=4) + + return temp_file + + +def get_composer_purl(vendor, package): + """ + Fetch all available Package URLs (purls) for a Composer package from Packagist. + Response example: + { + "minified": "composer/2.0", + "packages": [ + { + "monolog/monolog": { + "0": { + "name": "monolog/monolog", + "version": "3.9.0" + } + } + } + ], + "security-advisories": [ + { + "advisoryId": "PKSA-dmw8-jd8k-q3c6", + "affectedVersions": ">=1.8.0,<1.12.0" + } + ] + } + get_composer_purl("monolog", "monolog") + -> ["pkg:composer/monolog/monolog@3.9.0", "pkg:composer/monolog/monolog@3.8.0", ...] + """ + purls = [] + url = f"https://repo.packagist.org/p2/{vendor}/{package}.json" + + try: + response = requests.get(url, timeout=10) + response.raise_for_status() + except requests.RequestException: + return purls + + data = response.json() + packages = data.get("packages", {}) + releases = packages.get(f"{vendor}/{package}", []) + + for release in releases: + version = release.get("version") + if version: + purl = PackageURL( + type="composer", + namespace=vendor, + name=package, + version=version, + ) + purls.append(purl.to_string()) + + return purls + + +def load_composer_packages(packages_file): + """Load and return a list of (vendor, package) tuples from a JSON file.""" + with open(packages_file, encoding="utf-8") as f: + packages_data = json.load(f) + + package_names = packages_data.get("packageNames", []) + result = [] + + for item in package_names: + if "/" in item: + vendor, package = item.split("/", 1) + result.append((vendor, package)) + + return result diff --git a/minecode_pipelines/pipelines/mine_composer.py b/minecode_pipelines/pipelines/mine_composer.py new file mode 100644 index 00000000..192602f8 --- /dev/null +++ b/minecode_pipelines/pipelines/mine_composer.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import os +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import federatedcode + +from matchcode_pipeline import pipes +from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO +from minecode_pipelines.pipes.composer import mine_composer_packages +from minecode_pipelines.pipes.composer import mine_and_publish_composer_purls + +MINECODE_COMPOSER_GIT_URL = os.environ.get( + "MINECODE_COMPOSER_GIT_URL", "https://github.com/aboutcode-data/minecode-data-composer-test" +) + + +class MineComposer(Pipeline): + """ + Mine all packageURLs from a composer index and publish them to a FederatedCode repo. + """ + + @classmethod + def steps(cls): + return ( + cls.check_federatedcode_eligibility, + cls.clone_composer_repo, + cls.mine_and_publish_composer_purls, + ) + + def check_federatedcode_eligibility(self): + """ + Check if the project fulfills the following criteria for + pushing the project result to FederatedCode. + """ + federatedcode.check_federatedcode_configured_and_available(logger=self.log) + + def clone_composer_repo(self): + """ + Clone the federatedcode composer url and return the Repo object + """ + self.cloned_data_repo = federatedcode.clone_repository(MINECODE_COMPOSER_GIT_URL) + self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO) + + def mine_and_publish_composer_purls(self): + """ + Mine Composer package names from Composer indexes and generate + package URLs (pURLs) for all mined Composer packages. + """ + + composer_packages = mine_composer_packages() + mine_and_publish_composer_purls( + packages=composer_packages, + cloned_data_repo=self.cloned_data_repo, + cloned_config_repo=self.cloned_config_repo, + logger=self.log, + ) + + def delete_cloned_repos(self): + pipes.delete_cloned_repos( + repos=[self.cloned_data_repo, self.cloned_config_repo], + logger=self.log, + ) diff --git a/minecode_pipelines/pipes/composer.py b/minecode_pipelines/pipes/composer.py new file mode 100644 index 00000000..fa7575e9 --- /dev/null +++ b/minecode_pipelines/pipes/composer.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from datetime import datetime +from pathlib import Path +from aboutcode import hashid +from aboutcode.hashid import get_package_base_dir +from minecode_pipelines.miners.composer import get_composer_packages +from minecode_pipelines.miners.composer import load_composer_packages +from minecode_pipelines.miners.composer import get_composer_purl +from minecode_pipelines.pipes import ( + write_data_to_yaml_file, + get_checkpoint_from_file, + update_checkpoints_in_github, +) +from scanpipe.pipes.federatedcode import commit_changes +from scanpipe.pipes.federatedcode import push_changes +from minecode_pipelines import VERSION +from minecode_pipelines.utils import cycle_from_index, grouper + +PACKAGE_BATCH_SIZE = 100 +COMPOSER_CHECKPOINT_PATH = "composer/checkpoints.json" + + +def mine_composer_packages(): + """Mine Composer package names from Packagist and return List of (vendor, package) tuples.""" + packages_file = get_composer_packages() + return load_composer_packages(packages_file) + + +def mine_and_publish_composer_purls(packages, cloned_data_repo, cloned_config_repo, logger): + """Mine Composer packages and publish their PURLs to a FederatedCode repository.""" + composer_checkpoint = get_checkpoint_from_file( + cloned_repo=cloned_config_repo, path=COMPOSER_CHECKPOINT_PATH + ) + + start_index = composer_checkpoint.get("start_index", 0) + + packages_iter = cycle_from_index(packages, start_index) + + for batch_index, package_batch in enumerate( + grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_iter) + ): + purl_files = [] + purls = [] + + for item in package_batch: + if not item: + continue + + vendor, package = item + logger(f"getting packageURLs for package: {vendor}/{package}") + + updated_purls = get_composer_purl(vendor=vendor, package=package) + if not updated_purls: + continue + + base_purl = updated_purls[0] + package_base_dir = get_package_base_dir(purl=base_purl) + + logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}") + logger(f"packageURLs: {' '.join(updated_purls)}") + + purl_file_full_path = Path( + cloned_data_repo.working_dir + ) / hashid.get_package_purls_yml_file_path(base_purl) + + write_data_to_yaml_file(path=purl_file_full_path, data=updated_purls) + + purl_files.append(purl_file_full_path) + purls.append(str(base_purl)) + + if purl_files: + commit_changes( + repo=cloned_data_repo, + files_to_commit=purl_files, + purls=purls, + mine_type="packageURL", + tool_name="pkg:composer/minecode-pipelines", + tool_version=VERSION, + ) + push_changes(repo=cloned_data_repo) + + settings_data = { + "date": str(datetime.now()), + "start_index": start_index + (batch_index + 1) * PACKAGE_BATCH_SIZE, + } + update_checkpoints_in_github( + checkpoint=settings_data, + cloned_repo=cloned_config_repo, + path=COMPOSER_CHECKPOINT_PATH, + ) diff --git a/minecode_pipelines/tests/pipes/test_composer.py b/minecode_pipelines/tests/pipes/test_composer.py new file mode 100644 index 00000000..42976f65 --- /dev/null +++ b/minecode_pipelines/tests/pipes/test_composer.py @@ -0,0 +1,58 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from unittest.mock import patch, MagicMock +from django.test import SimpleTestCase + +from minecode_pipelines.miners.composer import get_composer_packages +from minecode_pipelines.miners.composer import load_composer_packages +from minecode_pipelines.miners.composer import get_composer_purl + +DATA_DIR = Path(__file__).parent.parent / "test_data" / "composer" + + +class ComposerPipelineTests(SimpleTestCase): + @patch("requests.get") + def test_generate_purls_from_composer(self, mock_get): + """ + Test mining composer packages and generating PURLs with mocked Packagist requests + using JSON files stored in test_data/composer. + """ + + with open(DATA_DIR / "packages_list.json", encoding="utf-8") as f: + fake_packages_list = json.load(f) + + with open(DATA_DIR / "package_details.json", encoding="utf-8") as f: + fake_package_details = json.load(f) + + with open(DATA_DIR / "expected_output.json", encoding="utf-8") as f: + expected_output = json.load(f) + + resp_list = MagicMock() + resp_list.ok = True + resp_list.json.return_value = fake_packages_list + + resp_package_details = MagicMock() + resp_package_details.ok = True + resp_package_details.json.return_value = fake_package_details + + mock_get.side_effect = [resp_list, resp_package_details] + + packages_file = get_composer_packages() + packages = load_composer_packages(packages_file) + + all_purls = [] + for vendor, package in packages: + purls = get_composer_purl(vendor, package) + all_purls.extend(purls) + + assert len(all_purls) == 85 + assert all_purls == expected_output diff --git a/minecode_pipelines/tests/test_data/composer/expected_output.json b/minecode_pipelines/tests/test_data/composer/expected_output.json new file mode 100644 index 00000000..facc6851 --- /dev/null +++ b/minecode_pipelines/tests/test_data/composer/expected_output.json @@ -0,0 +1,87 @@ +[ + "pkg:composer/monolog/monolog@3.9.0", + "pkg:composer/monolog/monolog@3.8.1", + "pkg:composer/monolog/monolog@3.8.0", + "pkg:composer/monolog/monolog@3.7.0", + "pkg:composer/monolog/monolog@3.6.0", + "pkg:composer/monolog/monolog@3.5.0", + "pkg:composer/monolog/monolog@3.4.0", + "pkg:composer/monolog/monolog@3.3.1", + "pkg:composer/monolog/monolog@3.3.0", + "pkg:composer/monolog/monolog@3.2.0", + "pkg:composer/monolog/monolog@3.1.0", + "pkg:composer/monolog/monolog@3.0.0", + "pkg:composer/monolog/monolog@3.0.0-RC1", + "pkg:composer/monolog/monolog@2.10.0", + "pkg:composer/monolog/monolog@2.9.3", + "pkg:composer/monolog/monolog@2.9.2", + "pkg:composer/monolog/monolog@2.9.1", + "pkg:composer/monolog/monolog@2.9.0", + "pkg:composer/monolog/monolog@2.8.0", + "pkg:composer/monolog/monolog@2.7.0", + "pkg:composer/monolog/monolog@2.6.0", + "pkg:composer/monolog/monolog@2.5.0", + "pkg:composer/monolog/monolog@2.4.0", + "pkg:composer/monolog/monolog@2.3.5", + "pkg:composer/monolog/monolog@2.3.4", + "pkg:composer/monolog/monolog@2.3.3", + "pkg:composer/monolog/monolog@2.3.2", + "pkg:composer/monolog/monolog@2.3.1", + "pkg:composer/monolog/monolog@2.3.0", + "pkg:composer/monolog/monolog@2.2.0", + "pkg:composer/monolog/monolog@2.1.1", + "pkg:composer/monolog/monolog@2.1.0", + "pkg:composer/monolog/monolog@2.0.2", + "pkg:composer/monolog/monolog@2.0.1", + "pkg:composer/monolog/monolog@2.0.0", + "pkg:composer/monolog/monolog@2.0.0-beta2", + "pkg:composer/monolog/monolog@2.0.0-beta1", + "pkg:composer/monolog/monolog@1.27.1", + "pkg:composer/monolog/monolog@1.27.0", + "pkg:composer/monolog/monolog@1.26.1", + "pkg:composer/monolog/monolog@1.26.0", + "pkg:composer/monolog/monolog@1.25.5", + "pkg:composer/monolog/monolog@1.25.4", + "pkg:composer/monolog/monolog@1.25.3", + "pkg:composer/monolog/monolog@1.25.2", + "pkg:composer/monolog/monolog@1.25.1", + "pkg:composer/monolog/monolog@1.25.0", + "pkg:composer/monolog/monolog@1.24.0", + "pkg:composer/monolog/monolog@1.23.0", + "pkg:composer/monolog/monolog@1.22.1", + "pkg:composer/monolog/monolog@1.22.0", + "pkg:composer/monolog/monolog@1.21.0", + "pkg:composer/monolog/monolog@1.20.0", + "pkg:composer/monolog/monolog@1.19.0", + "pkg:composer/monolog/monolog@1.18.2", + "pkg:composer/monolog/monolog@1.18.1", + "pkg:composer/monolog/monolog@1.18.0", + "pkg:composer/monolog/monolog@1.17.2", + "pkg:composer/monolog/monolog@1.17.1", + "pkg:composer/monolog/monolog@1.17.0", + "pkg:composer/monolog/monolog@1.16.0", + "pkg:composer/monolog/monolog@1.15.0", + "pkg:composer/monolog/monolog@1.14.0", + "pkg:composer/monolog/monolog@1.13.1", + "pkg:composer/monolog/monolog@1.13.0", + "pkg:composer/monolog/monolog@1.12.0", + "pkg:composer/monolog/monolog@1.11.0", + "pkg:composer/monolog/monolog@1.10.0", + "pkg:composer/monolog/monolog@1.9.1", + "pkg:composer/monolog/monolog@1.9.0", + "pkg:composer/monolog/monolog@1.8.0", + "pkg:composer/monolog/monolog@1.7.0", + "pkg:composer/monolog/monolog@1.6.0", + "pkg:composer/monolog/monolog@1.5.0", + "pkg:composer/monolog/monolog@1.4.1", + "pkg:composer/monolog/monolog@1.4.0", + "pkg:composer/monolog/monolog@1.3.1", + "pkg:composer/monolog/monolog@1.3.0", + "pkg:composer/monolog/monolog@1.2.1", + "pkg:composer/monolog/monolog@1.2.0", + "pkg:composer/monolog/monolog@1.1.0", + "pkg:composer/monolog/monolog@1.0.2", + "pkg:composer/monolog/monolog@1.0.1", + "pkg:composer/monolog/monolog@1.0.0", + "pkg:composer/monolog/monolog@1.0.0-RC1" +] \ No newline at end of file diff --git a/minecode_pipelines/tests/test_data/composer/package_details.json b/minecode_pipelines/tests/test_data/composer/package_details.json new file mode 100644 index 00000000..facc6851 --- /dev/null +++ b/minecode_pipelines/tests/test_data/composer/package_details.json @@ -0,0 +1,87 @@ +[ + "pkg:composer/monolog/monolog@3.9.0", + "pkg:composer/monolog/monolog@3.8.1", + "pkg:composer/monolog/monolog@3.8.0", + "pkg:composer/monolog/monolog@3.7.0", + "pkg:composer/monolog/monolog@3.6.0", + "pkg:composer/monolog/monolog@3.5.0", + "pkg:composer/monolog/monolog@3.4.0", + "pkg:composer/monolog/monolog@3.3.1", + "pkg:composer/monolog/monolog@3.3.0", + "pkg:composer/monolog/monolog@3.2.0", + "pkg:composer/monolog/monolog@3.1.0", + "pkg:composer/monolog/monolog@3.0.0", + "pkg:composer/monolog/monolog@3.0.0-RC1", + "pkg:composer/monolog/monolog@2.10.0", + "pkg:composer/monolog/monolog@2.9.3", + "pkg:composer/monolog/monolog@2.9.2", + "pkg:composer/monolog/monolog@2.9.1", + "pkg:composer/monolog/monolog@2.9.0", + "pkg:composer/monolog/monolog@2.8.0", + "pkg:composer/monolog/monolog@2.7.0", + "pkg:composer/monolog/monolog@2.6.0", + "pkg:composer/monolog/monolog@2.5.0", + "pkg:composer/monolog/monolog@2.4.0", + "pkg:composer/monolog/monolog@2.3.5", + "pkg:composer/monolog/monolog@2.3.4", + "pkg:composer/monolog/monolog@2.3.3", + "pkg:composer/monolog/monolog@2.3.2", + "pkg:composer/monolog/monolog@2.3.1", + "pkg:composer/monolog/monolog@2.3.0", + "pkg:composer/monolog/monolog@2.2.0", + "pkg:composer/monolog/monolog@2.1.1", + "pkg:composer/monolog/monolog@2.1.0", + "pkg:composer/monolog/monolog@2.0.2", + "pkg:composer/monolog/monolog@2.0.1", + "pkg:composer/monolog/monolog@2.0.0", + "pkg:composer/monolog/monolog@2.0.0-beta2", + "pkg:composer/monolog/monolog@2.0.0-beta1", + "pkg:composer/monolog/monolog@1.27.1", + "pkg:composer/monolog/monolog@1.27.0", + "pkg:composer/monolog/monolog@1.26.1", + "pkg:composer/monolog/monolog@1.26.0", + "pkg:composer/monolog/monolog@1.25.5", + "pkg:composer/monolog/monolog@1.25.4", + "pkg:composer/monolog/monolog@1.25.3", + "pkg:composer/monolog/monolog@1.25.2", + "pkg:composer/monolog/monolog@1.25.1", + "pkg:composer/monolog/monolog@1.25.0", + "pkg:composer/monolog/monolog@1.24.0", + "pkg:composer/monolog/monolog@1.23.0", + "pkg:composer/monolog/monolog@1.22.1", + "pkg:composer/monolog/monolog@1.22.0", + "pkg:composer/monolog/monolog@1.21.0", + "pkg:composer/monolog/monolog@1.20.0", + "pkg:composer/monolog/monolog@1.19.0", + "pkg:composer/monolog/monolog@1.18.2", + "pkg:composer/monolog/monolog@1.18.1", + "pkg:composer/monolog/monolog@1.18.0", + "pkg:composer/monolog/monolog@1.17.2", + "pkg:composer/monolog/monolog@1.17.1", + "pkg:composer/monolog/monolog@1.17.0", + "pkg:composer/monolog/monolog@1.16.0", + "pkg:composer/monolog/monolog@1.15.0", + "pkg:composer/monolog/monolog@1.14.0", + "pkg:composer/monolog/monolog@1.13.1", + "pkg:composer/monolog/monolog@1.13.0", + "pkg:composer/monolog/monolog@1.12.0", + "pkg:composer/monolog/monolog@1.11.0", + "pkg:composer/monolog/monolog@1.10.0", + "pkg:composer/monolog/monolog@1.9.1", + "pkg:composer/monolog/monolog@1.9.0", + "pkg:composer/monolog/monolog@1.8.0", + "pkg:composer/monolog/monolog@1.7.0", + "pkg:composer/monolog/monolog@1.6.0", + "pkg:composer/monolog/monolog@1.5.0", + "pkg:composer/monolog/monolog@1.4.1", + "pkg:composer/monolog/monolog@1.4.0", + "pkg:composer/monolog/monolog@1.3.1", + "pkg:composer/monolog/monolog@1.3.0", + "pkg:composer/monolog/monolog@1.2.1", + "pkg:composer/monolog/monolog@1.2.0", + "pkg:composer/monolog/monolog@1.1.0", + "pkg:composer/monolog/monolog@1.0.2", + "pkg:composer/monolog/monolog@1.0.1", + "pkg:composer/monolog/monolog@1.0.0", + "pkg:composer/monolog/monolog@1.0.0-RC1" +] \ No newline at end of file diff --git a/minecode_pipelines/tests/test_data/composer/packages_list.json b/minecode_pipelines/tests/test_data/composer/packages_list.json new file mode 100644 index 00000000..9682c428 --- /dev/null +++ b/minecode_pipelines/tests/test_data/composer/packages_list.json @@ -0,0 +1,5 @@ + { + "packageNames": [ + "monolog/monolog" + ] + } \ No newline at end of file diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index d245cc9f..f5472a91 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -57,6 +57,7 @@ mine_alpine = "minecode_pipelines.pipelines.mine_alpine:MineAlpine" mine_conan = "minecode_pipelines.pipelines.mine_conan:MineConan" mine_cran = "minecode_pipelines.pipelines.mine_cran:MineCran" mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift" +mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer" [tool.bumpversion] current_version = "0.0.1b21"