Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions minecode_pipelines/miners/composer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import json
from minecode_pipelines.utils import get_temp_file
import requests
from packageurl import PackageURL


def get_composer_packages():
"""
Fetch all Composer packages from Packagist and save them to a temporary JSON file.
Response example:
{
"packageNames" ["0.0.0/composer-include-files", "0.0.0/laravel-env-shim"]
}
"""

response = requests.get("https://packagist.org/packages/list.json")
if not response.ok:
return

packages = response.json()
temp_file = get_temp_file("ComposerPackages", "json")
with open(temp_file, "w", encoding="utf-8") as f:
json.dump(packages, f, indent=4)

return temp_file


def get_composer_purl(vendor, package):
"""
Fetch all available Package URLs (purls) for a Composer package from Packagist.
Response example:
{
"minified": "composer/2.0",
"packages": [
{
"monolog/monolog": {
"0": {
"name": "monolog/monolog",
"version": "3.9.0"
}
}
}
],
"security-advisories": [
{
"advisoryId": "PKSA-dmw8-jd8k-q3c6",
"affectedVersions": ">=1.8.0,<1.12.0"
}
]
}
get_composer_purl("monolog", "monolog")
-> ["pkg:composer/monolog/monolog@3.9.0", "pkg:composer/monolog/monolog@3.8.0", ...]
"""
purls = []
url = f"https://repo.packagist.org/p2/{vendor}/{package}.json"

try:
response = requests.get(url, timeout=10)
response.raise_for_status()
except requests.RequestException:
return purls

data = response.json()
packages = data.get("packages", {})
releases = packages.get(f"{vendor}/{package}", [])

for release in releases:
version = release.get("version")
if version:
purl = PackageURL(
type="composer",
namespace=vendor,
name=package,
version=version,
)
purls.append(purl.to_string())

return purls


def load_composer_packages(packages_file):
"""Load and return a list of (vendor, package) tuples from a JSON file."""
with open(packages_file, encoding="utf-8") as f:
packages_data = json.load(f)

package_names = packages_data.get("packageNames", [])
result = []

for item in package_names:
if "/" in item:
vendor, package = item.split("/", 1)
result.append((vendor, package))

return result
82 changes: 82 additions & 0 deletions minecode_pipelines/pipelines/mine_composer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import os
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode

from matchcode_pipeline import pipes
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO
from minecode_pipelines.pipes.composer import mine_composer_packages
from minecode_pipelines.pipes.composer import mine_and_publish_composer_purls

MINECODE_COMPOSER_GIT_URL = os.environ.get(
"MINECODE_COMPOSER_GIT_URL", "https://github.com/aboutcode-data/minecode-data-composer-test"
)


class MineComposer(Pipeline):
"""
Mine all packageURLs from a composer index and publish them to a FederatedCode repo.
"""

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.clone_composer_repo,
cls.mine_and_publish_composer_purls,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available(logger=self.log)

def clone_composer_repo(self):
"""
Clone the federatedcode composer url and return the Repo object
"""
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_COMPOSER_GIT_URL)
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)

def mine_and_publish_composer_purls(self):
"""
Mine Composer package names from Composer indexes and generate
package URLs (pURLs) for all mined Composer packages.
"""

composer_packages = mine_composer_packages()
mine_and_publish_composer_purls(
packages=composer_packages,
cloned_data_repo=self.cloned_data_repo,
cloned_config_repo=self.cloned_config_repo,
logger=self.log,
)

def delete_cloned_repos(self):
pipes.delete_cloned_repos(
repos=[self.cloned_data_repo, self.cloned_config_repo],
logger=self.log,
)
111 changes: 111 additions & 0 deletions minecode_pipelines/pipes/composer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from datetime import datetime
from pathlib import Path
from aboutcode import hashid
from aboutcode.hashid import get_package_base_dir
from minecode_pipelines.miners.composer import get_composer_packages
from minecode_pipelines.miners.composer import load_composer_packages
from minecode_pipelines.miners.composer import get_composer_purl
from minecode_pipelines.pipes import (
write_data_to_yaml_file,
get_checkpoint_from_file,
update_checkpoints_in_github,
)
from scanpipe.pipes.federatedcode import commit_changes
from scanpipe.pipes.federatedcode import push_changes
from minecode_pipelines import VERSION
from minecode_pipelines.utils import cycle_from_index, grouper

PACKAGE_BATCH_SIZE = 100
COMPOSER_CHECKPOINT_PATH = "composer/checkpoints.json"


def mine_composer_packages():
"""Mine Composer package names from Packagist and return List of (vendor, package) tuples."""
packages_file = get_composer_packages()
return load_composer_packages(packages_file)


def mine_and_publish_composer_purls(packages, cloned_data_repo, cloned_config_repo, logger):
"""Mine Composer packages and publish their PURLs to a FederatedCode repository."""
composer_checkpoint = get_checkpoint_from_file(
cloned_repo=cloned_config_repo, path=COMPOSER_CHECKPOINT_PATH
)

start_index = composer_checkpoint.get("start_index", 0)

packages_iter = cycle_from_index(packages, start_index)

for batch_index, package_batch in enumerate(
grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_iter)
):
purl_files = []
purls = []

for item in package_batch:
if not item:
continue

vendor, package = item
logger(f"getting packageURLs for package: {vendor}/{package}")

updated_purls = get_composer_purl(vendor=vendor, package=package)
if not updated_purls:
continue

base_purl = updated_purls[0]
package_base_dir = get_package_base_dir(purl=base_purl)

logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}")
logger(f"packageURLs: {' '.join(updated_purls)}")

purl_file_full_path = Path(
cloned_data_repo.working_dir
) / hashid.get_package_purls_yml_file_path(base_purl)

write_data_to_yaml_file(path=purl_file_full_path, data=updated_purls)

purl_files.append(purl_file_full_path)
purls.append(str(base_purl))

if purl_files:
commit_changes(
repo=cloned_data_repo,
files_to_commit=purl_files,
purls=purls,
mine_type="packageURL",
tool_name="pkg:composer/minecode-pipelines",
tool_version=VERSION,
)
push_changes(repo=cloned_data_repo)

settings_data = {
"date": str(datetime.now()),
"start_index": start_index + (batch_index + 1) * PACKAGE_BATCH_SIZE,
}
update_checkpoints_in_github(
checkpoint=settings_data,
cloned_repo=cloned_config_repo,
path=COMPOSER_CHECKPOINT_PATH,
)
58 changes: 58 additions & 0 deletions minecode_pipelines/tests/pipes/test_composer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import json
from pathlib import Path
from unittest.mock import patch, MagicMock
from django.test import SimpleTestCase

from minecode_pipelines.miners.composer import get_composer_packages
from minecode_pipelines.miners.composer import load_composer_packages
from minecode_pipelines.miners.composer import get_composer_purl

DATA_DIR = Path(__file__).parent.parent / "test_data" / "composer"


class ComposerPipelineTests(SimpleTestCase):
@patch("requests.get")
def test_generate_purls_from_composer(self, mock_get):
"""
Test mining composer packages and generating PURLs with mocked Packagist requests
using JSON files stored in test_data/composer.
"""

with open(DATA_DIR / "packages_list.json", encoding="utf-8") as f:
fake_packages_list = json.load(f)

with open(DATA_DIR / "package_details.json", encoding="utf-8") as f:
fake_package_details = json.load(f)

with open(DATA_DIR / "expected_output.json", encoding="utf-8") as f:
expected_output = json.load(f)

resp_list = MagicMock()
resp_list.ok = True
resp_list.json.return_value = fake_packages_list

resp_package_details = MagicMock()
resp_package_details.ok = True
resp_package_details.json.return_value = fake_package_details

mock_get.side_effect = [resp_list, resp_package_details]

packages_file = get_composer_packages()
packages = load_composer_packages(packages_file)

all_purls = []
for vendor, package in packages:
purls = get_composer_purl(vendor, package)
all_purls.extend(purls)

assert len(all_purls) == 85
assert all_purls == expected_output
Loading
Loading