diff --git a/labelit/labelit/settings.py b/labelit/labelit/settings.py index 3eb83e5..9e9b4f4 100644 --- a/labelit/labelit/settings.py +++ b/labelit/labelit/settings.py @@ -58,7 +58,16 @@ LSPROXY_CONNECTION_PER_POOL = 20 # Remote storage configs -LABELIT_REMOTE_STORAGE_CONFIG = { +LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG = { + 's3': { + 'region': None, + }, + 'gs': { + 'project': None, + }, +} + +LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG = { 's3': { 'region': None, }, diff --git a/labelit/mainapp/forms.py b/labelit/mainapp/forms.py index bd1d5b6..91cfe62 100644 --- a/labelit/mainapp/forms.py +++ b/labelit/mainapp/forms.py @@ -19,17 +19,17 @@ class ProjectCreateForm(forms.ModelForm): class Meta: model = Project fields = ( - 'name', 'dataset_format', 'dataset_path', 'config', 'export_format' + 'name', 'dataset_format', 'dataset_path', 'config', 'export_format', "remote_export" ) class ProjectEditForm(forms.ModelForm): - editable_fields = ('status', 'export_format',) + editable_fields = ('status', 'export_format') status_choice_values = [Project.Status.ACTIVE.value, Project.Status.DISABLED.value] class Meta: model = Project fields = ( - 'name', 'dataset_format', 'dataset_path', 'config', 'status', 'export_format' + 'name', 'dataset_format', 'dataset_path', 'config', 'status', 'export_format', 'remote_export' ) def __init__(self, *args, **kwargs): diff --git a/labelit/mainapp/jobs.py b/labelit/mainapp/jobs.py index c573b75..33d2ba5 100644 --- a/labelit/mainapp/jobs.py +++ b/labelit/mainapp/jobs.py @@ -5,7 +5,7 @@ from apscheduler.schedulers.background import BackgroundScheduler from django.core.cache import cache -from labelit.settings import LABELIT_DIRS, LABELIT_REMOTE_STORAGE_CONFIG +from labelit.settings import LABELIT_DIRS, LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG, LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG from .models import Project, ProjectAnnotators from .utils import get_random_port, save_config_file, get_label_studio_cmd, start_tool_server from .storage.utils import get_storage_type @@ -67,12 +67,12 @@ def manage_project_servers(projects=None): try: if project_storage_path_type == 'gs': from .storage.gs import GoogleStorageHandler - gs_project = LABELIT_REMOTE_STORAGE_CONFIG['gs']['project'] + gs_project = LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG['gs']['project'] storage_obj = GoogleStorageHandler(project=gs_project) storage_obj.download(project.dataset_path, project_local_storage) elif project_storage_path_type == 's3': from .storage.s3 import S3StorageHandler - s3_region = LABELIT_REMOTE_STORAGE_CONFIG['s3']['region'] + s3_region = LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG['s3']['region'] storage_obj = S3StorageHandler(region=s3_region) storage_obj.download(project.dataset_path, project_local_storage) @@ -111,6 +111,19 @@ def export_projects(): # Get all projects projects = Project.objects.all() for project in projects: + storage_type = None + if not project.remote_export=="None": + # Declare a storage object which will be used to upload files later + storage_type = get_storage_type(project.remote_export) + if storage_type == "gs": + from .storage.gs import GoogleStorageHandler + gs_project = LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG['gs']['project'] + storage_obj = GoogleStorageHandler(project = gs_project) + elif storage_type == "s3": + from .storage.s3 import S3StorageHandler + s3_region = LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG['s3']['region'] + storage_obj = S3StorageHandler(region=s3_region) + if project.status == Project.Status.ACTIVE and project.export_format != Project.ExportFormat.NONE: logger.info(f"Exporting project {project.name}") output_paths = [] @@ -136,6 +149,14 @@ def export_projects(): logger.debug(f"Export format {project.export_format} not supported for project {project.name}") continue output_paths.append(output_path) + # Both GS and S3 util have the same upload function which take the same set of parameters to + # upload a file to the bucket. + # The above declared `storage_obj` is used to upload. + if storage_type: + file_to_upload = list(x for x in output_path.iterdir() if x.is_file())[0] + storage_path = f"{project.name}/{annotator.username}/{str(file_to_upload).split('/')[-1]}" + storage_obj.upload(project.remote_export, storage_path, file_to_upload) + def stop_project_servers(projects=None): """Stops running Label studio servers""" diff --git a/labelit/mainapp/migrations/0003_project_remote_export.py b/labelit/mainapp/migrations/0003_project_remote_export.py new file mode 100644 index 0000000..3bf1a1f --- /dev/null +++ b/labelit/mainapp/migrations/0003_project_remote_export.py @@ -0,0 +1,19 @@ +# Generated by Django 3.0.5 on 2020-06-27 10:17 + +from django.db import migrations, models +import mainapp.validators + + +class Migration(migrations.Migration): + + dependencies = [ + ('mainapp', '0002_auto_20200613_0820'), + ] + + operations = [ + migrations.AddField( + model_name='project', + name='remote_export', + field=models.CharField(default='None', help_text='Remote export path (GS or S3)', max_length=500, validators=[mainapp.validators.validate_remote_path], verbose_name='Remote Storage'), + ), + ] diff --git a/labelit/mainapp/models.py b/labelit/mainapp/models.py index 6deb3be..c152dc6 100644 --- a/labelit/mainapp/models.py +++ b/labelit/mainapp/models.py @@ -2,7 +2,7 @@ from django.contrib.auth.models import AbstractUser from django.contrib.auth import get_user_model from django.utils.translation import gettext_lazy as _ -from mainapp.validators import validate_dataset_path, validate_label_config +from mainapp.validators import validate_dataset_path, validate_label_config, validate_remote_path class User(AbstractUser): class StaffRole(models.IntegerChoices): @@ -65,6 +65,8 @@ class Status(models.IntegerChoices): status = models.IntegerField(choices=Status.choices, default=Status.INITIALIZED, help_text="Status of project") # Export format export_format = models.IntegerField(choices=ExportFormat.choices, default=ExportFormat.NONE, help_text="Export format for labelled data", verbose_name="Export Format") + # Remote Export Path + remote_export = models.CharField(validators=[validate_remote_path], default="None", max_length=500, help_text="Remote export path (GS or S3)", verbose_name="Remote Storage") class ProjectAnnotators(models.Model): diff --git a/labelit/mainapp/storage/gs.py b/labelit/mainapp/storage/gs.py index e6da54a..205b7f9 100644 --- a/labelit/mainapp/storage/gs.py +++ b/labelit/mainapp/storage/gs.py @@ -1,6 +1,8 @@ import os from .base import StorageHandler from .utils import split_bucket_path +from labelit.settings import LABELIT_DIRS +from pathlib import Path try: from google.cloud import storage @@ -26,3 +28,13 @@ def download(self, storage_path, download_path): cleaned_blob_name = blob.name.replace('/', '_') destination_file_name = os.path.join(download_path, cleaned_blob_name) blob.download_to_filename(destination_file_name) + + def upload(self, bucket_address, bucket_path_of_file, file_to_upload): + bucket_name, source_blob_path = split_bucket_path(bucket_address) + if len(source_blob_path) > 0: + if not source_blob_path.endswith("/"): + source_blob_path += "/" + bucket_path_of_file = source_blob_path + bucket_path_of_file + bucket = self.storage_client.bucket(bucket_name) + blob = bucket.blob(bucket_path_of_file) + blob.upload_from_filename(file_to_upload) diff --git a/labelit/mainapp/storage/s3.py b/labelit/mainapp/storage/s3.py index 67e0600..2356301 100644 --- a/labelit/mainapp/storage/s3.py +++ b/labelit/mainapp/storage/s3.py @@ -30,3 +30,11 @@ def download(self, storage_path, download_path): cleaned_object_name = object['Key'].replace('/', '_') destination_file_name = os.path.join(download_path, cleaned_object_name) self.storage_client.download_file(bucket_name, object['Key'], destination_file_name) + + def upload(self, bucket_address, bucket_path_of_file, file_to_upload): + bucket_name, source_blob_path = split_bucket_path(bucket_address) + if len(source_blob_path) > 0: + if not source_blob_path.endswith("/"): + source_blob_path += "/" + bucket_path_of_file = source_blob_path + bucket_path_of_file + self.storage_client.upload_file(file_to_upload, bucket_name, bucket_path_of_file) diff --git a/labelit/mainapp/storage/utils.py b/labelit/mainapp/storage/utils.py index 0e474bd..d056ea4 100644 --- a/labelit/mainapp/storage/utils.py +++ b/labelit/mainapp/storage/utils.py @@ -1,5 +1,5 @@ from .config import storage_prefex_config -from .exceptions import StorageNotSupported +from .exceptions import StorageNotSupported, InvalidStoragePath def get_storage_type(storage_path): """Get storage type for a storage path (local and supported remote storages)""" diff --git a/labelit/mainapp/validators.py b/labelit/mainapp/validators.py index 167ea83..d02d7ef 100644 --- a/labelit/mainapp/validators.py +++ b/labelit/mainapp/validators.py @@ -28,3 +28,14 @@ def validate_label_config(config): _('Invalid Label Studio config'), code='invalid' ) + +def validate_remote_path(value): + try: + storage_type = get_storage_type(value) + if storage_type == 'local': + raise Exception("Please enter valid GCP or AWS Format") + except: + raise ValidationError( + _('Enter a valid storage path!'), + code='invalid' + )