From 5d83e274c415ea940a90c235f1e139855a8ac48a Mon Sep 17 00:00:00 2001 From: ashmeet13 Date: Sun, 28 Jun 2020 18:21:19 +0530 Subject: [PATCH 1/3] Init Commit --- labelit/labelit/settings.py | 11 +++++++- labelit/mainapp/forms.py | 6 ++--- labelit/mainapp/jobs.py | 26 ++++++++++++++++--- .../migrations/0003_project_remote_export.py | 19 ++++++++++++++ labelit/mainapp/models.py | 4 ++- labelit/mainapp/storage/gs.py | 12 +++++++++ labelit/mainapp/storage/utils.py | 2 +- labelit/mainapp/validators.py | 11 ++++++++ 8 files changed, 81 insertions(+), 10 deletions(-) create mode 100644 labelit/mainapp/migrations/0003_project_remote_export.py diff --git a/labelit/labelit/settings.py b/labelit/labelit/settings.py index 3eb83e5..c824775 100644 --- a/labelit/labelit/settings.py +++ b/labelit/labelit/settings.py @@ -58,7 +58,7 @@ LSPROXY_CONNECTION_PER_POOL = 20 # Remote storage configs -LABELIT_REMOTE_STORAGE_CONFIG = { +LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG = { 's3': { 'region': None, }, @@ -67,6 +67,15 @@ }, } +LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG = { + 's3': { + 'region': None, + }, + 'gs': { + 'project': "mach-learn", + }, +} + # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/ diff --git a/labelit/mainapp/forms.py b/labelit/mainapp/forms.py index bd1d5b6..91cfe62 100644 --- a/labelit/mainapp/forms.py +++ b/labelit/mainapp/forms.py @@ -19,17 +19,17 @@ class ProjectCreateForm(forms.ModelForm): class Meta: model = Project fields = ( - 'name', 'dataset_format', 'dataset_path', 'config', 'export_format' + 'name', 'dataset_format', 'dataset_path', 'config', 'export_format', "remote_export" ) class ProjectEditForm(forms.ModelForm): - editable_fields = ('status', 'export_format',) + editable_fields = ('status', 'export_format') status_choice_values = [Project.Status.ACTIVE.value, Project.Status.DISABLED.value] class Meta: model = Project fields = ( - 'name', 'dataset_format', 'dataset_path', 'config', 'status', 'export_format' + 'name', 'dataset_format', 'dataset_path', 'config', 'status', 'export_format', 'remote_export' ) def __init__(self, *args, **kwargs): diff --git a/labelit/mainapp/jobs.py b/labelit/mainapp/jobs.py index c573b75..2c07be9 100644 --- a/labelit/mainapp/jobs.py +++ b/labelit/mainapp/jobs.py @@ -5,7 +5,7 @@ from apscheduler.schedulers.background import BackgroundScheduler from django.core.cache import cache -from labelit.settings import LABELIT_DIRS, LABELIT_REMOTE_STORAGE_CONFIG +from labelit.settings import LABELIT_DIRS, LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG, LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG from .models import Project, ProjectAnnotators from .utils import get_random_port, save_config_file, get_label_studio_cmd, start_tool_server from .storage.utils import get_storage_type @@ -67,12 +67,12 @@ def manage_project_servers(projects=None): try: if project_storage_path_type == 'gs': from .storage.gs import GoogleStorageHandler - gs_project = LABELIT_REMOTE_STORAGE_CONFIG['gs']['project'] + gs_project = LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG['gs']['project'] storage_obj = GoogleStorageHandler(project=gs_project) storage_obj.download(project.dataset_path, project_local_storage) elif project_storage_path_type == 's3': from .storage.s3 import S3StorageHandler - s3_region = LABELIT_REMOTE_STORAGE_CONFIG['s3']['region'] + s3_region = LABELIT_REMOTE_STORAGE_DOWNLOAD_CONFIG['s3']['region'] storage_obj = S3StorageHandler(region=s3_region) storage_obj.download(project.dataset_path, project_local_storage) @@ -111,6 +111,19 @@ def export_projects(): # Get all projects projects = Project.objects.all() for project in projects: + storage_type = None + if not project.remote_export=="None": + storage_type = get_storage_type(project.remote_export) + if storage_type == "gs": + from .storage.gs import GoogleStorageHandler + gs_project = LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG['gs']['project'] + storage_obj = GoogleStorageHandler(project = gs_project) + # elif storage_type == "s3": + # from .storage.s3 import S3StorageHandler + # s3_region = LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG['s3']['region'] + # storage_obj = S3StorageHandler(region=s3_region) + # storage_obj.download(project.dataset_path, project_local_storage) + if project.status == Project.Status.ACTIVE and project.export_format != Project.ExportFormat.NONE: logger.info(f"Exporting project {project.name}") output_paths = [] @@ -135,7 +148,12 @@ def export_projects(): else: logger.debug(f"Export format {project.export_format} not supported for project {project.name}") continue - output_paths.append(output_path) + output_paths.append(output_path) + if storage_type == 'gs': + file_to_upload = list(x for x in output_path.iterdir() if x.is_file())[0] + storage_path = f"{project.name}/{annotator.username}/{str(file_to_upload).split('/')[-1]}" + storage_obj.upload(project.remote_export, storage_path, file_to_upload) + def stop_project_servers(projects=None): """Stops running Label studio servers""" diff --git a/labelit/mainapp/migrations/0003_project_remote_export.py b/labelit/mainapp/migrations/0003_project_remote_export.py new file mode 100644 index 0000000..3bf1a1f --- /dev/null +++ b/labelit/mainapp/migrations/0003_project_remote_export.py @@ -0,0 +1,19 @@ +# Generated by Django 3.0.5 on 2020-06-27 10:17 + +from django.db import migrations, models +import mainapp.validators + + +class Migration(migrations.Migration): + + dependencies = [ + ('mainapp', '0002_auto_20200613_0820'), + ] + + operations = [ + migrations.AddField( + model_name='project', + name='remote_export', + field=models.CharField(default='None', help_text='Remote export path (GS or S3)', max_length=500, validators=[mainapp.validators.validate_remote_path], verbose_name='Remote Storage'), + ), + ] diff --git a/labelit/mainapp/models.py b/labelit/mainapp/models.py index 6deb3be..c152dc6 100644 --- a/labelit/mainapp/models.py +++ b/labelit/mainapp/models.py @@ -2,7 +2,7 @@ from django.contrib.auth.models import AbstractUser from django.contrib.auth import get_user_model from django.utils.translation import gettext_lazy as _ -from mainapp.validators import validate_dataset_path, validate_label_config +from mainapp.validators import validate_dataset_path, validate_label_config, validate_remote_path class User(AbstractUser): class StaffRole(models.IntegerChoices): @@ -65,6 +65,8 @@ class Status(models.IntegerChoices): status = models.IntegerField(choices=Status.choices, default=Status.INITIALIZED, help_text="Status of project") # Export format export_format = models.IntegerField(choices=ExportFormat.choices, default=ExportFormat.NONE, help_text="Export format for labelled data", verbose_name="Export Format") + # Remote Export Path + remote_export = models.CharField(validators=[validate_remote_path], default="None", max_length=500, help_text="Remote export path (GS or S3)", verbose_name="Remote Storage") class ProjectAnnotators(models.Model): diff --git a/labelit/mainapp/storage/gs.py b/labelit/mainapp/storage/gs.py index e6da54a..205b7f9 100644 --- a/labelit/mainapp/storage/gs.py +++ b/labelit/mainapp/storage/gs.py @@ -1,6 +1,8 @@ import os from .base import StorageHandler from .utils import split_bucket_path +from labelit.settings import LABELIT_DIRS +from pathlib import Path try: from google.cloud import storage @@ -26,3 +28,13 @@ def download(self, storage_path, download_path): cleaned_blob_name = blob.name.replace('/', '_') destination_file_name = os.path.join(download_path, cleaned_blob_name) blob.download_to_filename(destination_file_name) + + def upload(self, bucket_address, bucket_path_of_file, file_to_upload): + bucket_name, source_blob_path = split_bucket_path(bucket_address) + if len(source_blob_path) > 0: + if not source_blob_path.endswith("/"): + source_blob_path += "/" + bucket_path_of_file = source_blob_path + bucket_path_of_file + bucket = self.storage_client.bucket(bucket_name) + blob = bucket.blob(bucket_path_of_file) + blob.upload_from_filename(file_to_upload) diff --git a/labelit/mainapp/storage/utils.py b/labelit/mainapp/storage/utils.py index 0e474bd..d056ea4 100644 --- a/labelit/mainapp/storage/utils.py +++ b/labelit/mainapp/storage/utils.py @@ -1,5 +1,5 @@ from .config import storage_prefex_config -from .exceptions import StorageNotSupported +from .exceptions import StorageNotSupported, InvalidStoragePath def get_storage_type(storage_path): """Get storage type for a storage path (local and supported remote storages)""" diff --git a/labelit/mainapp/validators.py b/labelit/mainapp/validators.py index 167ea83..d02d7ef 100644 --- a/labelit/mainapp/validators.py +++ b/labelit/mainapp/validators.py @@ -28,3 +28,14 @@ def validate_label_config(config): _('Invalid Label Studio config'), code='invalid' ) + +def validate_remote_path(value): + try: + storage_type = get_storage_type(value) + if storage_type == 'local': + raise Exception("Please enter valid GCP or AWS Format") + except: + raise ValidationError( + _('Enter a valid storage path!'), + code='invalid' + ) From d90cc0e3bfba055ed144bf3f53c5466826c88073 Mon Sep 17 00:00:00 2001 From: ashmeet13 Date: Sun, 28 Jun 2020 19:05:01 +0530 Subject: [PATCH 2/3] Add S3 Support --- labelit/mainapp/jobs.py | 17 ++++++++++------- labelit/mainapp/storage/s3.py | 8 ++++++++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/labelit/mainapp/jobs.py b/labelit/mainapp/jobs.py index 2c07be9..33d2ba5 100644 --- a/labelit/mainapp/jobs.py +++ b/labelit/mainapp/jobs.py @@ -113,16 +113,16 @@ def export_projects(): for project in projects: storage_type = None if not project.remote_export=="None": + # Declare a storage object which will be used to upload files later storage_type = get_storage_type(project.remote_export) if storage_type == "gs": from .storage.gs import GoogleStorageHandler gs_project = LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG['gs']['project'] storage_obj = GoogleStorageHandler(project = gs_project) - # elif storage_type == "s3": - # from .storage.s3 import S3StorageHandler - # s3_region = LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG['s3']['region'] - # storage_obj = S3StorageHandler(region=s3_region) - # storage_obj.download(project.dataset_path, project_local_storage) + elif storage_type == "s3": + from .storage.s3 import S3StorageHandler + s3_region = LABELIT_REMOTE_STORAGE_UPLOAD_CONFIG['s3']['region'] + storage_obj = S3StorageHandler(region=s3_region) if project.status == Project.Status.ACTIVE and project.export_format != Project.ExportFormat.NONE: logger.info(f"Exporting project {project.name}") @@ -148,8 +148,11 @@ def export_projects(): else: logger.debug(f"Export format {project.export_format} not supported for project {project.name}") continue - output_paths.append(output_path) - if storage_type == 'gs': + output_paths.append(output_path) + # Both GS and S3 util have the same upload function which take the same set of parameters to + # upload a file to the bucket. + # The above declared `storage_obj` is used to upload. + if storage_type: file_to_upload = list(x for x in output_path.iterdir() if x.is_file())[0] storage_path = f"{project.name}/{annotator.username}/{str(file_to_upload).split('/')[-1]}" storage_obj.upload(project.remote_export, storage_path, file_to_upload) diff --git a/labelit/mainapp/storage/s3.py b/labelit/mainapp/storage/s3.py index 67e0600..2356301 100644 --- a/labelit/mainapp/storage/s3.py +++ b/labelit/mainapp/storage/s3.py @@ -30,3 +30,11 @@ def download(self, storage_path, download_path): cleaned_object_name = object['Key'].replace('/', '_') destination_file_name = os.path.join(download_path, cleaned_object_name) self.storage_client.download_file(bucket_name, object['Key'], destination_file_name) + + def upload(self, bucket_address, bucket_path_of_file, file_to_upload): + bucket_name, source_blob_path = split_bucket_path(bucket_address) + if len(source_blob_path) > 0: + if not source_blob_path.endswith("/"): + source_blob_path += "/" + bucket_path_of_file = source_blob_path + bucket_path_of_file + self.storage_client.upload_file(file_to_upload, bucket_name, bucket_path_of_file) From ec17e5eef25eec22a09acdcd226d74ec526b43ff Mon Sep 17 00:00:00 2001 From: ashmeet13 Date: Sun, 28 Jun 2020 20:30:04 +0530 Subject: [PATCH 3/3] Fix: --- labelit/labelit/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/labelit/labelit/settings.py b/labelit/labelit/settings.py index c824775..9e9b4f4 100644 --- a/labelit/labelit/settings.py +++ b/labelit/labelit/settings.py @@ -72,7 +72,7 @@ 'region': None, }, 'gs': { - 'project': "mach-learn", + 'project': None, }, }