diff --git a/README.md b/README.md index 9b1dc4719d..94cbf53c64 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,60 @@ Example usage (refer to docs for more): print(file_content) ``` +## Experimental support for data-proxy + +Original implementation from Bjorn Kindler & Jan Fousek. + +Example Usage: + +### Access collab bucket + +```python + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + + # access existing bucket + bucket = client.buckets.get_bucket("existing_collab_name") + + # or create a new collab + bucket + bucket = client.create_new("new_collab_name") + + # upload new file + bucket.upload("/home/jovyan/test.txt", "test/foobar.txt") + + # it seems newly uplaoded file will **NOT** be available immediately. Sleep for x seconds? + from time import sleep + sleep(1) + + # list the contents + files = [f for f in bucket.ls(prefix="test")] + + # get the uploaded file + file_handle = bucket.get_file("foobar.txt") + file_content = file_handle.get_content() + + # delete a bucket (n.b. this will **NOT** delete the collab!) + client.delete_bucket("new_bucket_name") +``` + +### Access datasets (e.g. HDG datasets) + +```python + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + + # access dataset bucket + # setting requeste_access = True will start the relevant access-request-flow when accessing HDG datasets + bucket = client.buckets.get_dataset("existing_dataset_id", request_access=True) + + # list the contents + files = [f for f in bucket.ls(prefix="path/to/somewhere/foo")] + + # get a file content + file_handle = bucket.get_file("path/to/somewhere/foobar.txt") + file_content = file_handle.get_content() + +```
EU Logo
diff --git a/doc.md b/doc.md index 0e1749760a..6d6f323022 100644 --- a/doc.md +++ b/doc.md @@ -1,6 +1,8 @@ -# Python Seafile +# Ebrains Drive

+ +
  • DataProxy
  • + +

    -# Python Seafile +# Drive (Seafile) ## Get Client ## @@ -459,3 +484,243 @@ None **Return Type** A Response Instance + + + +# Bucket + +## Get Client +**Request Parameters** + +* token + +**Sample Case** + +```python + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") +``` + + +**Return Type** + +A Client Object + +## Bucket ## +### Get Bucket ### +**Request Parameters** + +* existing_collab_name + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + bucket = client.buckets.get_bucket("existing_collab_name") +``` + +**Return Type** + +A Bucket Object + +**Exceptions** + +* Bucket does not exist or not authorized to use the specified bucket + +### Create Bucket ### +**Request Parameters** + +* new_collab_name + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + bucket = client.create_new("new_collab_name") +``` + +**Return Type** + +A Bucket Object + +**Exceptions** + +* Unauthorized to create new collab or bucket + +### List Bucket Entries ### +**Request Parameters** + +* prefix (optional) + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + bucket = client.buckets.get_bucket("existing_collab_name") + + # shows all files + all_files = [f for f in bucket.ls()] + + # shows all files that begins with path/to/my/files + my_files = [f for f in bucket.ls(prefix="path/to/my/files")] +``` + +**Return Type** + +An Iterator of File Objects + +**Exceptions** + +* Unauthorized + +## Dataset ## +### Get Dataset ### + +Note, if _request_access_ is set to `True`, this method may require user interaction. + +**Request Parameters** + +* dataset_id +* request_access (optional, default `False`) + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + bucket = client.buckets.get_dataset("dataset_id") + +``` + +**Return Type** +A Bucket Object + +**Exceptions** + +* Unauthorized (if _request_access_ is not set) + +## File ## + +Files in buckets are not typically organised in directories. Users may use the `/` in filename to construct a directory-like structure. + + +### Get File ### +**Request Parameters** + +* filename + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + + bucket = client.buckets.get_bucket("existing_collab_name") + # OR + bucket = client.buckets.get_dataset("dataset_id") + + file_handle = bucket.get_file("filename") + +``` + +**Return Type** + +A File Object + +**Exceptions** + +* Unauthorized +* DoesNotExist + +### Get File Content ### +**Request Parameters** + +* filename + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + + bucket = client.buckets.get_bucket("existing_collab_name") + # OR + bucket = client.buckets.get_dataset("dataset_id") + + file_handle = bucket.get_file("filename") + file_content = file_handle.get_content() + +``` + +**Return Type** + +bytes + +**Exceptions** + +* Unauthorized +* DoesNotExist + + +### Upload File ### +**Request Parameters** + +* path_to_file +* dest_filename + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + bucket = client.buckets.get_bucket("existing_collab_name") + + bucket.upload("path_to_file", "dest_filename") + +``` + +**Return Type** + +None + +**Exceptions** + +* Unauthorized + +### Delete File ### +**Request Parameters** + +* filename + +**Sample Case** + +```python + + from ebrains_drive import BucketApiClient + client = BucketApiClient(token="ey...") + bucket = client.buckets.get_bucket("existing_collab_name") + + file_handle = bucket.get_file("filename") + file_handle.delete() + +``` + +**Return Type** + +None + +**Exceptions** + +* Unauthorized +* DoesNotExist +* AssertionError diff --git a/ebrains_drive/__init__.py b/ebrains_drive/__init__.py index 20ca2e167b..9325a9dd74 100644 --- a/ebrains_drive/__init__.py +++ b/ebrains_drive/__init__.py @@ -7,7 +7,7 @@ """ -from ebrains_drive.client import DriveApiClient +from ebrains_drive.client import DriveApiClient, BucketApiClient def connect(username=None, password=None, token=None, env=""): client = DriveApiClient(username, password, token, env) diff --git a/ebrains_drive/bucket.py b/ebrains_drive/bucket.py new file mode 100644 index 0000000000..2282e0a5a0 --- /dev/null +++ b/ebrains_drive/bucket.py @@ -0,0 +1,85 @@ +from typing import Iterable +import requests +from ebrains_drive.exceptions import DoesNotExist, InvalidParameter +from ebrains_drive.files import DataproxyFile +from ebrains_drive.utils import on_401_raise_unauthorized + +class Bucket(object): + + LIMIT = 100 + + """ + A dataproxy bucket + n.b. for a dataset bucket, role & is_public may be None + """ + def __init__(self, client, name: str, objects_count: int, bytes: int, last_modified: str, is_public: bool = None, role: str = None, *, public: bool= False, target: str='buckets', dataset_id: str=None) -> None: + if target != 'buckets' and target != 'datasets': + raise InvalidParameter(f'Init Buckets exception: target can be left unset, but if set, must either be buckets or datasets') + if public: + raise NotImplementedError(f"Access to public datasets/buckets NYI.") + self.public = public + self.target = target + + self.client = client + + self.name = name + self.objects_count = objects_count + self.bytes = bytes + self.last_modified = last_modified + self.is_public = is_public + self.role = role + + # n.b. for dataset bucket, dataset_id needs to be used for dataproxy_entity_name, but for collab bucket, name is used + self.dataproxy_entity_name = dataset_id or name + + @classmethod + def from_json(cls, client, bucket_json, *, public:bool = False, target: str='buckets', dataset_id=None) -> 'Bucket': + return cls(client, **bucket_json, public=public, target=target, dataset_id=dataset_id) + + def __str__(self): + return "(name='{}')".format(self.name) + + def __repr__(self): + return "ebrains_drive.bucket.Bucket(name='{}')".format(self.name) + + @on_401_raise_unauthorized("Unauthorized.") + def ls(self, prefix: str=None) -> Iterable[DataproxyFile]: + marker = None + visited_name = set() + while True: + resp = self.client.get(f"/v1/{self.target}/{self.dataproxy_entity_name}", params={ + 'limit': self.LIMIT, + 'marker': marker, + 'prefix': prefix + }) + objects = resp.json().get("objects", []) + if len(objects) == 0: + break + + for obj in objects: + + yield DataproxyFile.from_json(self.client, self, obj) + marker = obj.get("name") + + if marker in visited_name: + raise RuntimeError(f"Bucket.ls error: hash {marker} has already been visited.") + visited_name.add(marker) + return + + @on_401_raise_unauthorized("Unauthorized") + def get_file(self, name: str) -> DataproxyFile: + name = name.lstrip("/") + for file in self.ls(prefix=name): + if file.name == name: + return file + raise DoesNotExist(f"Cannot find {name}.") + + @on_401_raise_unauthorized("Unauthorized") + def upload(self, fileobj: str, filename: str): + filename = filename.lstrip("/") + resp = self.client.put(f"/v1/{self.target}/{self.dataproxy_entity_name}/{filename}") + upload_url = resp.json().get("url") + if upload_url is None: + raise RuntimeError(f"Bucket.upload did not get upload url.") + resp = requests.request("PUT", upload_url, data=open(fileobj, 'rb')) + resp.raise_for_status() diff --git a/ebrains_drive/buckets.py b/ebrains_drive/buckets.py new file mode 100644 index 0000000000..f5986e0c72 --- /dev/null +++ b/ebrains_drive/buckets.py @@ -0,0 +1,37 @@ +from ebrains_drive.exceptions import ClientHttpError, Unauthorized +from ebrains_drive.utils import on_401_raise_unauthorized +from ebrains_drive.bucket import Bucket +from time import sleep + +class Buckets(object): + + def __init__(self, client): + self.client = client + + @on_401_raise_unauthorized('401 response. Check you/your token have access right and/or the bucket name has been spelt correctly.') + def get_bucket(self, bucket_name: str, *, public: bool=False) -> Bucket: + """Get the specified bucket according name. If forced flag is set to True, will attempt to create the collab, if necessary. + """ + resp = self.client.get(f"/v1/buckets/{bucket_name}/stat") + return Bucket.from_json(self.client, resp.json(), public=public, target='buckets') + + def get_dataset(self, dataset_id: str, *, public: bool=False, request_access: bool=False): + request_sent = False + attempt_no = 0 + while True: + try: + resp = self.client.get(f"/v1/datasets/{dataset_id}/stat") + return Bucket.from_json(self.client, resp.json(), public=public, target="datasets", dataset_id=dataset_id) + except ClientHttpError as e: + if e.code != 401: + raise e + + if not request_access: + raise Unauthorized(f"You do not have access to this dataset. If this is a private dataset, try to set request_access flag to true. We can start the procedure of requesting access for you.") + if not request_sent: + self.client.post(f"/v1/datasets/{dataset_id}", expected=(200, 201)) + request_sent = True + print("Request sent. Please check the mail box associated with the token.") + sleep(5) + attempt_no = attempt_no + 1 + print(f"Checking permission, attempt {attempt_no}") diff --git a/ebrains_drive/client.py b/ebrains_drive/client.py index efa825abe7..55659fe2a9 100644 --- a/ebrains_drive/client.py +++ b/ebrains_drive/client.py @@ -1,28 +1,22 @@ -import re from getpass import getpass import requests -from ebrains_drive.utils import urljoin -from ebrains_drive.exceptions import ClientHttpError +from abc import ABC +import base64 +import json +import time +from ebrains_drive.utils import urljoin, on_401_raise_unauthorized +from ebrains_drive.exceptions import ClientHttpError, TokenExpired from ebrains_drive.repos import Repos +from ebrains_drive.buckets import Buckets from ebrains_drive.file import File - -class DriveApiClient(object): - """Wraps seafile web api""" - def __init__(self, username=None, password=None, token=None, env=""): - """Wraps various basic operations to interact with seahub http api. - """ - self._set_env(env) - - self.server = self.drive_url +class ClientBase(ABC): + def __init__(self, username=None, password=None, token=None, env="") -> None: self.username = username self.password = password self._token = token - - self.repos = Repos(self) - self.groups = Groups(self) - self.file = File(self) + self.server = None if token is None: if self.username is None: @@ -36,6 +30,7 @@ def __init__(self, username=None, password=None, token=None, env=""): print("Error: Invalid user credentials!") raise + def _set_env(self, env=''): self.suffix = "" @@ -45,19 +40,9 @@ def _set_env(self, env=''): self.suffix = "-int" # else we keep empty suffix for production - self.drive_url = "https://drive" + self.suffix + ".ebrains.eu" self.iam_host = "iam" + self.suffix + ".ebrains.eu" self.iam_url = "https://" + self.iam_host - - def get_drive_url(self): - return self.drive_url - - def get_iam_host(self): - return self.iam_host - - def get_iam_url(self): - return self.iam_url - + def _get_token(self): response = requests.post( self.iam_url+'/auth/realms/hbp/protocol/openid-connect/token', @@ -67,29 +52,26 @@ def _get_token(self): 'username':self.username, 'password':self.password }) - self._token = response.json()['access_token'] - - def __str__(self): - return 'DriveApiClient[server=%s, user=%s]' % (self.server, self.username) - - __repr__ = __str__ - + def get(self, *args, **kwargs): - return self._send_request('GET', *args, **kwargs) + return self.send_request('GET', *args, **kwargs) def post(self, *args, **kwargs): - return self._send_request('POST', *args, **kwargs) + return self.send_request('POST', *args, **kwargs) def put(self, *args, **kwargs): - return self._send_request('PUT', *args, **kwargs) + return self.send_request('PUT', *args, **kwargs) def delete(self, *args, **kwargs): - return self._send_request('delete', *args, **kwargs) + return self.send_request('DELETE', *args, **kwargs) - def _send_request(self, method, url, *args, **kwargs): + def send_request(self, method: str, url: str, *args, **kwargs): if not url.startswith('http'): - url = urljoin(self.server, url) + # sanity checks. + # - accounts for if server was provided with trailing slashes + # - accounts for if url was provided with leading slashes + url = self.server.rstrip('/') + '/' + url.lstrip('/') headers = kwargs.get('headers', {}) headers.setdefault('Authorization', 'Bearer ' + self._token) @@ -106,6 +88,90 @@ def _send_request(self, method, url, *args, **kwargs): return resp +class DriveApiClient(ClientBase): + """Wraps seafile web api""" + def __init__(self, username=None, password=None, token=None, env=""): + """Wraps various basic operations to interact with seahub http api. + """ + self._set_env(env) + super().__init__(username, password, token, env) + + self.server = self.drive_url + + self.repos = Repos(self) + self.groups = Groups(self) + self.file = File(self) + + def _set_env(self, env=''): + super()._set_env(env) + self.drive_url = "https://drive" + self.suffix + ".ebrains.eu" + + def get_drive_url(self): + return self.drive_url + + def get_iam_host(self): + return self.iam_host + + def get_iam_url(self): + return self.iam_url + + def __str__(self): + return 'DriveApiClient[server=%s, user=%s]' % (self.server, self.username) + + __repr__ = __str__ + + def send_request(self, method: str, url: str, *args, **kwargs): + if not url.startswith('http'): + url = urljoin(self.server, url) + return super().send_request(method, url, *args, **kwargs) + +class BucketApiClient(ClientBase): + + def __init__(self, username=None, password=None, token=None, env="") -> None: + if env != "": + raise NotImplementedError("non prod environment for dataproxy access has not yet been implemented.") + self._set_env(env) + + super().__init__(username, password, token, env) + + self.server = "https://data-proxy.ebrains.eu/api" + + self.buckets = Buckets(self) + + @on_401_raise_unauthorized("Failed. Note: BucketApiClient.create_new needs to have clb.drive:write as a part of scope.") + def create_new(self, bucket_name: str, title=None, description="Created by ebrains_drive"): + # attempt to create new collab + self.send_request("POST", "https://wiki.ebrains.eu/rest/v1/collabs", json={ + "name": bucket_name, + "title": title or bucket_name, + "description": description, + "drive": True, + "chat": True, + "public": False + }, expected=201) + + # activate the bucket for the said collab + self.send_request("POST", "/v1/buckets", json={ + "bucket_name": bucket_name + }, expected=201) + + @on_401_raise_unauthorized("Failed. Note: BucketApiClient.create_new needs to have clb.drive:write as a part of scope.") + def delete_bucket(self, bucket_name: str): + self.send_request("DELETE", f"/v1/buckets/{bucket_name}") + + def send_request(self, method: str, url: str, *args, **kwargs): + hdr, info, sig = self._token.split('.') + info_json = base64.b64decode(info + '==').decode('utf-8') + + # https://www.rfc-editor.org/rfc/rfc7519#section-2 + exp_utc_seconds = json.loads(info_json).get('exp') + now_tc_seconds = time.time() + + if now_tc_seconds > exp_utc_seconds: + raise TokenExpired + + return super().send_request(method, url, *args, **kwargs) + class Groups(object): def __init__(self, client): diff --git a/ebrains_drive/exceptions.py b/ebrains_drive/exceptions.py index b11498d776..a75331b4dc 100644 --- a/ebrains_drive/exceptions.py +++ b/ebrains_drive/exceptions.py @@ -23,3 +23,16 @@ def __init__(self, msg): def __str__(self): return 'DoesNotExist: %s' % self.msg + +class Unauthorized(Exception): + + def __init__(self, msg): + super().__init__() + self.msg = msg + + def __str__(self): + return 'Unauthorized. This could be a result of either incorrect path or insufficient privilege. %s' % self.msg + +class InvalidParameter(Exception): pass + +class TokenExpired(Exception): pass diff --git a/ebrains_drive/files.py b/ebrains_drive/files.py index 34c6d2399a..c6fef27f38 100644 --- a/ebrains_drive/files.py +++ b/ebrains_drive/files.py @@ -3,7 +3,9 @@ import posixpath import re import time -from ebrains_drive.utils import querystr +from typing import Any, Dict +import requests +from ebrains_drive.utils import querystr, on_401_raise_unauthorized # Note: only files and dirs with contents is assigned an ID; else their ID is set to all zeros ZERO_OBJ_ID = '0000000000000000000000000000000000000000' @@ -338,3 +340,45 @@ def get_content(self): """Get the content of the file""" url = self._get_download_link() return self.client.get(url).content + +class DataproxyFile: + def __init__(self, client, bucket, hash: str, last_modified: str, bytes: int, name: str, content_type: str) -> None: + self.client = client + self.bucket = bucket + + self.hash = hash + self.last_modified = last_modified + self.bytes = bytes + self.name = name + self.content_type = content_type + + def __str__(self): + return 'DataproxyFile[bucket=%s, path=%s, size=%s]' % \ + (self.bucket.name, self.name, self.bytes) + + __repr__ = __str__ + + def get_download_link(self): + """n.b. this download link expires in the order of seconds + """ + resp = self.client.get(f"/v1/{self.bucket.target}/{self.bucket.dataproxy_entity_name}/{self.name}", params={ + "redirect": False + }) + return resp.json().get("url") + + def get_content(self): + url = self.get_download_link() + # Auth header must **NOT** be attached to the download link obtained, or we will get 401 + return requests.get(url).content + + @classmethod + def from_json(cls, client, bucket, file_json: Dict[str, Any]): + return cls(client, bucket, **file_json) + + + @on_401_raise_unauthorized("Unauthorized") + def delete(self): + resp = self.client.delete(f"/v1/{self.bucket.target}/{self.bucket.dataproxy_entity_name}/{self.name}") + json_resp = resp.json() + assert "failures" in json_resp + assert len(json_resp.get("failures")) == 0 diff --git a/ebrains_drive/utils.py b/ebrains_drive/utils.py index 9e0d684739..88d76dd11f 100644 --- a/ebrains_drive/utils.py +++ b/ebrains_drive/utils.py @@ -1,8 +1,10 @@ import string import random +import inspect from functools import wraps +from typing import Type from urllib.parse import urlencode -from ebrains_drive.exceptions import ClientHttpError, DoesNotExist +from ebrains_drive.exceptions import ClientHttpError, DoesNotExist, Unauthorized def randstring(length=0): if length == 0: @@ -20,6 +22,40 @@ def urljoin(base, *args): url = url[:-1] return url +def _raise_on(http_code: int, Ex: Type[Exception]): + """Decorator factory funciton to turn a function that get a http http_code response + to a `Ex` exception.""" + def raise_on(msg: str): + def decorator(func): + + if inspect.isgeneratorfunction(func): + @wraps(func) + def wrapped(*args, **kwargs): + try: + yield from func(*args, **kwargs) + except ClientHttpError as e: + if e.code == http_code: + raise Ex(msg) + else: + raise e + return wrapped + + else: + @wraps(func) + def wrapped(*args, **kwargs): + try: + return func(*args, **kwargs) + except ClientHttpError as e: + if e.code == http_code: + raise Ex(msg) + else: + raise e + return wrapped + return decorator + return raise_on + +on_401_raise_unauthorized = _raise_on(401, Unauthorized) + def raise_does_not_exist(msg): """Decorator to turn a function that get a http 404 response to a :exc:`DoesNotExist` exception.""" @@ -44,6 +80,7 @@ def to_utf8(obj): def querystr(**kwargs): return '?' + urlencode(kwargs) +# not used? def utf8lize(obj): if isinstance(obj, dict): return {k: to_utf8(v) for k, v in obj.items()} diff --git a/tests/test_bucket.py b/tests/test_bucket.py new file mode 100644 index 0000000000..27ad0096a8 --- /dev/null +++ b/tests/test_bucket.py @@ -0,0 +1,73 @@ +import pytest +from unittest.mock import MagicMock +from ebrains_drive.bucket import Bucket +from ebrains_drive.exceptions import ClientHttpError, Unauthorized + +class MockClient: + def get(self, *args, **kwargs): + raise NotImplementedError + def put(self, *args, **kwargs): + raise NotImplementedError + +class MockHttpResp: + def __init__(self, resp): + self.resp = resp + def json(self): + return self.resp + +bucket_json={ + 'name': 'foo', + 'objects_count': 12, + 'bytes': 112233, + 'last_modified': 'foo-bar', + 'is_public': False, + 'role': 'admin', +} + +file_json1={ + 'name': 'foo', + 'hash': 'hash-foo', + 'last_modified': 'last-modified', + 'bytes': 123, + 'content_type': 'json' +} + + + +def test_from_json(): + client = MockClient() + bucket = Bucket.from_json(client, bucket_json) + assert isinstance(bucket, Bucket) + +def test_ls_when_raise_client_error(): + + client = MockClient() + client.get = MagicMock() + client.get.side_effect = [ + ClientHttpError(401, "foo-bar") + ] + + bucket = Bucket.from_json(client, bucket_json) + + try: + fs = [f for f in bucket.ls()] + raise Exception("did not raise") + except Exception as e: + assert isinstance(e, Unauthorized), f"Expect raise Unauthorized: {e}" + +def test_ls_when_repeats(): + + client = MockClient() + client.get = MagicMock() + client.get.side_effect = [ + MockHttpResp({ + 'objects': [file_json1, file_json1] + }) + ] + bucket = Bucket.from_json(client, bucket_json) + + try: + fs = [f for f in bucket.ls()] + raise Exception("did not raise") + except Exception as e: + assert isinstance(e, RuntimeError), f"Expect raise RuntimeError: {e}" diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000000..960867e1f6 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,16 @@ +import inspect +import pytest +from ebrains_drive.utils import on_401_raise_unauthorized + +def generator_fn(): + yield 1 + +test_401_parameters = [ + (generator_fn, True), + (lambda: 1, False) +] + +@pytest.mark.parametrize('func,is_generator', test_401_parameters) +def test_on_401_wrap(func,is_generator): + wrapped_fn = on_401_raise_unauthorized('oh noes')(func) + assert inspect.isgeneratorfunction(wrapped_fn) == is_generator