Skip to content

Commit c06cbd7

Browse files
sipstore: adds sipstore implementation
Signed-off-by: Ioannis Tsanaktsidis <ioannis.tsanaktsidis@cern.ch>
1 parent b916a06 commit c06cbd7

File tree

11 files changed

+333
-34
lines changed

11 files changed

+333
-34
lines changed

cap/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -642,3 +642,6 @@ def _(x):
642642
# ================
643643
REANA_ACCESS_TOKEN = os.environ.get(
644644
'APP_REANA_ACCESS_TOKEN', None)
645+
646+
SIPSTORE_DEFAULT_AGENT_JSONSCHEMA = 'sipstore/agent-v0.0.1.json'
647+
SIPSTORE_DEFAULT_BAGIT_JSONSCHEMA = 'sipstore/bagit-v0.0.1.json'
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"allow_all": true,
3+
"experiment": null,
4+
"fullname": null,
5+
"is_deposit": false,
6+
"jsonschema": {
7+
"type": "object",
8+
"title": "SIPStore Agent schema.",
9+
"description": "User agent information making the SIP.",
10+
"properties": {
11+
"orcid": {
12+
"type": "string"
13+
},
14+
"email": {
15+
"type": "string"
16+
},
17+
"ip_address": {
18+
"type": "string"
19+
}
20+
}
21+
}
22+
}
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
{
2+
"allow_all": true,
3+
"experiment": null,
4+
"fullname": null,
5+
"is_deposit": false,
6+
"jsonschema": {
7+
"definitions": {
8+
"file": {
9+
"type": "object",
10+
"title": "Archived file information.",
11+
"description": "JSON describing a single file.",
12+
"additionalProperties": false,
13+
"properties": {
14+
"filepath": {
15+
"description":
16+
"Filepath to the archived file, relative to the archived directory root.",
17+
"type": "string"
18+
},
19+
"fullpath": {
20+
"description":
21+
"Absolute filepath to the file in the archive file system.",
22+
"type": "string"
23+
},
24+
"size": {
25+
"description": "Size of the file in bytes.",
26+
"type": "number"
27+
},
28+
"checksum": {
29+
"description":
30+
"MD5 checksum of the file. Always starts with 'md5:' prefix.",
31+
"type": "string"
32+
},
33+
"file_uuid": {
34+
"description":
35+
"UUID of the related FileInstance object. Used for Record's data files only.",
36+
"type": "string"
37+
},
38+
"metadata_id": {
39+
"description":
40+
"ID of the type (SIPMetadataType.id) of the related SIPMetadata object. Used for Record's metadata files only.",
41+
"type": "number"
42+
},
43+
"sipfilepath": {
44+
"description":
45+
"Original SIPFile.filepath value. Used for Record's data files only.",
46+
"type": "string"
47+
},
48+
"filename": {
49+
"description":
50+
"Filename of the SIPFile in the archive. Used for Record's data files only.",
51+
"type": "string"
52+
},
53+
"content": {
54+
"description":
55+
"Text-content of the file. Used for BagIt metadata files only.",
56+
"type": "string"
57+
},
58+
"fetched": {
59+
"description":
60+
"Marks whether given file is fetched from another bag (specified in 'fetch.txt'). If the key does not exist or is set to false, it is assumed that the file is written down in the bag, hence NOT fetched. Used for Record's data files only.",
61+
"type": "boolean"
62+
}
63+
},
64+
"required": ["filepath", "fullpath", "size", "checksum"]
65+
}
66+
},
67+
"properties": {
68+
"files": {
69+
"description": "All files stored in this archive package.",
70+
"type": "array",
71+
"items": {
72+
"$ref": "#/definitions/file"
73+
}
74+
}
75+
}
76+
}
77+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"allow_all": true,
3+
"experiment": null,
4+
"fullname": null,
5+
"is_deposit": false,
6+
"jsonschema": {
7+
"properties": {
8+
"filepath": {
9+
"description":
10+
"Filepath to the archived file, relative to the archived directory root.",
11+
"type": "string"
12+
},
13+
"fullpath": {
14+
"description":
15+
"Absolute filepath to the file in the archive file system.",
16+
"type": "string"
17+
},
18+
"size": {
19+
"description": "Size of the file in bytes.",
20+
"type": "number"
21+
},
22+
"checksum": {
23+
"description":
24+
"MD5 checksum of the file. Always starts with 'md5:' prefix.",
25+
"type": "string"
26+
},
27+
"file_uuid": {
28+
"description":
29+
"UUID of the related FileInstance object. Used for Record's data files only.",
30+
"type": "string"
31+
},
32+
"metadata_id": {
33+
"description":
34+
"ID of the type (SIPMetadataType.id) of the related SIPMetadata object. Used for Record's metadata files only.",
35+
"type": "number"
36+
},
37+
"sipfilepath": {
38+
"description":
39+
"Original SIPFile.filepath value. Used for Record's data files only.",
40+
"type": "string"
41+
},
42+
"filename": {
43+
"description":
44+
"Filename of the SIPFile in the archive. Used for Record's data files only.",
45+
"type": "string"
46+
},
47+
"content": {
48+
"description":
49+
"Text-content of the file. Used for BagIt metadata files only.",
50+
"type": "string"
51+
}
52+
},
53+
"required": ["filepath", "fullpath", "size", "checksum"]
54+
}
55+
}

cap/modules/deposit/api.py

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727
from __future__ import absolute_import, print_function
2828

2929
import copy
30-
import shutil
31-
import tempfile
3230
from copy import deepcopy
3331
from functools import wraps
3432

@@ -43,29 +41,36 @@
4341
from invenio_files_rest.errors import MultipartMissingParts
4442
from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
4543
from invenio_jsonschemas.errors import JSONSchemaNotFound
44+
from invenio_pidstore.errors import PIDDoesNotExistError
4645
from invenio_records.models import RecordMetadata
4746
from invenio_records_files.models import RecordsBuckets
4847
from invenio_rest.errors import FieldError
48+
49+
from invenio_sipstore.api import RecordSIP, SIP as SIPApi
50+
from invenio_sipstore.archivers import BagItArchiver
51+
from invenio_sipstore.models import SIP as SIPModel, \
52+
RecordSIP as RecordSIPModel
53+
4954
from jsonschema.validators import Draft4Validator, RefResolutionError
5055
from sqlalchemy.exc import IntegrityError
5156
from sqlalchemy.orm.exc import NoResultFound
5257
from werkzeug.local import LocalProxy
5358

54-
from cap.config import FILES_URL_MAX_SIZE
5559
from cap.modules.records.api import CAPRecord
5660
from cap.modules.repoimporter.repo_importer import RepoImporter
5761
from cap.modules.schemas.models import Schema
5862
from cap.modules.user.errors import DoesNotExistInLDAP
5963
from cap.modules.user.utils import (get_existing_or_register_role,
6064
get_existing_or_register_user)
6165

62-
from .errors import (DepositValidationError, FileUploadError,
66+
from .errors import (ArchivingError, DepositValidationError, FileUploadError,
6367
UpdateDepositPermissionsError)
6468
from .fetchers import cap_deposit_fetcher
6569
from .minters import cap_deposit_minter
6670
from .permissions import (AdminDepositPermission, CloneDepositPermission,
6771
DepositAdminActionNeed, DepositReadActionNeed,
6872
DepositUpdateActionNeed, UpdateDepositPermission)
73+
from .utils import compare_files, task_commit, ensure_content_length
6974

7075
_datastore = LocalProxy(lambda: current_app.extensions['security'].datastore)
7176

@@ -197,7 +202,52 @@ def publish(self, *args, **kwargs):
197202
if file_.data['checksum'] is None:
198203
raise MultipartMissingParts()
199204

200-
return super(CAPDeposit, self).publish(*args, **kwargs)
205+
try:
206+
_, last_record = self.fetch_published()
207+
is_first_publishing = False
208+
fetched_files = last_record.files
209+
create_sip_files = not compare_files(fetched_files, self.files)
210+
except (PIDDoesNotExistError, KeyError):
211+
is_first_publishing = True
212+
create_sip_files = True if self.files else False
213+
214+
deposit = super(CAPDeposit, self).publish(*args, **kwargs)
215+
recid, record = deposit.fetch_published()
216+
sip_patch_of = None
217+
if not is_first_publishing:
218+
sip_recid = recid
219+
220+
sip_patch_of = (
221+
db.session.query(SIPModel)
222+
.join(RecordSIPModel, RecordSIPModel.sip_id == SIPModel.id)
223+
.filter(RecordSIPModel.pid_id == sip_recid.id)
224+
.order_by(SIPModel.created.desc())
225+
.first()
226+
)
227+
228+
recordsip = RecordSIP.create(
229+
recid, record, archivable=True,
230+
create_sip_files=create_sip_files,
231+
sip_metadata_type='json',
232+
user_id=current_user.id,
233+
agent=None)
234+
235+
archiver = BagItArchiver(
236+
recordsip.sip, include_all_previous=(not is_first_publishing),
237+
patch_of=sip_patch_of)
238+
239+
archiver.save_bagit_metadata()
240+
241+
sip = (
242+
RecordSIPModel.query
243+
.filter_by(pid_id=recid.id)
244+
.order_by(RecordSIPModel.created.desc())
245+
.first().sip
246+
)
247+
248+
archive_sip.delay(str(sip.id))
249+
250+
return deposit
201251

202252
@mark_as_action
203253
def upload(self, pid=None, *args, **kwargs):
@@ -601,32 +651,31 @@ def download_repo(pid, url, filename):
601651
task_commit(record, response.raw, filename, total)
602652

603653

604-
def task_commit(record, response, filename, total):
605-
"""Commit file to the record."""
606-
record.files[filename].file.set_contents(
607-
response,
608-
default_location=record.files.bucket.location.uri,
609-
size=total
610-
)
611-
db.session.commit()
612-
613-
614-
def ensure_content_length(
615-
url, method='GET',
616-
session=None,
617-
max_size=FILES_URL_MAX_SIZE or 2**20,
618-
*args, **kwargs):
619-
"""Add Content-Length when no present."""
620-
kwargs['stream'] = True
621-
session = session or requests.Session()
622-
r = session.request(method, url, *args, **kwargs)
623-
if 'Content-Length' not in r.headers:
624-
# stream content into a temporary file so we can get the real size
625-
spool = tempfile.SpooledTemporaryFile(max_size)
626-
shutil.copyfileobj(r.raw, spool)
627-
r.headers['Content-Length'] = str(spool.tell())
628-
spool.seek(0)
629-
# replace the original socket with our temporary file
630-
r.raw._fp.close()
631-
r.raw._fp = spool
632-
return r
654+
@shared_task(ignore_result=True, max_retries=6,
655+
default_retry_delay=4 * 60 * 60)
656+
def archive_sip(sip_uuid):
657+
"""Send the SIP for archiving.
658+
659+
Retries every 4 hours, six times, which should work for up to 24 hours
660+
archiving system downtime.
661+
:param sip_uuid: UUID of the SIP for archiving.
662+
:type sip_uuid: str
663+
"""
664+
try:
665+
sip = SIPApi(SIPModel.query.get(sip_uuid))
666+
archiver = BagItArchiver(sip)
667+
bagmeta = archiver.get_bagit_metadata(sip)
668+
if bagmeta is None:
669+
raise ArchivingError(
670+
'Bagit metadata does not exist for SIP: {0}.'.format(sip.id))
671+
if sip.archived:
672+
raise ArchivingError(
673+
'SIP was already archived {0}.'.format(sip.id))
674+
archiver.write_all_files()
675+
sip.archived = True
676+
db.session.commit()
677+
except Exception as exc:
678+
# On ArchivingError (see above), do not retry, but re-raise
679+
if not isinstance(exc, ArchivingError):
680+
archive_sip.retry(exc=exc)
681+
raise

cap/modules/deposit/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@
2828
from invenio_rest.errors import RESTException
2929

3030

31+
class ArchivingError(Exception):
32+
"""Represents a SIP archiving error that can occur during task."""
33+
34+
3135
class DepositDoesNotExist(Exception):
3236
"""Deposit with given key does not exist exception."""
3337

0 commit comments

Comments
 (0)