diff --git a/CHANGES.md b/CHANGES.md index 702432a4b..ea167afdf 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -4,6 +4,7 @@ Changelog 3.0.0b134 (unreleased) ---------------------- +- change all references of md5 to sha256 - Fix mass copy/paste functionality diff --git a/castle/cms/_scripts/md5_to_sha256.py b/castle/cms/_scripts/md5_to_sha256.py new file mode 100644 index 000000000..127da29a5 --- /dev/null +++ b/castle/cms/_scripts/md5_to_sha256.py @@ -0,0 +1,54 @@ +import argparse +import hashlib +import logging + +from castle.cms.files import aws +from castle.cms.services.google.youtube import get_youtube_service +import plone.api as api +from zope.component.hooks import setSite + + +logger = logging.getLogger(__name__) +RESOURCES_KEY_PREFIX = 'archiveresources/' + +parser = argparse.ArgumentParser( + description='...') +parser.add_argument('--site-id', dest='site_id', default='Castle') +args, _ = parser.parse_known_args() + + +def update_s3_resources(): + bucket_name = api.portal.get_registry_record( + 'castle.aws_s3_bucket_name') + s3_conn, bucket = aws.get_bucket(s3_bucket=bucket_name) + objects_summary_list = bucket.objects.all() + for object_summary in objects_summary_list: + md5_key = object_summary.key + obj = s3_conn.get_object(key=md5_key) + fidata = obj.get('Body') + copy_source = { + 'Bucket': bucket.name, + 'Key': md5_key + } + sha256 = hashlib.sha256(fidata).hexdigest() + sha256_content_path = '{0}{1}/{2}/{3}/{4}'.format( + RESOURCES_KEY_PREFIX, sha256[0], sha256[1], sha256[2], sha256 + ) + try: + # check if the key already exists + bucket.Object(sha256_content_path).load() + logger.info(f'key {md5_key} already exists, skipping') + except botocore.exceptions.ClientError: + # this is what we want, the sha256 hashed version shouldn't exist yet + new_obj = bucket.Object(sha256_content_path) + resp_copy = new_obj.copy(copy_source) + if resp_copy.status == 200: + resp_delete = bucket.Object(md5_key).delete() + if resp_delete.status != 200: + logger.error(f'Unable to delete md5 hashed key {md5_key}. This key should not be used for security purposes') + raise Exception(f'Unable to delete key {md5_key}.') + +if __name__ == '__main__': + site = app[args.site_id] # noqa + setSite(site) + update_s3_resources(site) diff --git a/castle/cms/_scripts/templates/watch-run.py b/castle/cms/_scripts/templates/watch-run.py index 457c9f6f3..8dc7363a1 100644 --- a/castle/cms/_scripts/templates/watch-run.py +++ b/castle/cms/_scripts/templates/watch-run.py @@ -12,12 +12,12 @@ times = {} -def md5(fname): - hash_md5 = hashlib.md5() +def sha256(fname): + hash_sha256 = hashlib.sha256() with open(fname, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() + hash_sha256.update(chunk) + return hash_sha256.hexdigest() def check(directory): @@ -29,7 +29,7 @@ def check(directory): if check(filepath): changed = True else: - hashed = md5(filepath) + hashed = sha256(filepath) current_hash = times.get(filepath, None) times[filepath] = hashed diff --git a/castle/cms/archival.py b/castle/cms/archival.py index 4c6dbbcfd..88ff7235f 100644 --- a/castle/cms/archival.py +++ b/castle/cms/archival.py @@ -323,7 +323,7 @@ def __init__(self, site, UrlOpener=SubrequestUrlOpener): except AttributeError: self.archives = self.site._archives = OOBTree() self.path_to_uid = self.site._archives_path_to_uid = OOBTree() - self.resources = {} # path -> md5 dict + self.resources = {} # path -> sha256 dict self.errors = [] # so we can ignore try: @@ -424,10 +424,10 @@ def move_resource(self, url, keep_ext=False, use_vhm=True): fidata = fidata.replace(sub_url, new_url) # upload to amazon and get url! - md5 = hashlib.md5(fidata).hexdigest() + sha256 = hashlib.sha256(fidata).hexdigest() content_path = '{0}{1}/{2}/{3}/{4}'.format( - RESOURCES_KEY_PREFIX, md5[0], md5[1], md5[2], md5 + RESOURCES_KEY_PREFIX, sha256[0], sha256[1], sha256[2], sha256 ) if keep_ext and '.' in url: ext = url.split('.')[-1] diff --git a/castle/cms/browser/content/__init__.py b/castle/cms/browser/content/__init__.py index 479f8b4ef..ca4fc3895 100644 --- a/castle/cms/browser/content/__init__.py +++ b/castle/cms/browser/content/__init__.py @@ -250,12 +250,12 @@ def _clean_tmp(self, info): def detect_duplicate(self, info): dup_detector = duplicates.DuplicateDetector() - md5_hash = commands.md5(info['tmp_file']) - obj = dup_detector.get_object(md5_hash) + sha256_hash = commands.sha256(info['tmp_file']) + obj = dup_detector.get_object(sha256_hash) if obj is not None: # found, use existing file... raise duplicates.DuplicateException(obj) - return dup_detector, md5_hash + return dup_detector, sha256_hash def handle_auto_folder_creation(self, folder, type_): # we only auto publish built-in repositories, otherwise, leave it be @@ -366,18 +366,18 @@ def create_file_content(self, info): original_location == location folder = utils.recursive_create_path(self.context, location) - md5_hash = dup_detector = None + sha256_hash = dup_detector = None if original_location == location: # first off, check first that it wasn't already uploaded... # we are only checking images for now... - if type_ == 'Image' and commands.md5: - dup_detector, md5_hash = self.detect_duplicate(info) + if type_ == 'Image' and commands.sha256: + dup_detector, sha256_hash = self.detect_duplicate(info) self.handle_auto_folder_creation(folder, type_) obj = self.create_object(folder, type_, info) - if type_ == 'Image' and md5_hash and dup_detector: - dup_detector.register(obj, md5_hash) + if type_ == 'Image' and sha256_hash and dup_detector: + dup_detector.register(obj, sha256_hash) return obj def create_object(self, folder, type_, info): diff --git a/castle/cms/browser/security/chat.py b/castle/cms/browser/security/chat.py index 44844d24a..c268cb12e 100644 --- a/castle/cms/browser/security/chat.py +++ b/castle/cms/browser/security/chat.py @@ -33,6 +33,8 @@ def __call__(self): for key in keyring: if key is None: continue + # this code matches the hashing code in rocket chat and cannot + # be updated to sha256 value = hmac.new(key, user + salt, sha).hexdigest() if _is_equal(value, token): return json.dumps({ diff --git a/castle/cms/commands.py b/castle/cms/commands.py index 61483544c..d4a8ac789 100644 --- a/castle/cms/commands.py +++ b/castle/cms/commands.py @@ -195,6 +195,41 @@ def __call__(self, filepath): logger.warn('gs not installed. Some metadata might remain in PDF files.') +class SHA256SubProcess(BaseSubProcess): + """ + To get sha256 hash of files on the filesystem so + large files do not need to be loaded into + memory to be checked + """ + if os.name == 'nt': + bin_name = 'sha256sum.exe' + else: + bin_name = 'sha256sum' + + def __call__(self, filepath): + cmd = [self.binary, filepath] + hashval = self._run_command(cmd) + try: + val = hashval.split('=')[1].strip() + return val + except: + try: + val = hashval.split(' ')[0].strip() + return val + except IOError: + logger.exception("No sha256 installed. castle.cms " + "will not be able to detect sha256 of files.") + return None + + +try: + sha256 = SHA256SubProcess() +except IOError: + sha256 = None + + +# keep the md5 classes for any residual objects that use it for hashed values +# if object uses md5 hash, it should be marked with the attribut useforsecurity = False class MD5SubProcess(BaseSubProcess): """ To get md5 hash of files on the filesystem so diff --git a/castle/cms/files/duplicates.py b/castle/cms/files/duplicates.py index 63af8ddeb..45f75dcf0 100644 --- a/castle/cms/files/duplicates.py +++ b/castle/cms/files/duplicates.py @@ -15,18 +15,18 @@ def __init__(self): self.annotations = IAnnotations(self.site) self.data = self.annotations.get(DATA_KEY) - def register(self, obj, hash): + def register(self, obj, hashed): if self.data is None: self.annotations[DATA_KEY] = OOBTree() self.data = self.annotations[DATA_KEY] - self.data[hash] = IUUID(obj) + self.data[hashed] = IUUID(obj) - def get_object(self, hash): + def get_object(self, hashed): if self.data is None: return None - if hash not in self.data: + if hashed not in self.data: return None - uuid = self.annotations[DATA_KEY][hash] + uuid = self.annotations[DATA_KEY][hashed] return uuidToObject(uuid) diff --git a/castle/cms/media/video.py b/castle/cms/media/video.py index d9ac905ad..a393cee79 100644 --- a/castle/cms/media/video.py +++ b/castle/cms/media/video.py @@ -3,7 +3,7 @@ from shutil import copyfile, rmtree from tempfile import mkdtemp -from castle.cms.commands import avconv, md5 +from castle.cms.commands import avconv, sha256, md5 from castle.cms.files import aws from castle.cms.services.google import youtube from collective.celery.utils import getCelery @@ -34,14 +34,22 @@ def process(context): return # by default, assume all non-mp4 videos need to be converted - # but in reality, all videos need converting, even mp4. - # md5 is only what makes this possible convert_it = video.contentType.split('/')[-1] != 'mp4' - if md5 is not None: - old_hash = getattr(context, '_file_hash', None) - current_hash = md5(bfilepath) - if old_hash is None or old_hash != current_hash: - convert_it = True + old_hash = getattr(context, '_file_hash', None) + if old_hash is None: + # video has not been converted + context.__setattr__('useforsecurity', True) + if context.useforsecurity is None: + # video has been converted but hashed with md5 + context.__setattr__('useforsecurity', False) + if context.useforsecurity is False: + if md5 is not None: + current_hash = md5(bfilepath) + else: + if sha256 is not None: + current_hash = sha256(bfilepath) + if old_hash is None or old_hash != current_hash: + convert_it = True if context.image and not convert_it: # already an mp4 and already has a screen grab @@ -51,7 +59,8 @@ def process(context): try: youtube.upload(context, bfilepath, filename=video.filename) # saving hash tells us we do not need to convert anymore... - context._file_hash = md5(bfilepath) + context.__setattr__('useforsecurity', True) + context._file_hash = sha256(bfilepath) convert_it = False except Exception: logger.error('Error uploading youtube video', exc_info=True) @@ -68,11 +77,12 @@ def process(context): logger.info('Could not convert video', exc_info=True) if (os.path.exists(output_filepath) and os.path.getsize(output_filepath) > 0): - if md5 is not None: + if sha256 is not None: try: - context._file_hash = md5(output_filepath) + context.__setattr__('useforsecurity', True) + context._file_hash = sha256(output_filepath) except Exception: - logger.info('Could not get md5', exc_info=True) + logger.info('Could not get sha256', exc_info=True) if not getCelery().conf.task_always_eager: context._p_jar.sync() fi = open(output_filepath) diff --git a/castle/cms/tasks/files.py b/castle/cms/tasks/files.py index 0b42729d8..af2b15174 100644 --- a/castle/cms/tasks/files.py +++ b/castle/cms/tasks/files.py @@ -1,7 +1,7 @@ import logging import transaction -from castle.cms.commands import md5 +from castle.cms.commands import md5, sha256 from castle.cms.files import aws from castle.cms.media import video from castle.cms.services.google import youtube @@ -87,13 +87,17 @@ def youtube_video_edited(obj): # we can only edit if file has NOT changed # otherwise, it will be reuploaded and original deleted - if md5 is not None: - old_hash = getattr(obj, '_file_hash', None) - if old_hash is not None: - current_hash = md5(bfilepath) - if old_hash != current_hash: - # dive out, we don't want to edit - return + old_hash = getattr(obj, '_file_hash', None) + if old_hash is not None: + if obj.useforsecurity is True: + if sha256 is not None: + current_hash = sha256(bfilepath) + if obj.useforsecurity is False: + if md5 is not None: + current_hash = md5(bfilepath) + if old_hash != current_hash: + # dive out, we don't want to edit + return youtube.edit(obj) diff --git a/castle/cms/tests/test_content.py b/castle/cms/tests/test_content.py index 810099830..bffbac5bd 100644 --- a/castle/cms/tests/test_content.py +++ b/castle/cms/tests/test_content.py @@ -124,6 +124,39 @@ def test_upload(self): self.assertEquals(fileOb.file.data, 'X' * 1024 * 5) return fileOb + def test_upload_image(self): + self.request.form.update({ + 'action': 'chunk-upload', + 'chunk': '1', + 'chunkSize': 1024, + 'totalSize': 1024 * 5, + 'image': BytesIO('X' * 1024), + 'name': 'foobar.jpg' + }) + cc = content.Creator(self.portal, self.request) + data = json.loads(cc()) + self.assertTrue('id' in data) + self.assertTrue(data['success']) + self.request.form.update({ + 'id': data['id'] + }) + + for idx in range(4): + self.request.form.update({ + 'action': 'chunk-upload', + 'chunk': str(idx + 2), + 'image': BytesIO('X' * 1024) + }) + cc = content.Creator(self.portal, self.request) + data = json.loads(cc()) + self.assertTrue(data['success']) + + self.assertTrue(data['valid']) + self.assertTrue('url' in data) + imgOb = api.content.get(path='/image-repository/foobar.jpg') + self.assertEquals(imgOb.image.data, 'X' * 1024 * 5) + return imgOb + def test_update_upload(self): fileOb = self.test_upload() api.content.transition(obj=fileOb, to_state='published')