diff --git a/pyproject.toml b/pyproject.toml index c58c29b..6b795df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ appdirs = "^1.4.4" peewee = "^3.16.0" fuzzywuzzy = "^0.18.0" climage = "^0.2.0" +filetype = "^1.2.0" [tool.poetry.group.dev.dependencies] black = "^23.1.0" diff --git a/src/baca/tools/KindleUnpack/mobi_cover.py b/src/baca/tools/KindleUnpack/mobi_cover.py index 3078ac4..dc12a91 100644 --- a/src/baca/tools/KindleUnpack/mobi_cover.py +++ b/src/baca/tools/KindleUnpack/mobi_cover.py @@ -8,7 +8,7 @@ from .unipath import pathof import os -import imghdr +import filetype import struct # note: struct pack, unpack, unpack_from all require bytestring format @@ -34,25 +34,13 @@ def get_image_type(imgname, imgdata=None): - imgtype = unicode_str(imghdr.what(pathof(imgname), imgdata)) - - # imghdr only checks for JFIF or Exif JPEG files. Apparently, there are some - # with only the magic JPEG bytes out there... - # ImageMagick handles those, so, do it too. - if imgtype is None: - if imgdata is None: - with open(pathof(imgname), 'rb') as f: - imgdata = f.read() - if imgdata[0:2] == b'\xFF\xD8': - # Get last non-null bytes - last = len(imgdata) - while (imgdata[last-1:last] == b'\x00'): - last-=1 - # Be extra safe, check the trailing bytes, too. - if imgdata[last-2:last] == b'\xFF\xD9': - imgtype = "jpeg" - return imgtype - + if imgdata is not None: + result = filetype.guess(imgdata) + else: + result = filetype.guess(pathof(imgname)) + if result is None: + return None + return unicode_str(result.extension) def get_image_size(imgname, imgdata=None): '''Determine the image type of imgname (or imgdata) and return its size. diff --git a/tests/fixtures/room-with-a-view-gutenberg.mobi b/tests/fixtures/room-with-a-view-gutenberg.mobi new file mode 100644 index 0000000..06b03a1 Binary files /dev/null and b/tests/fixtures/room-with-a-view-gutenberg.mobi differ diff --git a/tests/test_mobi_cover.py b/tests/test_mobi_cover.py new file mode 100644 index 0000000..2d8d898 --- /dev/null +++ b/tests/test_mobi_cover.py @@ -0,0 +1,53 @@ +import os +import tempfile +import shutil +from baca.tools.KindleUnpack.mobi_cover import get_image_type, get_image_size + + +def test_mobi_file_cover_extraction(): + """Test that we can extract and identify cover image from a real mobi file""" + # Use test fixture from repo + test_dir = os.path.dirname(os.path.abspath(__file__)) + mobi_file = os.path.join(test_dir, "fixtures", "room-with-a-view-gutenberg.mobi") + + # Skip test if the mobi file doesn't exist + if not os.path.exists(mobi_file): + import pytest + pytest.skip(f"Test mobi file not found: {mobi_file}") + + # Extract the mobi file to a temporary directory + temp_dir = tempfile.mkdtemp() + + try: + # Use kindleunpack to extract the mobi + from baca.tools.KindleUnpack.kindleunpack import unpackBook + + # Extract the mobi file + unpackBook(mobi_file, temp_dir) + + # Look for image files in the extracted directory + image_files = [] + for root, dirs, files in os.walk(temp_dir): + for file in files: + if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif')): + image_files.append(os.path.join(root, file)) + + # Verify that we found at least one image + assert len(image_files) > 0, "No images found in extracted mobi file" + + # Test get_image_type on each image found + for img_path in image_files: + img_type = get_image_type(img_path) + assert img_type is not None, f"Could not determine type for {img_path}" + assert img_type in ['jpg', 'jpeg', 'png', 'gif'], f"Unexpected image type: {img_type}" + + # Test get_image_size on each image + size = get_image_size(img_path) + if size is not None: + width, height = size + assert width > 0, f"Invalid width for {img_path}" + assert height > 0, f"Invalid height for {img_path}" + + finally: + # Clean up temporary directory + shutil.rmtree(temp_dir, ignore_errors=True)