From 3ac2c20465efce9aa77a03f67ea50e2ac5464553 Mon Sep 17 00:00:00 2001 From: Farzad Daei Date: Mon, 16 Feb 2026 01:25:39 -0800 Subject: [PATCH 1/3] feat(backup): add option to skip media download for some chats --- .env.example | 6 +++ .pre-commit-config.yaml | 4 +- src/config.py | 25 +++++++++++ src/listener.py | 2 +- src/telegram_backup.py | 7 +++- tests/test_config.py | 93 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 133 insertions(+), 4 deletions(-) diff --git a/.env.example b/.env.example index b3a0799..4a48094 100644 --- a/.env.example +++ b/.env.example @@ -45,6 +45,12 @@ BATCH_SIZE=100 # Comma-separated chat IDs to process FIRST in all operations # PRIORITY_CHAT_IDS= +# Skip media downloads for specific chats (comma-separated IDs) +# Messages are still backed up with text, but media files are not downloaded +# Useful for high-volume media chats where you only need text content +# Example: SKIP_MEDIA_CHAT_IDS=-1001234567890,-1009876543210 +# SKIP_MEDIA_CHAT_IDS= + # Hour (0-23) to recalculate backup statistics daily # STATS_CALCULATION_HOUR=3 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2ecd9e1..9c1744c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-yaml - id: check-toml @@ -12,7 +12,7 @@ repos: args: [--maxkb=500] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.9.6 + rev: v0.15.1 hooks: - id: ruff args: [--fix] diff --git a/src/config.py b/src/config.py index 504c1c1..66f217c 100644 --- a/src/config.py +++ b/src/config.py @@ -95,6 +95,9 @@ def __init__(self): # Useful for ensuring important chats are always backed up first self.priority_chat_ids = self._parse_id_list(os.getenv("PRIORITY_CHAT_IDS", "")) + # Skip media downloads for specific chats (but still backup message text) + self.skip_media_chat_ids = self._parse_id_list(os.getenv("SKIP_MEDIA_CHAT_IDS", "")) + # Session configuration self.session_name = os.getenv("SESSION_NAME", "telegram_backup") @@ -266,6 +269,8 @@ def __init__(self): ) if self.display_chat_ids: logger.info(f"Display mode: Viewer restricted to chat IDs {self.display_chat_ids}") + if self.skip_media_chat_ids: + logger.info(f"Media downloads skipped for chat IDs: {self.skip_media_chat_ids}") def _parse_id_list(self, id_str: str) -> set: """Parse comma-separated ID string into a set of integers.""" @@ -412,6 +417,26 @@ def get_max_media_size_bytes(self) -> int: """Get maximum media file size in bytes.""" return self.max_media_size_mb * 1024 * 1024 + def should_download_media_for_chat(self, chat_id: int) -> bool: + """ + Determine if media should be downloaded for a specific chat. + + Args: + chat_id: Telegram chat ID (marked format) + + Returns: + True if media should be downloaded, False if skipped + """ + # If global media download is disabled, return False + if not self.download_media: + return False + + # Check if chat is in skip list + if chat_id in self.skip_media_chat_ids: + return False + + return True + def validate_credentials(self): """Ensure Telegram credentials are present.""" if not all([self.api_id, self.api_hash, self.phone]): diff --git a/src/listener.py b/src/listener.py index f08972a..19799de 100644 --- a/src/listener.py +++ b/src/listener.py @@ -842,7 +842,7 @@ async def on_new_message(event: events.NewMessage.Event) -> None: # v6.0.0: Handle media - create Media record AFTER message exists if media_type: # Download media immediately if enabled - if self.config.listen_new_messages_media and self.config.download_media: + if self.config.listen_new_messages_media and self.config.should_download_media_for_chat(chat_id): try: media_path = await self._download_media(message, chat_id) if media_path: diff --git a/src/telegram_backup.py b/src/telegram_backup.py index 847fd58..8291cec 100644 --- a/src/telegram_backup.py +++ b/src/telegram_backup.py @@ -503,6 +503,11 @@ async def _verify_and_redownload_media(self) -> None: failed = 0 for chat_id, records in by_chat.items(): + # Skip media verification for chats in skip list + if chat_id in self.config.skip_media_chat_ids: + logger.debug(f"Skipping media verification for chat {chat_id} (in SKIP_MEDIA_CHAT_IDS)") + continue + try: # Get message IDs to fetch message_ids = [r["message_id"] for r in records if r.get("message_id")] @@ -951,7 +956,7 @@ async def _process_message(self, message: Message, chat_id: int) -> dict: "results": results_data, } - elif self.config.download_media: + elif self.config.should_download_media_for_chat(chat_id): # v6.0.0: Download media and store data for later insertion # (media is inserted AFTER message to satisfy FK constraint) media_result = await self._process_media(message, chat_id) diff --git a/tests/test_config.py b/tests/test_config.py index c100fd8..49ddcb8 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -180,5 +180,98 @@ def test_database_dir_custom(self): self.assertTrue(config.database_path.startswith("/data/ssd")) +class TestSkipMediaChatIds(unittest.TestCase): + """Test SKIP_MEDIA_CHAT_IDS configuration for media filtering.""" + + def setUp(self): + self.temp_dir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def test_skip_media_chat_ids_empty(self): + """Skip media chat IDs defaults to empty set when not configured.""" + env_vars = {"CHAT_TYPES": "private", "BACKUP_PATH": self.temp_dir} + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + self.assertEqual(config.skip_media_chat_ids, set()) + + def test_skip_media_chat_ids_single(self): + """Can configure single chat ID to skip media.""" + env_vars = { + "CHAT_TYPES": "private", + "SKIP_MEDIA_CHAT_IDS": "-1001234567890", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + self.assertEqual(config.skip_media_chat_ids, {-1001234567890}) + + def test_skip_media_chat_ids_multiple(self): + """Can configure multiple chat IDs to skip media.""" + env_vars = { + "CHAT_TYPES": "private", + "SKIP_MEDIA_CHAT_IDS": "-1001234567890,-1009876543210,123456", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + self.assertEqual(config.skip_media_chat_ids, {-1001234567890, -1009876543210, 123456}) + + def test_should_download_media_for_chat_normal(self): + """Should download media for chats not in skip list.""" + env_vars = { + "CHAT_TYPES": "private", + "DOWNLOAD_MEDIA": "true", + "SKIP_MEDIA_CHAT_IDS": "-1001234567890", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + # Should download for chats not in skip list + self.assertTrue(config.should_download_media_for_chat(123456)) + self.assertTrue(config.should_download_media_for_chat(-1009999999)) + + def test_should_download_media_for_chat_skipped(self): + """Should NOT download media for chats in skip list.""" + env_vars = { + "CHAT_TYPES": "private", + "DOWNLOAD_MEDIA": "true", + "SKIP_MEDIA_CHAT_IDS": "-1001234567890,-1009876543210", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + # Should NOT download for chats in skip list + self.assertFalse(config.should_download_media_for_chat(-1001234567890)) + self.assertFalse(config.should_download_media_for_chat(-1009876543210)) + + def test_should_download_media_respects_global_flag(self): + """Should respect DOWNLOAD_MEDIA=false even if not in skip list.""" + env_vars = { + "CHAT_TYPES": "private", + "DOWNLOAD_MEDIA": "false", + "SKIP_MEDIA_CHAT_IDS": "-1001234567890", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + # Should NOT download for ANY chat when global flag is false + self.assertFalse(config.should_download_media_for_chat(123456)) + self.assertFalse(config.should_download_media_for_chat(-1009999999)) + self.assertFalse(config.should_download_media_for_chat(-1001234567890)) + + def test_skip_media_chat_ids_whitespace_handling(self): + """Should handle whitespace in chat ID list correctly.""" + env_vars = { + "CHAT_TYPES": "private", + "SKIP_MEDIA_CHAT_IDS": " -1001234567890 , -1009876543210 , 123456 ", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + self.assertEqual(config.skip_media_chat_ids, {-1001234567890, -1009876543210, 123456}) + + if __name__ == "__main__": unittest.main() From a08fbc8ae1c1a86965cf28cb78104c344e017b04 Mon Sep 17 00:00:00 2001 From: Farzad Daei Date: Mon, 16 Feb 2026 01:50:00 -0800 Subject: [PATCH 2/3] doc: add new config to README --- README.md | 1 + docker-compose.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 469299a..f09cc8e 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,7 @@ The **Scope** column shows whether each variable applies to the backup scheduler | `VERIFY_MEDIA` | `false` | B | Re-download missing or corrupted media files | | `STATS_CALCULATION_HOUR` | `3` | B | Hour (0-23) to recalculate backup statistics daily | | `PRIORITY_CHAT_IDS` | - | B | Comma-separated chat IDs to process first in all operations | +| `SKIP_MEDIA_CHAT_IDS` | - | B | Skip media downloads for specific chats | | `LOG_LEVEL` | `INFO` | B/V | Logging verbosity: `DEBUG`, `INFO`, `WARNING`/`WARN`, `ERROR` | | **Chat Filtering** | | | See [Chat Filtering](#chat-filtering) below | | `CHAT_IDS` | - | B | **Whitelist mode**: backup ONLY these chats (ignores all other filters) | diff --git a/docker-compose.yml b/docker-compose.yml index 7404f65..e32d200 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -22,6 +22,7 @@ services: BATCH_SIZE: ${BATCH_SIZE:-100} # DEDUPLICATE_MEDIA: ${DEDUPLICATE_MEDIA:-true} # PRIORITY_CHAT_IDS: ${PRIORITY_CHAT_IDS:-} + # SKIP_MEDIA_CHAT_IDS: ${SKIP_MEDIA_CHAT_IDS:-} # STATS_CALCULATION_HOUR: ${STATS_CALCULATION_HOUR:-3} # ======================================================================= From 04f9bcecfa06db6d333a64894de1d7390c7a56e4 Mon Sep 17 00:00:00 2001 From: Farzad Daei Date: Mon, 16 Feb 2026 10:54:54 -0800 Subject: [PATCH 3/3] feat(backup): add option to reclaim space from chats with skipped media --- .env.example | 5 ++++ README.md | 3 ++- docker-compose.yml | 1 + src/config.py | 5 +++- src/db/adapter.py | 45 ++++++++++++++++++++++++++++++++++++ src/telegram_backup.py | 52 ++++++++++++++++++++++++++++++++++++++++++ tests/test_config.py | 29 +++++++++++++++++++++++ 7 files changed, 138 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index 4a48094..7e690ac 100644 --- a/.env.example +++ b/.env.example @@ -51,6 +51,11 @@ BATCH_SIZE=100 # Example: SKIP_MEDIA_CHAT_IDS=-1001234567890,-1009876543210 # SKIP_MEDIA_CHAT_IDS= +# Delete existing media files and DB records for chats in SKIP_MEDIA_CHAT_IDS +# When enabled (default), reclaims storage by removing already-downloaded media +# Set to false to keep existing media but skip future downloads +SKIP_MEDIA_DELETE_EXISTING=true + # Hour (0-23) to recalculate backup statistics daily # STATS_CALCULATION_HOUR=3 diff --git a/README.md b/README.md index f09cc8e..b81ad1e 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,8 @@ The **Scope** column shows whether each variable applies to the backup scheduler | `VERIFY_MEDIA` | `false` | B | Re-download missing or corrupted media files | | `STATS_CALCULATION_HOUR` | `3` | B | Hour (0-23) to recalculate backup statistics daily | | `PRIORITY_CHAT_IDS` | - | B | Comma-separated chat IDs to process first in all operations | -| `SKIP_MEDIA_CHAT_IDS` | - | B | Skip media downloads for specific chats | +| `SKIP_MEDIA_CHAT_IDS` | - | B | Skip media downloads for specific chats (messages still backed up with text) | +| `SKIP_MEDIA_DELETE_EXISTING` | `true` | B | Delete existing media files and DB records for chats in skip list to reclaim storage | | `LOG_LEVEL` | `INFO` | B/V | Logging verbosity: `DEBUG`, `INFO`, `WARNING`/`WARN`, `ERROR` | | **Chat Filtering** | | | See [Chat Filtering](#chat-filtering) below | | `CHAT_IDS` | - | B | **Whitelist mode**: backup ONLY these chats (ignores all other filters) | diff --git a/docker-compose.yml b/docker-compose.yml index e32d200..7a181e5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -23,6 +23,7 @@ services: # DEDUPLICATE_MEDIA: ${DEDUPLICATE_MEDIA:-true} # PRIORITY_CHAT_IDS: ${PRIORITY_CHAT_IDS:-} # SKIP_MEDIA_CHAT_IDS: ${SKIP_MEDIA_CHAT_IDS:-} + # SKIP_MEDIA_DELETE_EXISTING: ${SKIP_MEDIA_DELETE_EXISTING:-true} # STATS_CALCULATION_HOUR: ${STATS_CALCULATION_HOUR:-3} # ======================================================================= diff --git a/src/config.py b/src/config.py index 66f217c..4ac7afc 100644 --- a/src/config.py +++ b/src/config.py @@ -97,6 +97,8 @@ def __init__(self): # Skip media downloads for specific chats (but still backup message text) self.skip_media_chat_ids = self._parse_id_list(os.getenv("SKIP_MEDIA_CHAT_IDS", "")) + # Delete existing media files and records for chats in skip list (reclaim storage) + self.skip_media_delete_existing = os.getenv("SKIP_MEDIA_DELETE_EXISTING", "true").lower() == "true" # Session configuration self.session_name = os.getenv("SESSION_NAME", "telegram_backup") @@ -270,7 +272,8 @@ def __init__(self): if self.display_chat_ids: logger.info(f"Display mode: Viewer restricted to chat IDs {self.display_chat_ids}") if self.skip_media_chat_ids: - logger.info(f"Media downloads skipped for chat IDs: {self.skip_media_chat_ids}") + cleanup_status = "will delete existing media" if self.skip_media_delete_existing else "keeps existing media" + logger.info(f"Media downloads skipped for chat IDs: {self.skip_media_chat_ids} ({cleanup_status})") def _parse_id_list(self, id_str: str) -> set: """Parse comma-separated ID string into a set of integers.""" diff --git a/src/db/adapter.py b/src/db/adapter.py index b68244a..9529db8 100644 --- a/src/db/adapter.py +++ b/src/db/adapter.py @@ -675,6 +675,51 @@ async def insert_media(self, media_data: dict[str, Any]) -> None: await session.execute(stmt) await session.commit() + async def get_media_for_chat(self, chat_id: int) -> list[dict[str, Any]]: + """ + Get all media records for a specific chat. + + Args: + chat_id: Chat identifier + + Returns: + List of media records with file paths and metadata + """ + async with self.db_manager.async_session_factory() as session: + stmt = select(Media).where(Media.chat_id == chat_id) + result = await session.execute(stmt) + media_records = result.scalars().all() + + return [ + { + "id": m.id, + "message_id": m.message_id, + "chat_id": m.chat_id, + "type": m.type, + "file_path": m.file_path, + "file_size": m.file_size, + "downloaded": m.downloaded, + } + for m in media_records + ] + + async def delete_media_for_chat(self, chat_id: int) -> int: + """ + Delete all media records for a specific chat. + Does not delete message records or the chat itself. + + Args: + chat_id: Chat identifier + + Returns: + Number of media records deleted + """ + async with self.db_manager.async_session_factory() as session: + stmt = delete(Media).where(Media.chat_id == chat_id) + result = await session.execute(stmt) + await session.commit() + return result.rowcount + async def get_media_for_verification(self) -> list[dict[str, Any]]: """ Get all media records that should have files on disk. diff --git a/src/telegram_backup.py b/src/telegram_backup.py index 8291cec..d41ff5b 100644 --- a/src/telegram_backup.py +++ b/src/telegram_backup.py @@ -592,6 +592,10 @@ async def _backup_dialog(self, dialog, is_archived: bool = False) -> int: chat_data = self._extract_chat_data(entity, is_archived=is_archived) await self.db.upsert_chat(chat_data) + # Clean up existing media if this chat is in the skip list + if chat_id in self.config.skip_media_chat_ids and self.config.skip_media_delete_existing: + await self._cleanup_existing_media(chat_id) + # Ensure profile photos for users and groups/channels are backed up. # This runs on every dialog backup but only downloads new files when # Telegram reports a different profile photo. @@ -1044,6 +1048,54 @@ async def _ensure_profile_photo(self, entity, marked_id: int = None) -> None: except Exception as e: logger.warning(f"Failed to download avatar for {file_id}: {e}") + async def _cleanup_existing_media(self, chat_id: int) -> None: + """ + Delete existing media files and database records for a chat. + Used when a chat is added to SKIP_MEDIA_CHAT_IDS to reclaim storage. + + Args: + chat_id: Chat identifier + """ + try: + # Get all media records for this chat + media_records = await self.db.get_media_for_chat(chat_id) + if not media_records: + logger.debug(f"No existing media found for chat {chat_id}") + return + + deleted_files = 0 + deleted_records = 0 + freed_bytes = 0 + + for record in media_records: + # Delete file from disk if it exists + file_path = record.get("file_path") + if file_path and os.path.exists(file_path): + try: + file_size = os.path.getsize(file_path) + # Check if it's a symlink (deduplicated media) + if os.path.islink(file_path): + os.unlink(file_path) + else: + os.remove(file_path) + deleted_files += 1 + freed_bytes += file_size + except Exception as e: + logger.warning(f"Failed to delete media file {file_path}: {e}") + + # Delete all media records from database for this chat + deleted_records = await self.db.delete_media_for_chat(chat_id) + + if deleted_files > 0 or deleted_records > 0: + freed_mb = freed_bytes / (1024 * 1024) + logger.info( + f"Cleaned up existing media for chat {chat_id}: " + f"{deleted_files} files ({freed_mb:.1f} MB), {deleted_records} DB records" + ) + + except Exception as e: + logger.error(f"Error cleaning up existing media for chat {chat_id}: {e}", exc_info=True) + async def _process_media(self, message: Message, chat_id: int) -> dict | None: """ Process and download media from a message. diff --git a/tests/test_config.py b/tests/test_config.py index 49ddcb8..0755208 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -272,6 +272,35 @@ def test_skip_media_chat_ids_whitespace_handling(self): config = Config() self.assertEqual(config.skip_media_chat_ids, {-1001234567890, -1009876543210, 123456}) + def test_skip_media_delete_existing_defaults_true(self): + """SKIP_MEDIA_DELETE_EXISTING defaults to true when not set.""" + env_vars = {"CHAT_TYPES": "private", "BACKUP_PATH": self.temp_dir} + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + self.assertTrue(config.skip_media_delete_existing) + + def test_skip_media_delete_existing_can_be_disabled(self): + """Can disable SKIP_MEDIA_DELETE_EXISTING to keep existing media.""" + env_vars = { + "CHAT_TYPES": "private", + "SKIP_MEDIA_DELETE_EXISTING": "false", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + self.assertFalse(config.skip_media_delete_existing) + + def test_skip_media_delete_existing_explicit_true(self): + """Can explicitly enable SKIP_MEDIA_DELETE_EXISTING.""" + env_vars = { + "CHAT_TYPES": "private", + "SKIP_MEDIA_DELETE_EXISTING": "true", + "BACKUP_PATH": self.temp_dir, + } + with patch.dict(os.environ, env_vars, clear=True): + config = Config() + self.assertTrue(config.skip_media_delete_existing) + if __name__ == "__main__": unittest.main()