From 6ff0c492fc24bd5afb8eb420ad768bd52868c694 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Mon, 5 May 2025 23:32:54 +0800 Subject: [PATCH 01/17] feat: get rolling comments --- nndownload/nndownload.py | 184 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 174 insertions(+), 10 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 9afaada..3d08919 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -19,6 +19,7 @@ import time import xml.dom.minidom from typing import AnyStr, List, Match +from datetime import datetime, timezone import aiohttp import requests @@ -29,7 +30,7 @@ from requests.utils import add_dict_to_cookiejar from rich.progress import Progress from urllib3.util import Retry -from urllib.parse import urlparse +from urllib.parse import urlencode, urlparse from .ffmpeg_dl import FfmpegDL, FfmpegDLException, FfmpegExistsException from .hls_dl import download_hls @@ -86,8 +87,6 @@ USER_SERIES_API = "https://nvapi.nicovideo.jp/v1/users/{0}/series" USER_FOLLOWING_API = "https://nvapi.nicovideo.jp/v1/users/{0}/following/users?pageSize=800" # 800 following limit for premium users SEIGA_MANGA_TAGS_API = "https://seiga.nicovideo.jp/ajax/manga/tag/list?id={0}" -COMMENTS_API = "https://public.nvcomment.nicovideo.jp/v1/threads" -COMMENTS_API_POST_DATA = "{{\'params\':{0},\'threadKey\':\'{1}\',\'additionals\':{{}}}}" USER_HISTORY_API = "https://nvapi.nicovideo.jp/v1/users/me/watch/history?page={0}&pageSize={1}" USER_LIKES_API = "nvapi.nicovideo.jp/v1/users/me/watch/likes?page={0}&pageSize={1}" USER_WATCHLATER_API = "https://nvapi.nicovideo.jp/v1/users/me/watch-later?sortKey=addedAt&sortOrder=desc&pageSize={0}&page={1}" @@ -1817,7 +1816,9 @@ def collect_video_parameters(session: requests.Session, template_params: dict, p or params["video"]["thumbnail"]["middleUrl"] or params["video"]["thumbnail"]["url"]) + template_params["threads"] = params["comment"]["threads"] template_params["thread_id"] = int(params["comment"]["threads"][0]["id"]) + template_params["comment_server"] = params["comment"]["nvComment"]["server"] template_params["thread_key"] = params["comment"]["nvComment"]["threadKey"] template_params["thread_params"] = params["comment"]["nvComment"]["params"] template_params["published"] = params["video"]["registeredAt"] @@ -1899,23 +1900,186 @@ def download_thumbnail(session: requests.Session, filename: AnyStr, template_par output("Finished downloading thumbnail for {0}.\n".format(template_params["id"]), logging.INFO) - -def download_comments(session: requests.Session, filename: AnyStr, template_params: dict): +def get_niconico_timestamp(input_time=None): + if input_time is None: + return int(datetime.now(timezone.utc).timestamp()) + + if isinstance(input_time, (int, float)): + return int(input_time) + + try: + # Try ISO format first + return int(datetime.strptime(input_time, "%Y-%m-%dT%H:%M:%S%z").timestamp()) + except ValueError: + try: + # Try date-only format + return int(datetime.strptime(input_time, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp()) + except ValueError: + return int(datetime.now(timezone.utc).timestamp()) # Fallback + +def download_comments( + session: requests.Session, + filename: str, + template_params: dict, +): """Download the video comments.""" + comment_limit: int = 40 + output("Downloading comments for {0}...\n".format(template_params["id"]), logging.INFO) filename = replace_extension(filename, "comments.json") - comments_post = COMMENTS_API_POST_DATA.format(template_params["thread_params"], template_params["thread_key"]).replace("\'", "\"").replace(": ", ":").replace(", ", ",") - session.options(COMMENTS_API, headers=API_HEADERS) # OPTIONS - get_comments_request = session.post(COMMENTS_API, data=comments_post, headers=API_HEADERS) - get_comments_request.raise_for_status() + comments = { + "globalComments": [ + { + "count": 0, + "id": "" + } + ], + "threads": [], + } + + # Convert date limit to timestamp + last_time = get_niconico_timestamp() + + # Process each comment thread + for thread in template_params["thread_params"]["targets"]: + thread_comments = fetch_comments_modern( + session, + template_params["id"], + template_params["comment_server"], + template_params["thread_key"], + thread, + template_params["thread_params"]["language"], + last_time, + comment_limit, + ) + + if thread_comments.get("globalCommentsData"): + if thread_comments["globalCommentsData"]["count"] > comments["globalComments"][0]["count"]: + comments["globalComments"][0]["count"] = thread_comments["globalCommentsData"]["count"] + comments["globalComments"][0]["id"] = thread_comments["globalCommentsData"]["id"] + comments["threads"].append(thread_comments["threadData"]) + + # Save comments to file with open(filename, "w", encoding="utf-8") as file: - json.dump(get_comments_request.json(), file, indent=4, ensure_ascii=False, sort_keys=True) + json.dump(comments, file, indent=4, ensure_ascii=False, sort_keys=True) output("Finished downloading comments for {0}.\n".format(template_params["id"]), logging.INFO) +def fetch_comments_modern( + session: requests.Session, + video_id: str, + api_server: str, + thread_key: str, + thread: dict, + language: str, + last_time: int, + limit: int, +) -> dict: + """Fetch comments using modern Niconico API (requires login).""" + global_comments_data = { + "count": 0, + "id": thread["id"], + } + thread_data = { + "commentCount": 0, + "comments": [], + "fork": thread["fork"], + "id": thread["id"] + } + base_data = { + "threadKey": thread_key, + "params": { + "language": language, + "targets": [thread] + } + } + fetched_count = 0 + + while fetched_count < limit: + try: + payload = { + **base_data, + "additionals": { + "res_from": -1000, + "when": last_time + } + } + + headers = { + "content-type": "text/plain;charset=UTF-8", + "x-client-os-type": "others", + "x-frontend-id": "6", + "x-frontend-version": "0" + } + + response = session.post( + f"{api_server}/v1/threads", + json=payload, + headers=headers + ) + data = response.json() + + # Handle errors + if "meta" in data and "errorCode" in data["meta"]: + handle_api_error(data["meta"]["errorCode"], session, video_id) + continue + + global_comment_count = data["data"]["globalComments"][0]["count"] + if global_comment_count and global_comment_count > global_comments_data["count"]: + # Get the highest global comment count as our estimated global threads total comment count + global_comments_data["count"] = global_comment_count + + comment_count = data["data"]["threads"][0]["commentCount"] + if comment_count and fetched_count == 0: + # Use first fetch count to know the thread total comment count + thread_data["commentCount"] = comment_count + + # Extract comments + thread_comments = data["data"]["threads"][0]["comments"] + # Not sure why this `no < 5` check was in the original comment-zouryou source code. + # I uncommented it out because some easy threads comments would not be added to the list. + if not thread_comments: # or thread_comments[0]["no"] < 5: + break # Reached beginning + + thread_data["comments"].extend(thread_comments) + last_time = get_niconico_timestamp(thread_comments[0]["postedAt"]) + fetched_count += 1 + + # Rate limiting (not sure if need this) + time.sleep(1) + + except Exception as e: + print(f"Error fetching comments: {e}") + break + + return {"globalCommentsData": global_comments_data, "threadData": thread_data} + +def handle_api_error(error_code: str, session: requests.Session, video_id: str) -> None: + """Handle API errors with appropriate actions.""" + if error_code == "TOO_MANY_REQUESTS": + output("Rate limited - waiting 60 seconds...", logging.INFO) + time.sleep(60) + elif error_code == "EXPIRED_TOKEN": + output("Refreshing thread key...", logging.INFO) + refresh_thread_key(session, video_id) + elif error_code == "INVALID_TOKEN": + raise Exception("Authentication required - please login first") + else: + raise Exception(f"API error: {error_code}") + +def refresh_thread_key(session: requests.Session, video_id: str) -> str: + """Refresh the thread key for modern API.""" + url = f"https://nvapi.nicovideo.jp/v1/comment/keys/thread?videoId={video_id}" + headers = { + "X-Frontend-Id": "6", + "X-Frontend-Version": "0", + "Content-Type": "application/json" + } + response = session.get(url, headers=headers) + return response.json()["data"]["threadKey"] def add_metadata_to_container(filename: AnyStr, template_params: dict): """Add metadata to any MP4 container.""" From 877d0847620ba8329f3bea09b7be539dfb5c421a Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Mon, 5 May 2025 23:39:03 +0800 Subject: [PATCH 02/17] fix: remove unused urlencode --- nndownload/nndownload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 3d08919..dcadabc 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -30,7 +30,7 @@ from requests.utils import add_dict_to_cookiejar from rich.progress import Progress from urllib3.util import Retry -from urllib.parse import urlencode, urlparse +from urllib.parse import urlparse from .ffmpeg_dl import FfmpegDL, FfmpegDLException, FfmpegExistsException from .hls_dl import download_hls From f1e3b75e40921e57c371de73a648b4b741633ae5 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Tue, 6 May 2025 02:06:18 +0800 Subject: [PATCH 03/17] perf: remove 1 sec rate limiting for comments --- nndownload/nndownload.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index dcadabc..38011e4 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -2048,9 +2048,6 @@ def fetch_comments_modern( last_time = get_niconico_timestamp(thread_comments[0]["postedAt"]) fetched_count += 1 - # Rate limiting (not sure if need this) - time.sleep(1) - except Exception as e: print(f"Error fetching comments: {e}") break From d2acbffdd56e200cdf3586ebbf502fc3ae31ed15 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Tue, 6 May 2025 15:10:45 +0800 Subject: [PATCH 04/17] add attribution --- LICENSE | 3 +++ 1 file changed, 3 insertions(+) diff --git a/LICENSE b/LICENSE index feaa87e..8817838 100644 --- a/LICENSE +++ b/LICENSE @@ -19,3 +19,6 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +Methods and logic borrowed from the following programs. See their respective licenses for more details: +- comment-zouryou (https://github.com/tanbatu/comment-zouryou): Copyright (c) 2022 tanbatu, MIT License From 611fe88a24da8dfb995a239e2838ac120f28dd97 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Tue, 6 May 2025 15:16:54 +0800 Subject: [PATCH 05/17] use API_HEADERS and specify thread refresh url as global --- nndownload/nndownload.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 38011e4..e571859 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -78,6 +78,7 @@ SEIGA_MANGA_ID_RE = re.compile(r"/comic/(\d+)") THUMB_INFO_API = "http://ext.nicovideo.jp/api/getthumbinfo/{0}" +THREAD_REFRESH_API = "https://nvapi.nicovideo.jp/v1/comment/keys/thread?videoId={0}" MYLIST_API = "https://nvapi.nicovideo.jp/v2/mylists/{0}?pageSize=500" # 500 video limit for premium mylists MYLIST_ME_API = "https://nvapi.nicovideo.jp/v1/users/me/mylists/{0}?pageSize=500" # Still on /v1 SERIES_API = "https://nvapi.nicovideo.jp/v2/series/{0}?&pageSize=500" # Same as mylists @@ -2068,11 +2069,11 @@ def handle_api_error(error_code: str, session: requests.Session, video_id: str) raise Exception(f"API error: {error_code}") def refresh_thread_key(session: requests.Session, video_id: str) -> str: - """Refresh the thread key for modern API.""" - url = f"https://nvapi.nicovideo.jp/v1/comment/keys/thread?videoId={video_id}" + """Refresh the thread key for comment API""" + url = THREAD_REFRESH_API.format(video_id) + headers = { - "X-Frontend-Id": "6", - "X-Frontend-Version": "0", + **API_HEADERS, "Content-Type": "application/json" } response = session.get(url, headers=headers) From d0835eb60328ce2ab253ad8c9c6d04977b0194c1 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Tue, 6 May 2025 15:26:37 +0800 Subject: [PATCH 06/17] easy wins --- nndownload/nndownload.py | 70 ++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index e571859..128371d 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -97,6 +97,7 @@ } USER_VIDEOS_API_N = 100 +COMMENT_THREAD_COOLDOWN_S = 60 NAMA_HEARTBEAT_INTERVAL_S = 30 NAMA_PLAYLIST_INTERVAL_S = 5 DMC_HEARTBEAT_INTERVAL_S = 15 @@ -440,6 +441,37 @@ def rewrite_file(filename: AnyStr, old_str: AnyStr, new_str: AnyStr): file.truncate() +@contextlib.contextmanager +def get_temp_dir(): + """Get a temporary working directory.""" + + tmpdir = tempfile.mkdtemp() + try: + yield tmpdir + finally: + shutil.rmtree(tmpdir) + +def get_unix_timestamp(input_time=None): + """Convert a date or timestamp to a Unix timestamp.""" + + if input_time is None: + return int(datetime.now(timezone.utc).timestamp()) + + if isinstance(input_time, (int, float)): + return int(input_time) + + try: + # Try ISO format first + return int(datetime.strptime(input_time, "%Y-%m-%dT%H:%M:%S%z").timestamp()) + except ValueError: + try: + # Try date-only format + return int(datetime.strptime(input_time, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp()) + except ValueError: + return int(datetime.now(timezone.utc).timestamp()) # Fallback + + + ## Nama methods def generate_stream(session: requests.Session, master_url: AnyStr) -> AnyStr: @@ -1068,7 +1100,7 @@ def request_video(session: requests.Session, video_id: AnyStr): if _CMDL_OPTS.download_thumbnail: download_thumbnail(session, filename, template_params) if _CMDL_OPTS.download_comments: - download_comments(session, filename, template_params) + download_video_comments(session, filename, template_params) def request_user(session: requests.Session, user_id: AnyStr): @@ -1901,24 +1933,7 @@ def download_thumbnail(session: requests.Session, filename: AnyStr, template_par output("Finished downloading thumbnail for {0}.\n".format(template_params["id"]), logging.INFO) -def get_niconico_timestamp(input_time=None): - if input_time is None: - return int(datetime.now(timezone.utc).timestamp()) - - if isinstance(input_time, (int, float)): - return int(input_time) - - try: - # Try ISO format first - return int(datetime.strptime(input_time, "%Y-%m-%dT%H:%M:%S%z").timestamp()) - except ValueError: - try: - # Try date-only format - return int(datetime.strptime(input_time, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp()) - except ValueError: - return int(datetime.now(timezone.utc).timestamp()) # Fallback - -def download_comments( +def download_video_comments( session: requests.Session, filename: str, template_params: dict, @@ -1942,7 +1957,7 @@ def download_comments( } # Convert date limit to timestamp - last_time = get_niconico_timestamp() + last_time = get_unix_timestamp() # Process each comment thread for thread in template_params["thread_params"]["targets"]: @@ -2010,10 +2025,9 @@ def fetch_comments_modern( } headers = { + **API_HEADERS, "content-type": "text/plain;charset=UTF-8", "x-client-os-type": "others", - "x-frontend-id": "6", - "x-frontend-version": "0" } response = session.post( @@ -2046,11 +2060,11 @@ def fetch_comments_modern( break # Reached beginning thread_data["comments"].extend(thread_comments) - last_time = get_niconico_timestamp(thread_comments[0]["postedAt"]) + last_time = get_unix_timestamp(thread_comments[0]["postedAt"]) fetched_count += 1 except Exception as e: - print(f"Error fetching comments: {e}") + output(f"Error fetching comments {e}", logging.ERROR) break return {"globalCommentsData": global_comments_data, "threadData": thread_data} @@ -2059,17 +2073,17 @@ def handle_api_error(error_code: str, session: requests.Session, video_id: str) """Handle API errors with appropriate actions.""" if error_code == "TOO_MANY_REQUESTS": output("Rate limited - waiting 60 seconds...", logging.INFO) - time.sleep(60) + time.sleep(COMMENT_THREAD_COOLDOWN_S) elif error_code == "EXPIRED_TOKEN": output("Refreshing thread key...", logging.INFO) refresh_thread_key(session, video_id) elif error_code == "INVALID_TOKEN": - raise Exception("Authentication required - please login first") + raise AuthenticationException else: - raise Exception(f"API error: {error_code}") + raise ParameterExtractionException(error_code) def refresh_thread_key(session: requests.Session, video_id: str) -> str: - """Refresh the thread key for comment API""" + """Refresh the thread key for the comments API""" url = THREAD_REFRESH_API.format(video_id) headers = { From 3c77972276a5d6cd0c18a8bbdcd679d1d3021c26 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Tue, 6 May 2025 18:31:36 +0800 Subject: [PATCH 07/17] resolve most convos --- nndownload/nndownload.py | 102 +++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 128371d..f064785 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -97,10 +97,12 @@ } USER_VIDEOS_API_N = 100 -COMMENT_THREAD_COOLDOWN_S = 60 NAMA_HEARTBEAT_INTERVAL_S = 30 NAMA_PLAYLIST_INTERVAL_S = 5 DMC_HEARTBEAT_INTERVAL_S = 15 +COMMENTS_THREAD_COOLDOWN_S = 60 +COMMENTS_THREAD_INTERVAL_S = 1 +COMMENTS_LIMIT_DEFAULT_N = sys.maxsize # effectively infinite (no videos on niconico have reached 100 mil comments) KILOBYTE = 1024 KILOBIT = 1000 BLOCK_SIZE = 1024 @@ -109,6 +111,16 @@ BACKOFF_FACTOR = 2 # retry_timeout_s = BACKOFF_FACTOR * (2 ** ({RETRY_ATTEMPTS} - 1)) TEMP_PATH_LEN = 16 +COMMENTS_DATA_JSON = { + "globalComments": [ + { + "count": 0, + "id": None + } + ], + "threads": [], +} + MIMETYPES = { "image/gif": "gif", "image/jpeg": "jpg", @@ -451,7 +463,7 @@ def get_temp_dir(): finally: shutil.rmtree(tmpdir) -def get_unix_timestamp(input_time=None): +def get_unix_timestamp(input_time=None) -> int: """Convert a date or timestamp to a Unix timestamp.""" if input_time is None: @@ -1940,28 +1952,20 @@ def download_video_comments( ): """Download the video comments.""" - comment_limit: int = 40 - output("Downloading comments for {0}...\n".format(template_params["id"]), logging.INFO) filename = replace_extension(filename, "comments.json") - comments = { - "globalComments": [ - { - "count": 0, - "id": "" - } - ], - "threads": [], - } + # TODO: Make sure to limit date range to this: + # OLD_DATE.min = "2007-03-03"; + # OLD_DATE.max = new Date().getFullYear() + "-12-31"; # Convert date limit to timestamp last_time = get_unix_timestamp() # Process each comment thread for thread in template_params["thread_params"]["targets"]: - thread_comments = fetch_comments_modern( + thread_comments = fetch_thread_comments( session, template_params["id"], template_params["comment_server"], @@ -1969,22 +1973,22 @@ def download_video_comments( thread, template_params["thread_params"]["language"], last_time, - comment_limit, ) - if thread_comments.get("globalCommentsData"): - if thread_comments["globalCommentsData"]["count"] > comments["globalComments"][0]["count"]: - comments["globalComments"][0]["count"] = thread_comments["globalCommentsData"]["count"] - comments["globalComments"][0]["id"] = thread_comments["globalCommentsData"]["id"] - comments["threads"].append(thread_comments["threadData"]) + # Update global comment count if it's higher than the previous highest + if thread_comments["globalCommentsData"]["count"] > COMMENTS_DATA_JSON["globalComments"][0]["count"]: + COMMENTS_DATA_JSON["globalComments"][0]["count"] = thread_comments["globalCommentsData"]["count"] + # Set the thread ID too + COMMENTS_DATA_JSON["globalComments"][0]["id"] = thread_comments["globalCommentsData"]["id"] + COMMENTS_DATA_JSON["threads"].append(thread_comments["threadData"]) # Save comments to file with open(filename, "w", encoding="utf-8") as file: - json.dump(comments, file, indent=4, ensure_ascii=False, sort_keys=True) + json.dump(COMMENTS_DATA_JSON, file, indent=4, ensure_ascii=False, sort_keys=True) output("Finished downloading comments for {0}.\n".format(template_params["id"]), logging.INFO) -def fetch_comments_modern( +def fetch_thread_comments( session: requests.Session, video_id: str, api_server: str, @@ -1992,9 +1996,8 @@ def fetch_comments_modern( thread: dict, language: str, last_time: int, - limit: int, ) -> dict: - """Fetch comments using modern Niconico API (requires login).""" + """Fetch comments for thread (requires login).""" global_comments_data = { "count": 0, "id": thread["id"], @@ -2013,8 +2016,11 @@ def fetch_comments_modern( } } fetched_count = 0 + + if _CMDL_OPTS.no_login: + raise AuthenticationException("Downloading comments is not possible when -g/--no-login is specified. Please login or provide a session cookie") - while fetched_count < limit: + while fetched_count < COMMENTS_LIMIT_DEFAULT_N: try: payload = { **base_data, @@ -2037,10 +2043,21 @@ def fetch_comments_modern( ) data = response.json() - # Handle errors - if "meta" in data and "errorCode" in data["meta"]: - handle_api_error(data["meta"]["errorCode"], session, video_id) - continue + # Handle handle API errors, if any, with appropriate actions + error_code = data["meta"]["errorCode"] if "meta" in data and "errorCode" in data["meta"] else None + if error_code is not None: + if error_code == "TOO_MANY_REQUESTS": + output("Rate limited - waiting 60 seconds...", logging.INFO) + time.sleep(COMMENTS_THREAD_COOLDOWN_S) + continue + elif error_code == "EXPIRED_TOKEN": + output("Refreshing thread key...", logging.INFO) + refresh_thread_key(session, video_id) + continue + elif error_code == "INVALID_TOKEN": + raise AuthenticationException + else: + raise ParameterExtractionException(error_code) global_comment_count = data["data"]["globalComments"][0]["count"] if global_comment_count and global_comment_count > global_comments_data["count"]: @@ -2049,19 +2066,21 @@ def fetch_comments_modern( comment_count = data["data"]["threads"][0]["commentCount"] if comment_count and fetched_count == 0: - # Use first fetch count to know the thread total comment count + # Use first fetch count to know the thread's total comment count thread_data["commentCount"] = comment_count - # Extract comments thread_comments = data["data"]["threads"][0]["comments"] - # Not sure why this `no < 5` check was in the original comment-zouryou source code. - # I uncommented it out because some easy threads comments would not be added to the list. - if not thread_comments: # or thread_comments[0]["no"] < 5: - break # Reached beginning + + # TODO: Add check for date range later + if not thread_comments: + # There are no comments before lastTime to fetch + break thread_data["comments"].extend(thread_comments) last_time = get_unix_timestamp(thread_comments[0]["postedAt"]) fetched_count += 1 + + time.sleep(COMMENTS_THREAD_COOLDOWN_S) except Exception as e: output(f"Error fetching comments {e}", logging.ERROR) @@ -2069,19 +2088,6 @@ def fetch_comments_modern( return {"globalCommentsData": global_comments_data, "threadData": thread_data} -def handle_api_error(error_code: str, session: requests.Session, video_id: str) -> None: - """Handle API errors with appropriate actions.""" - if error_code == "TOO_MANY_REQUESTS": - output("Rate limited - waiting 60 seconds...", logging.INFO) - time.sleep(COMMENT_THREAD_COOLDOWN_S) - elif error_code == "EXPIRED_TOKEN": - output("Refreshing thread key...", logging.INFO) - refresh_thread_key(session, video_id) - elif error_code == "INVALID_TOKEN": - raise AuthenticationException - else: - raise ParameterExtractionException(error_code) - def refresh_thread_key(session: requests.Session, video_id: str) -> str: """Refresh the thread key for the comments API""" url = THREAD_REFRESH_API.format(video_id) From 06e2624ebe60ff4805f501b6b74f060396a753db Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Tue, 6 May 2025 18:52:07 +0800 Subject: [PATCH 08/17] remove extra space --- nndownload/nndownload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index f064785..615bf5c 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -483,7 +483,6 @@ def get_unix_timestamp(input_time=None) -> int: return int(datetime.now(timezone.utc).timestamp()) # Fallback - ## Nama methods def generate_stream(session: requests.Session, master_url: AnyStr) -> AnyStr: @@ -2097,6 +2096,7 @@ def refresh_thread_key(session: requests.Session, video_id: str) -> str: "Content-Type": "application/json" } response = session.get(url, headers=headers) + response.raise_for_status() return response.json()["data"]["threadKey"] def add_metadata_to_container(filename: AnyStr, template_params: dict): From 0916d79ab69a02ebcaebeec389a8f348f160a690 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Wed, 7 May 2025 23:26:05 +0800 Subject: [PATCH 09/17] feat: add comment qualifier cli flags with impl --- nndownload/nndownload.py | 187 ++++++++++++++++++++++++++------------- 1 file changed, 125 insertions(+), 62 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 615bf5c..4c05509 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -62,6 +62,7 @@ SEIGA_CHAPTER_URL = "https://seiga.nicovideo.jp/watch/{0}" SEIGA_SOURCE_URL = "https://seiga.nicovideo.jp/image/source/{0}" SEIGA_CDN_URL = "https://lohas.nicoseiga.jp/" +COMMENTS_THREAD_URL = "{0}/v1/threads" TIMESHIFT_USE_URL = "https://live.nicovideo.jp/api/timeshift.ticket.use" TIMESHIFT_RESERVE_URL = "https://live.nicovideo.jp/api/timeshift.reservations" @@ -102,7 +103,7 @@ DMC_HEARTBEAT_INTERVAL_S = 15 COMMENTS_THREAD_COOLDOWN_S = 60 COMMENTS_THREAD_INTERVAL_S = 1 -COMMENTS_LIMIT_DEFAULT_N = sys.maxsize # effectively infinite (no videos on niconico have reached 100 mil comments) +COMMENTS_LIMIT_DEFAULT_N = 1000 KILOBYTE = 1024 KILOBIT = 1000 BLOCK_SIZE = 1024 @@ -186,11 +187,63 @@ PONG_FRAME = json.loads("""{"type":"pong"}""") +MIN_DATE = datetime(2007, 3, 3).replace(tzinfo=timezone.utc) # constant taken from `comment-zouryou` +MAX_DATE = datetime.now(timezone.utc) + logger = logging.getLogger(__name__) +# Needs to be defined first +def parse_datetime_to_timestamp(value) -> int: + """ + Parse either ISO 8601 datetime or Unix timestamp, returning Unix timestamp. + Validates date is between 2007-03-03 and today. + For ISO dates without time, defaults to 23:59:59 of that day. + """ + try: + # First try to parse as Unix timestamp + timestamp = float(value) + dt = datetime.fromtimestamp(timestamp, timezone.utc) + except ValueError: + try: + # Try parsing as ISO 8601 + if 'T' in value or ' ' in value: + dt = datetime.fromisoformat(value.replace('Z', '+00:00')) + else: + # Date-only format - set to end of day + dt = datetime.fromisoformat(value).replace( + hour=23, minute=59, second=59 + ) + + # Ensure timezone awareness + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone(timezone.utc) + + except ValueError as e: + raise argparse.ArgumentTypeError( + f"Invalid datetime: '{value}'. Must be either:\n" + "- Unix timestamp (e.g., 1686787200)\n" + "- ISO 8601 date (e.g., '2023-06-15' → sets to 23:59:59)\n" + "- ISO 8601 datetime (e.g., '2023-06-15T14:30:00Z')" + ) from e + + # Validate date range + if dt < MIN_DATE: + raise argparse.ArgumentTypeError( + f"Date cannot be before {MIN_DATE.date()} (got {dt.date()})" + ) + if dt > MAX_DATE: + raise argparse.ArgumentTypeError( + f"Date cannot be in the future (got {dt.date()}, today is {MAX_DATE.date()})" + ) + + return int(dt.timestamp()) + + CMDL_USAGE = "%(prog)s [options] input" CMDL_VERSION = __version__ -cmdl_parser = argparse.ArgumentParser(usage=CMDL_USAGE, conflict_handler="resolve") +cmdl_parser = argparse.ArgumentParser(usage=CMDL_USAGE, conflict_handler="resolve", formatter_class=argparse.RawTextHelpFormatter) cmdl_parser.add_argument("-u", "--username", dest="username", metavar="EMAIL/TEL", help="account email address or telephone number") @@ -219,6 +272,17 @@ help="download video thumbnail") dl_group.add_argument("-c", "--download-comments", action="store_true", dest="download_comments", help="download video comments") +dl_group.add_argument("--comments-limit", dest="comments_limit", metavar="N", type=int, default=COMMENTS_LIMIT_DEFAULT_N, + help=f"number of comments to download (default: {COMMENTS_LIMIT_DEFAULT_N})") +dl_group.add_argument( + "--comments-from", dest="comments_from", type=parse_datetime_to_timestamp, metavar="DATETIME_OR_TIMESTAMP", default=int(datetime.now(timezone.utc).timestamp()), + help="only download comments posted after specified time:\n" + "- Unix timestamp (e.g., 1686787200)\n" + "- ISO 8601 date (e.g., '2023-06-15' → sets to 23:59:59)\n" + "- ISO 8601 datetime (e.g., '2023-06-15T14:30:00' or '2023-06-15 14:30:00')\n" +) +dl_group.add_argument("--all-comments", action="store_true", dest="request_all_comments", + help="request all comments ignoring comments-limit") dl_group.add_argument("-e", "--english", action="store_true", dest="download_english", help="request video on english site") dl_group.add_argument("--chinese", action="store_true", dest="download_chinese", @@ -483,6 +547,7 @@ def get_unix_timestamp(input_time=None) -> int: return int(datetime.now(timezone.utc).timestamp()) # Fallback + ## Nama methods def generate_stream(session: requests.Session, master_url: AnyStr) -> AnyStr: @@ -1951,41 +2016,48 @@ def download_video_comments( ): """Download the video comments.""" + if _CMDL_OPTS.no_login: + raise AuthenticationException("Downloading comments is not possible when -g/--no-login is specified. Please login or provide a session cookie") + output("Downloading comments for {0}...\n".format(template_params["id"]), logging.INFO) filename = replace_extension(filename, "comments.json") - # TODO: Make sure to limit date range to this: - # OLD_DATE.min = "2007-03-03"; - # OLD_DATE.max = new Date().getFullYear() + "-12-31"; - - # Convert date limit to timestamp - last_time = get_unix_timestamp() - - # Process each comment thread - for thread in template_params["thread_params"]["targets"]: - thread_comments = fetch_thread_comments( - session, - template_params["id"], - template_params["comment_server"], - template_params["thread_key"], - thread, - template_params["thread_params"]["language"], - last_time, - ) - - # Update global comment count if it's higher than the previous highest - if thread_comments["globalCommentsData"]["count"] > COMMENTS_DATA_JSON["globalComments"][0]["count"]: - COMMENTS_DATA_JSON["globalComments"][0]["count"] = thread_comments["globalCommentsData"]["count"] - # Set the thread ID too - COMMENTS_DATA_JSON["globalComments"][0]["id"] = thread_comments["globalCommentsData"]["id"] - COMMENTS_DATA_JSON["threads"].append(thread_comments["threadData"]) + last_time: int = _CMDL_OPTS.comments_from + # sys.maxsize is practically big enough (no videos on niconico have reached 100 mil comments) + comments_limit: int = sys.maxsize if _CMDL_OPTS.request_all_comments else _CMDL_OPTS.comments_limit - # Save comments to file - with open(filename, "w", encoding="utf-8") as file: - json.dump(COMMENTS_DATA_JSON, file, indent=4, ensure_ascii=False, sort_keys=True) + try: + for thread in template_params["thread_params"]["targets"]: + thread_data = { + "commentCount": 0, + "comments": [], + "fork": thread["fork"], + "id": thread["id"] + } + COMMENTS_DATA_JSON["threads"].append(thread_data) + + fetch_thread_comments( + session, + template_params["id"], + template_params["comment_server"], + template_params["thread_key"], + thread, + template_params["thread_params"]["language"], + last_time, + comments_limit, + thread_data + ) + + output("Finished downloading comments for {0}.\n".format(template_params["id"]), logging.INFO) - output("Finished downloading comments for {0}.\n".format(template_params["id"]), logging.INFO) + except KeyboardInterrupt: + output("\nCTRL+C detected. Saving downloaded comments for {0}\n".format(template_params["id"]), logging.INFO) + raise + finally: + # Save in case of success and on interrupt) + with open(filename, "w", encoding="utf-8") as file: + json.dump(COMMENTS_DATA_JSON, file, indent=4, ensure_ascii=False, sort_keys=True) def fetch_thread_comments( session: requests.Session, @@ -1995,18 +2067,10 @@ def fetch_thread_comments( thread: dict, language: str, last_time: int, -) -> dict: + comments_limit: int, + thread_data: dict +) -> None: """Fetch comments for thread (requires login).""" - global_comments_data = { - "count": 0, - "id": thread["id"], - } - thread_data = { - "commentCount": 0, - "comments": [], - "fork": thread["fork"], - "id": thread["id"] - } base_data = { "threadKey": thread_key, "params": { @@ -2014,12 +2078,9 @@ def fetch_thread_comments( "targets": [thread] } } - fetched_count = 0 - - if _CMDL_OPTS.no_login: - raise AuthenticationException("Downloading comments is not possible when -g/--no-login is specified. Please login or provide a session cookie") + fetched_comments_count = 0 - while fetched_count < COMMENTS_LIMIT_DEFAULT_N: + while fetched_comments_count < comments_limit: try: payload = { **base_data, @@ -2036,7 +2097,7 @@ def fetch_thread_comments( } response = session.post( - f"{api_server}/v1/threads", + COMMENTS_THREAD_URL.format(api_server), json=payload, headers=headers ) @@ -2058,34 +2119,32 @@ def fetch_thread_comments( else: raise ParameterExtractionException(error_code) + # Update global comments count if higher global_comment_count = data["data"]["globalComments"][0]["count"] - if global_comment_count and global_comment_count > global_comments_data["count"]: - # Get the highest global comment count as our estimated global threads total comment count - global_comments_data["count"] = global_comment_count - - comment_count = data["data"]["threads"][0]["commentCount"] - if comment_count and fetched_count == 0: - # Use first fetch count to know the thread's total comment count - thread_data["commentCount"] = comment_count + if global_comment_count and global_comment_count > COMMENTS_DATA_JSON["globalComments"][0]["count"]: + COMMENTS_DATA_JSON["globalComments"][0]["count"] = global_comment_count + COMMENTS_DATA_JSON["globalComments"][0]["id"] = thread["id"] + + # Update thread comment count if first fetch + if fetched_comments_count == 0: + thread_data["commentCount"] = data["data"]["threads"][0]["commentCount"] thread_comments = data["data"]["threads"][0]["comments"] - # TODO: Add check for date range later if not thread_comments: # There are no comments before lastTime to fetch break + # Append new comments and update last_time thread_data["comments"].extend(thread_comments) - last_time = get_unix_timestamp(thread_comments[0]["postedAt"]) - fetched_count += 1 + last_time = int(datetime.fromisoformat(thread_comments[0]["postedAt"]).timestamp()) + fetched_comments_count += len(thread_comments) - time.sleep(COMMENTS_THREAD_COOLDOWN_S) + time.sleep(COMMENTS_THREAD_INTERVAL_S) except Exception as e: output(f"Error fetching comments {e}", logging.ERROR) break - - return {"globalCommentsData": global_comments_data, "threadData": thread_data} def refresh_thread_key(session: requests.Session, video_id: str) -> str: """Refresh the thread key for the comments API""" @@ -2334,6 +2393,10 @@ def main(): "available in a lower quality. For access to all content, please provide a login with " "--username/--password, --session-cookie, or --netrc.\n", logging.WARNING) + if (_CMDL_OPTS.comments_limit or _CMDL_OPTS.request_all_comments or _CMDL_OPTS.comments_from) and not _CMDL_OPTS.download_comments: + output("Comment downloading qualifiers --comments-limit, --request-all-comments, or --comments-from were specified, but --download-comments was not. " + "Did you forget to set --download-comments?\n", logging.WARNING) + session = login(account_username, account_password, session_cookie) for arg_item in _CMDL_OPTS.input: From 3d16af08035f987c253882b9c106ee4bc78a49e8 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Wed, 14 May 2025 04:15:42 +0800 Subject: [PATCH 10/17] feat: comments fetching progress bar --- nndownload/nndownload.py | 170 ++++++++++++++++++++++++--------------- 1 file changed, 105 insertions(+), 65 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 4c05509..acb970b 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -28,7 +28,7 @@ from mutagen.mp4 import MP4, MP4StreamInfoError from requests.adapters import HTTPAdapter from requests.utils import add_dict_to_cookiejar -from rich.progress import Progress +from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn, TimeRemainingColumn from urllib3.util import Retry from urllib.parse import urlparse @@ -2016,10 +2016,14 @@ def download_video_comments( ): """Download the video comments.""" + output("Downloading comments for {0}...\n".format(template_params["id"]), logging.INFO) + if _CMDL_OPTS.no_login: raise AuthenticationException("Downloading comments is not possible when -g/--no-login is specified. Please login or provide a session cookie") - output("Downloading comments for {0}...\n".format(template_params["id"]), logging.INFO) + if os.path.exists(filename): + output(f"{replace_extension(filename, 'comments.json')} already exists. Skipping...\n", logging.INFO) + return False filename = replace_extension(filename, "comments.json") @@ -2079,72 +2083,108 @@ def fetch_thread_comments( } } fetched_comments_count = 0 - - while fetched_comments_count < comments_limit: - try: - payload = { - **base_data, - "additionals": { - "res_from": -1000, - "when": last_time + previous_comments = None + total_comments = None + + with Progress( + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + TextColumn("({task.completed}/{task.total})"), + TimeRemainingColumn(), + transient=True + ) as progress: + task = progress.add_task(f"Downloading {thread['fork']} comments", total=None) + + while fetched_comments_count < comments_limit: + try: + payload = { + **base_data, + "additionals": { + "res_from": -1000, + "when": last_time + } } - } - - headers = { - **API_HEADERS, - "content-type": "text/plain;charset=UTF-8", - "x-client-os-type": "others", - } - - response = session.post( - COMMENTS_THREAD_URL.format(api_server), - json=payload, - headers=headers - ) - data = response.json() - - # Handle handle API errors, if any, with appropriate actions - error_code = data["meta"]["errorCode"] if "meta" in data and "errorCode" in data["meta"] else None - if error_code is not None: - if error_code == "TOO_MANY_REQUESTS": - output("Rate limited - waiting 60 seconds...", logging.INFO) - time.sleep(COMMENTS_THREAD_COOLDOWN_S) - continue - elif error_code == "EXPIRED_TOKEN": - output("Refreshing thread key...", logging.INFO) - refresh_thread_key(session, video_id) - continue - elif error_code == "INVALID_TOKEN": - raise AuthenticationException - else: - raise ParameterExtractionException(error_code) - - # Update global comments count if higher - global_comment_count = data["data"]["globalComments"][0]["count"] - if global_comment_count and global_comment_count > COMMENTS_DATA_JSON["globalComments"][0]["count"]: - COMMENTS_DATA_JSON["globalComments"][0]["count"] = global_comment_count - COMMENTS_DATA_JSON["globalComments"][0]["id"] = thread["id"] - - # Update thread comment count if first fetch - if fetched_comments_count == 0: - thread_data["commentCount"] = data["data"]["threads"][0]["commentCount"] - thread_comments = data["data"]["threads"][0]["comments"] - - if not thread_comments: - # There are no comments before lastTime to fetch - break + headers = { + **API_HEADERS, + "content-type": "text/plain;charset=UTF-8", + "x-client-os-type": "others", + } - # Append new comments and update last_time - thread_data["comments"].extend(thread_comments) - last_time = int(datetime.fromisoformat(thread_comments[0]["postedAt"]).timestamp()) - fetched_comments_count += len(thread_comments) - - time.sleep(COMMENTS_THREAD_INTERVAL_S) - - except Exception as e: - output(f"Error fetching comments {e}", logging.ERROR) - break + response = session.post( + COMMENTS_THREAD_URL.format(api_server), + json=payload, + headers=headers + ) + data = response.json() + + # Handle API errors, if any, with appropriate actions + error_code = data["meta"]["errorCode"] if "meta" in data and "errorCode" in data["meta"] else None + if error_code is not None: + if error_code == "TOO_MANY_REQUESTS": + output("Rate limited - waiting 60 seconds...\n", logging.INFO) + time.sleep(COMMENTS_THREAD_COOLDOWN_S) + continue + elif error_code == "EXPIRED_TOKEN": + output("Refreshing thread key...\n", logging.INFO) + refresh_thread_key(session, video_id) + continue + elif error_code == "INVALID_TOKEN": + raise AuthenticationException + else: + raise ParameterExtractionException(error_code) + + # Update global comments count if higher + global_comment_count = data["data"]["globalComments"][0]["count"] + if global_comment_count and global_comment_count > COMMENTS_DATA_JSON["globalComments"][0]["count"]: + COMMENTS_DATA_JSON["globalComments"][0]["count"] = global_comment_count + COMMENTS_DATA_JSON["globalComments"][0]["id"] = thread["id"] + + # Update thread comment count if first fetch + if fetched_comments_count == 0: + thread_data["commentCount"] = data["data"]["threads"][0]["commentCount"] + # If requesting all comments, we use the video's total comment count + if _CMDL_OPTS.request_all_comments: + total_comments = thread_data["commentCount"] + else: + # Otherwise, we use the limit provided or the video's total comment count, whichever is lower + total_comments = min(thread_data["commentCount"], comments_limit) + progress.update(task, total=total_comments) + progress.update(task, description=f"Downloading {thread['fork']} comments") + + thread_comments = data["data"]["threads"][0]["comments"] + + if not thread_comments: + # There are no comments before lastTime to fetch + break + + # If we got the same comments as last time, we should stop + if previous_comments and len(thread_comments) == len(previous_comments): + same_comments = True + for i in range(len(thread_comments)): + if thread_comments[i]["id"] != previous_comments[i]["id"]: + same_comments = False + break + if same_comments: + # We've reached the end of comments + break + + # Store current comments for next comparison + previous_comments = thread_comments.copy() + + # Append new comments and update last_time + thread_data["comments"].extend(thread_comments) + last_time = int(datetime.fromisoformat(thread_comments[0]["postedAt"]).timestamp()) + + fetched_comments_count += len(thread_comments) + progress.update(task, completed=fetched_comments_count) + + time.sleep(COMMENTS_THREAD_INTERVAL_S) + + except Exception as e: + output(f"Error fetching comments {e}\n", logging.ERROR) + break def refresh_thread_key(session: requests.Session, video_id: str) -> str: """Refresh the thread key for the comments API""" From a5095f007afafedf02a05bbed0fe663e2827d7f4 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Wed, 14 May 2025 04:22:49 +0800 Subject: [PATCH 11/17] fix: file exists check on comments.json --- nndownload/nndownload.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index acb970b..58a4918 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -2021,12 +2021,12 @@ def download_video_comments( if _CMDL_OPTS.no_login: raise AuthenticationException("Downloading comments is not possible when -g/--no-login is specified. Please login or provide a session cookie") + filename = replace_extension(filename, "comments.json") + if os.path.exists(filename): - output(f"{replace_extension(filename, 'comments.json')} already exists. Skipping...\n", logging.INFO) + output(f"File {filename} exists. Skipping...\n", logging.INFO) return False - filename = replace_extension(filename, "comments.json") - last_time: int = _CMDL_OPTS.comments_from # sys.maxsize is practically big enough (no videos on niconico have reached 100 mil comments) comments_limit: int = sys.maxsize if _CMDL_OPTS.request_all_comments else _CMDL_OPTS.comments_limit From 30be699d92d5d2ff57c3c267b42a2d623aa20625 Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Wed, 14 May 2025 05:17:45 +0800 Subject: [PATCH 12/17] perf: no indents when dumping comments json --- nndownload/nndownload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 58a4918..e143255 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -2061,7 +2061,7 @@ def download_video_comments( finally: # Save in case of success and on interrupt) with open(filename, "w", encoding="utf-8") as file: - json.dump(COMMENTS_DATA_JSON, file, indent=4, ensure_ascii=False, sort_keys=True) + json.dump(COMMENTS_DATA_JSON, file, indent=None, ensure_ascii=False, sort_keys=True) def fetch_thread_comments( session: requests.Session, From 236977c1a4249f1c753003fb4cce1c04179f05a6 Mon Sep 17 00:00:00 2001 From: AlexAplin Date: Thu, 22 May 2025 01:23:52 -0400 Subject: [PATCH 13/17] Make changes and improvements --- README.md | 7 + nndownload/nndownload.py | 283 +++++++++++++++++---------------------- 2 files changed, 131 insertions(+), 159 deletions(-) diff --git a/README.md b/README.md index 7d9b74d..cf8ee65 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,13 @@ download options: download video thumbnail -c, --download-comments download video comments + --comments-limit N number of comments to download (default: 1000) + --comments-from DATETIME_OR_TIMESTAMP + only download comments posted before a specified time: + - Unix timestamp (e.g., 1686787200) + - ISO 8601 date (e.g., '2023-06-15' → sets to 23:59:59) + - ISO 8601 datetime (e.g., '2023-06-15T14:30:00' or '2023-06-15 14:30:00') + --all-comments request all comments (ignores --comments-limit) -e, --english request video on english site --chinese request video on traditional chinese (taiwan) site -aq AUDIO_QUALITY, --audio-quality AUDIO_QUALITY diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index e143255..6bbae3f 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -113,12 +113,10 @@ TEMP_PATH_LEN = 16 COMMENTS_DATA_JSON = { - "globalComments": [ - { - "count": 0, - "id": None - } - ], + "globalComments": { + "retrievedCount": None, + "commentCount": None + }, "threads": [], } @@ -192,13 +190,14 @@ logger = logging.getLogger(__name__) -# Needs to be defined first + def parse_datetime_to_timestamp(value) -> int: """ Parse either ISO 8601 datetime or Unix timestamp, returning Unix timestamp. Validates date is between 2007-03-03 and today. For ISO dates without time, defaults to 23:59:59 of that day. """ + try: # First try to parse as Unix timestamp timestamp = float(value) @@ -273,16 +272,16 @@ def parse_datetime_to_timestamp(value) -> int: dl_group.add_argument("-c", "--download-comments", action="store_true", dest="download_comments", help="download video comments") dl_group.add_argument("--comments-limit", dest="comments_limit", metavar="N", type=int, default=COMMENTS_LIMIT_DEFAULT_N, - help=f"number of comments to download (default: {COMMENTS_LIMIT_DEFAULT_N})") + help=f"number of comments to download per thread (default: {COMMENTS_LIMIT_DEFAULT_N})") dl_group.add_argument( - "--comments-from", dest="comments_from", type=parse_datetime_to_timestamp, metavar="DATETIME_OR_TIMESTAMP", default=int(datetime.now(timezone.utc).timestamp()), - help="only download comments posted after specified time:\n" + "--comments-from", dest="comments_from", type=parse_datetime_to_timestamp, metavar="DATETIME_OR_TIMESTAMP", + help="only download comments posted before a specified time:\n" "- Unix timestamp (e.g., 1686787200)\n" "- ISO 8601 date (e.g., '2023-06-15' → sets to 23:59:59)\n" "- ISO 8601 datetime (e.g., '2023-06-15T14:30:00' or '2023-06-15 14:30:00')\n" ) -dl_group.add_argument("--all-comments", action="store_true", dest="request_all_comments", - help="request all comments ignoring comments-limit") +dl_group.add_argument("--all-comments", action="store_true", dest="all_comments", + help="request all comments (ignores --comments-limit)") dl_group.add_argument("-e", "--english", action="store_true", dest="download_english", help="request video on english site") dl_group.add_argument("--chinese", action="store_true", dest="download_chinese", @@ -2009,6 +2008,7 @@ def download_thumbnail(session: requests.Session, filename: AnyStr, template_par output("Finished downloading thumbnail for {0}.\n".format(template_params["id"]), logging.INFO) + def download_video_comments( session: requests.Session, filename: str, @@ -2024,170 +2024,134 @@ def download_video_comments( filename = replace_extension(filename, "comments.json") if os.path.exists(filename): - output(f"File {filename} exists. Skipping...\n", logging.INFO) + output(f"Comments file \"{filename}\" exists. Skipping...\n", logging.INFO) return False - last_time: int = _CMDL_OPTS.comments_from - # sys.maxsize is practically big enough (no videos on niconico have reached 100 mil comments) - comments_limit: int = sys.maxsize if _CMDL_OPTS.request_all_comments else _CMDL_OPTS.comments_limit + comments_from: int = _CMDL_OPTS.comments_from + comments_limit: int = _CMDL_OPTS.comments_limit - try: + if comments_from: + output(f"Requesting comments looking back from {comments_from} on each thread.\n", logging.INFO) + else: + # Default to the current time + comments_from = int(datetime.now(timezone.utc).timestamp()) + + if _CMDL_OPTS.all_comments: + output(f"Requesting all comments on each thread.\n", logging.INFO) + comments_limit = None + else: + output(f"Requesting up to {comments_limit} comments on each thread.\n", logging.INFO) # Defaults to 1000 + + with Progress(TextColumn("{task.description}"), BarColumn(), TaskProgressColumn(), TextColumn("{task.completed}/{task.total}"), transient=True) as progress: + tasks = [] for thread in template_params["thread_params"]["targets"]: - thread_data = { - "commentCount": 0, - "comments": [], - "fork": thread["fork"], - "id": thread["id"] - } - COMMENTS_DATA_JSON["threads"].append(thread_data) - - fetch_thread_comments( - session, - template_params["id"], - template_params["comment_server"], - template_params["thread_key"], - thread, - template_params["thread_params"]["language"], - last_time, - comments_limit, - thread_data - ) - - output("Finished downloading comments for {0}.\n".format(template_params["id"]), logging.INFO) + comments_data = COMMENTS_DATA_JSON + thread = threading.Thread(target=fetch_comments_thread, args=(session, template_params["id"], template_params["comment_server"], template_params["thread_key"], thread, template_params["thread_params"]["language"], progress, comments_data, comments_from, comments_limit)) + thread.start() + tasks.append({ + "thread": thread, + }) + + for task in tasks: + task["thread"].join() + + comments_data["globalComments"]["retrievedCount"] = sum(thread["retrievedCount"] for thread in comments_data["threads"]) + comments_data["globalComments"]["commentCount"] = template_params["comment_count"] - except KeyboardInterrupt: - output("\nCTRL+C detected. Saving downloaded comments for {0}\n".format(template_params["id"]), logging.INFO) - raise - finally: - # Save in case of success and on interrupt) with open(filename, "w", encoding="utf-8") as file: - json.dump(COMMENTS_DATA_JSON, file, indent=None, ensure_ascii=False, sort_keys=True) + json.dump(comments_data, file, indent=None, ensure_ascii=False, sort_keys=True) + + output("Saved comments for {0} to {1}.\n".format(template_params["id"], filename), logging.INFO) + -def fetch_thread_comments( +def fetch_comments_thread( session: requests.Session, video_id: str, api_server: str, thread_key: str, thread: dict, language: str, - last_time: int, - comments_limit: int, - thread_data: dict + progress: Progress, + comments_data: dict, + comments_from: int, + comments_limit: int = None, ) -> None: - """Fetch comments for thread (requires login).""" - base_data = { - "threadKey": thread_key, - "params": { - "language": language, - "targets": [thread] - } + """Fetch comments for a specific comments thread.""" + + thread_data = { + "comments": [], + "fork": thread["fork"], + "id": thread["id"] } - fetched_comments_count = 0 - previous_comments = None - total_comments = None - - with Progress( - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TaskProgressColumn(), - TextColumn("({task.completed}/{task.total})"), - TimeRemainingColumn(), - transient=True - ) as progress: - task = progress.add_task(f"Downloading {thread['fork']} comments", total=None) - - while fetched_comments_count < comments_limit: - try: - payload = { - **base_data, - "additionals": { - "res_from": -1000, - "when": last_time - } - } - - headers = { - **API_HEADERS, - "content-type": "text/plain;charset=UTF-8", - "x-client-os-type": "others", + + task_id = progress.add_task(thread["fork"], total=None, visible=False) + while not progress.tasks[task_id].finished: + response = session.post( + COMMENTS_THREAD_URL.format(api_server), + json={ + "threadKey": thread_key, + "params": { + "language": language, + "targets": [thread] + }, + "additionals": { + "res_from": -1000, + "when": comments_from } - - response = session.post( - COMMENTS_THREAD_URL.format(api_server), - json=payload, - headers=headers - ) - data = response.json() - - # Handle API errors, if any, with appropriate actions - error_code = data["meta"]["errorCode"] if "meta" in data and "errorCode" in data["meta"] else None - if error_code is not None: - if error_code == "TOO_MANY_REQUESTS": - output("Rate limited - waiting 60 seconds...\n", logging.INFO) - time.sleep(COMMENTS_THREAD_COOLDOWN_S) - continue - elif error_code == "EXPIRED_TOKEN": - output("Refreshing thread key...\n", logging.INFO) - refresh_thread_key(session, video_id) - continue - elif error_code == "INVALID_TOKEN": - raise AuthenticationException - else: - raise ParameterExtractionException(error_code) - - # Update global comments count if higher - global_comment_count = data["data"]["globalComments"][0]["count"] - if global_comment_count and global_comment_count > COMMENTS_DATA_JSON["globalComments"][0]["count"]: - COMMENTS_DATA_JSON["globalComments"][0]["count"] = global_comment_count - COMMENTS_DATA_JSON["globalComments"][0]["id"] = thread["id"] - - # Update thread comment count if first fetch - if fetched_comments_count == 0: - thread_data["commentCount"] = data["data"]["threads"][0]["commentCount"] - # If requesting all comments, we use the video's total comment count - if _CMDL_OPTS.request_all_comments: - total_comments = thread_data["commentCount"] - else: - # Otherwise, we use the limit provided or the video's total comment count, whichever is lower - total_comments = min(thread_data["commentCount"], comments_limit) - progress.update(task, total=total_comments) - progress.update(task, description=f"Downloading {thread['fork']} comments") - - thread_comments = data["data"]["threads"][0]["comments"] - - if not thread_comments: - # There are no comments before lastTime to fetch - break - - # If we got the same comments as last time, we should stop - if previous_comments and len(thread_comments) == len(previous_comments): - same_comments = True - for i in range(len(thread_comments)): - if thread_comments[i]["id"] != previous_comments[i]["id"]: - same_comments = False - break - if same_comments: - # We've reached the end of comments - break - - # Store current comments for next comparison - previous_comments = thread_comments.copy() - - # Append new comments and update last_time - thread_data["comments"].extend(thread_comments) - last_time = int(datetime.fromisoformat(thread_comments[0]["postedAt"]).timestamp()) - - fetched_comments_count += len(thread_comments) - progress.update(task, completed=fetched_comments_count) - - time.sleep(COMMENTS_THREAD_INTERVAL_S) - - except Exception as e: - output(f"Error fetching comments {e}\n", logging.ERROR) - break + }, + headers={ + **API_HEADERS, + "content-type": "text/plain;charset=UTF-8", + "x-client-os-type": "others", + } + ) + response_data = response.json() + + # Handle API error codes + error_code = response_data["meta"]["errorCode"] if "meta" in response_data and "errorCode" in response_data["meta"] else None + if error_code is not None: + if error_code == "TOO_MANY_REQUESTS": + output(f"Rate limit hit. Sleeping for f{COMMENTS_THREAD_COOLDOWN_S} seconds...\n", logging.INFO) + time.sleep(COMMENTS_THREAD_COOLDOWN_S) + continue + elif error_code == "EXPIRED_TOKEN": + output("Thread key expired. Refreshing...\n", logging.INFO) + refresh_thread_key(session, video_id) + continue + elif error_code == "INVALID_TOKEN": + raise AuthenticationException("Comment thread key was invalid") + else: + raise ParameterExtractionException(error_code) + + # Specify our target end total + if not progress.tasks[task_id].total: + thread_data["commentCount"] = response_data["data"]["threads"][0]["commentCount"] + # If requesting all comments, specify the total as the thread's comment count + if not comments_limit: + progress.update(task_id, total=thread_data["commentCount"], visible=True) + # Otherwise, specify the total as the smaller of the thread's comment count or the specified limit + else: + progress.update(task_id, total=min(thread_data["commentCount"], comments_limit), visible=True) + + # If no comments are retrieved, stop the thread immediately + thread_comments = response_data["data"]["threads"][0]["comments"] + if not thread_comments: + break + + # Append new comments and update progress + thread_data["comments"].extend(thread_comments) + comments_from = int(datetime.fromisoformat(thread_comments[0]["postedAt"]).timestamp()) + progress.advance(task_id, advance=len(thread_comments)) + + time.sleep(COMMENTS_THREAD_INTERVAL_S) + + thread_data["retrievedCount"] = len(thread_data["comments"]) + comments_data["threads"].append(thread_data) + def refresh_thread_key(session: requests.Session, video_id: str) -> str: """Refresh the thread key for the comments API""" + url = THREAD_REFRESH_API.format(video_id) headers = { @@ -2198,6 +2162,7 @@ def refresh_thread_key(session: requests.Session, video_id: str) -> str: response.raise_for_status() return response.json()["data"]["threadKey"] + def add_metadata_to_container(filename: AnyStr, template_params: dict): """Add metadata to any MP4 container.""" @@ -2434,8 +2399,8 @@ def main(): "--username/--password, --session-cookie, or --netrc.\n", logging.WARNING) if (_CMDL_OPTS.comments_limit or _CMDL_OPTS.request_all_comments or _CMDL_OPTS.comments_from) and not _CMDL_OPTS.download_comments: - output("Comment downloading qualifiers --comments-limit, --request-all-comments, or --comments-from were specified, but --download-comments was not. " - "Did you forget to set --download-comments?\n", logging.WARNING) + output("Comment downloading qualifiers (--comments-limit, --request-all-comments, or --comments-from) were specified, but --download-comments was not. " + "No comments will be downloaded.\n", logging.WARNING) session = login(account_username, account_password, session_cookie) From a8cc766f6d299461790e275af1abd9c52d9a4b9f Mon Sep 17 00:00:00 2001 From: AlexAplin Date: Thu, 22 May 2025 01:52:56 -0400 Subject: [PATCH 14/17] Standardize outputs --- nndownload/nndownload.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 6bbae3f..2b485a9 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -2016,11 +2016,11 @@ def download_video_comments( ): """Download the video comments.""" - output("Downloading comments for {0}...\n".format(template_params["id"]), logging.INFO) - if _CMDL_OPTS.no_login: raise AuthenticationException("Downloading comments is not possible when -g/--no-login is specified. Please login or provide a session cookie") + output("Downloading comments for {0}...\n".format(template_params["id"]), logging.INFO) + filename = replace_extension(filename, "comments.json") if os.path.exists(filename): @@ -2061,7 +2061,7 @@ def download_video_comments( with open(filename, "w", encoding="utf-8") as file: json.dump(comments_data, file, indent=None, ensure_ascii=False, sort_keys=True) - output("Saved comments for {0} to {1}.\n".format(template_params["id"], filename), logging.INFO) + output("Finished downloading comments for {0}.\n".format(template_params["id"]), logging.INFO) def fetch_comments_thread( From a86d82d69260ce811f9b14518cb38a445119fd18 Mon Sep 17 00:00:00 2001 From: AlexAplin Date: Thu, 22 May 2025 01:54:11 -0400 Subject: [PATCH 15/17] Update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cf8ee65..bdbcf20 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ download options: download video thumbnail -c, --download-comments download video comments - --comments-limit N number of comments to download (default: 1000) + --comments-limit N number of comments to download per thread (default: 1000) --comments-from DATETIME_OR_TIMESTAMP only download comments posted before a specified time: - Unix timestamp (e.g., 1686787200) From 5d8ef2e2a6bdd9f4aa8cb96d9dd0a5960dc7b137 Mon Sep 17 00:00:00 2001 From: AlexAplin Date: Thu, 22 May 2025 01:56:43 -0400 Subject: [PATCH 16/17] Clarify constant --- nndownload/nndownload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 2b485a9..8388d67 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -185,7 +185,7 @@ PONG_FRAME = json.loads("""{"type":"pong"}""") -MIN_DATE = datetime(2007, 3, 3).replace(tzinfo=timezone.utc) # constant taken from `comment-zouryou` +MIN_DATE = datetime(2007, 3, 3).replace(tzinfo=timezone.utc) # Constant taken from comment-zouryou (likely coincides with the γ launch on 2007-03-06): https://github.com/tanbatu/comment-zouryou MAX_DATE = datetime.now(timezone.utc) logger = logging.getLogger(__name__) From 7235f510dddbd3b06894ba9e49dc10b19ebc7f6b Mon Sep 17 00:00:00 2001 From: Ebisuzawa Kurumi Date: Sat, 11 Apr 2026 17:20:52 +0800 Subject: [PATCH 17/17] fix: address issues --- nndownload/nndownload.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/nndownload/nndownload.py b/nndownload/nndownload.py index 8388d67..1e9da20 100644 --- a/nndownload/nndownload.py +++ b/nndownload/nndownload.py @@ -11,6 +11,8 @@ import netrc import os import random +import contextlib +import copy import re import shutil import string @@ -2042,11 +2044,12 @@ def download_video_comments( else: output(f"Requesting up to {comments_limit} comments on each thread.\n", logging.INFO) # Defaults to 1000 + comments_data = copy.deepcopy(COMMENTS_DATA_JSON) + with Progress(TextColumn("{task.description}"), BarColumn(), TaskProgressColumn(), TextColumn("{task.completed}/{task.total}"), transient=True) as progress: tasks = [] - for thread in template_params["thread_params"]["targets"]: - comments_data = COMMENTS_DATA_JSON - thread = threading.Thread(target=fetch_comments_thread, args=(session, template_params["id"], template_params["comment_server"], template_params["thread_key"], thread, template_params["thread_params"]["language"], progress, comments_data, comments_from, comments_limit)) + for thread_target in template_params["thread_params"]["targets"]: + thread = threading.Thread(target=fetch_comments_thread, args=(session, template_params["id"], template_params["comment_server"], template_params["thread_key"], thread_target, template_params["thread_params"]["language"], progress, comments_data, comments_from, comments_limit)) thread.start() tasks.append({ "thread": thread, @@ -2075,7 +2078,7 @@ def fetch_comments_thread( comments_data: dict, comments_from: int, comments_limit: int = None, -) -> None: +) -> str: """Fetch comments for a specific comments thread.""" thread_data = { @@ -2111,12 +2114,12 @@ def fetch_comments_thread( error_code = response_data["meta"]["errorCode"] if "meta" in response_data and "errorCode" in response_data["meta"] else None if error_code is not None: if error_code == "TOO_MANY_REQUESTS": - output(f"Rate limit hit. Sleeping for f{COMMENTS_THREAD_COOLDOWN_S} seconds...\n", logging.INFO) + output(f"Rate limit hit. Sleeping for {COMMENTS_THREAD_COOLDOWN_S} seconds...\n", logging.INFO) time.sleep(COMMENTS_THREAD_COOLDOWN_S) continue elif error_code == "EXPIRED_TOKEN": output("Thread key expired. Refreshing...\n", logging.INFO) - refresh_thread_key(session, video_id) + thread_key = refresh_thread_key(session, video_id) continue elif error_code == "INVALID_TOKEN": raise AuthenticationException("Comment thread key was invalid") @@ -2143,6 +2146,10 @@ def fetch_comments_thread( comments_from = int(datetime.fromisoformat(thread_comments[0]["postedAt"]).timestamp()) progress.advance(task_id, advance=len(thread_comments)) + # Stop if we've reached the requested limit + if comments_limit and len(thread_data["comments"]) >= comments_limit: + break + time.sleep(COMMENTS_THREAD_INTERVAL_S) thread_data["retrievedCount"] = len(thread_data["comments"])