From 383b4fb4bcfdca69db98042950df40f9a82ce4ec Mon Sep 17 00:00:00 2001 From: Qiao Wang Date: Fri, 20 Jun 2025 16:33:40 +0800 Subject: [PATCH 1/4] Fix yahoo collector for us stock --- scripts/data_collector/utils.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index f25b1ec7a2..d6f19b35c3 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -300,30 +300,41 @@ def get_us_stock_symbols(qlib_data_path: [str, Path] = None) -> list: global _US_SYMBOLS # pylint: disable=W0603 @deco_retry - def _get_eastmoney(): - url = "http://4.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&fs=m:105,m:106,m:107&fields=f12" + def _get_eastmoney_by_page(page): + page_size = 100 + url = f"http://4.push2.eastmoney.com/api/qt/clist/get?pn={page}&pz={page_size}&fs=m:105,m:106,m:107&fields=f12" resp = requests.get(url, timeout=None) if resp.status_code != 200: raise ValueError("request error") - try: _symbols = [_v["f12"].replace("_", "-P") for _v in resp.json()["data"]["diff"].values()] + return _symbols except Exception as e: logger.warning(f"request error: {e}") raise + @deco_retry + def _get_eastmoney(): + _symbols = [] + page = 1 + while True: + current_symbols = _get_eastmoney_by_page(page) + if not current_symbols: + break + if len(current_symbols) < 100: + break # last page + _symbols.extend(current_symbols) + page += 1 if len(_symbols) < 8000: raise ValueError("request error") - return _symbols @deco_retry def _get_nasdaq(): _res_symbols = [] for _name in ["otherlisted", "nasdaqtraded"]: - url = f"ftp://ftp.nasdaqtrader.com/SymbolDirectory/{_name}.txt" + url = f"https://www.nasdaqtrader.com/dynamic/SymDir/{_name}.txt" df = pd.read_csv(url, sep="|") - df = df.rename(columns={"ACT Symbol": "Symbol"}) _symbols = df["Symbol"].dropna() _symbols = _symbols.str.replace("$", "-P", regex=False) _symbols = _symbols.str.replace(".W", "-WT", regex=False) From e6c539a6983902dbcc81d64ce6d2f3c053c0a361 Mon Sep 17 00:00:00 2001 From: Qiao Wang Date: Fri, 20 Jun 2025 17:20:14 +0800 Subject: [PATCH 2/4] Fix --- scripts/data_collector/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index d6f19b35c3..ccd6887114 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -335,6 +335,7 @@ def _get_nasdaq(): for _name in ["otherlisted", "nasdaqtraded"]: url = f"https://www.nasdaqtrader.com/dynamic/SymDir/{_name}.txt" df = pd.read_csv(url, sep="|") + df = df.rename(columns={"ACT Symbol": "Symbol"}) _symbols = df["Symbol"].dropna() _symbols = _symbols.str.replace("$", "-P", regex=False) _symbols = _symbols.str.replace(".W", "-WT", regex=False) From b6db0ac9cfaa09615bb2b4c60266f22706c4001e Mon Sep 17 00:00:00 2001 From: Qiao Wang Date: Sat, 21 Jun 2025 11:19:51 +0800 Subject: [PATCH 3/4] Update _get_eastmoney --- scripts/data_collector/utils.py | 69 +++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index ccd6887114..8871ffce29 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -299,34 +299,61 @@ def get_us_stock_symbols(qlib_data_path: [str, Path] = None) -> list: """ global _US_SYMBOLS # pylint: disable=W0603 - @deco_retry - def _get_eastmoney_by_page(page): - page_size = 100 - url = f"http://4.push2.eastmoney.com/api/qt/clist/get?pn={page}&pz={page_size}&fs=m:105,m:106,m:107&fields=f12" - resp = requests.get(url, timeout=None) - if resp.status_code != 200: - raise ValueError("request error") - try: - _symbols = [_v["f12"].replace("_", "-P") for _v in resp.json()["data"]["diff"].values()] - return _symbols - except Exception as e: - logger.warning(f"request error: {e}") - raise - @deco_retry def _get_eastmoney(): + base_url = "http://4.push2.eastmoney.com/api/qt/clist/get" + params = { + "pn": 1, # page number + "pz": 100, # page size, default to 100 + "fs": "m:105,m:106,m:107", + "fields": "f12", + } + _symbols = [] page = 1 while True: - current_symbols = _get_eastmoney_by_page(page) - if not current_symbols: - break - if len(current_symbols) < 100: - break # last page - _symbols.extend(current_symbols) - page += 1 + params["pn"] = page + try: + resp = requests.get(base_url, params=params, timeout=None) + resp.raise_for_status() + data = resp.json() + + # Check if response contains valid data + if not data or "data" not in data or not data["data"] or "diff" not in data["data"]: + logger.warning(f"Invalid response structure on page {page}") + break + + # fetch the current page data + current_symbols = [_v["f12"] for _v in data["data"]["diff"].values()] + + if not current_symbols: # It's the last page if there is no data in current page + logger.info(f"Last page reached: {page - 1}") + break + + _symbols.extend(current_symbols) + + # show progress + logger.info( + f"Page {page}: fetch {len(current_symbols)} stocks:[{current_symbols[0]} ... {current_symbols[-1]}]" + ) + + page += 1 + + # sleep time to avoid overloading the server + time.sleep(0.5) + + except requests.exceptions.HTTPError as e: + raise requests.exceptions.HTTPError( + f"Request to {url} failed with status code {resp.status_code}" + ) from e + except Exception as e: + logger.warning("An error occurred while extracting data from the response.") + raise + + # If the number of symbols is less than the minimum required, raise an error if len(_symbols) < 8000: raise ValueError("request error") + return _symbols @deco_retry From efdf182712db00c13d98ce51194406d4252ecd30 Mon Sep 17 00:00:00 2001 From: Qiao Wang Date: Sat, 21 Jun 2025 15:05:27 +0800 Subject: [PATCH 4/4] Fix --- scripts/data_collector/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py index 8871ffce29..eeeb207968 100644 --- a/scripts/data_collector/utils.py +++ b/scripts/data_collector/utils.py @@ -344,7 +344,7 @@ def _get_eastmoney(): except requests.exceptions.HTTPError as e: raise requests.exceptions.HTTPError( - f"Request to {url} failed with status code {resp.status_code}" + f"Request to {base_url} failed with status code {resp.status_code}" ) from e except Exception as e: logger.warning("An error occurred while extracting data from the response.")