Skip to content

[fix] Improve datetime handling in YahooNormalize #1959

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions scripts/data_collector/yahoo/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from qlib.tests.data import GetData
from qlib.utils import code_to_fname, fname_to_code, exists_qlib_data
from qlib.constant import REG_CN as REGION_CN
from qlib.constant import REG_US as REGION_US


CUR_DIR = Path(__file__).resolve().parent
sys.path.append(str(CUR_DIR.parent.parent))
Expand Down Expand Up @@ -371,7 +373,7 @@ class YahooNormalize(BaseNormalize):
@staticmethod
def calc_change(df: pd.DataFrame, last_close: float) -> pd.Series:
df = df.copy()
_tmp_series = df["close"].fillna(method="ffill")
_tmp_series = df["close"].ffill()
_tmp_shift_series = _tmp_series.shift(1)
if last_close is not None:
_tmp_shift_series.iloc[0] = float(last_close)
Expand All @@ -392,8 +394,24 @@ def normalize_yahoo(
columns = copy.deepcopy(YahooNormalize.COLUMNS)
df = df.copy()
df.set_index(date_field_name, inplace=True)
df.index = pd.to_datetime(df.index)
df.index = df.index.tz_localize(None)
try:
df.index = pd.to_datetime(df.index, format='%Y-%m-%d')
except ValueError:
# If the standard format fails, try with mixed format to handle different datetime formats
# Use utc=True to handle mixed timezones and avoid FutureWarning
try:
df.index = pd.to_datetime(df.index, format='mixed', utc=True)
except Exception:
# Fallback: try without format specification but with utc=True
df.index = pd.to_datetime(df.index, utc=True)

# Handle timezone localization properly
if hasattr(df.index, 'tz') and df.index.tz is not None:
# If index has timezone info, convert to None (remove timezone)
df.index = df.index.tz_convert(None)
elif hasattr(df.index, 'tz_localize'):
# If index is timezone-naive, ensure it stays that way
df.index = df.index.tz_localize(None)
df = df[~df.index.duplicated(keep="first")]
if calendar_list is not None:
df = df.reindex(
Expand Down Expand Up @@ -459,7 +477,7 @@ def adjusted_price(self, df: pd.DataFrame) -> pd.DataFrame:
df.set_index(self._date_field_name, inplace=True)
if "adjclose" in df:
df["factor"] = df["adjclose"] / df["close"]
df["factor"] = df["factor"].fillna(method="ffill")
df["factor"] = df["factor"].ffill()
else:
df["factor"] = 1
for _col in self.COLUMNS:
Expand Down Expand Up @@ -725,7 +743,7 @@ def symbol_to_yahoo(self, symbol):


class Run(BaseRun):
def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_CN):
def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d", region=REGION_US):
"""

Parameters
Expand Down