From 97f35b721c43d22af1ca7fd238ba97de32555dbf Mon Sep 17 00:00:00 2001 From: ValueRaider Date: Mon, 13 May 2024 20:45:57 +0100 Subject: [PATCH] Price repair: improve 'sudden change' repair for splits & currency Original logic for repairing missing split adjustment only checked latest split. Improved logic checks ALL splits in data, because any can be missing. Then related changes to 'sudden change detection': - use prices median not mean, reduce sensitivity to noise. - handle Kuwait Dinar, which sub-divides into 1000x not 100x. --- yfinance/scrapers/history.py | 54 +++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/yfinance/scrapers/history.py b/yfinance/scrapers/history.py index a4bd380aa..f7c01f147 100644 --- a/yfinance/scrapers/history.py +++ b/yfinance/scrapers/history.py @@ -338,7 +338,7 @@ def history(self, period="1mo", interval="1d", # Do this before auto/back adjust logger.debug(f'{self.ticker}: checking OHLC for repairs ...') df = self._fix_unit_mixups(df, interval, tz_exchange, prepost) - df = self._fix_bad_stock_split(df, interval, tz_exchange) + df = self._fix_bad_stock_splits(df, interval, tz_exchange) # Must repair 100x and split errors before price reconstruction df = self._fix_zeroes(df, interval, tz_exchange, prepost) df = self._fix_missing_div_adjust(df, interval, tz_exchange) @@ -981,7 +981,12 @@ def _fix_unit_switch(self, df, interval, tz_exchange): # This function fixes the second. # Eventually Yahoo fixes but could take them 2 weeks. - return self._fix_prices_sudden_change(df, interval, tz_exchange, 100.0) + if self._history_metadata['currency'] == 'KWF': + # Kuwaiti Dinar divided into 1000 not 100 + n = 1000 + else: + n = 100 + return self._fix_prices_sudden_change(df, interval, tz_exchange, n) @utils.log_indent_decorator def _fix_zeroes(self, df, interval, tz_exchange, prepost): @@ -1171,9 +1176,12 @@ def _fix_missing_div_adjust(self, df, interval, tz_exchange): return df2 @utils.log_indent_decorator - def _fix_bad_stock_split(self, df, interval, tz_exchange): - # Repair idea is to look for BIG daily price changes that closely match the - # most recent stock split ratio. This indicates Yahoo failed to apply a new + def _fix_bad_stock_splits(self, df, interval, tz_exchange): + # Original logic only considered latest split adjustment could be missing, but + # actually **any** split adjustment can be missing. So check all splits in df. + # + # Improved logic looks for BIG daily price changes that closely match the + # **nearest future** stock split ratio. This indicates Yahoo failed to apply a new # stock split to old price data. # # There is a slight complication, because Yahoo does another stupid thing. @@ -1190,22 +1198,28 @@ def _fix_bad_stock_split(self, df, interval, tz_exchange): if not interday: return df - # Find the most recent stock split - df = df.sort_index(ascending=False) + df = df.sort_index() # scan splits oldest -> newest split_f = df['Stock Splits'].to_numpy() != 0 if not split_f.any(): logger.debug('price-repair-split: No splits in data') return df - most_recent_split_day = df.index[split_f].max() - split = df.loc[most_recent_split_day, 'Stock Splits'] - if most_recent_split_day == df.index[0]: - logger.info( - "price-repair-split: Need 1+ day of price data after split to determine true price. Won't repair") - return df - logger.debug(f'price-repair-split: Most recent split = {split:.4f} @ {most_recent_split_day.date()}') + for split_idx in np.where(split_f)[0]: + split_dt = df.index[split_idx] + split = df.loc[split_dt, 'Stock Splits'] + if split_dt == df.index[0]: + continue - return self._fix_prices_sudden_change(df, interval, tz_exchange, split, correct_volume=True) + cutoff_idx = min(df.shape[0], split_idx+1) # add one row after to detect big change + df_pre_split = df.iloc[0:cutoff_idx+1] + + df_pre_split_repaired = self._fix_prices_sudden_change(df_pre_split, interval, tz_exchange, split, correct_volume=True) + # Merge back in: + if cutoff_idx == df.shape[0]-1: + df = df_pre_split_repaired + else: + df = pd.concat([df_pre_split_repaired.sort_index(), df.iloc[cutoff_idx+1:]]) + return df @utils.log_indent_decorator def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_volume=False): @@ -1302,10 +1316,12 @@ def _fix_prices_sudden_change(self, df, interval, tz_exchange, change, correct_v # average change _1d_change_minx = np.average(_1d_change_x, axis=1) else: - # change nearest to 1.0 - diff = np.abs(_1d_change_x - 1.0) - j_indices = np.argmin(diff, axis=1) - _1d_change_minx = _1d_change_x[np.arange(n), j_indices] + # # change nearest to 1.0 + # diff = np.abs(_1d_change_x - 1.0) + # j_indices = np.argmin(diff, axis=1) + # _1d_change_minx = _1d_change_x[np.arange(n), j_indices] + # Still sensitive to extreme-low low. Try median: + _1d_change_minx = np.median(_1d_change_x, axis=1) f_na = np.isnan(_1d_change_minx) if f_na.any(): # Possible if data was too old for reconstruction.