Skip to content

Commit

Permalink
添加cache加快本地文件遍历速度
Browse files Browse the repository at this point in the history
  • Loading branch information
wukan1986 committed Jan 3, 2025
1 parent c69757e commit 47fbfcc
Show file tree
Hide file tree
Showing 14 changed files with 48 additions and 30 deletions.
2 changes: 1 addition & 1 deletion ddump/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.1"
__version__ = "0.2.2"
2 changes: 2 additions & 0 deletions ddump/api/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pathlib
import time
from functools import lru_cache

import pandas as pd

Expand All @@ -23,6 +24,7 @@ def start_end_2_name(start, end):
return f'{start}{START_SEP_END}{end}'


@lru_cache(maxsize=4)
def files_to_dataframe(path, suffix=FILE_SUFFIX):
"""目录中文件转DataFrame
Expand Down
4 changes: 3 additions & 1 deletion examples/jqresearch/get_fundamentals_season.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,17 @@ def main():
d = Dump__date(jqr, path, 'end_date')
# 前半段,按周查,这样能快一些
end = pd.to_datetime('2023-01-15')
end = pd.to_datetime(datetime.today().date()) + pd.Timedelta(days=6)
start = pd.to_datetime('2024-06-01')
end = pd.to_datetime(datetime.today().date()) + pd.Timedelta(days=6)
start = pd.to_datetime(datetime.today().date()) - pd.Timedelta(days=123)

for dr in pd.date_range(start=start, end=end, freq='QE'):
q = f'{dr.year}q{dr.month // 3}'
d.set_parameters(func_name,
end_date=dr,
statDate=q)
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 150):
# print(dr, q)
d.download(kw=['statDate'])
d.save()

Expand Down
6 changes: 4 additions & 2 deletions examples/jqresearch/get_fundamentals_valuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@


def main():
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
trading_day = trading_day['2024-12-01':end]
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
start = f"{pd.to_datetime('today') - pd.Timedelta(days=32):%Y-%m-%d}"
trading_day = trading_day[start:end]

func_name = f'get_fundamentals_valuation'

Expand All @@ -25,6 +26,7 @@ def main():
for i, date in enumerate(trading_day):
d.set_parameters(func_name, date=f'{date:%Y-%m-%d}')
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(date)
d.download(kw=['date'])
d.save()

Expand Down
9 changes: 6 additions & 3 deletions examples/jqresearch/get_index_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,16 @@


def main():
# 上月底
end = f"{pd.to_datetime('today') + relativedelta(months=-1, day=31):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
trading_day = trading_day['2024-10-01':end]
# 上月底
end = f"{pd.to_datetime('today') + relativedelta(months=-1, day=31):%Y-%m-%d}"
start = f"{pd.to_datetime('today') + relativedelta(months=-3, day=31):%Y-%m-%d}"

trading_day = trading_day[start:end]
trading_day = trading_day.resample('ME').last()

func_name = f'get_index_weights'
Expand All @@ -37,6 +39,7 @@ def main():
for i, date in enumerate(trading_day):
d.set_parameters(func_name, index_id=index_id, date=f'{date:%Y-%m-%d}')
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 3):
# print(index_id, date)
d.download(kw=['index_id', 'date'])
d.save()

Expand Down
8 changes: 5 additions & 3 deletions examples/jqresearch/get_price_futures_daily.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def do_get_price(d, start_date, end_date, symbols, fields, fq):
start_date=f'{start_date:%Y-%m-%d}', end_date=f'{end_date:%Y-%m-%d}',
security=symbols.index.tolist(), fq=fq, panel=False, fields=fields)
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(start_date, end_date)
d.download(kw=['start_date', 'end_date', 'security', 'fq', 'panel', 'fields'])
d.save()

Expand All @@ -41,6 +42,7 @@ def do_get_dominant_futures(d, date, end_date, symbols):
symbols=symbols,
date=f'{date:%Y-%m-%d}', end_date=f'{end_date:%Y-%m-%d}')
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(date, end_date)
d.download(kw=['symbols', 'date', 'end_date'],
post_download=post_download_get_dominant_futures,
post_download_kwargs={'end_date': f'{end_date:%Y-%m-%d}'})
Expand All @@ -58,14 +60,14 @@ def main():
path2 = DATA_ROOT / f'get_dominant_futures'
d2 = Dump__start__end(jqr, path2, 'date', 'end_date')

end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
# end = f"2024-11-01"
trading_day = trading_day['2024-12-01':end]
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
start = f"{pd.to_datetime('today') - pd.Timedelta(days=32):%Y-%m-%d}"
trading_day = trading_day[start:end]

# 只要跨月了就划分成两部分,实现指定月份也能加载不出错
start_list = []
Expand Down
7 changes: 4 additions & 3 deletions examples/jqresearch/get_price_futures_minute.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def do_get_price(d, start_date, end_date, symbols, fields, fq):
start_date=f'{start_date:%Y-%m-%d} 20:00:00', end_date=f'{end_date:%Y-%m-%d} 16:00:00',
security=symbols_list, fq=fq, panel=False, fields=fields, frequency='1m')
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 1):
# print(start_date, end_date)
sym_iter = more_itertools.batched(symbols_list, 300)
for syms in sym_iter:
d.set_parameters('get_price',
Expand All @@ -34,14 +35,14 @@ def main():
path = DATA_ROOT / f'get_price_{types}_minute'
d = Dump__start__end(jq, path, 'start_date', 'end_date')

end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
# end = f"2024-01-01"
trading_day = trading_day['2024-12-01':end]
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
start = f"{pd.to_datetime('today') - pd.Timedelta(days=32):%Y-%m-%d}"
trading_day = trading_day[start:end]

# 只要跨月了就划分成两部分,实现指定月份也能加载不出错
start_list = []
Expand Down
7 changes: 4 additions & 3 deletions examples/jqresearch/get_price_index_daily.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def do_get_price(d, start_date, end_date, symbols, fields, fq):
start_date=f'{start_date:%Y-%m-%d}', end_date=f'{end_date:%Y-%m-%d}',
security=symbols, fq=fq, panel=False, fields=fields)
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(start_date, end_date)
d.download(kw=['start_date', 'end_date', 'security', 'fq', 'panel', 'fields'])
d.save()

Expand All @@ -48,14 +49,14 @@ def main():
path1 = DATA_ROOT / f'get_price_{types}_daily'
d1 = Dump__start__end(jq, path1, 'start_date', 'end_date')

end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
# end = f"2024-11-01"
trading_day = trading_day['2024-12-01':end]
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
start = f"{pd.to_datetime('today') - pd.Timedelta(days=32):%Y-%m-%d}"
trading_day = trading_day[start:end]

# 只要跨月了就划分成两部分,实现指定月份也能加载不出错
start_list = []
Expand Down
7 changes: 4 additions & 3 deletions examples/jqresearch/get_price_index_minute.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def do_get_price(d, start_date, end_date, symbols, fields, fq):
start_date=f'{start_date:%Y-%m-%d} 08:00:00', end_date=f'{end_date:%Y-%m-%d} 16:00:00',
security=symbols_list, fq=fq, panel=False, fields=fields, frequency='1m')
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(start_date, end_date)
sym_iter = more_itertools.batched(symbols_list, 500)
for syms in sym_iter:
d.set_parameters('get_price',
Expand Down Expand Up @@ -56,14 +57,14 @@ def main():
path = DATA_ROOT / f'get_price_{types}_minute'
d = Dump__start__end(jq, path, 'start_date', 'end_date')

end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
# end = f"2024-04-30"
trading_day = trading_day['2024-12-01':end]
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
start = f"{pd.to_datetime('today') - pd.Timedelta(days=32):%Y-%m-%d}"
trading_day = trading_day[start:end]

# 只要跨月了就划分成两部分,实现指定月份也能加载不出错
start_list = []
Expand Down
9 changes: 6 additions & 3 deletions examples/jqresearch/get_price_stock_daily.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def do_get_price(d, start_date, end_date, symbols, fields, fq):
start_date=f'{start_date:%Y-%m-%d}', end_date=f'{end_date:%Y-%m-%d}',
security=symbols.index.tolist(), fq=fq, panel=False, fields=fields)
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(start_date, end_date)
d.download(kw=['start_date', 'end_date', 'security', 'fq', 'panel', 'fields'])
d.save()

Expand All @@ -42,6 +43,7 @@ def do_get_extras(d, start_date, end_date, symbols, info):
start_date=f'{start_date:%Y-%m-%d}', end_date=f'{end_date:%Y-%m-%d}',
security_list=symbols.index.tolist(), df=True)
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(start_date, end_date)
d.download(kw=['info', 'start_date', 'end_date', 'security_list', 'df'])
d.save(pre_save=save_func_get_extras)

Expand Down Expand Up @@ -74,6 +76,7 @@ def do_get_industry(d, start_date, end_date, symbols):
date=f'{end_date:%Y-%m-%d}',
security=symbols.index.tolist())
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 2):
# print(start_date, end_date)
d.download(kw=['security', 'date'])
d.save(pre_save=save_func_get_industry, pre_save_kwargs={'date': end_date})

Expand All @@ -91,14 +94,14 @@ def main():
d3 = Dump__start__end(jq, path3, 'start_date', 'end_date')
d4 = Dump__start__end(jq, path4, 'start_date', 'end_date')

end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
# end = f"2024-11-01"
trading_day = trading_day['2024-12-01':end]
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
start = f"{pd.to_datetime('today') - pd.Timedelta(days=32):%Y-%m-%d}"
trading_day = trading_day[start:end]

# 只要跨月了就划分成两部分,实现指定月份也能加载不出错
start_list = []
Expand Down
7 changes: 4 additions & 3 deletions examples/jqresearch/get_price_stock_minute.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def do_get_price(d, start_date, end_date, symbols, fields, fq):
start_date=f'{start_date:%Y-%m-%d} 08:00:00', end_date=f'{end_date:%Y-%m-%d} 16:00:00',
security=symbols_list, fq=fq, panel=False, fields=fields, frequency='1m')
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 1):
# print(start_date, end_date)
sym_iter = more_itertools.batched(symbols_list, 400)
for syms in sym_iter:
d.set_parameters('get_price',
Expand All @@ -41,14 +42,14 @@ def main():
path = DATA_ROOT / f'get_price_{types}_minute'
d = Dump__start__end(jq, path, 'start_date', 'end_date')

end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
# 加载交易日历
trading_day = pd.read_parquet(DATA_ROOT_AKSHARE / 'tool_trade_date_hist_sina' / f'calendar.parquet')
trading_day = trading_day['trade_date']
trading_day.index = pd.to_datetime(trading_day)
# 过滤交易日
# end = f"2024-01-01"
trading_day = trading_day['2024-12-01':end]
end = f"{pd.to_datetime('today') - pd.Timedelta(hours=15, minutes=30):%Y-%m-%d}"
start = f"{pd.to_datetime('today') - pd.Timedelta(days=32):%Y-%m-%d}"
trading_day = trading_day[start:end]

# 只要跨月了就划分成两部分,实现指定月份也能加载不出错
start_list = []
Expand Down
4 changes: 2 additions & 2 deletions examples/jqresearch/get_stk_sheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ def main():
d = Dump__date(jqr, path, 'end_date')
# 前半段,按周查,这样能快一些
end = pd.to_datetime(datetime.today().date()) + pd.Timedelta(days=91)
start = pd.to_datetime('2014-12-20')
start = pd.to_datetime('2023-12-20')
start = pd.to_datetime(datetime.today().date()) - pd.Timedelta(days=183)

for dr in pd.date_range(start=start, end=end, freq='QE'):
q = f'{dr:%Y-%m-%d}'
d.set_parameters(func_name,
end_date=dr,
pub_date=q)
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 90):
# print(dr, q)
d.download(kw=['pub_date'])
d.save()

Expand Down
4 changes: 2 additions & 2 deletions examples/jqresearch/get_stk_xr_xd.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ def main():
d = Dump__date(jqr, path, 'end_date')
# 前半段,按周查,这样能快一些
end = pd.to_datetime(datetime.today().date()) + pd.Timedelta(days=91)
start = pd.to_datetime('2014-12-20')
start = pd.to_datetime('2023-12-20')
start = pd.to_datetime(datetime.today().date()) - pd.Timedelta(days=183)

for dr in pd.date_range(start=start, end=end, freq='QE'):
q = f'{dr:%Y-%m-%d}'
d.set_parameters(func_name,
end_date=dr,
board_plan_pub_date=q)
if not d.exists(file_timeout=3600 * 6, data_timeout=86400 * 90):
# print(dr, q)
d.download(kw=['board_plan_pub_date'])
d.save()

Expand Down
2 changes: 1 addition & 1 deletion examples/preprocessing/run_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@

prp.merge_jqresearch.main()
prp.remove_tmp.main()
#

prp.step1.main()
prp.step2.main()

0 comments on commit 47fbfcc

Please sign in to comment.