Commit ae827c67 authored by Arnaud Campeas's avatar Arnaud Campeas
Browse files

get_delta: experimental high level api to help backtesting

This builds a view of any ts where the more recent data
are excluded accordingly to the closeness between the
insertion date and the value date.

auc: use at your own perils :)
     it is mostly a sugared version of .get_history with
     a weird twist
     I'm reluctant to include this since I've a hard time
     explaining it in lay terms. Probably because we lack
     decent use cases in the test suite.
parent dedb6cbf6d9a
......@@ -944,3 +944,146 @@ def test_long_name(engine, tsh):
name = 'a' * 64
tsh.insert(engine, serie, name, 'babar')
assert tsh.get(engine, name) is not None
def test_get_delta(engine, tsh):
for idate in pd.DatetimeIndex(start=utcdt(2015, 1, 1),
end=utcdt(2015, 1, 1, 3),
freq='H'):
ts = genserie(start=idate, freq='H', repeat=7)
tsh.insert(engine, ts, 'republication', 'test',
_insertion_date=idate)
hist = tsh.get_history(engine, 'republication')
assert_df("""
insertion_date value_date
2015-01-01 00:00:00+00:00 2015-01-01 00:00:00+00:00 0.0
2015-01-01 01:00:00+00:00 1.0
2015-01-01 02:00:00+00:00 2.0
2015-01-01 03:00:00+00:00 3.0
2015-01-01 04:00:00+00:00 4.0
2015-01-01 05:00:00+00:00 5.0
2015-01-01 06:00:00+00:00 6.0
2015-01-01 01:00:00+00:00 2015-01-01 00:00:00+00:00 0.0
2015-01-01 01:00:00+00:00 0.0
2015-01-01 02:00:00+00:00 1.0
2015-01-01 03:00:00+00:00 2.0
2015-01-01 04:00:00+00:00 3.0
2015-01-01 05:00:00+00:00 4.0
2015-01-01 06:00:00+00:00 5.0
2015-01-01 07:00:00+00:00 6.0
2015-01-01 02:00:00+00:00 2015-01-01 00:00:00+00:00 0.0
2015-01-01 01:00:00+00:00 0.0
2015-01-01 02:00:00+00:00 0.0
2015-01-01 03:00:00+00:00 1.0
2015-01-01 04:00:00+00:00 2.0
2015-01-01 05:00:00+00:00 3.0
2015-01-01 06:00:00+00:00 4.0
2015-01-01 07:00:00+00:00 5.0
2015-01-01 08:00:00+00:00 6.0
2015-01-01 03:00:00+00:00 2015-01-01 00:00:00+00:00 0.0
2015-01-01 01:00:00+00:00 0.0
2015-01-01 02:00:00+00:00 0.0
2015-01-01 03:00:00+00:00 0.0
2015-01-01 04:00:00+00:00 1.0
2015-01-01 05:00:00+00:00 2.0
2015-01-01 06:00:00+00:00 3.0
2015-01-01 07:00:00+00:00 4.0
2015-01-01 08:00:00+00:00 5.0
2015-01-01 09:00:00+00:00 6.0
""", hist)
deltas = tsh.get_delta(engine, 'republication', delta=timedelta(hours=3))
assert_df("""
2015-01-01 03:00:00+00:00 3.0
2015-01-01 04:00:00+00:00 3.0
2015-01-01 05:00:00+00:00 3.0
2015-01-01 06:00:00+00:00 3.0
2015-01-01 07:00:00+00:00 4.0
2015-01-01 08:00:00+00:00 5.0
2015-01-01 09:00:00+00:00 6.0
""", deltas)
deltas = tsh.get_delta(engine, 'republication', delta=timedelta(hours=5))
assert_df("""
2015-01-01 05:00:00+00:00 5.0
2015-01-01 06:00:00+00:00 5.0
2015-01-01 07:00:00+00:00 5.0
2015-01-01 08:00:00+00:00 5.0
2015-01-01 09:00:00+00:00 6.0
""", deltas)
hist = tsh.get_history(engine, 'republication',
deltabefore=-timedelta(hours=3),
deltaafter=timedelta(hours=3))
assert_df("""
insertion_date value_date
2015-01-01 00:00:00+00:00 2015-01-01 03:00:00+00:00 3.0
2015-01-01 01:00:00+00:00 2015-01-01 04:00:00+00:00 3.0
2015-01-01 02:00:00+00:00 2015-01-01 05:00:00+00:00 3.0
2015-01-01 03:00:00+00:00 2015-01-01 06:00:00+00:00 3.0
""", hist)
hist = tsh.get_history(engine, 'republication',
deltabefore=-timedelta(hours=5),
deltaafter=timedelta(hours=5))
assert_df("""
insertion_date value_date
2015-01-01 00:00:00+00:00 2015-01-01 05:00:00+00:00 5.0
2015-01-01 01:00:00+00:00 2015-01-01 06:00:00+00:00 5.0
2015-01-01 02:00:00+00:00 2015-01-01 07:00:00+00:00 5.0
2015-01-01 03:00:00+00:00 2015-01-01 08:00:00+00:00 5.0
""", hist)
# maybe a more interesting example, each days we insert 7 data points
for idx, idate in enumerate(pd.DatetimeIndex(start=utcdt(2015, 1, 1),
end=utcdt(2015, 1, 4),
freq='D')):
ts = genserie(start=idate, freq='H', repeat=7)
tsh.insert(engine, ts, 'repu2', 'test', _insertion_date=idate)
deltas = tsh.get_delta(engine, 'repu2', delta=timedelta(hours=3))
assert_df("""
2015-01-01 03:00:00+00:00 3.0
2015-01-01 04:00:00+00:00 4.0
2015-01-01 05:00:00+00:00 5.0
2015-01-01 06:00:00+00:00 6.0
2015-01-02 03:00:00+00:00 3.0
2015-01-02 04:00:00+00:00 4.0
2015-01-02 05:00:00+00:00 5.0
2015-01-02 06:00:00+00:00 6.0
2015-01-03 03:00:00+00:00 3.0
2015-01-03 04:00:00+00:00 4.0
2015-01-03 05:00:00+00:00 5.0
2015-01-03 06:00:00+00:00 6.0
2015-01-04 03:00:00+00:00 3.0
2015-01-04 04:00:00+00:00 4.0
2015-01-04 05:00:00+00:00 5.0
2015-01-04 06:00:00+00:00 6.0
""", deltas)
# which is basically the same as below
hist = tsh.get_history(engine, 'repu2',
deltabefore=-timedelta(hours=3))
assert_df("""
insertion_date value_date
2015-01-01 00:00:00+00:00 2015-01-01 03:00:00+00:00 3.0
2015-01-01 04:00:00+00:00 4.0
2015-01-01 05:00:00+00:00 5.0
2015-01-01 06:00:00+00:00 6.0
2015-01-02 00:00:00+00:00 2015-01-02 03:00:00+00:00 3.0
2015-01-02 04:00:00+00:00 4.0
2015-01-02 05:00:00+00:00 5.0
2015-01-02 06:00:00+00:00 6.0
2015-01-03 00:00:00+00:00 2015-01-03 03:00:00+00:00 3.0
2015-01-03 04:00:00+00:00 4.0
2015-01-03 05:00:00+00:00 5.0
2015-01-03 06:00:00+00:00 6.0
2015-01-04 00:00:00+00:00 2015-01-04 03:00:00+00:00 3.0
2015-01-04 04:00:00+00:00 4.0
2015-01-04 05:00:00+00:00 5.0
2015-01-04 06:00:00+00:00 6.0
""", hist)
......@@ -184,6 +184,39 @@ class TimeSerie(SeriesServices):
serie.name = name
return serie
def get_delta(self, cn, name, delta):
histo = self.get_history(
cn, name, deltabefore=-delta
)
df = histo.reset_index()
# df_date is a dataframe with two columns: value_date and insertion_date
df_date = df.loc[:, ['insertion_date', 'value_date']]
# now in selected_dates each value_date has only one occurence
# which is the last inserted
selected_dates = df_date.groupby('value_date').max().reset_index()
ts = df[name]
# ts is built from the df returned from get_history
# ts index is now a simple index of tuples (insert_date, value_date)
ts.index = ((row.insertion_date, row.value_date)
for row in df.itertuples())
# in ts, there ie still all the couple value_date * insertion_date
# We now used the selected_dates to select in ts only
# the couple (value_date, insertion_date)
# which corresponds to the last insertion_date
ts_select = ts[[(row[2], row[1])
for row in selected_dates.itertuples()]]
# ts_select has still a simple index of tuples (value_date, insertion_date)
new_index = (elt[1] for elt in ts_select.index)
# we only keep the value_date information from the index
ts_select.index = new_index
return ts_select
def exists(self, cn, name):
return self._get_ts_table(cn, name) is not None
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment