Commit 91bb05b4 authored by Aurélien Campéas's avatar Aurélien Campéas
Browse files

tsio: faster path for `get_history` (without deltas coupling between idates and vdates)

Let's look at the test_perf outputs (relevant ones):

BEFORE:

AVG (3)

TSH HIST 2.49
       class                                test      time
4  TimeSerie               manydiffs_history_all  3.43
5  TimeSerie            manydiffs_history_chunks  7.52
6  TimeSerie  manydiffs_history_chunks_valuedate  0.99


AFTER:

AVG (3)

TSH HIST 1.41
       class                                test      time
4  TimeSerie               manydiffs_history_all  3.04
5  TimeSerie            manydiffs_history_chunks  6.40
6  TimeSerie  manydiffs_history_chunks_valuedate  1.32


We get better numbers for the common bulky operations.
The last item would be now better served with a staircase query
(if it has any meaning at all).


Closes #44.
parent 8615e02e317a
......@@ -2,7 +2,7 @@ import pandas as pd
import zlib
from sqlalchemy import Table, Column, Integer, ForeignKey
from sqlalchemy.sql.expression import select, desc
from sqlalchemy.sql.expression import select, asc, desc
from sqlalchemy.dialects.postgresql import BYTEA, TIMESTAMP
from tshistory.util import (
......@@ -246,3 +246,74 @@ class Snapshot(SeriesServices):
chunk = self.chunk(cid, from_value_date, to_value_date)
return csid, chunk
def allchunks(self, heads, from_value_date=None):
where = ''
if from_value_date:
where = 'where chunks.end >= %(start)s '
sql = """
with recursive allchunks as (
select chunks.id as cid,
chunks.parent as parent,
chunks.chunk as chunk
from "{namespace}"."{table}" as chunks
where chunks.id in ({heads})
union
select chunks.id as cid,
chunks.parent as parent,
chunks.chunk as chunk
from "{namespace}"."{table}" as chunks
join allchunks on chunks.id = allchunks.parent
{where}
)
select cid, parent, chunk from allchunks
""".format(namespace=self.namespace,
table=self.name,
heads=','.join(str(head) for head in heads),
where=where)
res = self.cn.execute(sql, start=from_value_date)
chunks = {cid: (parent, rawchunk)
for cid, parent, rawchunk in res.fetchall()}
return chunks
def findall(self, revs, from_value_date, to_value_date):
csets = [rev for rev, _ in revs if rev is not None]
# csid -> heads
cset = self.tsh.schema.changeset
serie = self.tsh._get_ts_table(self.cn, self.seriename)
sql = select([serie.c.cset, serie.c.snapshot]
).order_by(asc(serie.c.id)
).select_from(serie.join(cset)
).where(cset.c.id >= min(csets)
).where(cset.c.id <= max(csets))
cset_snap_map = {
row.cset: row.snapshot
for row in self.cn.execute(sql).fetchall()
}
rawchunks = self.allchunks(
sorted(cset_snap_map.values()),
from_value_date
)
series = []
for cset, idate in revs:
if cset is None:
series.append((idate, None))
continue
chunks = []
head = cset_snap_map[cset]
while True:
parent, chunk = rawchunks.get(head, (None, None))
if chunk is None:
break
chunks.append(chunk)
head = parent
series.append(
(idate, subset(self._chunks_to_ts(chunks),
from_value_date,
to_value_date)
)
)
return series
......@@ -174,20 +174,25 @@ class TimeSerie(SeriesServices):
snapshot = Snapshot(cn, self, seriename)
series = []
for csid, idate in revs:
if (deltabefore, deltaafter) != (None, None):
if (deltabefore, deltaafter) != (None, None):
for csid, idate in revs:
from_value_date = None
to_value_date = None
if deltabefore is not None:
from_value_date = idate - deltabefore
if deltaafter is not None:
to_value_date = idate + deltaafter
series.append((
idate,
snapshot.find(csetfilter=[lambda cset: cset.c.id == csid],
from_value_date=from_value_date,
to_value_date=to_value_date)[1]
))
series.append((
idate,
snapshot.find(csetfilter=[lambda cset: cset.c.id == csid],
from_value_date=from_value_date,
to_value_date=to_value_date)[1]
))
else:
series = snapshot.findall(revs,
from_value_date,
to_value_date
)
if diffmode:
diffs = []
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment