test_tsio.py 14.2 KB
Newer Older
1
2
3
4
5
6
7
# coding: utf-8
from pathlib import Path
from datetime import datetime
from dateutil import parser

import pandas as pd
import numpy as np
8
import pytest
9

10
from tshistory.tsio import TimeSerie
11
12
13
14

DATADIR = Path(__file__).parent / 'data'


15
16
17
18
19
20
21
def assert_group_equals(g1, g2):
    for (n1, s1), (n2, s2) in zip(sorted(g1.items()),
                                  sorted(g2.items())):
        assert n1 == n2
        assert s1.equals(s2)


22
23
24
25
def assert_df(expected, df):
    assert expected.strip() == df.to_string().strip()


26
def test_changeset(engine):
27
28
29
30
    # instantiate one time serie handler object
    tso = TimeSerie()

    index = pd.date_range(start=datetime(2017, 1, 1), freq='D', periods=3)
31
    data = [1., 2., 3.]
32
33
34

    with engine.connect() as cnx:
        with tso.newchangeset(cnx, 'babar'):
35
            tso.insert(cnx, pd.Series(data, index=index), 'ts_values')
36
            tso.insert(cnx, pd.Series(['a', 'b', 'c'], index=index), 'ts_othervalues')
37

38
39
40
41
    g = tso.get_group(engine, 'ts_values')
    g2 = tso.get_group(engine, 'ts_othervalues')
    assert_group_equals(g, g2)

42
43
44
    with pytest.raises(AssertionError):
        tso.insert(engine, pd.Series([2,3,4], index=index), 'ts_values')

45
46
47
48
49
    with engine.connect() as cnx:
        data.append(data.pop(0))
        with tso.newchangeset(cnx, 'celeste'):
            tso.insert(cnx, pd.Series(data, index=index), 'ts_values')
            # below should be a noop
50
            tso.insert(cnx, pd.Series(['a', 'b', 'c'], index=index), 'ts_othervalues')
51

52
53
54
    g = tso.get_group(engine, 'ts_values')
    assert ['ts_values'] == list(g.keys())

55
    assert_df("""
56
57
58
2017-01-01    2.0
2017-01-02    3.0
2017-01-03    1.0
59
""", tso.get(engine, 'ts_values'))
60

61
    assert_df("""
62
63
64
2017-01-01    a
2017-01-02    b
2017-01-03    c
65
""", tso.get(engine, 'ts_othervalues'))
66

67

68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def test_tstamp_roundtrip(engine):
    tso = TimeSerie()
    ts = pd.Series(range(4),
                   index=pd.date_range(datetime(2017, 10, 28, 23),
                                       freq='H', periods=4, tz='UTC')
    )
    ts.index = ts.index.tz_convert('Europe/Paris')

    assert_df("""
2017-10-29 01:00:00+02:00    0
2017-10-29 02:00:00+02:00    1
2017-10-29 02:00:00+01:00    2
2017-10-29 03:00:00+01:00    3
Freq: H
    """, ts)

    tso.insert(engine, ts, 'tztest', 'Babar')
    back = tso.get(engine, 'tztest')

    # though un localized we understand it's been normalized to utc
    assert_df("""
2017-10-28 23:00:00    0.0
2017-10-29 00:00:00    1.0
2017-10-29 01:00:00    2.0
2017-10-29 02:00:00    3.0
""", back)

    back.index = back.index.tz_localize('UTC')
    assert (ts.index == back.index).all()


99
def test_differential(engine):
100
101
    # instantiate one time serie handler object
    tso = TimeSerie()
102
103
104

    ts_begin = pd.Series(range(10))
    ts_begin.index = pd.date_range(start=datetime(2010, 1, 1), freq='D', periods=10)
105
    tso.insert(engine, ts_begin, 'ts_test', 'test')
106

107
    assert_df("""
108
109
110
111
112
113
114
115
116
117
2010-01-01    0.0
2010-01-02    1.0
2010-01-03    2.0
2010-01-04    3.0
2010-01-05    4.0
2010-01-06    5.0
2010-01-07    6.0
2010-01-08    7.0
2010-01-09    8.0
2010-01-10    9.0
118
""", tso.get(engine, 'ts_test'))
119
120

    # we should detect the emission of a message
121
    tso.insert(engine, ts_begin, 'ts_test', 'babar')
122

123
    assert_df("""
124
125
126
127
128
129
130
131
132
133
2010-01-01    0.0
2010-01-02    1.0
2010-01-03    2.0
2010-01-04    3.0
2010-01-05    4.0
2010-01-06    5.0
2010-01-07    6.0
2010-01-08    7.0
2010-01-09    8.0
2010-01-10    9.0
134
""", tso.get(engine, 'ts_test'))
135
136
137
138

    ts_slight_variation = ts_begin.copy()
    ts_slight_variation.iloc[3] = 0
    ts_slight_variation.iloc[6] = 0
139
    tso.insert(engine, ts_slight_variation, 'ts_test', 'celeste')
140

141
    assert_df("""
142
143
144
145
146
147
148
149
150
151
2010-01-01    0.0
2010-01-02    1.0
2010-01-03    2.0
2010-01-04    0.0
2010-01-05    4.0
2010-01-06    5.0
2010-01-07    0.0
2010-01-08    7.0
2010-01-09    8.0
2010-01-10    9.0
152
""", tso.get(engine, 'ts_test'))
153
154
155
156
157
158
159

    ts_longer = pd.Series(range(15))
    ts_longer.index = pd.date_range(start=datetime(2010, 1, 3), freq='D', periods=15)
    ts_longer.iloc[1] = 2.48
    ts_longer.iloc[3] = 3.14
    ts_longer.iloc[5] = ts_begin.iloc[7]

160
    tso.insert(engine, ts_longer, 'ts_test', 'test')
161

162
    assert_df("""
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
2010-01-01     0.00
2010-01-02     1.00
2010-01-03     0.00
2010-01-04     2.48
2010-01-05     2.00
2010-01-06     3.14
2010-01-07     4.00
2010-01-08     7.00
2010-01-09     6.00
2010-01-10     7.00
2010-01-11     8.00
2010-01-12     9.00
2010-01-13    10.00
2010-01-14    11.00
2010-01-15    12.00
2010-01-16    13.00
2010-01-17    14.00
180
""", tso.get(engine, 'ts_test'))
181
182
183
184
185

    # start testing manual overrides
    ts_begin = pd.Series([2] * 5)
    ts_begin.index = pd.date_range(start=datetime(2010, 1, 1), freq='D', periods=5)
    ts_begin.loc['2010-01-04'] = -1
186
    tso.insert(engine, ts_begin, 'ts_mixte', 'test')
187
188

    # -1 represents bogus upstream data
189
    assert_df("""
190
191
192
193
194
2010-01-01    2.0
2010-01-02    2.0
2010-01-03    2.0
2010-01-04   -1.0
2010-01-05    2.0
195
""", tso.get(engine, 'ts_mixte'))
196
197
198
199
200

    # refresh all the period + 1 extra data point
    ts_more = pd.Series([2] * 5)
    ts_more.index = pd.date_range(start=datetime(2010, 1, 2), freq='D', periods=5)
    ts_more.loc['2010-01-04'] = -1
201
    tso.insert(engine, ts_more, 'ts_mixte', 'test')
202

203
    assert_df("""
204
205
206
207
208
209
2010-01-01    2.0
2010-01-02    2.0
2010-01-03    2.0
2010-01-04   -1.0
2010-01-05    2.0
2010-01-06    2.0
210
""", tso.get(engine, 'ts_mixte'))
211
212
213
214

    # just append an extra data point
    ts_one_more = pd.Series([3])  # with no intersection with the previous ts
    ts_one_more.index = pd.date_range(start=datetime(2010, 1, 7), freq='D', periods=1)
215
    tso.insert(engine, ts_one_more, 'ts_mixte', 'test')
216

217
    assert_df("""
218
219
220
221
222
223
224
2010-01-01    2.0
2010-01-02    2.0
2010-01-03    2.0
2010-01-04   -1.0
2010-01-05    2.0
2010-01-06    2.0
2010-01-07    3.0
225
""", tso.get(engine, 'ts_mixte'))
226

227
    hist = pd.read_sql('select id, parent from timeserie.ts_test order by id',
228
                        engine)
229
    assert_df("""
230
231
232
233
   id  parent
0   1     NaN
1   2     1.0
2   3     2.0
234
""", hist)
235

236
    hist = pd.read_sql('select id, parent from timeserie.ts_mixte order by id',
237
                        engine)
238
    assert_df("""
239
240
241
242
   id  parent
0   1     NaN
1   2     1.0
2   3     2.0
243
""", hist)
244

245
    allts = pd.read_sql("select name, table_name from registry "
246
                        "where name in ('ts_test', 'ts_mixte')",
247
248
                        engine)

249
    assert_df("""
250
251
252
       name          table_name
0   ts_test   timeserie.ts_test
1  ts_mixte  timeserie.ts_mixte
253
""", allts)
254

255
    assert_df("""
256
257
258
259
260
261
262
2010-01-01    2.0
2010-01-02    2.0
2010-01-03    2.0
2010-01-04   -1.0
2010-01-05    2.0
2010-01-06    2.0
2010-01-07    3.0
263
264
""", tso.get(engine, 'ts_mixte',
             revision_date=datetime.now()))
265
266
267


def test_bad_import(engine):
268
269
270
    # instantiate one time serie handler object
    tso = TimeSerie()

271
272
273
274
275
    # the data were parsed as date by pd.read_json()
    df_result = pd.read_csv(DATADIR / 'test_data.csv')
    df_result['Gas Day'] = df_result['Gas Day'].apply(parser.parse, dayfirst=True, yearfirst=False)
    df_result.set_index('Gas Day', inplace=True)
    ts = df_result['SC']
276
277
278

    tso.insert(engine, ts, 'SND_SC', 'test')
    result = tso.get(engine, 'SND_SC')
279
    assert result.dtype == 'float64'
280
281
282

    # insertion of empty ts
    ts = pd.Series(name='truc', dtype='object')
283
284
    tso.insert(engine, ts, 'empty_ts', 'test')
    assert tso.get(engine, 'empty_ts') is None
285
286
287
288
289
290

    # nan in ts
    # all na
    ts = pd.Series([np.nan] * 10,
                   index=pd.date_range(start=datetime(2010, 1, 10),
                                       freq='D', periods=10), name='truc')
291
292
    tso.insert(engine, ts, 'test_nan', 'test')
    assert tso.get(engine, 'test_nan') is None
293
294
295
296
297

    # mixe na
    ts = pd.Series([np.nan] * 5 + [3] * 5,
                   index=pd.date_range(start=datetime(2010, 1, 10),
                                       freq='D', periods=10), name='truc')
298
299
    tso.insert(engine, ts, 'test_nan', 'test')
    result = tso.get(engine, 'test_nan')
300

301
302
    tso.insert(engine, ts, 'test_nan', 'test')
    result = tso.get(engine, 'test_nan')
303
    assert_df("""
304
305
306
307
308
2010-01-15    3.0
2010-01-16    3.0
2010-01-17    3.0
2010-01-18    3.0
2010-01-19    3.0
309
""", result)
310
311
312

    # get_ts with name not in database

313
    tso.get(engine, 'inexisting_name', 'test')
314
315
316


def test_revision_date(engine):
317
318
319
    # instantiate one time serie handler object
    tso = TimeSerie()

320
321
    idate1 = datetime(2015, 1, 1, 15, 43, 23)
    with tso.newchangeset(engine, 'test', _insertion_date=idate1):
322
323
324
325

        ts = pd.Series([1] * 4,
                       index=pd.date_range(start=datetime(2010, 1, 4),
                                           freq='D', periods=4), name='truc')
326
327
        tso.insert(engine, ts, 'ts_through_time')
        assert idate1 == tso.latest_insertion_date(engine, 'ts_through_time')
328

329
330
    idate2 = datetime(2015, 1, 2, 15, 43, 23)
    with tso.newchangeset(engine, 'test', _insertion_date=idate2):
331
332
333
334

        ts = pd.Series([2] * 4,
                       index=pd.date_range(start=datetime(2010, 1, 4),
                                           freq='D', periods=4), name='truc')
335
336
        tso.insert(engine, ts, 'ts_through_time')
        assert idate2 == tso.latest_insertion_date(engine, 'ts_through_time')
337

338
339
    idate3 = datetime(2015, 1, 3, 15, 43, 23)
    with tso.newchangeset(engine, 'test', _insertion_date=idate3):
340
341
342
343

        ts = pd.Series([3] * 4,
                       index=pd.date_range(start=datetime(2010, 1, 4),
                                           freq='D', periods=4), name='truc')
344
345
        tso.insert(engine, ts, 'ts_through_time')
        assert idate3 == tso.latest_insertion_date(engine, 'ts_through_time')
346

347
    ts = tso.get(engine, 'ts_through_time')
348

349
    assert_df("""
350
351
352
353
2010-01-04    3.0
2010-01-05    3.0
2010-01-06    3.0
2010-01-07    3.0
354
""", ts)
355

356
357
    ts = tso.get(engine, 'ts_through_time',
                 revision_date=datetime(2015, 1, 2, 18, 43, 23) )
358

359
    assert_df("""
360
361
362
363
2010-01-04    2.0
2010-01-05    2.0
2010-01-06    2.0
2010-01-07    2.0
364
""", ts)
365

366
367
    ts = tso.get(engine, 'ts_through_time',
                 revision_date=datetime(2015, 1, 1, 18, 43, 23))
368

369
    assert_df("""
370
371
372
373
2010-01-04    1.0
2010-01-05    1.0
2010-01-06    1.0
2010-01-07    1.0
374
""", ts)
375

376
377
    ts = tso.get(engine, 'ts_through_time',
                 revision_date=datetime(2014, 1, 1, 18, 43, 23))
378
379
380

    assert ts is None

381
382
383

def test_snapshots(engine):
    tso = TimeSerie()
384
    tso._snapshot_interval = 4
385
386

    with engine.connect() as cnx:
387
        for tscount in range(1, 11):
388
389
390
            ts = pd.Series([1] * tscount,
                           index=pd.date_range(datetime(2015, 1, 1),
                                               freq='D', periods=tscount))
391
392
393
394
395
            diff = tso.insert(cnx, ts, 'growing', 'babar')
            assert diff.index[0] == diff.index[-1] == ts.index[-1]

    diff = tso.insert(engine, ts, 'growing', 'babar')
    assert diff is None
396

397
398
    df = pd.read_sql("select id from timeserie.growing where snapshot is not null",
                     engine)
399
    assert_df("""
400
401
   id
0   1
402
403
404
1   4
2   8
3  10
405
""", df)
406
407

    ts = tso.get(engine, 'growing')
408
    assert_df("""
409
410
411
412
413
414
415
416
417
418
2015-01-01    1.0
2015-01-02    1.0
2015-01-03    1.0
2015-01-04    1.0
2015-01-05    1.0
2015-01-06    1.0
2015-01-07    1.0
2015-01-08    1.0
2015-01-09    1.0
2015-01-10    1.0
419
""", ts)
420

421
    df = pd.read_sql("select id, diff, snapshot from timeserie.growing order by id", engine)
422
423
424
    for attr in ('diff', 'snapshot'):
        df[attr] = df[attr].apply(lambda x: 0 if x is None else len(x))

425
    assert_df("""
426
   id  diff  snapshot
Arnaud Campeas's avatar
Arnaud Campeas committed
427
428
429
430
431
432
433
434
435
436
0   1     0        32
1   2    32         0
2   3    32         0
3   4    32       125
4   5    32         0
5   6    32         0
6   7    32         0
7   8    32       249
8   9    32         0
9  10    32       311
437
""", df)
438
439
440
441
442

    table = tso._get_ts_table(engine, 'growing')
    snapid, snap = tso._find_snapshot(engine, table, ())
    assert snapid == 10
    assert (ts == snap).all()
443
444
445
446
447
448
449
450
451
452
453
454
455
456


def test_deletion(engine):
    tso = TimeSerie()

    ts_begin = pd.Series(range(10))
    ts_begin.index = pd.date_range(start=datetime(2010, 1, 1), freq='D', periods=10)
    tso.insert(engine, ts_begin, 'ts_del', 'test')

    ts_begin.iloc[3] = np.nan
    ts_begin.iloc[0] = np.nan

    tso.insert(engine, ts_begin, 'ts_del', 'test')

457
    assert_df("""
458
459
460
461
462
463
464
2010-01-02    1.0
2010-01-03    2.0
2010-01-05    4.0
2010-01-06    5.0
2010-01-07    6.0
2010-01-08    7.0
2010-01-09    8.0
465
466
2010-01-10    9.0
""", tso.get(engine, 'ts_del'))
467
468
469
470
471
472

    ts_begin.iloc[0] = 42
    ts_begin.iloc[3] = 23

    tso.insert(engine, ts_begin, 'ts_del', 'test')

473
    assert_df("""
474
475
476
477
478
479
480
481
482
2010-01-01    42.0
2010-01-02     1.0
2010-01-03     2.0
2010-01-04    23.0
2010-01-05     4.0
2010-01-06     5.0
2010-01-07     6.0
2010-01-08     7.0
2010-01-09     8.0
483
484
2010-01-10     9.0
""", tso.get(engine, 'ts_del'))
485
486
487
488
489
490
491
492
493
494
495

    # now with string!

    ts_string = pd.Series(['machin'] * 10)
    ts_string.index = pd.date_range(start=datetime(2010, 1, 1), freq='D', periods=10)
    tso.insert(engine, ts_string, 'ts_string_del', 'test')

    ts_string[4] = None
    ts_string[5] = None

    tso.insert(engine, ts_string, 'ts_string_del', 'test')
496
    assert_df("""
497
498
499
500
501
502
503
504
2010-01-01    machin
2010-01-02    machin
2010-01-03    machin
2010-01-04    machin
2010-01-07    machin
2010-01-08    machin
2010-01-09    machin
2010-01-10    machin
505
""", tso.get(engine, 'ts_string_del'))
506
507
508
509
510

    ts_string[4] = 'truc'
    ts_string[6] = 'truc'

    tso.insert(engine, ts_string, 'ts_string_del', 'test')
511
    assert_df("""
512
513
514
515
516
517
518
519
2010-01-01    machin
2010-01-02    machin
2010-01-03    machin
2010-01-04    machin
2010-01-05      truc
2010-01-07      truc
2010-01-08    machin
2010-01-09    machin
520
521
2010-01-10    machin
""", tso.get(engine, 'ts_string_del'))
522
523
524
525
526
527
528
529

    # first insertion with only nan

    ts_begin = pd.Series([np.nan] * 10)
    ts_begin.index = pd.date_range(start=datetime(2010, 1, 1), freq='D', periods=10)
    tso.insert(engine, ts_begin, 'ts_null', 'test')

    assert tso.get(engine, 'ts_null') is None