fg21sim/utils/ds.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

# Copyright (c) 2017 Weitian LI <weitian@aaronly.me>
# MIT license

"""
Data structure/set utilities.
"""

import logging
from collections import Iterable

import pandas as pd


logger = logging.getLogger(__name__)


def _flatten_list(l):
    """
    Flatten an arbitrarily nested list.

    Credit
    ------
    * Flatten (an irregular) list of lists
      https://stackoverflow.com/a/2158532
    """
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from _flatten_list(el)
        else:
            yield el


def dictlist_to_dataframe(dictlist, keys=None):
    """
    Convert the data in format of list of dictionaries to be a Pandas
    DataFrame by flattening the dictionary keys into columns.

    NOTE
    ----
    If the item ``key`` of the dictionary has value of a list/vector,
    then it is split into multiple columns named as ``key[0], key[1], ...``.

    Parameters
    ----------
    dictlist : list[dict]
        The input data to be converted, is a list of dictionaries, with
        each member dictionary has the same format/structure.
        NOTE: The dictionary may have items with list/vector as the values,
              but other more complex items (e.g., nested dictionary) is not
              allowed and supported.
    keys : list[str], optional
        The list of dictionary items to be selected for conversion.
        Default: convert all dictionary items.

    Returns
    -------
    dataframe : `~pandas.DataFrame`
        The converted Pandas DataFrame with columns be the dictionary
        item keys.
    """
    d0 = dictlist[0]
    if keys is None:
        keys = list(d0.keys())
    logger.info("DataFrame conversion selected keys: {0}".format(keys))

    columns = []
    for k in keys:
        v = d0[k]
        if isinstance(v, Iterable) and not isinstance(v, (str, bytes)):
            columns += ["%s[%d]" % (k, i) for i in range(len(v))]
        else:
            columns.append(k)
    logger.info("DataFrame number of columns: %d" % len(columns))
    logger.debug("DataFrame columns: {0}".format(columns))

    data = []
    for d in dictlist:
        dv = [d[k] for k in keys]
        dv2 = list(_flatten_list(dv))
        data.append(dv2)
    dataframe = pd.DataFrame(data, columns=columns)
    return dataframe