aboutsummaryrefslogtreecommitdiffstats
path: root/fg21sim/utils/ds.py
blob: 694cbe3bcf9eb10a4a2223f8cbbc6da28737b057 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright (c) 2017,2019 Weitian LI <wt@liwt.net>
# MIT License

"""
Data structure/set utilities.
"""

import logging
from collections import Iterable

import pandas as pd


logger = logging.getLogger(__name__)


def _flatten_list(l):
    """
    Flatten an arbitrarily nested list.

    Credit
    ------
    * Flatten (an irregular) list of lists
      https://stackoverflow.com/a/2158532
    """
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from _flatten_list(el)
        else:
            yield el


def dictlist_to_dataframe(dictlist, keys=None):
    """
    Convert the data in format of list of dictionaries to be a Pandas
    DataFrame by flattening the dictionary keys into columns.

    NOTE
    ----
    If the item ``key`` of the dictionary has value of a list/vector,
    then it is split into multiple columns named as ``key[0], key[1], ...``.

    Parameters
    ----------
    dictlist : list[dict]
        The input data to be converted, is a list of dictionaries, with
        each member dictionary has the same format/structure.
        NOTE: The dictionary may have items with list/vector as the values,
              but other more complex items (e.g., nested dictionary) is not
              allowed and supported.
    keys : list[str], optional
        The list of dictionary items to be selected for conversion.
        Default: convert all dictionary items.

    Returns
    -------
    dataframe : `~pandas.DataFrame`
        The converted Pandas DataFrame with columns be the dictionary
        item keys.
    """
    d0 = dictlist[0]
    if keys is None:
        keys = list(d0.keys())
    logger.info("DataFrame conversion selected keys: {0}".format(keys))

    columns = []
    for k in keys:
        v = d0[k]
        if isinstance(v, Iterable) and not isinstance(v, (str, bytes)):
            columns += ["%s[%d]" % (k, i) for i in range(len(v))]
        else:
            columns.append(k)
    logger.info("DataFrame number of columns: %d" % len(columns))
    logger.debug("DataFrame columns: {0}".format(columns))

    data = []
    for d in dictlist:
        dv = [d[k] for k in keys]
        dv2 = list(_flatten_list(dv))
        data.append(dv2)

    return pd.DataFrame(data, columns=columns)


def pad_dict_list(d, keys, length, fill=None):
    """
    Pad the lists specified by ``keys`` to the given ``length``.

    Parameters
    ----------
    d : dict
        The dictionary to be updated.
    keys : list[str]
        The keys of the lists in the dictionary to be padded.
    length : int
        The expected length.
    fill : optional
        The value filled to the padded list.
    """
    for k in keys:
        v = d[k]
        n = len(v)
        if n < length:
            d[k] = v + [fill] * (length - n)