diff options
Diffstat (limited to 'fg21sim')
| -rw-r--r-- | fg21sim/utils/ds.py | 82 | 
1 files changed, 82 insertions, 0 deletions
| diff --git a/fg21sim/utils/ds.py b/fg21sim/utils/ds.py new file mode 100644 index 0000000..8b69dbc --- /dev/null +++ b/fg21sim/utils/ds.py @@ -0,0 +1,82 @@ +# Copyright (c) 2017 Weitian LI <weitian@aaronly.me> +# MIT license + +""" +Data structure/set utilities. +""" + +import logging +from collections import Iterable + +import pandas as pd + + +logger = logging.getLogger(__name__) + + +def _flatten_list(l): +    """ +    Flatten an arbitrarily nested list. + +    Credit +    ------ +    * Flatten (an irregular) list of lists +      https://stackoverflow.com/a/2158532 +    """ +    for el in l: +        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)): +            yield from _flatten_list(el) +        else: +            yield el + + +def dictlist_to_dataframe(dictlist, keys=None): +    """ +    Convert the data in format of list of dictionaries to be a Pandas +    DataFrame by flattening the dictionary keys into columns. + +    NOTE +    ---- +    If the item ``key`` of the dictionary has value of a list/vector, +    then it is split into multiple columns named as ``key[0], key[1], ...``. + +    Parameters +    ---------- +    dictlist : list[dict] +        The input data to be converted, is a list of dictionaries, with +        each member dictionary has the same format/structure. +        NOTE: The dictionary may have items with list/vector as the values, +              but other more complex items (e.g., nested dictionary) is not +              allowed and supported. +    keys : list[str], optional +        The list of dictionary items to be selected for conversion. +        Default: convert all dictionary items. + +    Returns +    ------- +    dataframe : `~pandas.DataFrame` +        The converted Pandas DataFrame with columns be the dictionary +        item keys. +    """ +    d0 = dictlist[0] +    if keys is None: +        keys = list(d0.keys()) +    logger.info("DataFrame conversion selected keys: {0}".format(keys)) + +    columns = [] +    for k in keys: +        v = d0[k] +        if isinstance(v, Iterable) and not isinstance(v, (str, bytes)): +            columns += ["%s[%d]" % (k, i) for i in range(len(v))] +        else: +            columns.append(k) +    logger.info("DataFrame number of columns: %d" % len(columns)) +    logger.debug("DataFrame columns: {0}".format(columns)) + +    data = [] +    for d in dictlist: +        dv = [d[k] for k in keys] +        dv2 = list(_flatten_list(dv)) +        data.append(dv2) +    dataframe = pd.DataFrame(data, columns=columns) +    return dataframe | 
