aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fg21sim/utils/ds.py82
1 files changed, 82 insertions, 0 deletions
diff --git a/fg21sim/utils/ds.py b/fg21sim/utils/ds.py
new file mode 100644
index 0000000..8b69dbc
--- /dev/null
+++ b/fg21sim/utils/ds.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2017 Weitian LI <weitian@aaronly.me>
+# MIT license
+
+"""
+Data structure/set utilities.
+"""
+
+import logging
+from collections import Iterable
+
+import pandas as pd
+
+
+logger = logging.getLogger(__name__)
+
+
+def _flatten_list(l):
+ """
+ Flatten an arbitrarily nested list.
+
+ Credit
+ ------
+ * Flatten (an irregular) list of lists
+ https://stackoverflow.com/a/2158532
+ """
+ for el in l:
+ if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
+ yield from _flatten_list(el)
+ else:
+ yield el
+
+
+def dictlist_to_dataframe(dictlist, keys=None):
+ """
+ Convert the data in format of list of dictionaries to be a Pandas
+ DataFrame by flattening the dictionary keys into columns.
+
+ NOTE
+ ----
+ If the item ``key`` of the dictionary has value of a list/vector,
+ then it is split into multiple columns named as ``key[0], key[1], ...``.
+
+ Parameters
+ ----------
+ dictlist : list[dict]
+ The input data to be converted, is a list of dictionaries, with
+ each member dictionary has the same format/structure.
+ NOTE: The dictionary may have items with list/vector as the values,
+ but other more complex items (e.g., nested dictionary) is not
+ allowed and supported.
+ keys : list[str], optional
+ The list of dictionary items to be selected for conversion.
+ Default: convert all dictionary items.
+
+ Returns
+ -------
+ dataframe : `~pandas.DataFrame`
+ The converted Pandas DataFrame with columns be the dictionary
+ item keys.
+ """
+ d0 = dictlist[0]
+ if keys is None:
+ keys = list(d0.keys())
+ logger.info("DataFrame conversion selected keys: {0}".format(keys))
+
+ columns = []
+ for k in keys:
+ v = d0[k]
+ if isinstance(v, Iterable) and not isinstance(v, (str, bytes)):
+ columns += ["%s[%d]" % (k, i) for i in range(len(v))]
+ else:
+ columns.append(k)
+ logger.info("DataFrame number of columns: %d" % len(columns))
+ logger.debug("DataFrame columns: {0}".format(columns))
+
+ data = []
+ for d in dictlist:
+ dv = [d[k] for k in keys]
+ dv2 = list(_flatten_list(dv))
+ data.append(dv2)
+ dataframe = pd.DataFrame(data, columns=columns)
+ return dataframe