Add read_table_colspec.py: convert optical data table to CSV format

author: Aaron LI <aaronly.me@outlook.com> 2016-06-25 20:20:59 +0800
committer: Aaron LI <aaronly.me@outlook.com> 2016-06-25 20:20:59 +0800
commit: 8eabb89d0fb55e70be3d619a08288e051ff00367 (patch)
tree: c8a09d185d082bfd35d28905aae3616483430fcc /read_table_colspec.py
parent: e856955776a2fc072068cf44be7b1b5f999e1eed (diff)
download: cexcess-8eabb89d0fb55e70be3d619a08288e051ff00367.tar.bz2
1 files changed, 133 insertions, 0 deletions
diff --git a/read_table_colspec.py b/read_table_colspec.py
new file mode 100755
index 0000000..f49026e
--- /dev/null
+++ b/read_table_colspec.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+#
+# Weitian LI
+# Created: 2016-06-25
+# Updated: 2016-06-25
+#
+
+"""
+Read a table from a plain text file according to a data column
+specification.
+
+The "column" here means the column number by each character.
+While the "data column" means a meaningful data which stretch multiple
+columns. e.g,
+------------------------------------------------------------
+1234567890123456789012345678901234567890  -> column
+  x  xxx.xx  xxx.x   xxxx
+ xx   xx.x    xx.xx  xxxx
+  |     |       |      |
+ DC1   DC2     DC3    DC4                 -> data column
+------------------------------------------------------------
+
+This table syntax is usually used in optical astronomy, e.g, SDSS tables.
+
+
+Sample configuration file:
+------------------------------------------------------------
+## Configuration for `read_table_colspec.py`
+## Date: 2016-06-25
+
+# input table file
+infile = <INPUT.TXT>
+
+# output file in CSV format
+outfile = <OUTPUT.CSV>
+
+# data column specification
+# column number is 1-based, and end column is inclusive.
+[colspec]
+    # name = col_begin, col_end, type, comment
+    ID = 1, 4, int
+    RA = 5, 15, float, deg
+    Dec = 16, 26, float, deg
+    redshift = 27, 34, float
+    name = 35, -1, str
+------------------------------------------------------------
+"""
+
+import re
+import argparse
+import csv
+from pydoc import locate
+
+from configobj import ConfigObj
+
+
+def parse_colspec(config):
+    """
+    Parse the data column specification from the config file.
+
+    Parsed colspec syntax:
+    [
+        (name, col_begin, col_end, type, comment),
+        ...
+    ]
+
+    NOTE: the above `col_begin` and `col_end` are convert to be 0-based.
+    """
+    colspec = []
+    for name, spec in config.items():
+        col_begin, col_end = int(spec[0]), int(spec[1])
+        # Convert column number to be 0-based
+        col_begin -= 1
+        if col_end != -1:
+            col_end -= 1
+        # Cast from string to type
+        # Credit: https://stackoverflow.com/a/29831586/4856091
+        t = locate(spec[2])
+        if len(spec) == 4:
+            comment = spec[3]
+        else:
+            comment = ""
+        colspec.append((name, col_begin, col_end, t, comment))
+    return colspec
+
+
+def parse_line(line, colspec):
+    """
+    Parse the given line according to the data column specification.
+
+    Parse value syntax:
+    [ (name, value, comment), ... ]
+    """
+    items = []
+    for name, col_begin, col_end, t, comment in colspec:
+        if col_end == -1:
+            value = line[col_begin:].strip()
+        else:
+            value = line[col_begin:(col_end+1)].strip()
+        items.append((name, t(value), comment))
+    return items
+
+
+def main():
+    parser = argparse.ArgumentParser(
+            description="Read table data by column specification")
+    parser.add_argument("config", nargs=1,
+                        help="config of input, output, column specification")
+    args = parser.parse_args()
+
+    config = ConfigObj(args.config[0])
+    colspec = parse_colspec(config["colspec"])
+    # output column header
+    header = [spec[0] if spec[-1] == "" else "%s[%s]" % (spec[0], spec[-1])
+              for spec in colspec]
+    with open(config["outfile"], "w") as csv_file:
+        csv_writer = csv.writer(csv_file)
+        csv_writer.writerow(header)
+        # data lines
+        for line in open(config["infile"]).read().splitlines():
+            if re.match(r"^\s*$", line):
+                # ignore blank lines
+                continue
+            if re.match(r"^\s*#.*$", line):
+                continue
+            #
+            items = parse_line(line, colspec)
+            values = [x[1] for x in items]
+            csv_writer.writerow(values)
+
+
+if __name__ == "__main__":
+    main()
author	Aaron LI <aaronly.me@outlook.com>	2016-06-25 20:20:59 +0800
committer	Aaron LI <aaronly.me@outlook.com>	2016-06-25 20:20:59 +0800
commit	8eabb89d0fb55e70be3d619a08288e051ff00367 (patch)
tree	c8a09d185d082bfd35d28905aae3616483430fcc /read_table_colspec.py
parent	e856955776a2fc072068cf44be7b1b5f999e1eed (diff)
download	cexcess-8eabb89d0fb55e70be3d619a08288e051ff00367.tar.bz2