diff options
-rwxr-xr-x | read_table_colspec.py | 133 |
1 files changed, 133 insertions, 0 deletions
diff --git a/read_table_colspec.py b/read_table_colspec.py new file mode 100755 index 0000000..f49026e --- /dev/null +++ b/read_table_colspec.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# +# Weitian LI +# Created: 2016-06-25 +# Updated: 2016-06-25 +# + +""" +Read a table from a plain text file according to a data column +specification. + +The "column" here means the column number by each character. +While the "data column" means a meaningful data which stretch multiple +columns. e.g, +------------------------------------------------------------ +1234567890123456789012345678901234567890 -> column + x xxx.xx xxx.x xxxx + xx xx.x xx.xx xxxx + | | | | + DC1 DC2 DC3 DC4 -> data column +------------------------------------------------------------ + +This table syntax is usually used in optical astronomy, e.g, SDSS tables. + + +Sample configuration file: +------------------------------------------------------------ +## Configuration for `read_table_colspec.py` +## Date: 2016-06-25 + +# input table file +infile = <INPUT.TXT> + +# output file in CSV format +outfile = <OUTPUT.CSV> + +# data column specification +# column number is 1-based, and end column is inclusive. +[colspec] + # name = col_begin, col_end, type, comment + ID = 1, 4, int + RA = 5, 15, float, deg + Dec = 16, 26, float, deg + redshift = 27, 34, float + name = 35, -1, str +------------------------------------------------------------ +""" + +import re +import argparse +import csv +from pydoc import locate + +from configobj import ConfigObj + + +def parse_colspec(config): + """ + Parse the data column specification from the config file. + + Parsed colspec syntax: + [ + (name, col_begin, col_end, type, comment), + ... + ] + + NOTE: the above `col_begin` and `col_end` are convert to be 0-based. + """ + colspec = [] + for name, spec in config.items(): + col_begin, col_end = int(spec[0]), int(spec[1]) + # Convert column number to be 0-based + col_begin -= 1 + if col_end != -1: + col_end -= 1 + # Cast from string to type + # Credit: https://stackoverflow.com/a/29831586/4856091 + t = locate(spec[2]) + if len(spec) == 4: + comment = spec[3] + else: + comment = "" + colspec.append((name, col_begin, col_end, t, comment)) + return colspec + + +def parse_line(line, colspec): + """ + Parse the given line according to the data column specification. + + Parse value syntax: + [ (name, value, comment), ... ] + """ + items = [] + for name, col_begin, col_end, t, comment in colspec: + if col_end == -1: + value = line[col_begin:].strip() + else: + value = line[col_begin:(col_end+1)].strip() + items.append((name, t(value), comment)) + return items + + +def main(): + parser = argparse.ArgumentParser( + description="Read table data by column specification") + parser.add_argument("config", nargs=1, + help="config of input, output, column specification") + args = parser.parse_args() + + config = ConfigObj(args.config[0]) + colspec = parse_colspec(config["colspec"]) + # output column header + header = [spec[0] if spec[-1] == "" else "%s[%s]" % (spec[0], spec[-1]) + for spec in colspec] + with open(config["outfile"], "w") as csv_file: + csv_writer = csv.writer(csv_file) + csv_writer.writerow(header) + # data lines + for line in open(config["infile"]).read().splitlines(): + if re.match(r"^\s*$", line): + # ignore blank lines + continue + if re.match(r"^\s*#.*$", line): + continue + # + items = parse_line(line, colspec) + values = [x[1] for x in items] + csv_writer.writerow(values) + + +if __name__ == "__main__": + main() |