#!/usr/bin/env python3 # # Weitian LI # Created: 2016-06-25 # Updated: 2016-06-26 # # Change logs: # 2016-06-26: # * Allow missing data columns # """ Read a table from a plain text file according to a data column specification. The "column" here means the column number by each character. While the "data column" means a meaningful data which stretch multiple columns. e.g, ------------------------------------------------------------ 1234567890123456789012345678901234567890 -> column x xxx.xx xxx.x xxxx xx xx.x xx.xx xxxx | | | | DC1 DC2 DC3 DC4 -> data column ------------------------------------------------------------ This table syntax is usually used in optical astronomy, e.g, SDSS tables. Sample configuration file: ------------------------------------------------------------ ## Configuration for `read_table_colspec.py` ## Date: 2016-06-25 # input table file infile = # output file in CSV format outfile = # data column specification # column number is 1-based, and end column is inclusive. [colspec] # name = col_begin, col_end, type, comment ID = 1, 4, int RA = 5, 15, float, deg Dec = 16, 26, float, deg redshift = 27, 34, float name = 35, -1, str ------------------------------------------------------------ """ import re import argparse import csv from pydoc import locate from configobj import ConfigObj def parse_colspec(config): """ Parse the data column specification from the config file. Parsed colspec syntax: [ (name, col_begin, col_end, type, comment), ... ] NOTE: the above `col_begin` and `col_end` are convert to be 0-based. """ colspec = [] for name, spec in config.items(): col_begin, col_end = int(spec[0]), int(spec[1]) # Convert column number to be 0-based col_begin -= 1 if col_end != -1: col_end -= 1 # Cast from string to type # Credit: https://stackoverflow.com/a/29831586/4856091 t = locate(spec[2]) if len(spec) == 4: comment = spec[3] else: comment = "" colspec.append((name, col_begin, col_end, t, comment)) return colspec def parse_line(line, colspec): """ Parse the given line according to the data column specification. Parse value syntax: [ (name, value, comment), ... ] """ items = [] for name, col_begin, col_end, t, comment in colspec: if col_end == -1: value = line[col_begin:].strip() else: value = line[col_begin:(col_end+1)].strip() try: value = t(value) except ValueError: value = None items.append((name, value, comment)) return items def main(): parser = argparse.ArgumentParser( description="Read table data by column specification") parser.add_argument("config", nargs=1, help="config of input, output, column specification") args = parser.parse_args() config = ConfigObj(args.config[0]) colspec = parse_colspec(config["colspec"]) # output column header header = [spec[0] if spec[-1] == "" else "%s[%s]" % (spec[0], spec[-1]) for spec in colspec] with open(config["outfile"], "w") as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(header) # data lines for line in open(config["infile"]).read().splitlines(): if re.match(r"^\s*$", line): # ignore blank lines continue if re.match(r"^\s*#.*$", line): continue # items = parse_line(line, colspec) values = [x[1] for x in items] csv_writer.writerow(values) if __name__ == "__main__": main()