Source code for astropy.io.ascii.basic

# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""An extensible ASCII table reader and writer.

basic.py:
  Basic table read / write functionality for simple character
  delimited files with various options for column header definition.

:Copyright: Smithsonian Astrophysical Observatory (2011)
:Author: Tom Aldcroft (aldcroft@head.cfa.harvard.edu)
"""

import re

from . import core



[docs]
class BasicHeader(core.BaseHeader):
    """
    Basic table Header Reader.

    Set a few defaults for common ascii table formats
    (start at line 0, comments begin with ``#`` and possibly white space)
    """

    start_line = 0
    comment = r"\s*#"
    write_comment = "# "




[docs]
class BasicData(core.BaseData):
    """
    Basic table Data Reader.

    Set a few defaults for common ascii table formats
    (start at line 1, comments begin with ``#`` and possibly white space)
    """

    start_line = 1
    comment = r"\s*#"
    write_comment = "# "




[docs]
class Basic(core.BaseReader):
    r"""Character-delimited table with a single header line at the top.

    Lines beginning with a comment character (default='#') as the first
    non-whitespace character are comments.

    Example table::

      # Column definition is the first uncommented line
      # Default delimiter is the space character.
      apples oranges pears

      # Data starts after the header column definition, blank lines ignored
      1 2 3
      4 5 6
    """

    _format_name = "basic"
    _description = "Basic table with custom delimiters"
    _io_registry_format_aliases = ["ascii"]

    header_class = BasicHeader
    data_class = BasicData



class NoHeaderHeader(BasicHeader):
    """
    Reader for table header without a header.

    Set the start of header line number to `None`, which tells the basic
    reader there is no header line.
    """

    start_line = None


class NoHeaderData(BasicData):
    """
    Reader for table data without a header.

    Data starts at first uncommented line since there is no header line.
    """

    start_line = 0



[docs]
class NoHeader(Basic):
    """Character-delimited table with no header line.

    When reading, columns are autonamed using header.auto_format which defaults
    to "col%d".  Otherwise this reader the same as the :class:`Basic` class
    from which it is derived.  Example::

      # Table data
      1 2 "hello there"
      3 4 world

    """

    _format_name = "no_header"
    _description = "Basic table with no headers"
    header_class = NoHeaderHeader
    data_class = NoHeaderData



class CommentedHeaderHeader(BasicHeader):
    """
    Header class for which the column definition line starts with the
    comment character.  See the :class:`CommentedHeader` class  for an example.
    """

    def process_lines(self, lines):
        """
        Return only lines that start with the comment regexp.  For these
        lines strip out the matching characters.
        """
        re_comment = re.compile(self.comment)
        for line in lines:
            match = re_comment.match(line)
            if match:
                yield line[match.end() :]

    def write(self, lines):
        lines.append(self.write_comment + self.splitter.join(self.colnames))



[docs]
class CommentedHeader(Basic):
    """Character-delimited table with column names in a comment line.

    When reading, ``header_start`` can be used to specify the
    line index of column names, and it can be a negative index (for example -1
    for the last commented line).  The default delimiter is the <space>
    character.

    This matches the format produced by ``np.savetxt()``, with ``delimiter=','``,
    and ``header='<comma-delimited-column-names-list>'``.

    Example::

      # col1 col2 col3
      # Comment line
      1 2 3
      4 5 6

    """

    _format_name = "commented_header"
    _description = "Column names in a commented line"

    header_class = CommentedHeaderHeader
    data_class = NoHeaderData


[docs]
    def read(self, table):
        """
        Read input data (file-like object, filename, list of strings, or
        single string) into a Table and return the result.
        """
        out = super().read(table)

        # Strip off the comment line set as the header line for
        # commented_header format (first by default).
        if "comments" in out.meta:
            idx = self.header.start_line
            if idx < 0:
                idx = len(out.meta["comments"]) + idx
            out.meta["comments"] = (
                out.meta["comments"][:idx] + out.meta["comments"][idx + 1 :]
            )
            if not out.meta["comments"]:
                del out.meta["comments"]

        return out



[docs]
    def write_header(self, lines, meta):
        """
        Write comment lines after, rather than before, the header.
        """
        self.header.write(lines)
        self.header.write_comments(lines, meta)




class TabHeaderSplitter(core.DefaultSplitter):
    """Split lines on tab and do not remove whitespace."""

    delimiter = "\t"

    def process_line(self, line):
        return line + "\n"


class TabDataSplitter(TabHeaderSplitter):
    """
    Don't strip data value whitespace since that is significant in TSV tables.
    """

    process_val = None
    skipinitialspace = False


class TabHeader(BasicHeader):
    """
    Reader for header of tables with tab separated header.
    """

    splitter_class = TabHeaderSplitter


class TabData(BasicData):
    """
    Reader for data of tables with tab separated data.
    """

    splitter_class = TabDataSplitter



[docs]
class Tab(Basic):
    """Tab-separated table.

    Unlike the :class:`Basic` reader, whitespace is not stripped from the
    beginning and end of either lines or individual column values.

    Example::

      col1 <tab> col2 <tab> col3
      # Comment line
      1 <tab> 2 <tab> 5

    """

    _format_name = "tab"
    _description = "Basic table with tab-separated values"
    header_class = TabHeader
    data_class = TabData



class CsvSplitter(core.DefaultSplitter):
    """
    Split on comma for CSV (comma-separated-value) tables.
    """

    delimiter = ","


class CsvHeader(BasicHeader):
    """
    Header that uses the :class:`astropy.io.ascii.basic.CsvSplitter`.
    """

    splitter_class = CsvSplitter
    comment = None
    write_comment = None


class CsvData(BasicData):
    """
    Data that uses the :class:`astropy.io.ascii.basic.CsvSplitter`.
    """

    splitter_class = CsvSplitter
    fill_values = [(core.masked, "")]
    comment = None
    write_comment = None



[docs]
class Csv(Basic):
    """CSV (comma-separated-values) table.

    This file format may contain rows with fewer entries than the number of
    columns, a situation that occurs in output from some spreadsheet editors.
    The missing entries are marked as masked in the output table.

    Masked values (indicated by an empty '' field value when reading) are
    written out in the same way with an empty ('') field.  This is different
    from the typical default for `astropy.io.ascii` in which missing values are
    indicated by ``--``.

    By default leading or trailing whitespace in column names is stripped. If
    you pass ``strip_column_names=False`` then this is disabled.

    Since the `CSV format <https://tools.ietf.org/html/rfc4180>`_ does not
    formally support comments, any comments defined for the table via
    ``tbl.meta['comments']`` are ignored by default. If you would still like to
    write those comments then include a keyword ``comment='#'`` to the
    ``write()`` call.

    Example::

      num,ra,dec,radius,mag
      1,32.23222,10.1211
      2,38.12321,-88.1321,2.2,17.0

    """

    _format_name = "csv"
    _io_registry_format_aliases = ["csv"]
    _io_registry_can_write = True
    _io_registry_suffix = ".csv"
    _description = "Comma-separated-values"

    header_class = CsvHeader
    data_class = CsvData

    def __init__(self, *, strip_column_names=True):
        super().__init__()
        if not strip_column_names:
            self.header.splitter.process_val = None


[docs]
    def inconsistent_handler(self, str_vals, ncols):
        """
        Adjust row if it is too short.

        If a data row is shorter than the header, add empty values to make it the
        right length.
        Note that this will *not* be called if the row already matches the header.

        Parameters
        ----------
        str_vals : list
            A list of value strings from the current row of the table.
        ncols : int
            The expected number of entries from the table header.

        Returns
        -------
        str_vals : list
            List of strings to be parsed into data entries in the output table.
        """
        if len(str_vals) < ncols:
            str_vals.extend((ncols - len(str_vals)) * [""])

        return str_vals




class RdbHeader(TabHeader):
    """
    Header for RDB tables.
    """

    col_type_map = {"n": core.NumType, "s": core.StrType}

    def get_type_map_key(self, col):
        return col.raw_type[-1]

    def get_cols(self, lines):
        """
        Initialize the header Column objects from the table ``lines``.

        This is a specialized get_cols for the RDB type:
        Line 0: RDB col names
        Line 1: RDB col definitions
        Line 2+: RDB data rows

        Parameters
        ----------
        lines : list
            List of table lines

        Returns
        -------
        None

        """
        header_lines = self.process_lines(lines)  # this is a generator
        header_vals_list = [hl for _, hl in zip(range(2), self.splitter(header_lines))]
        if len(header_vals_list) != 2:
            raise ValueError("RDB header requires 2 lines")
        self.names, raw_types = header_vals_list

        if len(self.names) != len(raw_types):
            raise core.InconsistentTableError(
                "RDB header mismatch between number of column names and column types."
            )

        if any(not re.match(r"\d*(N|S)$", x, re.IGNORECASE) for x in raw_types):
            raise core.InconsistentTableError(
                f"RDB types definitions do not all match [num](N|S): {raw_types}"
            )

        self._set_cols_from_names()
        for col, raw_type in zip(self.cols, raw_types):
            col.raw_type = raw_type
            col.type = self.get_col_type(col)

    def write(self, lines):
        lines.append(self.splitter.join(self.colnames))
        rdb_types = []
        for col in self.cols:
            # Check if dtype.kind is string or unicode.  See help(np.core.numerictypes)
            rdb_type = "S" if col.info.dtype.kind in ("S", "U") else "N"
            rdb_types.append(rdb_type)

        lines.append(self.splitter.join(rdb_types))


class RdbData(TabData):
    """
    Data reader for RDB data. Starts reading at line 2.
    """

    start_line = 2



[docs]
class Rdb(Tab):
    """Tab-delimited table with a column name row and a type definition row.

    The ``rdb`` format is a legacy format that was originally created in 1991 as the
    basis for a suite of Unix command-line relational database utilities.

    The ``rdb`` format is defined as follows:

    - The table text starts with zero or more comment lines that begin with ``#``.
    - Comments are allowed only at the beginning of the table.
    - First row after the (optional) comments specifies the column names.
    - Second row after the comments specifies the data types:

      - Data type can be either ``S`` for string or ``N`` for numeric (case-insensitive).
      - Data type specifier can optionally be preceded with an integer to indicate the
        width when printing the table, but the ``astropy`` reader ignores it.
    - Subsequent rows contain the data values.
    - All row entries in the header and data are separated by a tab character.

    Example (where the added spaces are for visual clarity)::

        # Comment line
        # -----------------
        name <tab> age <tab> eye-color
        6S <tab> 5N <tab> S
        Bob  <tab> 45 <tab> blue
        Mary <tab> 32 <tab> brown
        Jill <tab> 80 <tab> hazel
    """

    _format_name = "rdb"
    _io_registry_format_aliases = ["rdb"]
    _io_registry_suffix = ".rdb"
    _description = "Tab-separated with a type definition header line"

    header_class = RdbHeader
    data_class = RdbData