Source code for astropy.io.ascii.fixedwidth

# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""An extensible ASCII table reader and writer.

fixedwidth.py:
  Read or write a table with fixed width columns.

:Copyright: Smithsonian Astrophysical Observatory (2011)
:Author: Tom Aldcroft (aldcroft@head.cfa.harvard.edu)
"""

from . import basic, core
from .core import DefaultSplitter, InconsistentTableError



[docs]
class FixedWidthSplitter(core.BaseSplitter):
    """
    Split line based on fixed start and end positions for each ``col`` in
    ``self.cols``.

    This class requires that the Header class will have defined ``col.start``
    and ``col.end`` for each column.  The reference to the ``header.cols`` gets
    put in the splitter object by the base Reader.read() function just in time
    for splitting data lines by a ``data`` object.

    Note that the ``start`` and ``end`` positions are defined in the pythonic
    style so line[start:end] is the desired substring for a column.  This splitter
    class does not have a hook for ``process_lines`` since that is generally not
    useful for fixed-width input.

    """

    delimiter_pad = ""
    bookend = False
    delimiter = "|"


[docs]
    def __call__(self, lines):
        for line in lines:
            vals = [line[x.start : x.end] for x in self.cols]
            if self.process_val:
                yield [self.process_val(x) for x in vals]
            else:
                yield vals



[docs]
    def join(self, vals, widths):
        pad = self.delimiter_pad or ""
        delimiter = self.delimiter or ""
        padded_delim = pad + delimiter + pad
        if self.bookend:
            bookend_left = delimiter + pad
            bookend_right = pad + delimiter
        else:
            bookend_left = ""
            bookend_right = ""
        vals = [" " * (width - len(val)) + val for val, width in zip(vals, widths)]
        return bookend_left + padded_delim.join(vals) + bookend_right




class FixedWidthHeaderSplitter(DefaultSplitter):
    """Splitter class that splits on ``|``."""

    delimiter = "|"



[docs]
class FixedWidthHeader(basic.BasicHeader):
    """
    Fixed width table header reader.
    """

    splitter_class = FixedWidthHeaderSplitter
    """ Splitter class for splitting data lines into columns """
    position_line = None  # secondary header line position
    """ row index of line that specifies position (default = 1) """
    set_of_position_line_characters = set(r"""`~!#$%^&*-_+=\|":'""")


[docs]
    def get_line(self, lines, index):
        for i, line in enumerate(self.process_lines(lines)):
            if i == index:
                break
        else:  # No header line matching
            raise InconsistentTableError("No header line found in table")
        return line



[docs]
    def get_cols(self, lines):
        """
        Initialize the header Column objects from the table ``lines``.

        Based on the previously set Header attributes find or create the column names.
        Sets ``self.cols`` with the list of Columns.

        Parameters
        ----------
        lines : list
            List of table lines

        """
        header_rows = getattr(self, "header_rows", ["name"])

        # See "else" clause below for explanation of start_line and position_line
        start_line = core._get_line_index(self.start_line, self.process_lines(lines))
        position_line = core._get_line_index(
            self.position_line, self.process_lines(lines)
        )

        # If start_line is none then there is no header line.  Column positions are
        # determined from first data line and column names are either supplied by user
        # or auto-generated.
        if start_line is None:
            if position_line is not None:
                raise ValueError(
                    "Cannot set position_line without also setting header_start"
                )

            # data.data_lines attribute already set via self.data.get_data_lines(lines)
            # in BaseReader.read().  This includes slicing for data_start / data_end.
            data_lines = self.data.data_lines

            if not data_lines:
                raise InconsistentTableError(
                    "No data lines found so cannot autogenerate column names"
                )
            vals, starts, ends = self.get_fixedwidth_params(data_lines[0])

            self.names = [self.auto_format.format(i) for i in range(1, len(vals) + 1)]

        else:
            # This bit of code handles two cases:
            # start_line = <index> and position_line = None
            #    Single header line where that line is used to determine both the
            #    column positions and names.
            # start_line = <index> and position_line = <index2>
            #    Two header lines where the first line defines the column names and
            #    the second line defines the column positions

            if position_line is not None:
                # Define self.col_starts and self.col_ends so that the call to
                # get_fixedwidth_params below will use those to find the header
                # column names.  Note that get_fixedwidth_params returns Python
                # slice col_ends but expects inclusive col_ends on input (for
                # more intuitive user interface).
                line = self.get_line(lines, position_line)
                if len(set(line) - {self.splitter.delimiter, " "}) != 1:
                    raise InconsistentTableError(
                        "Position line should only contain delimiters and "
                        'one other character, e.g. "--- ------- ---".'
                    )
                    # The line above lies. It accepts white space as well.
                    # We don't want to encourage using three different
                    # characters, because that can cause ambiguities, but white
                    # spaces are so common everywhere that practicality beats
                    # purity here.
                charset = self.set_of_position_line_characters.union(
                    {self.splitter.delimiter, " "}
                )
                if not set(line).issubset(charset):
                    raise InconsistentTableError(
                        f"Characters in position line must be part of {charset}"
                    )
                vals, self.col_starts, col_ends = self.get_fixedwidth_params(line)
                self.col_ends = [x - 1 if x is not None else None for x in col_ends]

            # Get the column names from the header line
            line = self.get_line(lines, start_line + header_rows.index("name"))
            self.names, starts, ends = self.get_fixedwidth_params(line)

        self._set_cols_from_names()

        for ii, attr in enumerate(header_rows):
            if attr != "name":
                line = self.get_line(lines, start_line + ii)
                vals = self.get_fixedwidth_params(line)[0]
                for col, val in zip(self.cols, vals):
                    if val:
                        setattr(col, attr, val)

        # Set column start and end positions.
        for i, col in enumerate(self.cols):
            col.start = starts[i]
            col.end = ends[i]



[docs]
    def get_fixedwidth_params(self, line):
        """
        Split ``line`` on the delimiter and determine column values and
        column start and end positions.  This might include null columns with
        zero length (e.g. for ``header row = "| col1 || col2 | col3 |"`` or
        ``header2_row = "----- ------- -----"``).  The null columns are
        stripped out.  Returns the values between delimiters and the
        corresponding start and end positions.

        Parameters
        ----------
        line : str
            Input line

        Returns
        -------
        vals : list
            List of values.
        starts : list
            List of starting indices.
        ends : list
            List of ending indices.

        """
        # If column positions are already specified then just use those.
        # If neither column starts or ends are given, figure out positions
        # between delimiters. Otherwise, either the starts or the ends have
        # been given, so figure out whichever wasn't given.
        if self.col_starts is not None and self.col_ends is not None:
            starts = list(self.col_starts)  # could be any iterable, e.g. np.array
            # user supplies inclusive endpoint
            ends = [x + 1 if x is not None else None for x in self.col_ends]
            if len(starts) != len(ends):
                raise ValueError(
                    "Fixed width col_starts and col_ends must have the same length"
                )
            vals = [line[start:end].strip() for start, end in zip(starts, ends)]
        elif self.col_starts is None and self.col_ends is None:
            # There might be a cleaner way to do this but it works...
            vals = line.split(self.splitter.delimiter)
            starts = [0]
            ends = []
            for val in vals:
                if val:
                    ends.append(starts[-1] + len(val))
                    starts.append(ends[-1] + 1)
                else:
                    starts[-1] += 1
            starts = starts[:-1]
            vals = [x.strip() for x in vals if x]
            if len(vals) != len(starts) or len(vals) != len(ends):
                raise InconsistentTableError("Error parsing fixed width header")
        else:
            # exactly one of col_starts or col_ends is given...
            if self.col_starts is not None:
                starts = list(self.col_starts)
                ends = starts[1:] + [None]  # Assume each col ends where the next starts
            else:  # self.col_ends is not None
                ends = [x + 1 for x in self.col_ends]
                starts = [0] + ends[:-1]  # Assume each col starts where the last ended
            vals = [line[start:end].strip() for start, end in zip(starts, ends)]

        return vals, starts, ends



[docs]
    def write(self, lines):
        # Header line not written until data are formatted.  Until then it is
        # not known how wide each column will be for fixed width.
        pass





[docs]
class FixedWidthData(basic.BasicData):
    """
    Base table data reader.
    """

    splitter_class = FixedWidthSplitter
    """ Splitter class for splitting data lines into columns """
    start_line = None


[docs]
    def write(self, lines):
        default_header_rows = [] if self.header.start_line is None else ["name"]
        header_rows = getattr(self, "header_rows", default_header_rows)
        # First part is getting the widths of each column.
        # List (rows) of list (column values) for data lines
        vals_list = list(zip(*self.str_vals()))

        # List (rows) of list (columns values) for header lines.
        hdrs_list = []
        for col_attr in header_rows:
            vals = [
                "" if (val := getattr(col.info, col_attr)) is None else str(val)
                for col in self.cols
            ]
            hdrs_list.append(vals)

        # Widths for data columns
        widths = [
            max(len(vals[i_col]) for vals in vals_list)
            for i_col in range(len(self.cols))
        ]
        # Incorporate widths for header columns (if there are any)
        if hdrs_list:
            for i_col in range(len(self.cols)):
                widths[i_col] = max(
                    widths[i_col], *(len(vals[i_col]) for vals in hdrs_list)
                )

        # Now collect formatted header and data lines into the output lines
        for vals in hdrs_list:
            lines.append(self.splitter.join(vals, widths))

        if self.header.position_line is not None:
            vals = [self.header.position_char * width for width in widths]
            lines.append(self.splitter.join(vals, widths))

        for vals in vals_list:
            lines.append(self.splitter.join(vals, widths))

        return lines





[docs]
class FixedWidth(basic.Basic):
    """Fixed width table with single header line defining column names and positions.

    Examples::

      # Bar delimiter in header and data

      |  Col1 |   Col2      |  Col3 |
      |  1.2  | hello there |     3 |
      |  2.4  | many words  |     7 |

      # Bar delimiter in header only

      Col1 |   Col2      | Col3
      1.2    hello there    3
      2.4    many words     7

      # No delimiter with column positions specified as input

      Col1       Col2Col3
       1.2hello there   3
       2.4many words    7

    See the :ref:`astropy:fixed_width_gallery` for specific usage examples.

    """

    _format_name = "fixed_width"
    _description = "Fixed width"

    header_class = FixedWidthHeader
    data_class = FixedWidthData

    def __init__(
        self,
        col_starts=None,
        col_ends=None,
        delimiter_pad=" ",
        bookend=True,
        header_rows=None,
    ):
        if header_rows is None:
            header_rows = ["name"]
        super().__init__()
        self.data.splitter.delimiter_pad = delimiter_pad
        self.data.splitter.bookend = bookend
        self.header.col_starts = col_starts
        self.header.col_ends = col_ends
        self.header.header_rows = header_rows
        self.data.header_rows = header_rows
        if self.data.start_line is None:
            self.data.start_line = len(header_rows)



class FixedWidthNoHeaderHeader(FixedWidthHeader):
    """Header reader for fixed with tables with no header line."""

    start_line = None


class FixedWidthNoHeaderData(FixedWidthData):
    """Data reader for fixed width tables with no header line."""

    start_line = 0



[docs]
class FixedWidthNoHeader(FixedWidth):
    """Fixed width table which has no header line.

    When reading, column names are either input (``names`` keyword) or
    auto-generated.  Column positions are determined either by input
    (``col_starts`` and ``col_stops`` keywords) or by splitting the first data
    line.  In the latter case a ``delimiter`` is required to split the data
    line.

    Examples::

      # Bar delimiter in header and data

      |  1.2  | hello there |     3 |
      |  2.4  | many words  |     7 |

      # Compact table having no delimiter and column positions specified as input

      1.2hello there3
      2.4many words 7

    This class is just a convenience wrapper around the ``FixedWidth`` reader
    but with ``header_start=None`` and ``data_start=0``.

    See the :ref:`astropy:fixed_width_gallery` for specific usage examples.

    """

    _format_name = "fixed_width_no_header"
    _description = "Fixed width with no header"
    header_class = FixedWidthNoHeaderHeader
    data_class = FixedWidthNoHeaderData

    def __init__(self, col_starts=None, col_ends=None, delimiter_pad=" ", bookend=True):
        super().__init__(
            col_starts,
            col_ends,
            delimiter_pad=delimiter_pad,
            bookend=bookend,
            header_rows=[],
        )



class FixedWidthTwoLineHeader(FixedWidthHeader):
    """Header reader for fixed width tables splitting on whitespace.

    For fixed width tables with several header lines, there is typically
    a white-space delimited format line, so splitting on white space is
    needed.
    """

    splitter_class = DefaultSplitter


class FixedWidthTwoLineDataSplitter(FixedWidthSplitter):
    """Splitter for fixed width tables splitting on ``' '``."""

    delimiter = " "


class FixedWidthTwoLineData(FixedWidthData):
    """Data reader for fixed with tables with two header lines."""

    splitter_class = FixedWidthTwoLineDataSplitter



[docs]
class FixedWidthTwoLine(FixedWidth):
    """Fixed width table which has two header lines.

    The first header line defines the column names and the second implicitly
    defines the column positions.

    Examples::

      # Typical case with column extent defined by ---- under column names.

       col1    col2         <== header_start = 0
      -----  ------------   <== position_line = 1, position_char = "-"
        1     bee flies     <== data_start = 2
        2     fish swims

      # Pretty-printed table

      +------+------------+
      | Col1 |   Col2     |
      +------+------------+
      |  1.2 | "hello"    |
      |  2.4 | there world|
      +------+------------+

    See the :ref:`astropy:fixed_width_gallery` for specific usage examples.

    """

    _format_name = "fixed_width_two_line"
    _description = "Fixed width with second header line"
    data_class = FixedWidthTwoLineData
    header_class = FixedWidthTwoLineHeader

    def __init__(
        self,
        position_line=None,
        position_char="-",
        delimiter_pad=None,
        bookend=False,
        header_rows=None,
    ):
        if len(position_char) != 1:
            raise ValueError(
                f'Position_char="{position_char}" must be a single character'
            )
        super().__init__(
            delimiter_pad=delimiter_pad, bookend=bookend, header_rows=header_rows
        )
        if position_line is None:
            position_line = len(self.header.header_rows)
        self.header.position_line = position_line
        self.header.position_char = position_char
        self.data.start_line = position_line + 1