Source code for astropy.table.sorted_array

# Licensed under a 3-clause BSD style license - see LICENSE.rst
from collections.abc import Hashable, Mapping, Sequence
from numbers import Integral
from typing import TYPE_CHECKING

import numpy as np

if TYPE_CHECKING:
    from .table import Column, Table


def _searchsorted(array, val, side="left"):
    """
    Call np.searchsorted or use a custom binary
    search if necessary.
    """
    if hasattr(array, "searchsorted"):
        return array.searchsorted(val, side=side)
    # Python binary search
    begin = 0
    end = len(array)
    while begin < end:
        mid = (begin + end) // 2
        if val > array[mid]:
            begin = mid + 1
        elif val < array[mid]:
            end = mid
        elif side == "right":
            begin = mid + 1
        else:
            end = mid
    return begin



[docs]
class SortedArray:
    """
    Implements a sorted array container using
    a list of numpy arrays.

    Parameters
    ----------
    data : Table
        Sorted columns of the original table
    row_index : Column object
        Row numbers corresponding to data columns
    unique : bool
        Whether the values of the index must be unique.
        Defaults to False.
    """

    def __init__(self, data: "Table", row_index: "Column", unique: bool = False):
        self.data = data
        self.row_index = row_index
        self.num_cols = len(getattr(data, "colnames", []))
        self.unique = unique

    @property
    def cols(self) -> list["Column"]:
        return list(self.data.columns.values())


[docs]
    def add(self, key: tuple, row: int) -> None:
        """
        Add a new entry to the sorted array.

        Parameters
        ----------
        key : tuple
            Column values at the given row
        row : int
            Row number
        """
        pos = self.find_pos(key, row)  # first >= key

        if (
            self.unique
            and 0 <= pos < len(self.row_index)
            and all(self.data[pos][i] == key[i] for i in range(len(key)))
        ):
            # already exists
            raise ValueError(f'Cannot add duplicate value "{key}" in a unique index')
        self.data.insert_row(pos, key)
        self.row_index = self.row_index.insert(pos, row)


    def _get_key_slice(self, i, begin, end):
        """
        Retrieve the ith slice of the sorted array
        from begin to end.
        """
        if i < self.num_cols:
            return self.cols[i][begin:end]
        else:
            return self.row_index[begin:end]


[docs]
    def find_pos(self, key, data, exact=False):
        """
        Return the index of the first key in data greater than or
        equal to the given key, data pair.

        Parameters
        ----------
        key : tuple
            Column key
        data : int
            Row number
        exact : bool
            If True, return the index of the given key in data
            or -1 if the key is not present.
        """
        begin = 0
        end = len(self.row_index)
        num_cols = self.num_cols
        if not self.unique:
            # consider the row value as well
            key = key + (data,)
            num_cols += 1

        # search through keys in lexicographic order
        for i in range(num_cols):
            key_slice = self._get_key_slice(i, begin, end)
            t = _searchsorted(key_slice, key[i])
            # t is the smallest index >= key[i]
            if exact and (t == len(key_slice) or key_slice[t] != key[i]):
                # no match
                return -1
            elif t == len(key_slice) or (
                t == 0 and len(key_slice) > 0 and key[i] < key_slice[0]
            ):
                # too small or too large
                return begin + t
            end = begin + _searchsorted(key_slice, key[i], side="right")
            begin += t
            if begin >= len(self.row_index):  # greater than all keys
                return begin

        return begin



[docs]
    def find(self, key: tuple) -> Sequence[Integral]:
        """
        Find all rows matching the given key.

        Parameters
        ----------
        key : tuple
            Column values

        Returns
        -------
        matching_rows : list
            List of rows matching the input key
        """
        begin = 0
        end = len(self.row_index)

        # search through keys in lexicographic order
        for i in range(self.num_cols):
            key_slice = self._get_key_slice(i, begin, end)
            t = _searchsorted(key_slice, key[i])
            # t is the smallest index >= key[i]
            if t == len(key_slice) or key_slice[t] != key[i]:
                # no match
                return []
            elif t == 0 and len(key_slice) > 0 and key[i] < key_slice[0]:
                # too small or too large
                return []
            end = begin + _searchsorted(key_slice, key[i], side="right")
            begin += t
            if begin >= len(self.row_index):  # greater than all keys
                return []

        return self.row_index[begin:end]



[docs]
    def range(
        self,
        lower: tuple[Hashable, ...] | None,
        upper: tuple[Hashable, ...] | None,
        bounds: tuple[bool, bool],
    ) -> list[int]:
        """
        Find values in the given range.

        Parameters
        ----------
        lower : tuple, None
            Lower search bound (no lower bound if None)
        upper : tuple, None
            Upper search bound (no upper bound if None)
        bounds : (2,) tuple of bool
            Indicates whether the search should be inclusive or
            exclusive with respect to the endpoints. The first
            argument corresponds to an inclusive lower bound,
            and the second argument to an inclusive upper bound.
        """
        n = len(self.row_index)

        def bisect_left(key):
            # Position of the first entry whose key is >= ``key``.
            return self.find_pos(key, 0)

        def bisect_right(key):
            # Position of the first entry whose key is > ``key`` (i.e. just
            # past the last entry equal to ``key``).
            if self.unique:
                pos = self.find_pos(key, 0)
                if pos < n and tuple(col[pos] for col in self.cols) == key:
                    pos += 1
                return pos
            # For a non-unique index the stored keys include the row number, so
            # searching with a row number larger than any real row lands just
            # past the last entry equal to ``key`` (and thus past all duplicates).
            return self.find_pos(key, n)

        # Compute a half-open range [lower_pos, upper_pos). Just like a slice
        # object, a None value for `lower` or `upper` means no bound in that
        # direction. Using bisect_left/bisect_right ensures all entries that
        # share a value with an inclusive bound are included, even duplicates.
        if lower is None:
            lower_pos = 0
        else:
            lower_pos = bisect_left(lower) if bounds[0] else bisect_right(lower)

        if upper is None:
            upper_pos = n
        else:
            upper_pos = bisect_right(upper) if bounds[1] else bisect_left(upper)

        return self.row_index[lower_pos:upper_pos]



[docs]
    def remove(self, key: tuple, data: int) -> bool:
        """
        Remove the given entry from the sorted array.

        Parameters
        ----------
        key : tuple
            Column values
        data : int
            Row number

        Returns
        -------
        successful : bool
            Whether the entry was successfully removed
        """
        pos = self.find_pos(key, data, exact=True)
        if pos == -1:  # key not found
            return False

        self.data.remove_row(pos)
        keep_mask = np.ones(len(self.row_index), dtype=bool)
        keep_mask[pos] = False
        self.row_index = self.row_index[keep_mask]
        return True



[docs]
    def shift_left(self, row: int) -> None:
        """
        Decrement all row numbers greater than the input row.

        Parameters
        ----------
        row : int
            Input row number
        """
        self.row_index[self.row_index > row] -= 1



[docs]
    def shift_right(self, row: int) -> None:
        """
        Increment all row numbers greater than or equal to the input row.

        Parameters
        ----------
        row : int
            Input row number
        """
        self.row_index[self.row_index >= row] += 1



[docs]
    def replace_rows(self, row_map: "Mapping[int, int]") -> None:
        """
        Replace all rows with the values they map to in the
        given dictionary. Any rows not present as keys in
        the dictionary will have their entries deleted.

        Parameters
        ----------
        row_map : dict
            Mapping of row numbers to new row numbers
        """
        num_rows = len(row_map)
        keep_rows = np.zeros(len(self.row_index), dtype=bool)
        tagged = 0
        for i, row in enumerate(self.row_index):
            if row in row_map:
                keep_rows[i] = True
                tagged += 1
                if tagged == num_rows:
                    break

        self.data = self.data[keep_rows]
        self.row_index = np.array([row_map[x] for x in self.row_index[keep_rows]])



[docs]
    def items(self) -> list[tuple[Hashable, list[Integral]]]:
        """
        Retrieve all array items as a list of pairs of the form
        [(key, [row 1, row 2, ...]), ...].
        """
        array = []
        last_key = None
        for i, key in enumerate(zip(*self.data.columns.values())):
            row = self.row_index[i]
            if key == last_key:
                array[-1][1].append(row)
            else:
                last_key = key
                array.append((key, [row]))
        return array



[docs]
    def sort(self) -> None:
        """
        Make row order align with key order.
        """
        self.row_index = np.arange(len(self.row_index))



[docs]
    def sorted_data(self) -> None:
        """
        Return rows in sorted order.
        """
        return self.row_index


    def __getitem__(self, item):
        """
        Return a sliced reference to this sorted array.

        Parameters
        ----------
        item : slice
            Slice to use for referencing
        """
        return SortedArray(self.data[item], self.row_index[item])

    def __repr__(self):
        t = self.data.copy()
        t["rows"] = self.row_index
        return f"<{self.__class__.__name__} length={len(t)}>\n{t}"

    def __len__(self) -> int:
        return len(self.row_index)