Source code for molr.selection.parser

"""
Selection language parser using pyparsing.

This module provides a parser for atom selection strings, converting them
into SelectionExpression objects that can be evaluated against structures.
The syntax is inspired by MDAnalysis and VMD selection languages.
"""

from typing import Any, List, Optional, Union

import pyparsing as pp
from pyparsing import (
    CaselessKeyword,
    CaselessLiteral,
    Forward,
    Group,
    OneOrMore,
)
from pyparsing import Optional as PPOptional
from pyparsing import (
    ParseException,
    Regex,
    Suppress,
    Word,
    ZeroOrMore,
    alphanums,
    alphas,
    nums,
    oneOf,
    pyparsing_common,
)

from .expressions import (
    AllExpression,
    AndExpression,
    AromaticExpression,
    AroundExpression,
    AtomNameExpression,
    BackboneExpression,
    ByResidueExpression,
    CenterOfGeometryExpression,
    ChainExpression,
    DNAExpression,
    ElementExpression,
    IndexExpression,
    LigandExpression,
    NoneExpression,
    NotExpression,
    NucleicExpression,
    OrExpression,
    ProteinExpression,
    ResidueIdExpression,
    ResidueNameExpression,
    RNAExpression,
    SelectionExpression,
    SidechainExpression,
    SphericalExpression,
    WaterExpression,
    WithinExpression,
)



[docs]
class SelectionParser:
    """
    Parser for atom selection language.

    Supports syntax like:
        - "protein and backbone"
        - "resname ALA GLY"
        - "chain A and resid 1:100"
        - "element C N O"
        - "not water"
        - "(protein and chain A) or ligand"
        - "byres (ligand and within 5 of protein)"
    """


[docs]
    def __init__(self) -> None:
        """Initialize the parser with grammar rules."""
        self._build_grammar()


    def _build_grammar(self) -> None:
        """Build the pyparsing grammar for selection language."""
        # Enable packrat parsing for better performance
        pp.ParserElement.enablePackrat()

        # Basic tokens
        integer = pyparsing_common.integer
        real = pyparsing_common.real
        identifier = Word(alphas, alphanums + "_")

        # Keywords (case-insensitive)
        AND = CaselessKeyword("and")
        OR = CaselessKeyword("or")
        NOT = CaselessKeyword("not")

        # Selection keywords
        ALL = CaselessKeyword("all")
        NONE = CaselessKeyword("none")
        PROTEIN = CaselessKeyword("protein")
        NUCLEIC = CaselessKeyword("nucleic")
        DNA = CaselessKeyword("dna")
        RNA = CaselessKeyword("rna")
        BACKBONE = CaselessKeyword("backbone")
        SIDECHAIN = CaselessKeyword("sidechain")
        WATER = CaselessKeyword("water")
        LIGAND = CaselessKeyword("ligand")
        AROMATIC = CaselessKeyword("aromatic")

        # Property keywords
        ELEMENT = CaselessKeyword("element") | CaselessKeyword("elem")
        NAME = CaselessKeyword("name") | CaselessKeyword("atomname")
        RESNAME = CaselessKeyword("resname") | CaselessKeyword("resn")
        RESID = CaselessKeyword("resid") | CaselessKeyword("resi")
        CHAIN = CaselessKeyword("chain") | CaselessKeyword("segid")
        INDEX = CaselessKeyword("index") | CaselessKeyword("idx")
        BYRES = CaselessKeyword("byres") | CaselessKeyword("byresidue")

        # Spatial keywords
        WITHIN = CaselessKeyword("within")
        AROUND = CaselessKeyword("around")
        OF = CaselessKeyword("of")
        COG = CaselessKeyword("cog") | CaselessKeyword("centerofgeometry")
        SPHERE = CaselessKeyword("sphere")
        CENTER = CaselessKeyword("center")
        RADIUS = CaselessKeyword("radius")

        # Operators
        LPAREN = Suppress("(")
        RPAREN = Suppress(")")
        COLON = Suppress(":")

        # Forward declaration for recursive grammar
        selection_expr = Forward()

        # Simple selections
        all_selection = ALL.setParseAction(lambda: AllExpression())
        none_selection = NONE.setParseAction(lambda: NoneExpression())
        protein_selection = PROTEIN.setParseAction(lambda: ProteinExpression())
        nucleic_selection = NUCLEIC.setParseAction(lambda: NucleicExpression())
        dna_selection = DNA.setParseAction(lambda: DNAExpression())
        rna_selection = RNA.setParseAction(lambda: RNAExpression())
        backbone_selection = BACKBONE.setParseAction(lambda: BackboneExpression())
        sidechain_selection = SIDECHAIN.setParseAction(lambda: SidechainExpression())
        water_selection = WATER.setParseAction(lambda: WaterExpression())
        ligand_selection = LIGAND.setParseAction(lambda: LigandExpression())
        aromatic_selection = AROMATIC.setParseAction(lambda: AromaticExpression())

        # Element selection: "element C N O" or "elem C"
        element_list = Group(ELEMENT + OneOrMore(Word(alphas, max=2)))
        element_selection = element_list.setParseAction(
            lambda t: ElementExpression(list(t[0][1:]))
        )

        # Atom name selection: "name CA CB" or "atomname CA"
        name_list = Group(NAME + OneOrMore(Word(alphanums + "_*")))
        name_selection = name_list.setParseAction(
            lambda t: AtomNameExpression(list(t[0][1:]))
        )

        # Residue name selection: "resname ALA GLY" or "resn ALA"
        resname_list = Group(RESNAME + OneOrMore(Word(alphanums)))
        resname_selection = resname_list.setParseAction(
            lambda t: ResidueNameExpression(list(t[0][1:]))
        )

        # Residue ID selection: "resid 1 2 3" or "resid 1:100" or "resi 50"
        resid_range = Group(integer + COLON + integer)
        resid_value = resid_range | integer
        resid_list = Group(RESID + ZeroOrMore(resid_value))

        def parse_resid(tokens: Any) -> Any:
            """Parse residue ID selection."""
            values: List[int] = []
            i = 1
            while i < len(tokens[0]):
                current = tokens[0][i]
                if hasattr(current, "asList") and len(current.asList()) == 2:
                    # Range notation (from resid_range)
                    start, end = current.asList()
                    values.extend(range(start, end + 1))
                elif isinstance(current, list) and len(current) == 2:
                    # Range notation (backup)
                    start, end = current
                    values.extend(range(start, end + 1))
                else:
                    # Single value
                    values.append(current)
                i += 1
            return ResidueIdExpression(values)

        resid_selection = resid_list.setParseAction(parse_resid)

        # Chain selection: "chain A B" or "chain AB"
        chain_chars = Word(alphanums)
        chain_list = Group(CHAIN + ZeroOrMore(chain_chars))

        def parse_chain(tokens: Any) -> Any:
            """Parse chain selection."""
            chains = []
            for item in tokens[0][1:]:
                if len(item) == 1:
                    chains.append(item)
                else:
                    # Multiple chains in one string
                    chains.extend(list(item))
            return ChainExpression(chains)

        chain_selection = chain_list.setParseAction(parse_chain)

        # Index selection: "index 0 1 2" or "index 0:100"
        index_range = integer + COLON + integer + PPOptional(COLON + integer)
        index_value = index_range | integer
        index_list = Group(INDEX + ZeroOrMore(index_value))

        def parse_index(tokens: Any) -> Any:
            """Parse index selection."""
            values: List[Union[int, slice]] = []
            i = 1
            while i < len(tokens[0]):
                if i + 2 < len(tokens[0]) and isinstance(tokens[0][i + 1], str):
                    # Range notation
                    start = tokens[0][i]
                    end = tokens[0][i + 2]
                    if i + 4 < len(tokens[0]) and isinstance(tokens[0][i + 3], str):
                        # Step notation
                        step = tokens[0][i + 4]
                        values.append(slice(start, end, step))
                        i += 5
                    else:
                        values.extend(list(range(start, end)))
                        i += 3
                else:
                    # Single value
                    values.append(tokens[0][i])
                    i += 1

            if len(values) == 1 and isinstance(values[0], slice):
                return IndexExpression(values[0])
            # Convert to list of ints, expanding slices
            int_values = []
            for v in values:
                if isinstance(v, slice):
                    # Expand slice to list of ints
                    start = v.start if v.start is not None else 0
                    stop = v.stop if v.stop is not None else 0
                    step = v.step if v.step is not None else 1
                    int_values.extend(list(range(start, stop, step)))
                else:
                    int_values.append(v)
            return IndexExpression(int_values)

        index_selection = index_list.setParseAction(parse_index)

        # Spatial selections
        # "within 5.0 of protein"
        within_selection = (
            WITHIN + real + OF + LPAREN + selection_expr + RPAREN
        ).setParseAction(lambda t: WithinExpression(t[1], t[4]))

        # "around protein 5.0"
        around_selection = (
            AROUND + LPAREN + selection_expr + RPAREN + real
        ).setParseAction(lambda t: AroundExpression(t[2], t[4]))

        # "cog protein 8.0" (atoms within distance of center of geometry)
        cog_selection = (COG + LPAREN + selection_expr + RPAREN + real).setParseAction(
            lambda t: CenterOfGeometryExpression(t[2], t[4])
        )

        # By residue selection: "byres (protein and chain A)"
        byres_selection = (BYRES + LPAREN + selection_expr + RPAREN).setParseAction(
            lambda t: ByResidueExpression(t[1])
        )

        # Atomic selection (all non-boolean selections)
        atomic_selection = (
            all_selection
            | none_selection
            | protein_selection
            | nucleic_selection
            | dna_selection
            | rna_selection
            | backbone_selection
            | sidechain_selection
            | water_selection
            | ligand_selection
            | aromatic_selection
            | element_selection
            | name_selection
            | resname_selection
            | resid_selection
            | chain_selection
            | index_selection
            | within_selection
            | around_selection
            | cog_selection
            | byres_selection
            | (LPAREN + selection_expr + RPAREN)
        )

        # NOT expression
        not_expr = (NOT + atomic_selection).setParseAction(
            lambda t: NotExpression(t[1])
        )

        # AND expression (implicit when no operator)
        and_expr = (atomic_selection | not_expr) + ZeroOrMore(
            PPOptional(AND) + (atomic_selection | not_expr)
        )

        def parse_and(tokens: Any) -> Any:
            """Parse AND expressions."""
            if len(tokens) == 1:
                return tokens[0]
            result = tokens[0]
            for i in range(1, len(tokens)):
                if not isinstance(tokens[i], SelectionExpression):
                    continue
                result = AndExpression(result, tokens[i])
            return result

        and_expr.setParseAction(parse_and)

        # OR expression
        or_expr = and_expr + ZeroOrMore(OR + and_expr)

        def parse_or(tokens: Any) -> Any:
            """Parse OR expressions."""
            if len(tokens) == 1:
                return tokens[0]
            result = tokens[0]
            for i in range(2, len(tokens), 2):
                result = OrExpression(result, tokens[i])
            return result

        or_expr.setParseAction(parse_or)

        # Complete expression
        selection_expr <<= or_expr

        # Set the complete grammar
        self.grammar = selection_expr + pp.StringEnd()


[docs]
    def parse(self, selection_string: str) -> SelectionExpression:
        """
        Parse a selection string into a SelectionExpression.

        Args:
            selection_string: The selection string to parse

        Returns:
            SelectionExpression object

        Raises:
            ParseException: If the string cannot be parsed
        """
        try:
            result = self.grammar.parseString(selection_string, parseAll=True)
            return result[0]  # type: ignore[no-any-return]
        except ParseException as e:
            # Enhance error message
            col = e.column
            line = e.line
            error_msg = f"Invalid selection syntax at position {col}: {e.msg}"
            if line:
                marker = " " * (col - 1) + "^"
                error_msg = f"{error_msg}\n{line}\n{marker}"
            raise ParseException(error_msg)



[docs]
    @classmethod
    def parse_selection(cls, selection_string: str) -> SelectionExpression:
        """
        Convenience class method to parse a selection string.

        Args:
            selection_string: The selection string to parse

        Returns:
            SelectionExpression object
        """
        parser = cls()
        return parser.parse(selection_string)