Source code for molr.selection.parser

"""
Selection language parser using pyparsing.

This module provides a parser for atom selection strings, converting them
into SelectionExpression objects that can be evaluated against structures.
The syntax is inspired by MDAnalysis and VMD selection languages.
"""

from typing import Any, List, Optional, Union

import pyparsing as pp
from pyparsing import (
    CaselessKeyword,
    CaselessLiteral,
    Forward,
    Group,
    OneOrMore,
)
from pyparsing import Optional as PPOptional
from pyparsing import (
    ParseException,
    Regex,
    Suppress,
    Word,
    ZeroOrMore,
    alphanums,
    alphas,
    nums,
    oneOf,
    pyparsing_common,
)

from .expressions import (
    AllExpression,
    AndExpression,
    AromaticExpression,
    AroundExpression,
    AtomNameExpression,
    BackboneExpression,
    ByResidueExpression,
    CenterOfGeometryExpression,
    ChainExpression,
    DNAExpression,
    ElementExpression,
    IndexExpression,
    LigandExpression,
    NoneExpression,
    NotExpression,
    NucleicExpression,
    OrExpression,
    ProteinExpression,
    ResidueIdExpression,
    ResidueNameExpression,
    RNAExpression,
    SelectionExpression,
    SidechainExpression,
    SphericalExpression,
    WaterExpression,
    WithinExpression,
)


[docs] class SelectionParser: """ Parser for atom selection language. Supports syntax like: - "protein and backbone" - "resname ALA GLY" - "chain A and resid 1:100" - "element C N O" - "not water" - "(protein and chain A) or ligand" - "byres (ligand and within 5 of protein)" """
[docs] def __init__(self) -> None: """Initialize the parser with grammar rules.""" self._build_grammar()
def _build_grammar(self) -> None: """Build the pyparsing grammar for selection language.""" # Enable packrat parsing for better performance pp.ParserElement.enablePackrat() # Basic tokens integer = pyparsing_common.integer real = pyparsing_common.real identifier = Word(alphas, alphanums + "_") # Keywords (case-insensitive) AND = CaselessKeyword("and") OR = CaselessKeyword("or") NOT = CaselessKeyword("not") # Selection keywords ALL = CaselessKeyword("all") NONE = CaselessKeyword("none") PROTEIN = CaselessKeyword("protein") NUCLEIC = CaselessKeyword("nucleic") DNA = CaselessKeyword("dna") RNA = CaselessKeyword("rna") BACKBONE = CaselessKeyword("backbone") SIDECHAIN = CaselessKeyword("sidechain") WATER = CaselessKeyword("water") LIGAND = CaselessKeyword("ligand") AROMATIC = CaselessKeyword("aromatic") # Property keywords ELEMENT = CaselessKeyword("element") | CaselessKeyword("elem") NAME = CaselessKeyword("name") | CaselessKeyword("atomname") RESNAME = CaselessKeyword("resname") | CaselessKeyword("resn") RESID = CaselessKeyword("resid") | CaselessKeyword("resi") CHAIN = CaselessKeyword("chain") | CaselessKeyword("segid") INDEX = CaselessKeyword("index") | CaselessKeyword("idx") BYRES = CaselessKeyword("byres") | CaselessKeyword("byresidue") # Spatial keywords WITHIN = CaselessKeyword("within") AROUND = CaselessKeyword("around") OF = CaselessKeyword("of") COG = CaselessKeyword("cog") | CaselessKeyword("centerofgeometry") SPHERE = CaselessKeyword("sphere") CENTER = CaselessKeyword("center") RADIUS = CaselessKeyword("radius") # Operators LPAREN = Suppress("(") RPAREN = Suppress(")") COLON = Suppress(":") # Forward declaration for recursive grammar selection_expr = Forward() # Simple selections all_selection = ALL.setParseAction(lambda: AllExpression()) none_selection = NONE.setParseAction(lambda: NoneExpression()) protein_selection = PROTEIN.setParseAction(lambda: ProteinExpression()) nucleic_selection = NUCLEIC.setParseAction(lambda: NucleicExpression()) dna_selection = DNA.setParseAction(lambda: DNAExpression()) rna_selection = RNA.setParseAction(lambda: RNAExpression()) backbone_selection = BACKBONE.setParseAction(lambda: BackboneExpression()) sidechain_selection = SIDECHAIN.setParseAction(lambda: SidechainExpression()) water_selection = WATER.setParseAction(lambda: WaterExpression()) ligand_selection = LIGAND.setParseAction(lambda: LigandExpression()) aromatic_selection = AROMATIC.setParseAction(lambda: AromaticExpression()) # Element selection: "element C N O" or "elem C" element_list = Group(ELEMENT + OneOrMore(Word(alphas, max=2))) element_selection = element_list.setParseAction( lambda t: ElementExpression(list(t[0][1:])) ) # Atom name selection: "name CA CB" or "atomname CA" name_list = Group(NAME + OneOrMore(Word(alphanums + "_*"))) name_selection = name_list.setParseAction( lambda t: AtomNameExpression(list(t[0][1:])) ) # Residue name selection: "resname ALA GLY" or "resn ALA" resname_list = Group(RESNAME + OneOrMore(Word(alphanums))) resname_selection = resname_list.setParseAction( lambda t: ResidueNameExpression(list(t[0][1:])) ) # Residue ID selection: "resid 1 2 3" or "resid 1:100" or "resi 50" resid_range = Group(integer + COLON + integer) resid_value = resid_range | integer resid_list = Group(RESID + ZeroOrMore(resid_value)) def parse_resid(tokens: Any) -> Any: """Parse residue ID selection.""" values: List[int] = [] i = 1 while i < len(tokens[0]): current = tokens[0][i] if hasattr(current, "asList") and len(current.asList()) == 2: # Range notation (from resid_range) start, end = current.asList() values.extend(range(start, end + 1)) elif isinstance(current, list) and len(current) == 2: # Range notation (backup) start, end = current values.extend(range(start, end + 1)) else: # Single value values.append(current) i += 1 return ResidueIdExpression(values) resid_selection = resid_list.setParseAction(parse_resid) # Chain selection: "chain A B" or "chain AB" chain_chars = Word(alphanums) chain_list = Group(CHAIN + ZeroOrMore(chain_chars)) def parse_chain(tokens: Any) -> Any: """Parse chain selection.""" chains = [] for item in tokens[0][1:]: if len(item) == 1: chains.append(item) else: # Multiple chains in one string chains.extend(list(item)) return ChainExpression(chains) chain_selection = chain_list.setParseAction(parse_chain) # Index selection: "index 0 1 2" or "index 0:100" index_range = integer + COLON + integer + PPOptional(COLON + integer) index_value = index_range | integer index_list = Group(INDEX + ZeroOrMore(index_value)) def parse_index(tokens: Any) -> Any: """Parse index selection.""" values: List[Union[int, slice]] = [] i = 1 while i < len(tokens[0]): if i + 2 < len(tokens[0]) and isinstance(tokens[0][i + 1], str): # Range notation start = tokens[0][i] end = tokens[0][i + 2] if i + 4 < len(tokens[0]) and isinstance(tokens[0][i + 3], str): # Step notation step = tokens[0][i + 4] values.append(slice(start, end, step)) i += 5 else: values.extend(list(range(start, end))) i += 3 else: # Single value values.append(tokens[0][i]) i += 1 if len(values) == 1 and isinstance(values[0], slice): return IndexExpression(values[0]) # Convert to list of ints, expanding slices int_values = [] for v in values: if isinstance(v, slice): # Expand slice to list of ints start = v.start if v.start is not None else 0 stop = v.stop if v.stop is not None else 0 step = v.step if v.step is not None else 1 int_values.extend(list(range(start, stop, step))) else: int_values.append(v) return IndexExpression(int_values) index_selection = index_list.setParseAction(parse_index) # Spatial selections # "within 5.0 of protein" within_selection = ( WITHIN + real + OF + LPAREN + selection_expr + RPAREN ).setParseAction(lambda t: WithinExpression(t[1], t[4])) # "around protein 5.0" around_selection = ( AROUND + LPAREN + selection_expr + RPAREN + real ).setParseAction(lambda t: AroundExpression(t[2], t[4])) # "cog protein 8.0" (atoms within distance of center of geometry) cog_selection = (COG + LPAREN + selection_expr + RPAREN + real).setParseAction( lambda t: CenterOfGeometryExpression(t[2], t[4]) ) # By residue selection: "byres (protein and chain A)" byres_selection = (BYRES + LPAREN + selection_expr + RPAREN).setParseAction( lambda t: ByResidueExpression(t[1]) ) # Atomic selection (all non-boolean selections) atomic_selection = ( all_selection | none_selection | protein_selection | nucleic_selection | dna_selection | rna_selection | backbone_selection | sidechain_selection | water_selection | ligand_selection | aromatic_selection | element_selection | name_selection | resname_selection | resid_selection | chain_selection | index_selection | within_selection | around_selection | cog_selection | byres_selection | (LPAREN + selection_expr + RPAREN) ) # NOT expression not_expr = (NOT + atomic_selection).setParseAction( lambda t: NotExpression(t[1]) ) # AND expression (implicit when no operator) and_expr = (atomic_selection | not_expr) + ZeroOrMore( PPOptional(AND) + (atomic_selection | not_expr) ) def parse_and(tokens: Any) -> Any: """Parse AND expressions.""" if len(tokens) == 1: return tokens[0] result = tokens[0] for i in range(1, len(tokens)): if not isinstance(tokens[i], SelectionExpression): continue result = AndExpression(result, tokens[i]) return result and_expr.setParseAction(parse_and) # OR expression or_expr = and_expr + ZeroOrMore(OR + and_expr) def parse_or(tokens: Any) -> Any: """Parse OR expressions.""" if len(tokens) == 1: return tokens[0] result = tokens[0] for i in range(2, len(tokens), 2): result = OrExpression(result, tokens[i]) return result or_expr.setParseAction(parse_or) # Complete expression selection_expr <<= or_expr # Set the complete grammar self.grammar = selection_expr + pp.StringEnd()
[docs] def parse(self, selection_string: str) -> SelectionExpression: """ Parse a selection string into a SelectionExpression. Args: selection_string: The selection string to parse Returns: SelectionExpression object Raises: ParseException: If the string cannot be parsed """ try: result = self.grammar.parseString(selection_string, parseAll=True) return result[0] # type: ignore[no-any-return] except ParseException as e: # Enhance error message col = e.column line = e.line error_msg = f"Invalid selection syntax at position {col}: {e.msg}" if line: marker = " " * (col - 1) + "^" error_msg = f"{error_msg}\n{line}\n{marker}" raise ParseException(error_msg)
[docs] @classmethod def parse_selection(cls, selection_string: str) -> SelectionExpression: """ Convenience class method to parse a selection string. Args: selection_string: The selection string to parse Returns: SelectionExpression object """ parser = cls() return parser.parse(selection_string)