AutoLibrary/src/autoscript/ASTokenizer.py

# -*- coding: utf-8 -*-
"""
Copyright (c) 2026 KenanZhu.
All rights reserved.

This software is provided "as is", without any warranty of any kind.
You may use, modify, and distribute this file under the terms of the MIT License.
See the LICENSE file for details.
"""
import re


__all__ = [
    "ASTokenizer",
    "Stmt",
    "Script",
    "IfNode",
    "ElifNode",
    "SetNode",
    "OpNode",
    "PassNode",
    "UnrecogNode",
    "NodeVisitor",
    "LineStrategy"
]


# Token kind constants
K_IF      = "IF"
K_ELSE_IF = "ELSE IF"
K_ELSE    = "ELSE"
K_ENDIF   = "ENDIF"
K_SET     = "SET"
K_ADD     = "ADD"
K_SUB     = "SUB"
K_PASS    = "PASS"

# Op-type constants
OP_SET = "set"
OP_ADD = "add"
OP_SUB = "sub"

# Compiled line patterns
_RE_IF      = re.compile(r"^IF\((.+)\)(?:\s+THEN\s*)?$", re.IGNORECASE)
_RE_ELSE_IF = re.compile(r"^ELSE\s+IF\((.+)\)(?:\s+THEN\s*)?$", re.IGNORECASE)
_RE_ELSE    = re.compile(r"^ELSE\s*$", re.IGNORECASE)
_RE_ENDIF   = re.compile(r"^(ENDIF|END IF)$", re.IGNORECASE)
_RE_SET     = re.compile(r"^SET\s+(\w+)\s*=\s*(.+)$", re.IGNORECASE)
_RE_ADD     = re.compile(r"^(\w+)\s+\.ADD\.\s+(-?\d+(?:\.\d+)?|\w+)$", re.IGNORECASE)
_RE_SUB     = re.compile(r"^(\w+)\s+\.SUB\.\s+(-?\d+(?:\.\d+)?|\w+)$", re.IGNORECASE)
_RE_PASS    = re.compile(r"^\s*PASS\s*$", re.IGNORECASE)


class Script:
    """
        Root AST node for an entire AutoScript.
        Contains an ordered list of top-level statement nodes.
    """

    def __init__(
        self,
        body: list = None
    ):

        self.body = body or []

    def accept(
        self,
        visitor
    ):

        return visitor.visitScript(self)


class IfNode:
    """
        IF conditional block with optional ELSE IF / ELSE branches.

        Attributes:
            condition (str): Raw condition expression.
            body (list): Statements executed when condition is true.
            elif_branches (list[ElifNode]): ELSE IF branches in order.
            else_body (list): Statements executed for the ELSE branch.
            closed (bool): Whether this IF has a matching ENDIF token.
    """

    def __init__(
        self,
        condition: str = "",
        body: list = None,
        elif_branches: list = None,
        else_body: list = None,
        closed: bool = True
    ):

        self.condition = condition
        self.body = body or []
        self.elif_branches = elif_branches or []
        self.else_body = else_body or []
        self.closed = closed

    def accept(
        self,
        visitor
    ):

        return visitor.visitIf(self)


class ElifNode:
    """
        ELSE IF branch within an IfNode.
    """

    def __init__(
        self,
        condition: str = "",
        body: list = None
    ):

        self.condition = condition
        self.body = body or []


class SetNode:
    """
        SET assignment statement.
    """

    def __init__(
        self,
        target: str = "",
        value: str = ""
    ):

        self.target = target
        self.value = value

    def accept(
        self,
        visitor
    ):

        return visitor.visitSet(self)


class OpNode:
    """
        .ADD. / .SUB. operation statement.
    """

    def __init__(
        self,
        op_type: str = "",
        target: str = "",
        value: str = ""
    ):

        self.op_type = op_type
        self.target = target
        self.value = value

    def accept(
        self,
        visitor
    ):

        return visitor.visitOp(self)


class PassNode:
    """
        PASS no-op statement.
    """

    def accept(
        self,
        visitor
    ):

        return visitor.visitPass(self)


class UnrecogNode:
    """
        Unrecognised line preserved for downstream error reporting.
    """

    def __init__(
        self,
        raw_line: str = ""
    ):

        self.raw_line = raw_line

    def accept(
        self,
        visitor
    ):

        return visitor.visitUnrecog(self)


class NodeVisitor:
    """
        Base visitor for the AutoScript AST.

        Subclass and override visit* methods to implement
        custom traversal logic.  Default walks tree depth-first.
    """

    def visitScript(
        self,
        _node: Script
    ):

        for child in _node.body:
            child.accept(self)

    def visitIf(
        self,
        _node: IfNode
    ):

        for child in _node.body:
            child.accept(self)
        for elif_node in _node.elif_branches:
            for child in elif_node.body:
                child.accept(self)
        for child in _node.else_body:
            child.accept(self)

    def visitSet(
        self,
        _node: SetNode
    ):

        pass

    def visitOp(
        self,
        _node: OpNode
    ):

        pass

    def visitPass(
        self,
        _node: PassNode
    ):

        pass

    def visitUnrecog(
        self,
        _node: UnrecogNode
    ):

        pass


class LineStrategy:
    """
        Encapsulates a regex pattern and its data-extraction handler.
        Used by the tokenizer to classify a single line.
    """

    def __init__(
        self,
        pattern,
        handler
    ):

        self.pattern = pattern
        self.handler = handler

    def match(
        self,
        line: str
    ):

        m = self.pattern.match(line)
        if m:
            return self.handler(m)
        return None


# Strategy instances — one per recognised AutoScript syntax form
_LINE_STRATEGIES = [
    LineStrategy(_RE_IF,      lambda m: (K_IF,      m.group(1))),
    LineStrategy(_RE_ELSE_IF, lambda m: (K_ELSE_IF, m.group(1))),
    LineStrategy(_RE_ELSE,    lambda m: (K_ELSE,    None)),
    LineStrategy(_RE_ENDIF,   lambda m: (K_ENDIF,   None)),
    LineStrategy(_RE_SET,     lambda m: (K_SET,     (m.group(1).strip(), m.group(2).strip()))),
    LineStrategy(_RE_ADD,     lambda m: (K_ADD,     (m.group(1).strip(), m.group(2).strip()))),
    LineStrategy(_RE_SUB,     lambda m: (K_SUB,     (m.group(1).strip(), m.group(2).strip()))),
    LineStrategy(_RE_PASS,    lambda m: (K_PASS,    None)),
]


class Stmt:
    """
        Flat statement container, backward-compatible with the original
        tokenize() output and the orchestration dialog's _classifyLine.
    """

    def __init__(
        self,
        kind: str | None = None,
        condition: str | None = None,
        target: str | None = None,
        value: str | None = None,
        op_type: str | None = None,
        raw_line: str = ""
    ):

        self.kind = kind
        self.condition = condition
        self.target = target
        self.value = value
        self.op_type = op_type
        self.raw_line = raw_line


class ASTokenizer:
    """
        Tokenizer / parser for the AutoScript DSL.

        Main class-level entry points (engine-facing):
          - classifyLine(line)   — single-line classifier.
          - tokenize(script)      — flat Stmt list.
          - parse(script)         — structured AST (Script root).

        Observer-enabled API (used by pre-check & orchestration):
          >>> obs = ScriptPrecheckObserver()
          >>> stmts = ASTokenizer.tokenizeWithObservers(script, [obs])
    """

    @classmethod
    def _notifyObservers(
        cls,
        observers: list,
        method: str,
        *args
    ):

        for obs in observers:
            getattr(obs, method)(*args)

    @classmethod
    def _matchLine(
        cls,
        stripped: str
    ):

        for strategy in _LINE_STRATEGIES:
            result = strategy.match(stripped)
            if result:
                return result
        return (None, None)

    @classmethod
    def _buildStmt(
        cls,
        stripped: str,
        kind: str | None,
        data
    ) -> Stmt:

        stmt = Stmt(kind=kind, raw_line=stripped)
        if kind == K_IF or kind == K_ELSE_IF:
            stmt.condition = data
        elif kind == K_SET:
            stmt.target, stmt.value = data
            stmt.op_type = OP_SET
        elif kind == K_ADD:
            stmt.target, stmt.value = data
            stmt.op_type = OP_ADD
        elif kind == K_SUB:
            stmt.target, stmt.value = data
            stmt.op_type = OP_SUB
        return stmt

    @classmethod
    def _stripComment(
        cls,
        line: str
    ) -> str:

        in_single = False
        in_double = False
        i = 0
        while i < len(line):
            ch = line[i]
            if ch == "'" and not in_double:
                if i + 1 < len(line) and line[i + 1] == "'":
                    i += 2
                    continue
                in_single = not in_single
            elif ch == '"' and not in_single:
                in_double = not in_double
            elif ch == "/" and i + 1 < len(line) and line[i + 1] == "/" and not in_single and not in_double:
                return line[:i].rstrip()
            i += 1
        return line

    @classmethod
    def _tokenizeImpl(
        cls,
        script: str
    ) -> list:

        statements = []
        for raw_line in script.split("\n"):
            code = cls._stripComment(raw_line.strip())
            if not code:
                continue
            kind, data = cls._matchLine(code)
            statements.append(cls._buildStmt(code, kind, data))
        return statements

    @classmethod
    def _parseTokens(
        cls,
        tokens: list
    ) -> Script:

        body = []
        i = 0
        while i < len(tokens):
            tok = tokens[i]
            kind = tok.kind

            if kind == K_IF:
                node, consumed = cls._parseIfBlock(tokens, i)
                body.append(node)
                i += consumed
            elif kind in (K_ELSE_IF, K_ELSE, K_ENDIF):
                i += 1
            elif kind == K_SET:
                body.append(SetNode(target=tok.target, value=tok.value))
                i += 1
            elif kind in (K_ADD, K_SUB):
                body.append(OpNode(
                    op_type=tok.op_type,
                    target=tok.target,
                    value=tok.value
                ))
                i += 1
            elif kind == K_PASS:
                body.append(PassNode())
                i += 1
            else:
                body.append(UnrecogNode(raw_line=tok.raw_line))
                i += 1
        return Script(body=body)

    @classmethod
    def classifyLine(
        cls,
        stripped: str
    ):

        kind, data = cls._matchLine(stripped)
        if kind is None or kind == K_PASS:
            return None
        return (kind, data)

    @classmethod
    def tokenize(
        cls,
        script: str
    ) -> list:

        return cls._tokenizeImpl(script)

    @classmethod
    def parse(
        cls,
        script: str
    ) -> Script:

        return cls._parseTokens(cls._tokenizeImpl(script))

    @classmethod
    def tokenizeWithObservers(
        cls,
        script: str,
        observers: list
    ) -> list:
        """
            Tokenize and notify observers for each classified line.

            Fires onParseStart, onTokenParsed, and onParseComplete
            events to each observer.  This is the single tokenization
            pipeline shared by pre-check and orchestration modules.
        """

        cls._notifyObservers(observers, "onParseStart", script)
        statements = []
        for i, raw_line in enumerate(script.split("\n"), 1):
            code = cls._stripComment(raw_line.strip())
            if not code:
                continue
            kind, data = cls._matchLine(code)
            cls._notifyObservers(observers, "onTokenParsed", kind, data, i, code)
            statements.append(cls._buildStmt(code, kind, data))
        cls._notifyObservers(observers, "onParseComplete", statements)
        return statements

    @classmethod
    def parseWithObservers(
        cls,
        script: str,
        observers: list
    ) -> Script:
        """
            Parse and notify observers throughout the pipeline.

            Calls tokenizeWithObservers (which fires per-token events),
            then builds the AST and fires onASTReady.
        """

        tokens = cls.tokenizeWithObservers(script, observers)
        ast = cls._parseTokens(tokens)
        cls._notifyObservers(observers, "onASTReady", ast)
        return ast

    @classmethod
    def _parseIfBlock(
        cls,
        tokens: list,
        start: int
    ):

        first = tokens[start]
        node = IfNode(condition=first.condition or "")
        body = []
        elif_branches = []
        else_body = []
        current_target = body
        i = start + 1

        while i < len(tokens):
            tok = tokens[i]
            kind = tok.kind
            if kind == K_IF:
                sub_node, consumed = cls._parseIfBlock(tokens, i)
                current_target.append(sub_node)
                i += consumed
            elif kind == K_ELSE_IF:
                elif_branches.append(ElifNode(condition=tok.condition or ""))
                current_target = elif_branches[-1].body
                i += 1
            elif kind == K_ELSE:
                else_body = []
                current_target = else_body
                i += 1
            elif kind == K_ENDIF:
                node.body = body
                node.elif_branches = elif_branches
                node.else_body = else_body
                return (node, i - start + 1)
            elif kind == K_SET:
                current_target.append(SetNode(target=tok.target, value=tok.value))
                i += 1
            elif kind in (K_ADD, K_SUB):
                current_target.append(OpNode(
                    op_type=tok.op_type,
                    target=tok.target,
                    value=tok.value
                ))
                i += 1
            elif kind == K_PASS:
                current_target.append(PassNode())
                i += 1
            else:
                current_target.append(UnrecogNode(raw_line=tok.raw_line))
                i += 1
        node.body = body
        node.elif_branches = elif_branches
        node.else_body = else_body
        node.closed = False
        return (node, i - start)