Core Modules

Explainer Module

`rexplain.core.explainer`

`RegexExplainer`

Provides human-readable explanations for regex patterns.

Source code in src/rexplain/core/explainer.py

class RegexExplainer:
    """
    Provides human-readable explanations for regex patterns.
    """
    def explain(self, pattern: str, flags: int = 0) -> str:
        r"""
        Explain a regex pattern as a formatted, line-by-line string.

        Args:
            pattern (str): The regex pattern to explain.
            flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

        Returns:
            str: A line-by-line explanation of the regex pattern.
        """
        from .parser import RegexParser
        ast = RegexParser().parse(pattern, flags=flags)
        return explain(ast)

`explain(pattern, flags=0)`

Explain a regex pattern as a formatted, line-by-line string.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	The regex pattern to explain.	required
`flags`	`int`	Regex flags (e.g., re.IGNORECASE). Defaults to 0.	`0`

Returns:

Name	Type	Description
`str`	`str`	A line-by-line explanation of the regex pattern.

Source code in src/rexplain/core/explainer.py

def explain(self, pattern: str, flags: int = 0) -> str:
    r"""
    Explain a regex pattern as a formatted, line-by-line string.

    Args:
        pattern (str): The regex pattern to explain.
        flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

    Returns:
        str: A line-by-line explanation of the regex pattern.
    """
    from .parser import RegexParser
    ast = RegexParser().parse(pattern, flags=flags)
    return explain(ast)

`explain(ast)`

Return a line-by-line, context-aware explanation of the regex AST.

Parameters:

Name	Type	Description	Default
`ast`	`RegexAST`	The root node of the regex AST.	required

Returns:

Name	Type	Description
`str`	`str`	A formatted, line-by-line explanation of the regex pattern.

Source code in src/rexplain/core/explainer.py

def explain(ast: RegexAST) -> str:
    r"""
    Return a line-by-line, context-aware explanation of the regex AST.

    Args:
        ast (RegexAST): The root node of the regex AST.

    Returns:
        str: A formatted, line-by-line explanation of the regex pattern.
    """
    lines = _token_and_explanation(ast)
    return '\n'.join(lines)

Generator Module

`rexplain.core.generator`

`ExampleGenerator`

Generates example strings that match a given regex pattern using the AST.

Source code in src/rexplain/core/generator.py

class ExampleGenerator:
    """
    Generates example strings that match a given regex pattern using the AST.
    """
    def __init__(self):
        """
        Initialize the ExampleGenerator.
        """
        self.parser = RegexParser()
        # For negated char classes, pick from this set
        self.default_charset = [chr(i) for i in range(32, 127)]

    def generate(self, pattern: str, count: int = 3, flags: int = 0) -> List[str]:
        """
        Generate a list of example strings that match the given regex pattern.

        Args:
            pattern (str): The regex pattern.
            count (int, optional): Number of examples to generate. Defaults to 3.
            flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

        Returns:
            List[str]: Example strings matching the pattern.
        """
        ast = self.parser.parse(pattern, flags=flags)
        # For alternations, try to cover all branches if possible
        if isinstance(ast, Alternation) and count <= len(ast.options):
            return [self._generate_from_ast(opt) for opt in ast.options[:count]]
        # Special handling for anchored patterns: only generate the exact match
        if self._is_fully_anchored(ast):
            return [self._generate_from_ast(ast)] * count
        return [self._generate_from_ast(ast) for _ in range(count)]

    def _is_fully_anchored(self, ast: RegexAST) -> bool:
        # Returns True if the pattern is ^...$ (fully anchored)
        if isinstance(ast, Sequence):
            elements = ast.elements
            if len(elements) >= 2 and isinstance(elements[0], Anchor) and elements[0].value == '^' and \
               isinstance(elements[-1], Anchor) and elements[-1].value == '$':
                return True
            if len(elements) == 1 and isinstance(elements[0], Anchor):
                return True
        if isinstance(ast, Anchor):
            return True
        return False

    def _generate_from_ast(self, ast: RegexAST) -> str:
        if isinstance(ast, Literal):
            return ast.value
        elif isinstance(ast, CharClass):
            chars, negated = self._parse_char_class(ast.value)
            if negated:
                candidates = [c for c in self.default_charset if c not in chars]
                return random.choice(candidates) if candidates else '?'
            else:
                return random.choice(chars) if chars else ''
        elif isinstance(ast, Escape):
            # Map escapes to representative characters
            escape_map = {
                r'\d': lambda: str(random.randint(0, 9)),
                r'\w': lambda: random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'),
                r'\s': lambda: random.choice([' ', '\t', '\n']),
                r'\D': lambda: random.choice([c for c in self.default_charset if not c.isdigit()]),
                r'\W': lambda: random.choice([c for c in self.default_charset if not (c.isalnum() or c == '_')]),
                r'\S': lambda: random.choice([c for c in self.default_charset if not c.isspace()]),
                r'\\': lambda: '\\',
                r'\n': lambda: '\n',
                r'\t': lambda: '\t',
                r'\r': lambda: '\r',
                r'\b': lambda: '',  # word boundary, ignore in generation
                r'\B': lambda: '',  # non-word boundary, ignore
            }
            # Unicode/hex escapes
            if ast.value.startswith(r'\u') and len(ast.value) == 6:
                try:
                    codepoint = int(ast.value[2:], 16)
                    return chr(codepoint)
                except Exception:
                    return '?'
            if ast.value.startswith(r'\x') and len(ast.value) == 4:
                try:
                    codepoint = int(ast.value[2:], 16)
                    return chr(codepoint)
                except Exception:
                    return '?'
            return escape_map.get(ast.value, lambda: '?')()
        elif isinstance(ast, Quantifier):
            min_n, max_n = self._parse_quant(ast.quant)
            # For larger ranges, limit max_n for practicality
            max_n = min(max_n, 8)
            n = random.randint(min_n, max_n)
            return ''.join(self._generate_from_ast(ast.child) for _ in range(n))
        elif isinstance(ast, Anchor):
            # Anchors do not produce characters
            return ''
        elif isinstance(ast, Sequence):
            # If fully anchored, only generate the inner content
            if self._is_fully_anchored(ast):
                elements = ast.elements
                # Remove ^ and $ anchors
                inner = elements[1:-1]
                return ''.join(self._generate_from_ast(e) for e in inner)
            # Recursively generate for each element
            return ''.join(self._generate_from_ast(e) for e in ast.elements)
        elif isinstance(ast, Alternation):
            # Randomly pick one option, support nested alternations
            option = random.choice(ast.options)
            return self._generate_from_ast(option)
        elif isinstance(ast, Group):
            # For lookahead/lookbehind, do not generate any characters
            if ast.group_type in {'GROUP_LOOKAHEAD', 'GROUP_NEG_LOOKAHEAD', 'GROUP_LOOKBEHIND', 'GROUP_NEG_LOOKBEHIND'}:
                return ''
            # Recursively generate for each child (supports nested groups)
            return ''.join(self._generate_from_ast(child) for child in ast.children)
        else:
            return ''

    def _parse_char_class(self, class_str: str) -> Tuple[List[str], bool]:
        # Enhanced char class parser: supports negation and ranges
        chars = []
        negated = False
        if class_str.startswith('[') and class_str.endswith(']'):
            inner = class_str[1:-1]
            if inner.startswith('^'):
                negated = True
                inner = inner[1:]
            i = 0
            while i < len(inner):
                if i+2 < len(inner) and inner[i+1] == '-':
                    # Range
                    start, end = inner[i], inner[i+2]
                    chars.extend([chr(c) for c in range(ord(start), ord(end)+1)])
                    i += 3
                else:
                    chars.append(inner[i])
                    i += 1
        return chars, negated

    def _parse_quant(self, quant: str) -> Tuple[int, int]:
        # Returns (min, max) for quantifier
        if quant == '*':
            return (0, 4)
        elif quant == '+':
            return (1, 4)
        elif quant == '?':
            return (0, 1)
        elif quant.endswith('?'):
            # Non-greedy, treat as normal
            return self._parse_quant(quant[:-1])
        elif quant.startswith('{'):
            import re
            m = re.match(r'\{(\d+)(,(\d*)?)?\}', quant)
            if m:
                n1 = int(m.group(1))
                n2 = m.group(3)
                if n2 == '' or n2 is None:
                    return (n1, n1)
                elif n2:
                    return (n1, int(n2) if n2.isdigit() else n1+4)
            return (1, 1)
        else:
            return (1, 1)

`init()`

Initialize the ExampleGenerator.

Source code in src/rexplain/core/generator.py

def __init__(self):
    """
    Initialize the ExampleGenerator.
    """
    self.parser = RegexParser()
    # For negated char classes, pick from this set
    self.default_charset = [chr(i) for i in range(32, 127)]

`generate(pattern, count=3, flags=0)`

Generate a list of example strings that match the given regex pattern.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	The regex pattern.	required
`count`	`int`	Number of examples to generate. Defaults to 3.	`3`
`flags`	`int`	Regex flags (e.g., re.IGNORECASE). Defaults to 0.	`0`

Returns:

Type	Description
`List[str]`	List[str]: Example strings matching the pattern.

Source code in src/rexplain/core/generator.py

def generate(self, pattern: str, count: int = 3, flags: int = 0) -> List[str]:
    """
    Generate a list of example strings that match the given regex pattern.

    Args:
        pattern (str): The regex pattern.
        count (int, optional): Number of examples to generate. Defaults to 3.
        flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

    Returns:
        List[str]: Example strings matching the pattern.
    """
    ast = self.parser.parse(pattern, flags=flags)
    # For alternations, try to cover all branches if possible
    if isinstance(ast, Alternation) and count <= len(ast.options):
        return [self._generate_from_ast(opt) for opt in ast.options[:count]]
    # Special handling for anchored patterns: only generate the exact match
    if self._is_fully_anchored(ast):
        return [self._generate_from_ast(ast)] * count
    return [self._generate_from_ast(ast) for _ in range(count)]

Parser Module

`rexplain.core.parser`

`Alternation` `dataclass`

Bases: RegexAST

Represents alternation, e.g., a|b|c.

Source code in src/rexplain/core/parser.py

@dataclass
class Alternation(RegexAST):
    """
    Represents alternation, e.g., a|b|c.
    """
    options: List[RegexAST]

`Anchor` `dataclass`

Bases: RegexAST

Represents anchors like ^, $, \b, etc.

Source code in src/rexplain/core/parser.py

@dataclass
class Anchor(RegexAST):
    r"""
    Represents anchors like ^, $, \b, etc.
    """
    value: str

`CharClass` `dataclass`

Bases: RegexAST

Represents a character class, e.g., [a-z] or [^abc].

Source code in src/rexplain/core/parser.py

@dataclass
class CharClass(RegexAST):
    """
    Represents a character class, e.g., [a-z] or [^abc].
    """
    value: str  # The raw class string, e.g., '[a-z]'

`Escape` `dataclass`

Bases: RegexAST

Represents escape sequences like \d, \w, etc.

Source code in src/rexplain/core/parser.py

@dataclass
class Escape(RegexAST):
    r"""
    Represents escape sequences like \d, \w, etc.
    """
    value: str

`Group` `dataclass`

Bases: RegexAST

Represents a group (capturing, non-capturing, named, lookahead, etc.).

Source code in src/rexplain/core/parser.py

@dataclass
class Group(RegexAST):
    """
    Represents a group (capturing, non-capturing, named, lookahead, etc.).
    """
    group_type: str  # 'capturing', 'noncap', 'named', 'lookahead', etc.
    children: List[RegexAST]
    name: Optional[str] = None  # For named groups
    flags: Optional[str] = None  # For inline/scoped flags
    condition: Optional[str] = None  # For conditional expressions

`Literal` `dataclass`

Bases: RegexAST

Represents a literal character in the regex.

Source code in src/rexplain/core/parser.py

@dataclass
class Literal(RegexAST):
    """
    Represents a literal character in the regex.
    """
    value: str

`Quantifier` `dataclass`

Bases: RegexAST

Represents a quantifier applied to a subpattern, e.g., a*, b{2,3}.

Source code in src/rexplain/core/parser.py

@dataclass
class Quantifier(RegexAST):
    """
    Represents a quantifier applied to a subpattern, e.g., a*, b{2,3}.
    """
    child: RegexAST
    quant: str  # '*', '+', '?', '{n}', '{n,m}', etc.

`RegexAST` `dataclass`

Base class for all AST nodes representing regex components.

Source code in src/rexplain/core/parser.py

@dataclass
class RegexAST:
    """
    Base class for all AST nodes representing regex components.
    """
    pass

`RegexParser`

Parses a regex string into an abstract syntax tree (AST).

Source code in src/rexplain/core/parser.py

class RegexParser:
    """
    Parses a regex string into an abstract syntax tree (AST).
    """
    def parse(self, pattern: str, flags: int = 0) -> RegexAST:
        r"""
        Parse a regex pattern string into an AST.

        Args:
            pattern (str): The regex pattern to parse.
            flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

        Returns:
            RegexAST: The root node of the parsed regex AST.
        """
        tokens = self.tokenize(pattern, flags)
        self._tokens = tokens
        self._pos = 0
        ast = self._parse_alternation()
        return ast

    def _peek(self):
        if self._pos < len(self._tokens):
            return self._tokens[self._pos]
        return None

    def _advance(self):
        tok = self._peek()
        if tok:
            self._pos += 1
        return tok

    def _parse_alternation(self):
        options = [self._parse_sequence()]
        while self._peek() and self._peek().type == 'SPECIAL' and self._peek().value == '|':
            self._advance()  # skip '|'
            options.append(self._parse_sequence())
        if len(options) == 1:
            return options[0]
        return Alternation(options)

    def _parse_sequence(self):
        elements = []
        while True:
            tok = self._peek()
            if tok is None or (tok.type == 'SPECIAL' and tok.value == '|') or (tok.type == 'GROUP_CLOSE'):
                break
            elements.append(self._parse_quantifier())
        if len(elements) == 1:
            return elements[0]
        return Sequence(elements)

    def _parse_quantifier(self):
        # Always allow quantifiers to apply to any atom, including Anchor
        atom = self._parse_atom()
        tok = self._peek()
        if tok and tok.type == 'QUANTIFIER':
            quant_tok = self._advance()
            # Check for non-greedy quantifier (e.g., *?, +?, ??, {n,m}?)
            next_tok = self._peek()
            if next_tok and next_tok.type == 'SPECIAL' and next_tok.value == '?':
                self._advance()
                quant_str = quant_tok.value + '?'
            else:
                quant_str = quant_tok.value
            return Quantifier(atom, quant_str)
        return atom

    def _parse_atom(self):
        tok = self._peek()
        if tok is None:
            return None
        # Escaped metacharacters as literals
        if tok.type == 'ESCAPE':
            # If it's an escaped metacharacter, treat as Literal
            metachars = {'.', '*', '+', '?', '|', '(', ')', '[', ']', '{', '}', '^', '$', '\\'}
            if len(tok.value) == 2 and tok.value[1] in metachars:
                self._advance()
                return Literal(tok.value[1])
            else:
                self._advance()
                return Escape(tok.value)
        elif tok.type == 'LITERAL':
            self._advance()
            return Literal(tok.value)
        elif tok.type == 'CHAR_CLASS':
            self._advance()
            return CharClass(tok.value)
        elif tok.type == 'SPECIAL' and tok.value in {'^', '$'}:
            self._advance()
            return Anchor(tok.value)
        elif tok.type.startswith('GROUP_'):
            return self._parse_group()
        else:
            self._advance()
            return Literal(tok.value)

    def _parse_group(self):
        tok = self._advance()
        group_type = tok.type
        name = None
        flags = None
        condition = None
        # Inline flags: (?i), (?m), (?s), or scoped flags (?i:...)
        if group_type == 'GROUP_FLAGS':
            # Distinguish between inline and scoped flags
            import re
            m = re.match(r'\(\?[a-zA-Z]+([):])', tok.value)
            if m and m.group(1) == ')':
                # Inline flags group, e.g., (?i)
                flags = tok.value[2:-1]  # extract flags between (? and )
                return Group('GROUP_FLAGS', [], None, flags=flags)
            elif m and m.group(1) == ':':
                # Scoped flags group, e.g., (?m:...)
                flags = tok.value[2:-1]  # extract flags between (? and :
                group_type = 'GROUP_FLAGS'
                # Parse group contents until closing paren
                children = []
                if self._peek() and self._peek().type == 'GROUP_CLOSE':
                    self._advance()  # empty group
                    return Group(group_type, children, name, flags, condition)
                children.append(self._parse_alternation())
                if self._peek() and self._peek().type == 'GROUP_CLOSE':
                    self._advance()
                else:
                    raise ValueError('Unclosed group: missing )')
                return Group(group_type, children, name, flags, condition)
        if group_type == 'GROUP_NAMED':
            # Extract group name from value, e.g., (?P<name>
            import re
            m = re.match(r'\(\?P<([^>]+)>', tok.value)
            if m:
                name = m.group(1)  # FIX: should be group(1), not group(2)
        # For lookahead/lookbehind/noncap/flags/conditional and other group types, parse contents then expect GROUP_CLOSE
        children = []
        if group_type in {'GROUP_LOOKAHEAD', 'GROUP_NEG_LOOKAHEAD', 'GROUP_LOOKBEHIND', 'GROUP_NEG_LOOKBEHIND', 'GROUP_NONCAP', 'GROUP_FLAGS', 'GROUP_CONDITIONAL', 'GROUP_NAMED'}:
            # Parse group contents until closing paren
            if self._peek() and self._peek().type == 'GROUP_CLOSE':
                self._advance()  # empty group
                return Group(group_type, children, name, flags, condition)
            children.append(self._parse_alternation())
            if self._peek() and self._peek().type == 'GROUP_CLOSE':
                self._advance()
            else:
                raise ValueError('Unclosed group: missing )')
            return Group(group_type, children, name, flags, condition)
        # For capturing groups, parse alternation (may be nested)
        if self._peek() and self._peek().type == 'GROUP_CLOSE':
            self._advance()  # consume ')'
            return Group(group_type, children, name, flags, condition)
        while self._peek() and not (self._peek().type == 'GROUP_CLOSE'):
            children.append(self._parse_alternation())
        if self._peek() and self._peek().type == 'GROUP_CLOSE':
            self._advance()  # consume ')'
        else:
            raise ValueError('Unclosed group: missing )')
        return Group(group_type, children, name, flags, condition)

    def tokenize(self, pattern: str, flags: int = 0) -> List['RegexToken']:
        r"""
        Tokenize a regex pattern string into RegexToken objects, including character classes and groups.

        Args:
            pattern (str): The regex pattern to tokenize.
            flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

        Returns:
            List[RegexToken]: List of tokens representing the regex pattern.
        """
        tokens: List[RegexToken] = []
        i = 0
        special_chars = {'.', '*', '+', '?', '|', '(', ')', '[', ']', '{', '}', '^', '$'}
        escape_sequences = {'d', 'w', 's', 'D', 'W', 'S', 'b', 'B', 'A', 'Z', 'G', 'n', 'r', 't', 'v', 'f', '\\', 'u', 'x', 'N'}
        length = len(pattern)
        while i < length:
            c = pattern[i]
            # Character class
            if c == '[':
                start = i
                i += 1
                in_escape = False
                while i < length:
                    if not in_escape and pattern[i] == ']':
                        i += 1
                        break
                    if pattern[i] == '\\' and not in_escape:
                        in_escape = True
                        i += 1
                    else:
                        in_escape = False
                        i += 1
                if i > length or (i == length and (length == 0 or pattern[i-1] != ']')):
                    raise ValueError('Unclosed character class: missing ]')
                tokens.append(RegexToken(type='CHAR_CLASS', value=pattern[start:i]))
            # Group constructs
            elif c == '(':
                if pattern[i:i+3] == '(?:':
                    tokens.append(RegexToken(type='GROUP_NONCAP', value='(?:'))
                    i += 3
                elif pattern[i:i+4] == '(?P<':
                    # Named group: (?P<name>
                    start = i
                    j = i+4
                    while j < length and pattern[j] != '>':
                        j += 1
                    if j < length and pattern[j] == '>':
                        group_str = pattern[start:j+1]
                        tokens.append(RegexToken(type='GROUP_NAMED', value=group_str))
                        i = j+1  # Advance index to after the closing '>'
                    else:
                        tokens.append(RegexToken(type='GROUP_OPEN', value='('))
                        i += 1
                elif pattern[i:i+3] == '(?=':
                    tokens.append(RegexToken(type='GROUP_LOOKAHEAD', value='(?='))
                    i += 3
                elif pattern[i:i+4] == '(?!':
                    tokens.append(RegexToken(type='GROUP_NEG_LOOKAHEAD', value='(?!'))
                    i += 4
                elif pattern[i:i+4] == '(?<=':
                    tokens.append(RegexToken(type='GROUP_LOOKBEHIND', value='(?<='))
                    i += 4
                elif pattern[i:i+5] == '(?<!':
                    tokens.append(RegexToken(type='GROUP_NEG_LOOKBEHIND', value='(?<!'))
                    i += 5
                # Inline flags or conditional expressions
                elif pattern[i:i+2] == '(?':
                    # Could be inline flags, scoped flags, or conditional
                    j = i+2
                    flag_str = ''
                    while j < length and pattern[j] in 'imsxauL':
                        flag_str += pattern[j]
                        j += 1
                    if j < length and pattern[j] == ':':
                        tokens.append(RegexToken(type='GROUP_FLAGS', value=pattern[i:j+1]))
                        i = j+1
                    elif j < length and pattern[j] == ')':
                        tokens.append(RegexToken(type='GROUP_FLAGS', value=pattern[i:j+1]))
                        i = j+1
                    else:
                        tokens.append(RegexToken(type='GROUP_OPEN', value='('))
                        i += 1
                else:
                    tokens.append(RegexToken(type='GROUP_OPEN', value='('))
                    i += 1
            elif c == ')':
                tokens.append(RegexToken(type='GROUP_CLOSE', value=')'))
                i += 1
            # Quantifier braces
            elif c == '{':
                start = i
                i += 1
                while i < length and pattern[i] != '}':
                    i += 1
                if i < length and pattern[i] == '}':
                    i += 1
                else:
                    raise ValueError('Unclosed quantifier braces: missing }')
                tokens.append(RegexToken(type='QUANTIFIER', value=pattern[start:i]))
            # Quantifiers *, +, ?
            elif c in {'*', '+', '?'}:
                tokens.append(RegexToken(type='QUANTIFIER', value=c))
                i += 1
            # Escape sequences (including Unicode/ASCII/Named)
            elif c == '\\':
                if i + 1 < length:
                    next_c = pattern[i+1]
                    if next_c in escape_sequences:
                        # Unicode: \uXXXX, ASCII: \xXX, Named: \N{...}
                        if next_c == 'u' and i+5 < length:
                            tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+6]))
                            i += 6
                        elif next_c == 'x' and i+3 < length:
                            tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+4]))
                            i += 4
                        elif next_c == 'N' and i+2 < length and pattern[i+2] == '{':
                            j = i+3
                            while j < length and pattern[j] != '}':
                                j += 1
                            if j < length and pattern[j] == '}':
                                tokens.append(RegexToken(type='ESCAPE', value=pattern[i:j+1]))
                                i = j+1
                            else:
                                tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+2]))
                                i += 2
                        else:
                            tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+2]))
                            i += 2
                    else:
                        tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+2]))
                        i += 2
                else:
                    tokens.append(RegexToken(type='ESCAPE', value=c))
                    i += 1
            # Specials (other than quantifiers)
            elif c in special_chars:
                tokens.append(RegexToken(type='SPECIAL', value=c))
                i += 1
            # Literals
            else:
                tokens.append(RegexToken(type='LITERAL', value=c))
                i += 1
        return tokens

`parse(pattern, flags=0)`

Parse a regex pattern string into an AST.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	The regex pattern to parse.	required
`flags`	`int`	Regex flags (e.g., re.IGNORECASE). Defaults to 0.	`0`

Returns:

Name	Type	Description
`RegexAST`	`RegexAST`	The root node of the parsed regex AST.

Source code in src/rexplain/core/parser.py

def parse(self, pattern: str, flags: int = 0) -> RegexAST:
    r"""
    Parse a regex pattern string into an AST.

    Args:
        pattern (str): The regex pattern to parse.
        flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

    Returns:
        RegexAST: The root node of the parsed regex AST.
    """
    tokens = self.tokenize(pattern, flags)
    self._tokens = tokens
    self._pos = 0
    ast = self._parse_alternation()
    return ast

`tokenize(pattern, flags=0)`

Tokenize a regex pattern string into RegexToken objects, including character classes and groups.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	The regex pattern to tokenize.	required
`flags`	`int`	Regex flags (e.g., re.IGNORECASE). Defaults to 0.	`0`

Returns:

Type	Description
`List[RegexToken]`	List[RegexToken]: List of tokens representing the regex pattern.

Source code in src/rexplain/core/parser.py

def tokenize(self, pattern: str, flags: int = 0) -> List['RegexToken']:
    r"""
    Tokenize a regex pattern string into RegexToken objects, including character classes and groups.

    Args:
        pattern (str): The regex pattern to tokenize.
        flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

    Returns:
        List[RegexToken]: List of tokens representing the regex pattern.
    """
    tokens: List[RegexToken] = []
    i = 0
    special_chars = {'.', '*', '+', '?', '|', '(', ')', '[', ']', '{', '}', '^', '$'}
    escape_sequences = {'d', 'w', 's', 'D', 'W', 'S', 'b', 'B', 'A', 'Z', 'G', 'n', 'r', 't', 'v', 'f', '\\', 'u', 'x', 'N'}
    length = len(pattern)
    while i < length:
        c = pattern[i]
        # Character class
        if c == '[':
            start = i
            i += 1
            in_escape = False
            while i < length:
                if not in_escape and pattern[i] == ']':
                    i += 1
                    break
                if pattern[i] == '\\' and not in_escape:
                    in_escape = True
                    i += 1
                else:
                    in_escape = False
                    i += 1
            if i > length or (i == length and (length == 0 or pattern[i-1] != ']')):
                raise ValueError('Unclosed character class: missing ]')
            tokens.append(RegexToken(type='CHAR_CLASS', value=pattern[start:i]))
        # Group constructs
        elif c == '(':
            if pattern[i:i+3] == '(?:':
                tokens.append(RegexToken(type='GROUP_NONCAP', value='(?:'))
                i += 3
            elif pattern[i:i+4] == '(?P<':
                # Named group: (?P<name>
                start = i
                j = i+4
                while j < length and pattern[j] != '>':
                    j += 1
                if j < length and pattern[j] == '>':
                    group_str = pattern[start:j+1]
                    tokens.append(RegexToken(type='GROUP_NAMED', value=group_str))
                    i = j+1  # Advance index to after the closing '>'
                else:
                    tokens.append(RegexToken(type='GROUP_OPEN', value='('))
                    i += 1
            elif pattern[i:i+3] == '(?=':
                tokens.append(RegexToken(type='GROUP_LOOKAHEAD', value='(?='))
                i += 3
            elif pattern[i:i+4] == '(?!':
                tokens.append(RegexToken(type='GROUP_NEG_LOOKAHEAD', value='(?!'))
                i += 4
            elif pattern[i:i+4] == '(?<=':
                tokens.append(RegexToken(type='GROUP_LOOKBEHIND', value='(?<='))
                i += 4
            elif pattern[i:i+5] == '(?<!':
                tokens.append(RegexToken(type='GROUP_NEG_LOOKBEHIND', value='(?<!'))
                i += 5
            # Inline flags or conditional expressions
            elif pattern[i:i+2] == '(?':
                # Could be inline flags, scoped flags, or conditional
                j = i+2
                flag_str = ''
                while j < length and pattern[j] in 'imsxauL':
                    flag_str += pattern[j]
                    j += 1
                if j < length and pattern[j] == ':':
                    tokens.append(RegexToken(type='GROUP_FLAGS', value=pattern[i:j+1]))
                    i = j+1
                elif j < length and pattern[j] == ')':
                    tokens.append(RegexToken(type='GROUP_FLAGS', value=pattern[i:j+1]))
                    i = j+1
                else:
                    tokens.append(RegexToken(type='GROUP_OPEN', value='('))
                    i += 1
            else:
                tokens.append(RegexToken(type='GROUP_OPEN', value='('))
                i += 1
        elif c == ')':
            tokens.append(RegexToken(type='GROUP_CLOSE', value=')'))
            i += 1
        # Quantifier braces
        elif c == '{':
            start = i
            i += 1
            while i < length and pattern[i] != '}':
                i += 1
            if i < length and pattern[i] == '}':
                i += 1
            else:
                raise ValueError('Unclosed quantifier braces: missing }')
            tokens.append(RegexToken(type='QUANTIFIER', value=pattern[start:i]))
        # Quantifiers *, +, ?
        elif c in {'*', '+', '?'}:
            tokens.append(RegexToken(type='QUANTIFIER', value=c))
            i += 1
        # Escape sequences (including Unicode/ASCII/Named)
        elif c == '\\':
            if i + 1 < length:
                next_c = pattern[i+1]
                if next_c in escape_sequences:
                    # Unicode: \uXXXX, ASCII: \xXX, Named: \N{...}
                    if next_c == 'u' and i+5 < length:
                        tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+6]))
                        i += 6
                    elif next_c == 'x' and i+3 < length:
                        tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+4]))
                        i += 4
                    elif next_c == 'N' and i+2 < length and pattern[i+2] == '{':
                        j = i+3
                        while j < length and pattern[j] != '}':
                            j += 1
                        if j < length and pattern[j] == '}':
                            tokens.append(RegexToken(type='ESCAPE', value=pattern[i:j+1]))
                            i = j+1
                        else:
                            tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+2]))
                            i += 2
                    else:
                        tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+2]))
                        i += 2
                else:
                    tokens.append(RegexToken(type='ESCAPE', value=pattern[i:i+2]))
                    i += 2
            else:
                tokens.append(RegexToken(type='ESCAPE', value=c))
                i += 1
        # Specials (other than quantifiers)
        elif c in special_chars:
            tokens.append(RegexToken(type='SPECIAL', value=c))
            i += 1
        # Literals
        else:
            tokens.append(RegexToken(type='LITERAL', value=c))
            i += 1
    return tokens

`RegexToken` `dataclass`

Represents a single regex component (token) in the pattern.

Source code in src/rexplain/core/parser.py

@dataclass
class RegexToken:
    """
    Represents a single regex component (token) in the pattern.
    """
    type: str
    value: str

`Sequence` `dataclass`

Bases: RegexAST

Represents a sequence of regex elements (e.g., abcd).

Source code in src/rexplain/core/parser.py

@dataclass
class Sequence(RegexAST):
    """
    Represents a sequence of regex elements (e.g., abcd).
    """
    elements: List[RegexAST]

Tester Module

`rexplain.core.tester`

`MatchResult` `dataclass`

Represents the result of testing a string against a regex pattern.

Attributes:

Name	Type	Description
`matches`	`bool`	Whether the string fully matches the pattern.
`reason`	`str`	Explanation of the match or failure.
`failed_at`	`Optional[int]`	Index where the match failed, if applicable.
`partial_matches`	`Optional[List[str]]`	List of partial matches, if any.

Source code in src/rexplain/core/tester.py

@dataclass
class MatchResult:
    """
    Represents the result of testing a string against a regex pattern.

    Attributes:
        matches (bool): Whether the string fully matches the pattern.
        reason (str): Explanation of the match or failure.
        failed_at (Optional[int]): Index where the match failed, if applicable.
        partial_matches (Optional[List[str]]): List of partial matches, if any.
    """
    matches: bool
    reason: str
    failed_at: Optional[int] = None
    partial_matches: Optional[List[str]] = None

    def __str__(self):
        return (
            f"MatchResult(matches={self.matches}, reason=\"{self.reason}\", "
            f"failed_at={self.failed_at}, partial_matches={self.partial_matches})"
        )

`RegexTester`

Tests if a string matches a regex pattern and provides detailed feedback.

Source code in src/rexplain/core/tester.py

class RegexTester:
    """
    Tests if a string matches a regex pattern and provides detailed feedback.
    """
    def test(self, pattern: str, test_string: str, flags: int = 0) -> MatchResult:
        r"""
        Test if a string matches a regex pattern and explain why/why not.

        Args:
            pattern (str): The regex pattern.
            test_string (str): The string to test.
            flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

        Returns:
            MatchResult: Result object with match status and explanation.
        """
        prog = re.compile(pattern, flags)
        m = prog.fullmatch(test_string)
        if m:
            return MatchResult(matches=True, reason="Full match.")

        # Try to use the parser for step-by-step analysis
        try:
            from .parser import RegexParser, Literal, CharClass, Escape, Sequence
            ast = RegexParser().parse(pattern, flags=flags)
            # Only handle simple sequences of literals/char classes for now
            if isinstance(ast, Sequence):
                elements = ast.elements
            else:
                elements = [ast]
            i = 0
            j = 0
            details = []
            while i < len(elements) and j < len(test_string):
                node = elements[i]
                c = test_string[j]
                if isinstance(node, Literal):
                    if c == node.value:
                        details.append(f"{c!r} matches literal '{node.value}' at position {j}")
                        i += 1
                        j += 1
                    else:
                        reason = (f"Failed at position {j}: expected literal '{node.value}', got '{c}'")
                        return MatchResult(
                            matches=False,
                            reason=reason,
                            failed_at=j,
                            partial_matches=[test_string[:j]] if j > 0 else []
                        )
                elif isinstance(node, CharClass):
                    import re as _re
                    charclass = node.value
                    # Remove brackets for eval
                    pattern = charclass
                    if pattern.startswith('[') and pattern.endswith(']'):
                        pattern = pattern[1:-1]
                    # Build a regex for the char class
                    charclass_re = _re.compile(f"[{pattern}]")
                    if charclass_re.fullmatch(c):
                        details.append(f"{c!r} matches character class {node.value} at position {j}")
                        i += 1
                        j += 1
                    else:
                        reason = (f"Failed at position {j}: expected character in {node.value}, got '{c}'")
                        return MatchResult(
                            matches=False,
                            reason=reason,
                            failed_at=j,
                            partial_matches=[test_string[:j]] if j > 0 else []
                        )
                elif isinstance(node, Escape):
                    import re as _re
                    esc = node.value
                    esc_re = _re.compile(esc)
                    display_esc = esc  # Always show as written (e.g., '\d')
                    if esc_re.fullmatch(c):
                        details.append(f"{c!r} matches escape {display_esc} at position {j}")
                        i += 1
                        j += 1
                    else:
                        reason = (f"Failed at position {j}: expected {display_esc}, got '{c}'")
                        return MatchResult(
                            matches=False,
                            reason=reason,
                            failed_at=j,
                            partial_matches=[test_string[:j]] if j > 0 else []
                        )
                else:
                    # For now, fallback to regex engine for complex nodes
                    break
            # If we finished all pattern elements but string is too short
            if i < len(elements):
                reason = f"String too short: expected more input for pattern element {elements[i]} at position {j}"
                return MatchResult(
                    matches=False,
                    reason=reason,
                    failed_at=j,
                    partial_matches=[test_string[:j]] if j > 0 else []
                )
            # If we finished all pattern elements but string is too long
            if j < len(test_string):
                reason = f"String too long: extra input '{test_string[j:]}' at position {j}"
                return MatchResult(
                    matches=False,
                    reason=reason,
                    failed_at=j,
                    partial_matches=[test_string[:j]] if j > 0 else []
                )
        except Exception as e:
            # Fallback to regex engine for complex patterns or parser errors
            pass

        # Fallback: original logic
        # Check if pattern is a literal (no regex metacharacters)
        if not re.search(r'[.^$*+?{}\[\]|()]', pattern):
            # Literal pattern: compare character by character
            match_len = 0
            for c1, c2 in zip(pattern, test_string):
                if c1 == c2:
                    match_len += 1
                else:
                    break
            failed_at = match_len
            reason = (
                f"Match failed at position {failed_at}: unexpected character '{test_string[failed_at]}'"
                if failed_at < len(test_string)
                else "String too short."
            )
            partial_matches = [test_string[:match_len]] if match_len > 0 else []
            return MatchResult(
                matches=False,
                reason=reason,
                failed_at=failed_at,
                partial_matches=partial_matches
            )
        # Regex pattern: use current logic
        longest = 0
        for i in range(1, len(test_string) + 1):
            m = prog.fullmatch(test_string[:i])
            if m:
                longest = i
        if longest > 0:
            failed_at = None
            for i, (c1, c2) in enumerate(zip(pattern, test_string)):
                if c1 != c2:
                    failed_at = i
                    break
            if failed_at is None:
                failed_at = min(len(pattern), len(test_string))
            reason = (
                f"Match failed at position {failed_at}: unexpected character '{test_string[failed_at]}'"
                if failed_at < len(test_string)
                else "String too short."
            )
            return MatchResult(
                matches=False,
                reason=reason,
                failed_at=failed_at,
                partial_matches=[test_string[:longest]]
            )
        failed_at = 0
        for i, (c1, c2) in enumerate(zip(pattern, test_string)):
            if c1 != c2:
                failed_at = i
                break
        else:
            failed_at = min(len(pattern), len(test_string))
        return MatchResult(matches=False, reason="No match at all.", failed_at=failed_at, partial_matches=[])

`test(pattern, test_string, flags=0)`

Test if a string matches a regex pattern and explain why/why not.

Parameters:

Name	Type	Description	Default
`pattern`	`str`	The regex pattern.	required
`test_string`	`str`	The string to test.	required
`flags`	`int`	Regex flags (e.g., re.IGNORECASE). Defaults to 0.	`0`

Returns:

Name	Type	Description
`MatchResult`	`MatchResult`	Result object with match status and explanation.

Source code in src/rexplain/core/tester.py

def test(self, pattern: str, test_string: str, flags: int = 0) -> MatchResult:
    r"""
    Test if a string matches a regex pattern and explain why/why not.

    Args:
        pattern (str): The regex pattern.
        test_string (str): The string to test.
        flags (int, optional): Regex flags (e.g., re.IGNORECASE). Defaults to 0.

    Returns:
        MatchResult: Result object with match status and explanation.
    """
    prog = re.compile(pattern, flags)
    m = prog.fullmatch(test_string)
    if m:
        return MatchResult(matches=True, reason="Full match.")

    # Try to use the parser for step-by-step analysis
    try:
        from .parser import RegexParser, Literal, CharClass, Escape, Sequence
        ast = RegexParser().parse(pattern, flags=flags)
        # Only handle simple sequences of literals/char classes for now
        if isinstance(ast, Sequence):
            elements = ast.elements
        else:
            elements = [ast]
        i = 0
        j = 0
        details = []
        while i < len(elements) and j < len(test_string):
            node = elements[i]
            c = test_string[j]
            if isinstance(node, Literal):
                if c == node.value:
                    details.append(f"{c!r} matches literal '{node.value}' at position {j}")
                    i += 1
                    j += 1
                else:
                    reason = (f"Failed at position {j}: expected literal '{node.value}', got '{c}'")
                    return MatchResult(
                        matches=False,
                        reason=reason,
                        failed_at=j,
                        partial_matches=[test_string[:j]] if j > 0 else []
                    )
            elif isinstance(node, CharClass):
                import re as _re
                charclass = node.value
                # Remove brackets for eval
                pattern = charclass
                if pattern.startswith('[') and pattern.endswith(']'):
                    pattern = pattern[1:-1]
                # Build a regex for the char class
                charclass_re = _re.compile(f"[{pattern}]")
                if charclass_re.fullmatch(c):
                    details.append(f"{c!r} matches character class {node.value} at position {j}")
                    i += 1
                    j += 1
                else:
                    reason = (f"Failed at position {j}: expected character in {node.value}, got '{c}'")
                    return MatchResult(
                        matches=False,
                        reason=reason,
                        failed_at=j,
                        partial_matches=[test_string[:j]] if j > 0 else []
                    )
            elif isinstance(node, Escape):
                import re as _re
                esc = node.value
                esc_re = _re.compile(esc)
                display_esc = esc  # Always show as written (e.g., '\d')
                if esc_re.fullmatch(c):
                    details.append(f"{c!r} matches escape {display_esc} at position {j}")
                    i += 1
                    j += 1
                else:
                    reason = (f"Failed at position {j}: expected {display_esc}, got '{c}'")
                    return MatchResult(
                        matches=False,
                        reason=reason,
                        failed_at=j,
                        partial_matches=[test_string[:j]] if j > 0 else []
                    )
            else:
                # For now, fallback to regex engine for complex nodes
                break
        # If we finished all pattern elements but string is too short
        if i < len(elements):
            reason = f"String too short: expected more input for pattern element {elements[i]} at position {j}"
            return MatchResult(
                matches=False,
                reason=reason,
                failed_at=j,
                partial_matches=[test_string[:j]] if j > 0 else []
            )
        # If we finished all pattern elements but string is too long
        if j < len(test_string):
            reason = f"String too long: extra input '{test_string[j:]}' at position {j}"
            return MatchResult(
                matches=False,
                reason=reason,
                failed_at=j,
                partial_matches=[test_string[:j]] if j > 0 else []
            )
    except Exception as e:
        # Fallback to regex engine for complex patterns or parser errors
        pass

    # Fallback: original logic
    # Check if pattern is a literal (no regex metacharacters)
    if not re.search(r'[.^$*+?{}\[\]|()]', pattern):
        # Literal pattern: compare character by character
        match_len = 0
        for c1, c2 in zip(pattern, test_string):
            if c1 == c2:
                match_len += 1
            else:
                break
        failed_at = match_len
        reason = (
            f"Match failed at position {failed_at}: unexpected character '{test_string[failed_at]}'"
            if failed_at < len(test_string)
            else "String too short."
        )
        partial_matches = [test_string[:match_len]] if match_len > 0 else []
        return MatchResult(
            matches=False,
            reason=reason,
            failed_at=failed_at,
            partial_matches=partial_matches
        )
    # Regex pattern: use current logic
    longest = 0
    for i in range(1, len(test_string) + 1):
        m = prog.fullmatch(test_string[:i])
        if m:
            longest = i
    if longest > 0:
        failed_at = None
        for i, (c1, c2) in enumerate(zip(pattern, test_string)):
            if c1 != c2:
                failed_at = i
                break
        if failed_at is None:
            failed_at = min(len(pattern), len(test_string))
        reason = (
            f"Match failed at position {failed_at}: unexpected character '{test_string[failed_at]}'"
            if failed_at < len(test_string)
            else "String too short."
        )
        return MatchResult(
            matches=False,
            reason=reason,
            failed_at=failed_at,
            partial_matches=[test_string[:longest]]
        )
    failed_at = 0
    for i, (c1, c2) in enumerate(zip(pattern, test_string)):
        if c1 != c2:
            failed_at = i
            break
    else:
        failed_at = min(len(pattern), len(test_string))
    return MatchResult(matches=False, reason="No match at all.", failed_at=failed_at, partial_matches=[])

Core Modules

Explainer Module

rexplain.core.explainer

RegexExplainer

explain(pattern, flags=0)

explain(ast)

Generator Module

rexplain.core.generator

ExampleGenerator

__init__()

generate(pattern, count=3, flags=0)

Parser Module

rexplain.core.parser

Alternation dataclass

Anchor dataclass

CharClass dataclass

Escape dataclass

Group dataclass

Literal dataclass

Quantifier dataclass

RegexAST dataclass

RegexParser

parse(pattern, flags=0)

tokenize(pattern, flags=0)

RegexToken dataclass

Sequence dataclass

Tester Module

rexplain.core.tester

MatchResult dataclass

RegexTester

test(pattern, test_string, flags=0)

`rexplain.core.explainer`

`RegexExplainer`

`explain(pattern, flags=0)`

`explain(ast)`

`rexplain.core.generator`

`ExampleGenerator`

`init()`

`generate(pattern, count=3, flags=0)`

`rexplain.core.parser`

`Alternation` `dataclass`

`Anchor` `dataclass`

`CharClass` `dataclass`

`Escape` `dataclass`

`Group` `dataclass`

`Literal` `dataclass`

`Quantifier` `dataclass`

`RegexAST` `dataclass`

`RegexParser`

`parse(pattern, flags=0)`

`tokenize(pattern, flags=0)`

`RegexToken` `dataclass`

`Sequence` `dataclass`

`rexplain.core.tester`

`MatchResult` `dataclass`

`RegexTester`

`test(pattern, test_string, flags=0)`