Конверсии DFA NFA [закрыто]Python

Программы на Python
Ответить
Anonymous
 Конверсии DFA NFA [закрыто]

Сообщение Anonymous »

Еще одна частая точка сбоя возникает на этапе преобразования DFA в регулярное выражение, часто реализуемого с использованием исключения состояния. Устранение состояний требует многократного обновления меток перехода путем объединения и вложения регулярных выражений. Когда присутствует много состояний, эти выражения могут расти экспоненциально, превращаясь в огромные строки, с которыми трудно эффективно работать. Алгоритм может создавать настолько большие выражения, что ваша среда выполнения языка не сможет их сохранить, или операции конкатенации станут непомерно дорогими.
Ошибки также могут возникать на более ранних этапах конвейера. Если исходное представление NFA содержит непредусмотренные циклы, повторяющиеся звездочки (*) или неверные переходы, алгоритмы преобразования могут генерировать недопустимые или недостижимые состояния, что в конечном итоге приводит к сбоям при извлечении регулярных выражений. В частности, использование слишком большого количества звезд Клини в исходном шаблоне приводит к появлению вложенных циклов в NFA, что усугубляет сложность как при построении подмножества, так и при исключении состояний.
Чтобы диагностировать проблему, проверьте размеры промежуточных автоматов, ограничьте количество расширений или используйте такие оптимизации, как сокращение недостижимых состояний, раннее объединение эквивалентных состояний или применение правил упрощения регулярных выражений. Отслеживая рост на каждом этапе, вы можете выявить узкие места и обеспечить вычислительную осуществимость ваших преобразований.

DFA
from collections.abc import Callable
from collections import deque
from dataclasses import dataclass
from typing import TypeVar, Generic
from itertools import product

# Type variable for state type
STATE = TypeVar('STATE')
# Type variable for remapped state type
OTHER_STATE = TypeVar('OTHER_STATE')

# Deterministic Finite Automaton
@dataclass
class DFA(Generic[STATE]):
# Alphabet (set of symbols)
S: set[str]
# Set of states
K: set[STATE]
# Initial state
q0: STATE
# Transition function: (state, symbol) -> state
d: dict[tuple[STATE, str], STATE]
# Set of accepting states
F: set[STATE]

# Check if word is accepted
def accept(self, word: str) -> bool:
# Start from initial state
curr = self.q0

# Process each character
for ch in word:
# Create transition key
key = (curr, ch)
# If no transition, reject
if key not in self.d:
return False
# Follow transition
curr = self.d[key]

# Accept if in final state
return curr in self.F

# Get all reachable states from initial
def _get_reachable(self) -> set[STATE]:
# Start with initial state
visited = {self.q0}
# Queue for BFS
work = deque([self.q0])

# While states to process
while work:
# Get next state
curr = work.popleft()

# Check all symbols
for sym in self.S:
# Create transition key
key = (curr, sym)
# If transition exists
if key in self.d:
# Get next state
nxt = self.d[key]
# If not visited
if nxt not in visited:
# Mark visited
visited.add(nxt)
# Add to queue
work.append(nxt)

# Return all reachable states
return visited

# Check if two states are distinguishable
def _check_distinguishable(self, s1: STATE, s2: STATE,
state_list: list[STATE],
dist_table: list[list[bool]]) -> bool:
# Check all symbols
for sym in self.S:
# Create transition keys
k1, k2 = (s1, sym), (s2, sym)

# If both transitions exist
if k1 in self.d and k2 in self.d:
# Get next states
n1, n2 = self.d[k1], self.d[k2]

# If different next states
if n1 != n2:
# Get indices in state list
i1, i2 = state_list.index(n1), state_list.index(n2)
# Ensure low < high for table access
low, high = (i1, i2) if i1 < i2 else (i2, i1)

# If next states are distinguishable
if dist_table[low][high]:
# States are distinguishable
return True

# States not distinguishable
return False

# Minimize DFA using table-filling algorithm
def minimize(self) -> 'DFA[STATE]':
# Get reachable states only
reach = self._get_reachable()
# Convert to list for indexing
state_list = list(reach)
# Get number of states
n = len(state_list)

# Initialize distinction table (n x n)
dist_table = [[False] * n for _ in range(n)]

# Mark obviously distinguishable pairs
for i, j in product(range(n), repeat=2):
# Only upper triangle
if i < j:
# Get states
s1, s2 = state_list, state_list[j]
# Distinguishable if one final, other not
dist_table[j] = (s1 in self.F) != (s2 in self.F)

# Iterate until no changes
modified = True
while modified:
# Reset flag
modified = False

# Check all pairs
for i, j in ((i, j) for i in range(n) for j in range(i + 1, n)):
# If not yet marked distinguishable
if not dist_table[j]:
# Get states
s1, s2 = state_list, state_list[j]

# Check if distinguishable
if self._check_distinguishable(s1, s2, state_list, dist_table):
# Mark as distinguishable
dist_table[j] = True
# Signal change
modified = True

# Map states to equivalence classes
state_map = {}

# Process each state
for i, curr in enumerate(state_list):
# Skip if already mapped
if curr in state_map:
continue

# Find all equivalent states
equiv = [curr] + [state_list[j] for j in range(i + 1, n) if not dist_table[j]]
# Create equivalence class
cls = frozenset(equiv)
# Map all states in class to class
state_map.update({s: cls for s in cls})

# Get new state set (equivalence classes)
new_states = set(state_map.values())
# Map initial state
new_init = state_map[self.q0]
# Map final states
new_finals = {state_map for s in self.F if s in state_map}

# Build new transitions
new_trans = {}
# For each equivalence class
for cls in new_states:
# Get representative state
rep = next(iter(cls))

# Check all symbols
for sym in self.S:
# Create transition key
key = (rep, sym)
# If transition exists
if key in self.d:
# Get next state
nxt = self.d[key]
# If next state is mapped
if nxt in state_map:
# Get target equivalence class
tgt = state_map[nxt]
# Add transition
new_trans[(cls, sym)] = tgt

# Return minimized DFA
return DFA(self.S, new_states, new_init, new_trans, new_finals)

# Public wrapper for reachable states
def _get_reachable_states(self) -> set[STATE]:
return self._get_reachable()

# Remap states using function
def remap_states(self, f: Callable[[STATE], OTHER_STATE]) -> 'DFA[OTHER_STATE]':
return DFA(
# Keep alphabet
self.S,
# Remap all states
{f(s) for s in self.K},
# Remap initial state
f(self.q0),
# Remap transitions
{(f(s), ch): f(nxt) for (s, ch), nxt in self.d.items()},
# Remap final states
{f(s) for s in self.F}
)

НФА:
from .DFA import DFA
from dataclasses import dataclass
from collections.abc import Callable
from collections import deque
from typing import TypeVar, Generic

# Type variable for state type
STATE = TypeVar('STATE')
# Type variable for remapped state type
OTHER_STATE = TypeVar('OTHER_STATE')

# Empty string constant for epsilon transitions
EPSILON = ''

# Nondeterministic Finite Automaton
@dataclass
class NFA(Generic[STATE]):
# Alphabet (set of symbols)
S: set[str]
# Set of states
K: set[STATE]
# Initial state
q0: STATE
# Transition function: (state, symbol) -> set of states
d: dict[tuple[STATE, str], set[STATE]]
# Set of accepting states
F: set[STATE]

# Get epsilon closure of a state
def epsilon_closure(self, state: STATE) -> set[STATE]:
# Start with state itself
visited = {state}
# Stack for DFS
to_visit = deque([state])

# While states to process
while to_visit:
# Get next state
curr = to_visit.pop()
# Add epsilon-reachable states
visited.update(
nxt for nxt in self.d.get((curr, EPSILON), set())
if nxt not in visited and not to_visit.append(nxt)
)

# Return closure
return visited

# Get epsilon closure of a set of states
def _eps_closure_set(self, states: set[STATE]) -> set[STATE]:
# Union of closures for each state
return set().union(*(self.epsilon_closure(s) for s in states))

# Get transition on symbol from set of states
def _transition(self, states: set[STATE], symbol: str) -> set[STATE]:
# Union of transitions from each state
return set().union(*(self.d.get((s, symbol), set()) for s in states))

# Convert NFA to DFA using subset construction
def subset_construction(self) -> DFA[frozenset[STATE]]:
# Initial DFA state is epsilon closure of NFA initial
init = frozenset(self.epsilon_closure(self.q0))

# Set of DFA states
states_set = {init}
# Transition map
trans_map = {}
# Set of final states
finals_set = set()

# Queue for processing
work_list = deque([init])
# Already processed
done_set = set()

# While states to process
while work_list:
# Get next state
curr_state = work_list.popleft()

# Skip if already processed
if curr_state in done_set:
continue
# Mark as processed
done_set.add(curr_state)

# If contains NFA final state, mark as final
any(s in self.F for s in curr_state) and finals_set.add(curr_state)

# For each symbol
for sym in self.S:
# Get next states on symbol
nxt_states = self._transition(curr_state, sym)
# Apply epsilon closure
nxt_state = frozenset(self._eps_closure_set(nxt_states)) if nxt_states else frozenset()

# Add transition
trans_map[(curr_state, sym)] = nxt_state

# If new state found
if nxt_state not in states_set:
# Add to set
states_set.add(nxt_state)
# Add to queue
work_list.append(nxt_state)

# Empty state (dead state)
empty = frozenset()
# If empty state exists
if empty in states_set:
# Add self-loops for all symbols
trans_map.update({(empty, sym): empty for sym in self.S if (empty, sym) not in trans_map})

# Return DFA
return DFA(self.S, states_set, init, trans_map, finals_set)

# Remap states using function
def remap_states(self, f: Callable[[STATE], OTHER_STATE]) -> 'NFA[OTHER_STATE]':
# Remap all states
new_states = set()
for s in self.K:
new_states.add(f(s))

# Remap initial state
new_initial = f(self.q0)

# Remap transitions and target sets
new_transitions = {}
for key, value in self.d.items():
old_state = key[0]
symbol = key[1]
new_state = f(old_state)
new_key = (new_state, symbol)
new_targets = set()
for t in value:
new_targets.add(f(t))
new_transitions[new_key] = new_targets

# Remap final states
new_finals = set()
for s in self.F:
new_finals.add(f(s))

# Return new NFA
return NFA(
self.S,
new_states,
new_initial,
new_transitions,
new_finals
)

REGEX:
from typing import List
from .NFA import NFA

EPSILON = ''

# Generator for unique state pairs
class StateGenerator:
# Initialize counter to 0
def __init__(self):
self._cnt = 0

# Get next pair of consecutive states
def get_next_pair(self):
# Assign current and next state
s1, s2 = self._cnt, self._cnt + 1
# Increment counter by 2
self._cnt += 2
# Return state pair
return s1, s2

# Reset counter to 0
def reset(self):
self._cnt = 0

# Global state generator instance
_state_gen = StateGenerator()

# Base class for regex nodes
class Regex:
# Abstract method for Thompson construction
def thompson(self) -> NFA[int]:
raise NotImplementedError("Must use concrete Regex node (Character, Concat, Union, etc.)")

# Single character regex
class Character(Regex):
# Store character
def __init__(self, char: str):
self.char = char

# Build NFA for single character
def thompson(self) -> NFA[int]:
# Get unique state pair
s1, s2 = _state_gen.get_next_pair()

# If epsilon character
if self.char == 'eps':
# Return NFA with epsilon transition
return NFA(set(), {s1, s2}, s1, {(s1, EPSILON): {s2}}, {s2})
else:
# Return NFA with character transition
return NFA({self.char}, {s1, s2}, s1, {(s1, self.char): {s2}}, {s2})

# Concatenation of two regexes
class Concat(Regex):
# Store left and right regexes
def __init__(self, left: Regex, right: Regex):
self.left = left
self.right = right

# Build NFA for concatenation
def thompson(self) -> NFA[int]:
# Build NFAs for left and right
nfa_l, nfa_r = self.left.thompson(), self.right.thompson()

# Copy left transitions
merged = dict(nfa_l.d)

# Connect left finals to right initial via epsilon
for f in nfa_l.F:
merged.setdefault((f, EPSILON), set()).add(nfa_r.q0)

# Add all right transitions
for trans, tgts in nfa_r.d.items():
merged.setdefault(trans, set()).update(tgts)

# Return combined NFA
return NFA(nfa_l.S | nfa_r.S, nfa_l.K | nfa_r.K, nfa_l.q0, merged, nfa_r.F)

# Union of two regexes
class Union(Regex):
# Store left and right regexes
def __init__(self, left: Regex, right: Regex):
self.left = left
self.right = right

# Build NFA for union
def thompson(self) -> NFA[int]:
# Build NFAs for left and right
nfa_l, nfa_r = self.left.thompson(), self.right.thompson()
# Get new start and end states
s_new, e_new = _state_gen.get_next_pair()

# Copy left transitions
combined = dict(nfa_l.d)
# Add right transitions
for trans, tgts in nfa_r.d.items():
combined.setdefault(trans, set()).update(tgts)

# New start branches to both initial states
combined[(s_new, EPSILON)] = {nfa_l.q0, nfa_r.q0}

# All finals connect to new end
for f in nfa_l.F | nfa_r.F:
combined.setdefault((f, EPSILON), set()).add(e_new)

# Return union NFA
return NFA(nfa_l.S | nfa_r.S, nfa_l.K | nfa_r.K | {s_new, e_new}, s_new, combined, {e_new})

# Kleene star (0 or more repetitions)
class Star(Regex):
# Store expression
def __init__(self, expr: Regex):
self.expr = expr

# Build NFA for star
def thompson(self) -> NFA[int]:
# Build inner NFA
inner = self.expr.thompson()
# Get new start and end states
s_new, e_new = _state_gen.get_next_pair()

# Copy inner transitions
updated = dict(inner.d)
# New start can skip or enter
updated[(s_new, EPSILON)] = {inner.q0, e_new}

# After inner, can loop back or exit
for f in inner.F:
updated.setdefault((f, EPSILON), set()).update({inner.q0, e_new})

# Return star NFA
return NFA(inner.S,
inner.K |
{s_new, e_new},
s_new, updated,
{e_new})

# Plus (1 or more repetitions)
class Plus(Regex):
# Store expression
def __init__(self, expr: Regex):
self.expr = expr

# Build NFA for plus
def thompson(self) -> NFA[int]:
# Plus = expr followed by star(expr)
return Concat(self.expr, Star(self.expr)).thompson()

# Optional (0 or 1 repetition)
class Optional(Regex):
# Store expression
def __init__(self, expr: Regex):
self.expr = expr

# Build NFA for optional
def thompson(self) -> NFA[int]:
# Optional = expr or epsilon
return Union(self.expr, Character('eps')).thompson()

# Map special chars to Unicode placeholders
ESCAPE_MAP = {
'*': '\uE000', '+': '\uE001', '?': '\uE002', '(': '\uE003',
')': '\uE004', '|': '\uE005', '[': '\uE006', ']': '\uE007',
'-': '\uE008', '.': '\uE009', '/': '\uE00A',
}

# Reverse map for unescaping
UNESCAPE_MAP = {v: k for k, v in ESCAPE_MAP.items()}

# Preprocess regex string
def preprocess_regex(regex: str) -> str:
# Initialize output list and index
output, idx = [], 0
# Map escape sequences
escape_chars = {' ': ' ', 'n': '\n', 't': '\t', '\\': '\\'}

# Process each character
while idx < len(regex):
# If escape sequence found
if regex[idx] == '\\' and idx + 1 < len(regex):
# Get next character
nxt = regex[idx + 1]
# Add escaped char or mapped char
output.append(escape_chars.get(nxt, ESCAPE_MAP.get(nxt, nxt)))
# Skip both chars
idx += 2
# If not whitespace
elif regex[idx] not in (' ', '\t'):
# Add character
output.append(regex[idx])
# Advance index
idx += 1
else:
# Skip whitespace
idx += 1

# Join output
return ''.join(output)

# Parse regex string to Regex tree
def parse_regex(regex: str) -> Regex:
# Reset state generator
_state_gen.reset()
# Preprocess input
processed = preprocess_regex(regex)

# If empty, return epsilon
if not processed:
return Character('eps')

# Initialize position
pos = 0

# Peek at current character
peek_char = lambda: processed[pos] if pos < len(processed) else None

# Advance and return current character
def next_char():
nonlocal pos
# Get current char
ch = peek_char()
# Advance position
pos += 1
# Return char
return ch

# Parse expression (union has lowest precedence)
def parse_expr():
# Parse first concatenation
lhs = parse_concat()

# While union operator found
while peek_char() == '|':
# Skip operator
next_char()
# Create union with next concat
lhs = Union(lhs, parse_concat())

# Return expression
return lhs

# Parse concatenation
def parse_concat():
# Collect elements
elems = []

# While not union or closing paren
while peek_char() and peek_char() not in '|)':
# Parse postfix expression
elems.append(parse_postfix())

# If no elements, return epsilon
if not elems:
return Character('eps')

# Start with first element
res = elems[0]
# Concatenate remaining elements
for elem in elems[1:]:
res = Concat(res, elem)
# Return result
return res

# Parse postfix operators (*, +, ?)
def parse_postfix():
# Parse atom
atom = parse_atom()

# Map operators to classes
ops = {'*': Star, '+': Plus, '?': Optional}
# While postfix operator found
while peek_char() in ops:
# Apply operator
atom = ops[next_char()](atom)

# Return atom
return atom

# Parse atomic expression
def parse_atom():
# Peek at character
ch = peek_char()

# If end of input
if ch is None:
return Character('eps')

# If escaped character
if ch in UNESCAPE_MAP:
# Skip character
next_char()
# Return unescaped character
return Character(UNESCAPE_MAP[ch])

# If opening parenthesis
if ch == '(':
# Skip opening
next_char()
# Parse expression
e = parse_expr()
# Skip closing if present
peek_char() == ')' and next_char()
# Return expression
return e

# If character class
if ch == '[':
# Skip bracket
next_char()
# Parse character class
return parse_char_class()

# If epsilon literal
if processed[pos:pos+3] == 'eps':
# Skip 3 characters
for _ in range(3):
next_char()
# Return epsilon
return Character('eps')

# Return regular character
return Character(next_char())

# Parse character class [a-z] or [a]
def parse_char_class():
# Get start character
start = next_char()

# If range found
if peek_char() == '-':
# Skip dash
next_char()
# Get end character
end = next_char()
# Skip closing bracket if present
peek_char() == ']' and next_char()

# Generate character range
chars = [chr(c) for c in range(ord(start), ord(end) + 1)]
# Start with first character
res = Character(chars[0])
# Union remaining characters
for ch in chars[1:]:
res = Union(res, Character(ch))
# Return union
return res
else:
# Skip closing bracket if present
peek_char() == ']' and next_char()
# Return single character
return Character(start)

# Start parsing from expression level
return parse_expr()


Подробнее здесь: https://stackoverflow.com/questions/798 ... onversions
Ответить

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

Вернуться в «Python»