Source code for cnt.rulebase.workflow.exact_match_labeler
from typing import Any, List
import ahocorasick
from cnt.rulebase.workflow.type_annotations import IntervalGeneratorType
from cnt.rulebase.workflow.interval_labeler import IntervalLabeler
def _ac_automation_match(text: str, ac_automation: Any) -> IntervalGeneratorType:
prev_start, prev_end = -1, -1
# ``iter``` will return ``end`` in accending order, see
# https://github.com/WojciechMula/pyahocorasick/blob/484b1f13549fc9bdeb9868d8a1711d1861f804c3/py/pyahocorasick.py#L229-L252
# Also note the ``[start, end]`` generated by ``iter`` are closed interval.
for end, (_, key) in ac_automation.iter(text):
start = end + 1 - len(key)
if prev_start < 0:
# init.
prev_start, prev_end = start, end
elif start <= prev_end + 1:
# check the interleaved case.
prev_end = end
else:
# should return the previous interval. Note we yield half-opened interval here.
yield (prev_start, prev_end + 1)
prev_start, prev_end = start, end
# yield the last interval.
if prev_start >= 0:
yield (prev_start, prev_end + 1)
[docs]class ExactMatchLabeler(IntervalLabeler):
"""
Helper to label exact match strings.
"""
AC_AUTOMATION: Any = None
[docs] @classmethod
def build_ac_automation_from_strings(cls, keys: List[str]) -> Any:
atm = ahocorasick.Automaton() # pylint: disable=c-extension-no-member
for idx, key in enumerate(keys):
atm.add_word(key, (idx, key))
atm.make_automaton()
return atm
[docs] @classmethod
def build_and_bind_ac_automation_from_strings(cls, keys: List[str]) -> None:
cls.AC_AUTOMATION = cls.build_ac_automation_from_strings(keys)
[docs] def intervals_generator(self) -> IntervalGeneratorType:
if self.AC_AUTOMATION is None:
raise RuntimeError('AC_AUTOMATION is not initialized.')
return _ac_automation_match(self.input_sequence, self.AC_AUTOMATION)