Source code for cnt.rulebase.rules.sentence_segmentation.sentence_segmenter

"""
Chinese sentence segmentation.
"""
from typing import Union, Optional, List, cast

from cnt.rulebase import workflow, const
from cnt.rulebase.rules.sentence_segmentation import const as sentseg_const


[docs]class SentenceSegementationConfig(workflow.BasicConfig):

    def __init__(
            self,
            enable_strict_sentence_charset: bool,
            enable_comma_ending: bool,
            extend_ending_with_delimiters: bool,
            dynamic_endings: List[str],
    ):
        self.enable_strict_sentence_charset = enable_strict_sentence_charset
        self.enable_comma_ending = enable_comma_ending
        self.extend_ending_with_delimiters = extend_ending_with_delimiters
        self.dynamic_endings = dynamic_endings


[docs]class SentenceEndingLabeler(workflow.ExactMatchLabeler):
    """
    Mark sentence endings based on
    :py:const:`cnt.rulebase.const.sentence_endings.EM_SENTENCE_ENDINGS`
    """


SentenceEndingLabeler.build_and_bind_ac_automation_from_strings(sentseg_const.EM_SENTENCE_ENDINGS)


[docs]class DynamicSentenceEndingLabeler(workflow.ExactMatchLabeler):
    """
    Support dynamic sentence endings that will be built in runtime.
    """

    def __init__(self, input_sequence: str, config: Optional[SentenceSegementationConfig]):
        # Inject ``AC_AUTOMATION`` before __init__().
        if config and config.dynamic_endings:
            # pylint: disable=C0103
            self.AC_AUTOMATION = self.build_ac_automation_from_strings(config.dynamic_endings)

        super().__init__(input_sequence, config)

[docs]    def intervals_generator(self) -> workflow.IntervalGeneratorType:

        def mocker_generator() -> workflow.IntervalGeneratorType:
            empty_tuple = cast(workflow.IntervalListType, ())
            yield from empty_tuple

        if self.config:
            config = cast(SentenceSegementationConfig, self.config)
            if not config.dynamic_endings:
                return mocker_generator()

        return super().intervals_generator()


[docs]class CommaLabeler(workflow.BasicSequentialLabeler):
    """
    Mark comma.
    """

    COMMAS = (chr(0xFF0C), chr(0x201A), ',')

[docs]    def label(self, index: int) -> bool:
        return self.input_sequence[index] in self.COMMAS


[docs]class WhitespaceLabeler(workflow.IntervalLabeler):
    """
    Mark unicode whitespace.
    """


WhitespaceLabeler.initialize_by_regular_expression(r'\s+')


[docs]class SentenceValidCharacterLabeler(workflow.IntervalLabeler):
    """
    Mark valid character of chinese sentence.
    """


SentenceValidCharacterLabeler.initialize_by_intervals(sentseg_const.ITV_SENTENCE_VALID_CHARS)


[docs]class DelimitersLabeler(workflow.IntervalLabeler):
    """
    Mark dilimiters for sentence ending extension.
    """


DelimitersLabeler.initialize_by_intervals(const.ITV_DELIMITERS)


[docs]class SentenceSegementationLabelProcessor(workflow.BasicLabelProcessor):

    def _labels_indicate_sentence_ending(self, labels: workflow.LabelsType) -> bool:
        config = cast(SentenceSegementationConfig, self.config)
        return bool(labels[SentenceEndingLabeler] or
                    (config.dynamic_endings and labels[DynamicSentenceEndingLabeler]) or
                    (config.enable_comma_ending and labels[CommaLabeler]))

[docs]    def result(self) -> workflow.IntervalGeneratorType:
        """
        Generate intervals indicating the valid sentences.
        """
        config = cast(SentenceSegementationConfig, self.config)

        index = -1
        labels = None

        while True:

            # 1. Find the start of the sentence.
            start = -1
            while True:
                # Check the ``labels`` generated from step (2).
                if labels is None:
                    # https://www.python.org/dev/peps/pep-0479/
                    try:
                        index, labels = next(self.index_labels_generator)
                    except StopIteration:
                        return
                # Check if we found a valid sentence char.
                if labels[SentenceValidCharacterLabeler]:
                    start = index
                    break
                # Trigger next(...) action.
                labels = None
                index = -1

            # 2. Find the ending.
            end = -1
            try:
                while True:
                    index, labels = next(self.index_labels_generator)

                    # Detected invalid char.
                    if config.enable_strict_sentence_charset and \
                            not labels[SentenceValidCharacterLabeler] and \
                            not labels[WhitespaceLabeler]:
                        end = index
                        break

                    # Detected sentence ending.
                    if self._labels_indicate_sentence_ending(labels):
                        # Consume the ending span.
                        while True:
                            index, labels = next(self.index_labels_generator)
                            is_ending = (self._labels_indicate_sentence_ending(labels) or
                                         (config.extend_ending_with_delimiters and
                                          labels[DelimitersLabeler]))

                            if not is_ending:
                                end = index
                                break
                        # yeah we found the ending.
                        break
            except StopIteration:
                end = len(self.input_sequence)
                # Trigger next(...) action.
                labels = None
                index = -1

            yield start, end


#pylint: disable=W0223
class _SentenceSegementationOutputGeneratorLazy(workflow.BasicOutputGenerator):

    def _result(self) -> workflow.SegmentGeneratorType:
        return ((self.input_sequence[start:end], (start, end))
                for start, end in self.label_processor_result)


[docs]class SentenceSegementationOutputGeneratorLazy(_SentenceSegementationOutputGeneratorLazy):

[docs]    def result(self) -> workflow.SegmentGeneratorType:
        return self._result()


[docs]class SentenceSegementationOutputGenerator(_SentenceSegementationOutputGeneratorLazy):

[docs]    def result(self) -> workflow.SegmentListType:
        return list(self._result())


def _generate_sentseg_workflow(lazy: bool) -> workflow.BasicWorkflow:
    return workflow.BasicWorkflow(
            sequential_labeler_classes=[
                    SentenceEndingLabeler,
                    DynamicSentenceEndingLabeler,
                    DelimitersLabeler,
                    CommaLabeler,
                    WhitespaceLabeler,
                    SentenceValidCharacterLabeler,
            ],
            label_processor_class=SentenceSegementationLabelProcessor,
            output_generator_class=(SentenceSegementationOutputGeneratorLazy
                                    if lazy else SentenceSegementationOutputGenerator),
    )


SENTSEG_WORKFLOW_LAZY = _generate_sentseg_workflow(lazy=True)
SENTSEG_WORKFLOW = _generate_sentseg_workflow(lazy=False)


def _sentseg(
        sentseg_workflow: workflow.BasicWorkflow,
        text: str,
        enable_strict_sentence_charset: bool,
        enable_comma_ending: bool,
        extend_ending_with_delimiters: bool,
        dynamic_endings: List[str],
) -> Union[workflow.SegmentGeneratorType, workflow.SegmentListType]:
    config = SentenceSegementationConfig(
            enable_strict_sentence_charset=enable_strict_sentence_charset,
            enable_comma_ending=enable_comma_ending,
            extend_ending_with_delimiters=extend_ending_with_delimiters,
            dynamic_endings=dynamic_endings,
    )
    return cast(Union[workflow.SegmentGeneratorType, workflow.SegmentListType],
                sentseg_workflow.result(text, config))


[docs]def sentseg(
        text: str,
        enable_strict_sentence_charset: bool = False,
        enable_comma_ending: bool = False,
        extend_ending_with_delimiters: bool = False,
        dynamic_endings: Optional[List[str]] = None,
) -> workflow.SegmentListType:
    return cast(
            workflow.SegmentListType,
            _sentseg(
                    SENTSEG_WORKFLOW,
                    text,
                    enable_strict_sentence_charset,
                    enable_comma_ending,
                    extend_ending_with_delimiters,
                    dynamic_endings or [],
            ))


[docs]def sentseg_lazy(
        text: str,
        enable_strict_sentence_charset: bool = False,
        enable_comma_ending: bool = False,
        extend_ending_with_delimiters: bool = False,
        dynamic_endings: Optional[List[str]] = None,
) -> workflow.SegmentGeneratorType:
    return cast(
            workflow.SegmentGeneratorType,
            _sentseg(
                    SENTSEG_WORKFLOW_LAZY,
                    text,
                    enable_strict_sentence_charset,
                    enable_comma_ending,
                    extend_ending_with_delimiters,
                    dynamic_endings or [],
            ))
Source code for cnt.rulebase.rules.sentence_segmentation.sentence_segmenter

cnt.rulebase

Navigation

Related Topics