Rasa特征抽取之RegexFeaturizer

最新推荐文章于 2023-05-31 15:19:07 发布

发呆的比目鱼

最新推荐文章于 2023-05-31 15:19:07 发布

阅读量904

点赞数 1

分类专栏： RASA与对话系统文章标签：自然语言处理深度学习算法人工智能

本文链接：https://blog.csdn.net/weixin_42486623/article/details/121607702

版权

RASA与对话系统专栏收录该内容

28 篇文章

订阅专栏

Rasa特征抽取之RegexFeaturizer

rasa\nlu\featurizers\featurizer.py

class SparseFeaturizer(Featurizer):
    pass

rasa\nlu\featurizers\sparse_featurizer\regex_featurizer.py
train函数是训练的入口，_text_features_with_regex是正则提取函数，组件提取的是sequence_features和 sentence_features特征，这些特征添加在message的features属性数组中，其中特征是稀疏矩阵，中国coo_matrix转化为COO方式存储。（cipy.sparse.coo_matrix(sequence_features)）

pattern_utils.extract_patterns函数是汇总look_up词表和regexes正则的全部规则。

class RegexFeaturizer(SparseFeaturizer):
    @classmethod
    def required_components(cls) -> List[Type[Component]]:
        return [Tokenizer]

    defaults = {
        # 文本将默认区分大小写
        "case_sensitive": True,
        # 使用查找表生成特征
        "use_lookup_tables": True,
        # 使用正则表达式生成特征
        "use_regexes": True,
        # 增量训练要考虑的额外正则数量
        "number_additional_patterns": None,
        # 使用匹配词边界来查找表
        "use_word_boundaries": True,
    }

    def __init__(
        self,
        component_config: Optional[Dict[Text, Any]] = None,
        known_patterns: Optional[List[Dict[Text, Text]]] = None,
        pattern_vocabulary_stats: Optional[Dict[Text, int]] = None,
        finetune_mode: bool = False,
    ) -> None:
        """使用正则表达式构建正则表达式和查找表的新功能。"""
        super().__init__(component_config)

        self.known_patterns = known_patterns if known_patterns else []
        self.case_sensitive = self.component_config["case_sensitive"]
        self.number_additional_patterns = self.component_config[
            "number_additional_patterns"
        ]
        self.finetune_mode = finetune_mode
        self.pattern_vocabulary_stats = pattern_vocabulary_stats

        if self.finetune_mode and not self.pattern_vocabulary_stats:
            # If the featurizer is instantiated in finetune mode,
            # the vocabulary stats for it should be known.
            raise rasa.shared.exceptions.InvalidParameterException(
                f"{self.__class__.__name__} was instantiated with"
                f" `finetune_mode=True` but `pattern_vocabulary_stats`"
                f" was left to `None`. This is invalid since the featurizer"
                f" needs vocabulary statistics to featurize in finetune mode."
            )

    @lazy_property
    def vocabulary_stats(self) -> Dict[Text, int]:
        """计算总词汇量及其多少被计算"""
        if not self.finetune_mode:
            max_number_patterns = (
                len(self.known_patterns) + self._get_num_additional_slots()
            )
            return {
                "pattern_slots_filled": len(self.known_patterns),
                "max_number_patterns": max_number_patterns,
            }
        else:
            self.pattern_vocabulary_stats["pattern_slots_filled"] = len(
                self.known_patterns
            )
            return self.pattern_vocabulary_stats

    def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]) -> None:
        """使用从数据中提取的新模式更新已知模式. """
        max_number_patterns = self.pattern_vocabulary_stats["max_number_patterns"]
        pattern_name_index_map = {
            pattern["name"]: index for index, pattern in enumerate(self.known_patterns)
        }
        patterns_dropped = False

        for extra_pattern in new_patterns:
            new_pattern_name = extra_pattern["name"]

            # 某些模式可能只是添加了新示例。 这些不算作额外的模式。
            if new_pattern_name in pattern_name_index_map:
                self.known_patterns[pattern_name_index_map[new_pattern_name]][
                    "pattern"
                ] = extra_pattern["pattern"]
            else:
                if len(self.known_patterns) == max_number_patterns:
                    patterns_dropped = True
                    continue
                self.known_patterns.append(extra_pattern)
        if patterns_dropped:
            rasa.shared.utils.io.raise_warning(
                f"The originally trained model was configured to "
                f"handle a maximum number of {max_number_patterns} patterns. "
                f"The current training data exceeds this number as "
                f"there are {len(new_patterns)} patterns in total. "
                f"Some patterns will be dropped and not used for "
                f"featurization. It is advisable to re-train the "
                f"model from scratch."
            )

    def _get_num_additional_slots(self) -> int:
        """在已知模式之上计算词汇表中可用的额外模式槽的数量."""
        if self.number_additional_patterns is None:
            # We take twice the number of currently defined
            # regex patterns as the number of additional
            # vocabulary slots to support if this parameter
            # is not configured by the user. Also, to avoid having
            # to retrain from scratch very often, the default number
            # of additional slots is kept to MIN_ADDITIONAL_SLOTS.
            # This is an empirically tuned number.
            self.number_additional_patterns = max(
                MIN_ADDITIONAL_REGEX_PATTERNS, len(self.known_patterns) * 2
            )
        return self.number_additional_patterns

    def train(
        self,
        training_data: TrainingData,
        config: Optional[RasaNLUModelConfig] = None,
        **kwargs: Any,
    ) -> None:
        """使用从训练数据中提取的所有模式训练组件。
        """
        patterns_from_data = pattern_utils.extract_patterns(
            training_data,
            use_lookup_tables=self.component_config["use_lookup_tables"],
            use_regexes=self.component_config["use_regexes"],
            use_word_boundaries=self.component_config["use_word_boundaries"],
        )
        if self.finetune_mode:
            # 合并从数据中提取的正则与已知正则
            self._merge_new_patterns(patterns_from_data)
        else:
            self.known_patterns = patterns_from_data

        for example in training_data.training_examples:
            for attribute in [TEXT, RESPONSE, ACTION_TEXT]:
                self._text_features_with_regex(example, attribute)

    def process(self, message: Message, **kwargs: Any) -> None:
        self._text_features_with_regex(message, TEXT)

    def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
        """提取特征并在消息对象中适当设置它们的辅助方法。"""
        if self.known_patterns:
            sequence_features, sentence_features = self._features_for_patterns(
                message, attribute
            )

            if sequence_features is not None:
                final_sequence_features = Features(
                    sequence_features,
                    FEATURE_TYPE_SEQUENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sequence_features)

            if sentence_features is not None:
                final_sentence_features = Features(
                    sentence_features,
                    FEATURE_TYPE_SENTENCE,
                    attribute,
                    self.component_config[FEATURIZER_CLASS_ALIAS],
                )
                message.add_features(final_sentence_features)

    def _features_for_patterns(
        self, message: Message, attribute: Text
    ) -> Tuple[Optional[scipy.sparse.coo_matrix], Optional[scipy.sparse.coo_matrix]]:
        """检查哪些已知模式与消息匹配.

        Given a sentence, returns a vector of {1,0} values indicating which
        regexes did match. Furthermore, if the
        message is tokenized, the function will mark all tokens with a dict
        relating the name of the regex to whether it was matched.
        """
        # Attribute not set (e.g. response not present)
        if not message.get(attribute):
            return None, None

        tokens = message.get(TOKENS_NAMES[attribute], [])

        if not tokens:
            # nothing to featurize
            return None, None

        flags = 0  # default flag
        if not self.case_sensitive:
            flags = re.IGNORECASE

        sequence_length = len(tokens)

        max_number_patterns = self.vocabulary_stats["max_number_patterns"]

        sequence_features = np.zeros([sequence_length, max_number_patterns])
        sentence_features = np.zeros([1, max_number_patterns])

        for pattern_index, pattern in enumerate(self.known_patterns):
            matches = re.finditer(
                pattern["pattern"], message.get(attribute), flags=flags
            )
            matches = list(matches)

            for token_index, t in enumerate(tokens):
                patterns = t.get("pattern", default={})
                patterns[pattern["name"]] = False

                for match in matches:
                    if t.start < match.end() and t.end > match.start():
                        patterns[pattern["name"]] = True
                        sequence_features[token_index][pattern_index] = 1.0
                        if attribute in [RESPONSE, TEXT, ACTION_TEXT]:
                            # sentence vector should contain all patterns
                            sentence_features[0][pattern_index] = 1.0

                t.set("pattern", patterns)

        return (
            scipy.sparse.coo_matrix(sequence_features),
            scipy.sparse.coo_matrix(sentence_features),
        )

    @classmethod
    def load(
        cls,
        meta: Dict[Text, Any],
        model_dir: Optional[Text] = None,
        model_metadata: Optional[Metadata] = None,
        cached_component: Optional["RegexFeaturizer"] = None,
        should_finetune: bool = False,
        **kwargs: Any,
    ) -> "RegexFeaturizer":
        """加载先前训练过的组件."""
        file_name = meta.get("file")

        patterns_file_name = Path(model_dir) / (file_name + ".patterns.pkl")

        vocabulary_stats_file_name = Path(model_dir) / (
            file_name + ".vocabulary_stats.pkl"
        )

        known_patterns = None
        vocabulary_stats = None
        if patterns_file_name.exists():
            known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name)
        if vocabulary_stats_file_name.exists():
            vocabulary_stats = rasa.shared.utils.io.read_json_file(
                vocabulary_stats_file_name
            )

        return RegexFeaturizer(
            meta,
            known_patterns=known_patterns,
            pattern_vocabulary_stats=vocabulary_stats,
            finetune_mode=should_finetune,
        )

    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
        """将此模型持久化到传递的目录中."""
        
        patterns_file_name = file_name + ".patterns.pkl"
        regex_file = Path(model_dir) / patterns_file_name
        utils.write_json_to_file(regex_file, self.known_patterns, indent=4)
        vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl"
        vocabulary_file = Path(model_dir) / vocabulary_stats_file_name
        utils.write_json_to_file(vocabulary_file, self.vocabulary_stats, indent=4)

        return {"file": file_name}