Rasa特征抽取之RegexFeaturizer
rasa\nlu\featurizers\featurizer.py
class SparseFeaturizer(Featurizer):
pass
rasa\nlu\featurizers\sparse_featurizer\regex_featurizer.py
train
函数是训练的入口,_text_features_with_regex
是正则提取函数,组件提取的是sequence_features
和 sentence_features
特征,这些特征添加在message的features属性数组中,其中特征是稀疏矩阵,中国coo_matrix
转化为COO
方式存储。(cipy.sparse.coo_matrix(sequence_features))
pattern_utils.extract_patterns
函数是汇总look_up词表和regexes正则的全部规则。
class RegexFeaturizer(SparseFeaturizer):
@classmethod
def required_components(cls) -> List[Type[Component]]:
return [Tokenizer]
defaults = {
# 文本将默认区分大小写
"case_sensitive": True,
# 使用查找表生成特征
"use_lookup_tables": True,
# 使用正则表达式生成特征
"use_regexes": True,
# 增量训练要考虑的额外正则数量
"number_additional_patterns": None,
# 使用匹配词边界来查找表
"use_word_boundaries": True,
}
def __init__(
self,
component_config: Optional[Dict[Text, Any]] = None,
known_patterns: Optional[List[Dict[Text, Text]]] = None,
pattern_vocabulary_stats: Optional[Dict[Text, int]] = None,
finetune_mode: bool = False,
) -> None:
"""使用正则表达式构建正则表达式和查找表的新功能。"""
super().__init__(component_config)
self.known_patterns = known_patterns if known_patterns else []
self.case_sensitive = self.component_config["case_sensitive"]
self.number_additional_patterns = self.component_config[
"number_additional_patterns"
]
self.finetune_mode = finetune_mode
self.pattern_vocabulary_stats = pattern_vocabulary_stats
if self.finetune_mode and not self.pattern_vocabulary_stats:
# If the featurizer is instantiated in finetune mode,
# the vocabulary stats for it should be known.
raise rasa.shared.exceptions.InvalidParameterException(
f"{self.__class__.__name__} was instantiated with"
f" `finetune_mode=True` but `pattern_vocabulary_stats`"
f" was left to `None`. This is invalid since the featurizer"
f" needs vocabulary statistics to featurize in finetune mode."
)
@lazy_property
def vocabulary_stats(self) -> Dict[Text, int]:
"""计算总词汇量及其多少被计算"""
if not self.finetune_mode:
max_number_patterns = (
len(self.known_patterns) + self._get_num_additional_slots()
)
return {
"pattern_slots_filled": len(self.known_patterns),
"max_number_patterns": max_number_patterns,
}
else:
self.pattern_vocabulary_stats["pattern_slots_filled"] = len(
self.known_patterns
)
return self.pattern_vocabulary_stats
def _merge_new_patterns(self, new_patterns: List[Dict[Text, Text]]) -> None:
"""使用从数据中提取的新模式更新已知模式. """
max_number_patterns = self.pattern_vocabulary_stats["max_number_patterns"]
pattern_name_index_map = {
pattern["name"]: index for index, pattern in enumerate(self.known_patterns)
}
patterns_dropped = False
for extra_pattern in new_patterns:
new_pattern_name = extra_pattern["name"]
# 某些模式可能只是添加了新示例。 这些不算作额外的模式。
if new_pattern_name in pattern_name_index_map:
self.known_patterns[pattern_name_index_map[new_pattern_name]][
"pattern"
] = extra_pattern["pattern"]
else:
if len(self.known_patterns) == max_number_patterns:
patterns_dropped = True
continue
self.known_patterns.append(extra_pattern)
if patterns_dropped:
rasa.shared.utils.io.raise_warning(
f"The originally trained model was configured to "
f"handle a maximum number of {max_number_patterns} patterns. "
f"The current training data exceeds this number as "
f"there are {len(new_patterns)} patterns in total. "
f"Some patterns will be dropped and not used for "
f"featurization. It is advisable to re-train the "
f"model from scratch."
)
def _get_num_additional_slots(self) -> int:
"""在已知模式之上计算词汇表中可用的额外模式槽的数量."""
if self.number_additional_patterns is None:
# We take twice the number of currently defined
# regex patterns as the number of additional
# vocabulary slots to support if this parameter
# is not configured by the user. Also, to avoid having
# to retrain from scratch very often, the default number
# of additional slots is kept to MIN_ADDITIONAL_SLOTS.
# This is an empirically tuned number.
self.number_additional_patterns = max(
MIN_ADDITIONAL_REGEX_PATTERNS, len(self.known_patterns) * 2
)
return self.number_additional_patterns
def train(
self,
training_data: TrainingData,
config: Optional[RasaNLUModelConfig] = None,
**kwargs: Any,
) -> None:
"""使用从训练数据中提取的所有模式训练组件。
"""
patterns_from_data = pattern_utils.extract_patterns(
training_data,
use_lookup_tables=self.component_config["use_lookup_tables"],
use_regexes=self.component_config["use_regexes"],
use_word_boundaries=self.component_config["use_word_boundaries"],
)
if self.finetune_mode:
# 合并从数据中提取的正则与已知正则
self._merge_new_patterns(patterns_from_data)
else:
self.known_patterns = patterns_from_data
for example in training_data.training_examples:
for attribute in [TEXT, RESPONSE, ACTION_TEXT]:
self._text_features_with_regex(example, attribute)
def process(self, message: Message, **kwargs: Any) -> None:
self._text_features_with_regex(message, TEXT)
def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
"""提取特征并在消息对象中适当设置它们的辅助方法。"""
if self.known_patterns:
sequence_features, sentence_features = self._features_for_patterns(
message, attribute
)
if sequence_features is not None:
final_sequence_features = Features(
sequence_features,
FEATURE_TYPE_SEQUENCE,
attribute,
self.component_config[FEATURIZER_CLASS_ALIAS],
)
message.add_features(final_sequence_features)
if sentence_features is not None:
final_sentence_features = Features(
sentence_features,
FEATURE_TYPE_SENTENCE,
attribute,
self.component_config[FEATURIZER_CLASS_ALIAS],
)
message.add_features(final_sentence_features)
def _features_for_patterns(
self, message: Message, attribute: Text
) -> Tuple[Optional[scipy.sparse.coo_matrix], Optional[scipy.sparse.coo_matrix]]:
"""检查哪些已知模式与消息匹配.
Given a sentence, returns a vector of {1,0} values indicating which
regexes did match. Furthermore, if the
message is tokenized, the function will mark all tokens with a dict
relating the name of the regex to whether it was matched.
"""
# Attribute not set (e.g. response not present)
if not message.get(attribute):
return None, None
tokens = message.get(TOKENS_NAMES[attribute], [])
if not tokens:
# nothing to featurize
return None, None
flags = 0 # default flag
if not self.case_sensitive:
flags = re.IGNORECASE
sequence_length = len(tokens)
max_number_patterns = self.vocabulary_stats["max_number_patterns"]
sequence_features = np.zeros([sequence_length, max_number_patterns])
sentence_features = np.zeros([1, max_number_patterns])
for pattern_index, pattern in enumerate(self.known_patterns):
matches = re.finditer(
pattern["pattern"], message.get(attribute), flags=flags
)
matches = list(matches)
for token_index, t in enumerate(tokens):
patterns = t.get("pattern", default={})
patterns[pattern["name"]] = False
for match in matches:
if t.start < match.end() and t.end > match.start():
patterns[pattern["name"]] = True
sequence_features[token_index][pattern_index] = 1.0
if attribute in [RESPONSE, TEXT, ACTION_TEXT]:
# sentence vector should contain all patterns
sentence_features[0][pattern_index] = 1.0
t.set("pattern", patterns)
return (
scipy.sparse.coo_matrix(sequence_features),
scipy.sparse.coo_matrix(sentence_features),
)
@classmethod
def load(
cls,
meta: Dict[Text, Any],
model_dir: Optional[Text] = None,
model_metadata: Optional[Metadata] = None,
cached_component: Optional["RegexFeaturizer"] = None,
should_finetune: bool = False,
**kwargs: Any,
) -> "RegexFeaturizer":
"""加载先前训练过的组件."""
file_name = meta.get("file")
patterns_file_name = Path(model_dir) / (file_name + ".patterns.pkl")
vocabulary_stats_file_name = Path(model_dir) / (
file_name + ".vocabulary_stats.pkl"
)
known_patterns = None
vocabulary_stats = None
if patterns_file_name.exists():
known_patterns = rasa.shared.utils.io.read_json_file(patterns_file_name)
if vocabulary_stats_file_name.exists():
vocabulary_stats = rasa.shared.utils.io.read_json_file(
vocabulary_stats_file_name
)
return RegexFeaturizer(
meta,
known_patterns=known_patterns,
pattern_vocabulary_stats=vocabulary_stats,
finetune_mode=should_finetune,
)
def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
"""将此模型持久化到传递的目录中."""
patterns_file_name = file_name + ".patterns.pkl"
regex_file = Path(model_dir) / patterns_file_name
utils.write_json_to_file(regex_file, self.known_patterns, indent=4)
vocabulary_stats_file_name = file_name + ".vocabulary_stats.pkl"
vocabulary_file = Path(model_dir) / vocabulary_stats_file_name
utils.write_json_to_file(vocabulary_file, self.vocabulary_stats, indent=4)
return {"file": file_name}