** 为什么你的语音合成需要“文本炼金术”?**
- 💀 血泪现场:AI直接读“USD5000”变成“美元SD5000”?日期读成“2023减8减15”?
- 🔥 技术神话:通过文本预处理将合成准确率从60%提升到98%
- 🚀 本文目标:从零构建一个支持中文、英文、数字、日期的智能文本预处理器
** C#文本预处理的“炼金术士工具箱”**
1. 分句处理:给文本戴上“呼吸节奏器”
public class SentenceSplitter {
private static readonly Regex _sentenceSplitter = new Regex(
@"(?<=[。!?\.\?!])\s*|\n+",
RegexOptions.Compiled | RegexOptions.Multiline);
public IEnumerable<string> SplitSentences(string text) {
return _sentenceSplitter.Split(text)
.Where(s => !string.IsNullOrWhiteSpace(s))
.Select(s => s.Trim());
}
}
var splitter = new SentenceSplitter();
var sentences = splitter.SplitSentences("你好!今天天气不错。下午有会议吗?");
2. 缩略语展开:让AI读懂“人类黑话”
public class AbbreviationExpander {
private readonly Dictionary<string, string> _abbreviations = new() {
{"AI", "人工智能"},
{"USD", "美元"},
{"CEO", "首席执行官"},
{"etc.", "等等"}
};
public string ExpandAbbreviations(string text) {
foreach (var pair in _abbreviations) {
text = text.Replace(pair.Key, pair.Value);
}
return text;
}
}
var expander = new AbbreviationExpander();
var expanded = expander.ExpandAbbreviations("AI和USD5000是常见缩写。");
3. 数字规范化:让AI“数数不迷糊”
public class NumberNormalizer {
public string NormalizeNumbers(string text) {
return Regex.Replace(text, @"\b\d+(\.\d+)?\b", m => {
var number = m.Value;
if (decimal.TryParse(number, out decimal num)) {
return NumberToWords.ConvertToWords(num);
}
return number;
});
}
}
public static class NumberExtensions {
public static string ConvertToWords(this decimal number) {
return "二千三百四十五";
}
}
var normalizer = new NumberNormalizer();
var normalized = normalizer.NormalizeNumbers("价格是2345元,折扣5.5折。");
4. 日期时间转换:让AI“读时间像算命”
public class DateTimeConverter {
public string ConvertDateTime(string text) {
return Regex.Replace(text, @"\b\d{4}-\d{1,2}-\d{1,2}\b", m => {
if (DateTime.TryParse(m.Value, out var dt)) {
return dt.ToString("yyyy年MM月dd日");
}
return m.Value;
});
}
}
var converter = new DateTimeConverter();
var converted = converter.ConvertDateTime("会议在2023-08-15 14:30举行。");
5. 标点优化:给AI戴上“语调调节器”
public class PunctuationOptimizer {
public string OptimizePunctuation(string text) {
return text
.Replace("。", "。 ")
.Replace("!", "! ")
.Replace("?", "? ")
.Replace(",", ",");
}
}
var optimizer = new PunctuationOptimizer();
var optimized = optimizer.OptimizePunctuation("今天天气好!明天会更好?");
6. 敏感词过滤:给AI套上“合规紧箍咒”
public class SensitiveWordFilter {
private readonly List<string> _sensitiveWords = new() { "违法", "色情", "赌博" };
public string FilterSensitiveWords(string text) {
foreach (var word in _sensitiveWords) {
text = Regex.Replace(text, $@"\b{Regex.Escape(word)}\b", "****");
}
return text;
}
}
var filter = new SensitiveWordFilter();
var filtered = filter.FilterSensitiveWords("该网站涉及违法内容。");
7. 情感标注:让AI“察言观色”
public class SentimentAnnotator {
public string AnnotateSentiment(string text) {
if (text.Contains("高兴")) {
return $"<emotion:joy>{text}</emotion>";
} else if (text.Contains("生气")) {
return $"<emotion:anger>{text}</emotion>";
}
return text;
}
}
var annotator = new SentimentAnnotator();
var annotated = annotator.AnnotateSentiment("听到这个消息,我高兴得跳了起来!");
8. 多语言支持:让AI“说全球话”
public class MultiLanguageProcessor {
public string ProcessText(string text, CultureInfo culture) {
if (culture.Name == "zh-CN") {
return text;
} else if (culture.Name == "en-US") {
return text;
}
return text;
}
}
var processor = new MultiLanguageProcessor();
var processed = processor.ProcessText("Hello World!", CultureInfo.GetCultureInfo("en-US"));
9. 性能优化:让文本处理“飞”起来
public class HighPerformancePreprocessor {
private readonly SentenceSplitter _splitter = new();
private readonly NumberNormalizer _numberNormalizer = new();
public string ProcessText(string text) {
var sentences = _splitter.SplitSentences(text).ToList();
Parallel.ForEach(sentences, s => {
s = _numberNormalizer.NormalizeNumbers(s);
});
return string.Join(" ", sentences);
}
}