public List<string> SpliteToSentences(string para, List<string> segrules)
{
string input = para;
List<int> quotePos = new List<int>();
int pos = 0;
while (pos < input.Length)
{
pos = input.IndexOf('"', pos);
if (pos == -1) break;
quotePos.Add(pos);
pos += 1;
}
StringBuilder sb = new StringBuilder();
pos = 0;
for (int i = 0; i + 1 < quotePos.Count; i += 2)
{
sb.Append(Regex.Replace(input.Substring(pos, quotePos[i] - pos), @"(?<!(?:\W[A-Z]|etc|Prof|Mr|Miss|Mt|Rep|Dr|\bPh))(\.|!|\?)", "$1#N#"));
sb.Append(input.Substring(quotePos[i], quotePos[i + 1] - quotePos[i] + 1));
pos = quotePos[i + 1] + 1;
}
if (pos < input.Length)
{
sb.Append(Regex.Replace(input.Substring(pos), @"(?<!(?:\W[A-Z]|etc|Prof|Mr|Miss|Mt|Rep|Dr|\bPh))(\.|!|\?)", "$1#N#"));
//sb.Append(input.Substring(pos));
}
input = sb.ToString();
input = input.Replace("'s", " 's");
input = input.Replace("’s", " 's");
foreach (string rule in segrules)
{
if (rule.Contains(@""", @"""))
{
string rulept = Regex.Replace(rule, @"@""(.*)"", @""(.*)""\);", "$1");
string rulerep = Regex.Replace(rule, @"@""(.*)"", @""(.*)""\);", "$2");
input = Regex.Replace(input, rulept, rulerep);
}
}
string[] splits = { "#N#" };
List<string> sentences = new List<string>();
sentences = input.Split(splits, StringSplitOptions.RemoveEmptyEntries).Select(p => Preprocessing(p.Trim())).ToList();
return sentences;
}
</pre><pre class="csharp" name="code">private void ExecEnTextSegmentationToolsUsingEmailNER(HttpContext context, string text, IEnumerable<Tool> tools, JsonMultiExtraction ret)
{
string usingrule = context.Request["custom"];
string usingemailseg = context.Request["segtool"];
SegRulesText = context.Request["segrulestext"];
List<string> segrules = SegRulesText.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList();
List<string> docs = new List<string>();
docs = Regex.Split(text, "={75}", RegexOptions.Multiline).ToList();
List<string> paras=new List<string>();
foreach(var doc in docs)
{
text = doc;
if (usingemailseg == "true")
paras = SpliteToSentences(text, segrules);
else
paras = text.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(p => Preprocessing(p.Trim())).ToList();
}
初始化页面时做的:
protected void Page_Load(object sender, EventArgs e)
{
string lang = LangUtil.GetRequestLang(Request);
if (lang == "en")
{
SegRules= File.ReadAllText(Server.MapPath("~/app_data/SegRules.txt"), Encoding.UTF8);
}
}
最后,上我的所有分句规则,为了灵活,都是写在外部文档中的:
//SegRules.txt
@"\n\s*\n", @"#N#");
@"\.#N#(?=\w+([-+.]\w+)*@(\w+([-.]\w+)*))", @".");
@"(?<=\w+([-+.]\w+)*@\w+([-.]\w+)*)\.#N#", ".");
@"\.#N#(?=([a-z]+(\.#N#)*)*([a-z]+))", @".");
@"\bvs\.\s*#N#",@"vs.");
@"-{4}(-)+([a-zA-z/0-9:\s]+)?-{4}(-)+", @"#N#$&#N#");
@"\nFrom\s*:", @"#N#$&");
@"\n(To|to)\s*:", @"#N#$&");
@"\n(Cc|cc)\s*:", @"#N#$&");
@"\n(Bcc|bcc)\s*:", @"#N#$&");
@"\nSent\s*:", @"#N#$&");
@"\nSubject\s*:", @"#N#$&");
@"\nTitle\s*:", @"#N#$&");
@"\nDate\s*:", @"#N#$&");
@"\nWhen\s*:", @"#N#$&");
@"\nTime\s*:", @"#N#$&");
@"\nWhere\s*:", @"#N#$&");
@"\nHost\s*:", @"#N#$&");
@"\nSpeaker\s*:", @"#N#$&");
@"\nVenue\s*:", @"#N#$&");
@"\nNote\s*:", @"#N#$&");
@"\nAbstract\s*:", @"#N#$&");
@"\nRoom\s*:", @"#N#$&");
@"\nBio\n\s*:", @"#N#$&#N#");
@"(!#N#){3}", @"!!!#N#");
@"(!#N#){2}", @"!!#N#");
@"(\.#N#){3}", @"...#N#");
@"(\.#N#){2}", @"..#N#");
@"(i\.#N#e)", @"i.e");
@"(?<t1>\.)#N#(?<t2>txt|doc|xls)(?=\s)", @"${t1}${t2}#N#");
@"\r\r(\r)*", @"#N#$&#N#");