用正则表达式给email分句

34 篇文章 0 订阅
24 篇文章 0 订阅
public List<string> SpliteToSentences(string para, List<string> segrules)
        {
            string input = para;
            List<int> quotePos = new List<int>();
            int pos = 0;
            while (pos < input.Length)
            {
                pos = input.IndexOf('"', pos);
                if (pos == -1) break;
                quotePos.Add(pos);
                pos += 1;
            }
            StringBuilder sb = new StringBuilder();
            pos = 0;
            for (int i = 0; i + 1 < quotePos.Count; i += 2)
            {
                sb.Append(Regex.Replace(input.Substring(pos, quotePos[i] - pos), @"(?<!(?:\W[A-Z]|etc|Prof|Mr|Miss|Mt|Rep|Dr|\bPh))(\.|!|\?)", "$1#N#"));
                sb.Append(input.Substring(quotePos[i], quotePos[i + 1] - quotePos[i] + 1));
                pos = quotePos[i + 1] + 1;
            }
            if (pos < input.Length)
            {
                sb.Append(Regex.Replace(input.Substring(pos), @"(?<!(?:\W[A-Z]|etc|Prof|Mr|Miss|Mt|Rep|Dr|\bPh))(\.|!|\?)", "$1#N#"));
                //sb.Append(input.Substring(pos));
            }
            input = sb.ToString();
            input = input.Replace("'s", " 's");
            input = input.Replace("’s", " 's");
            foreach (string rule in segrules)
            {
                if (rule.Contains(@""", @"""))
                {
                    string rulept = Regex.Replace(rule, @"@""(.*)"", @""(.*)""\);", "$1");
                    string rulerep = Regex.Replace(rule, @"@""(.*)"", @""(.*)""\);", "$2");
                    input = Regex.Replace(input, rulept, rulerep);
                }
            }
            string[] splits = { "#N#" };
            List<string> sentences = new List<string>();
            sentences = input.Split(splits, StringSplitOptions.RemoveEmptyEntries).Select(p => Preprocessing(p.Trim())).ToList();
            return sentences;
        }
</pre><pre class="csharp" name="code">private void ExecEnTextSegmentationToolsUsingEmailNER(HttpContext context, string text, IEnumerable<Tool> tools, JsonMultiExtraction ret)
        {
            string usingrule = context.Request["custom"];
            string usingemailseg = context.Request["segtool"];
            SegRulesText = context.Request["segrulestext"];
            List<string> segrules = SegRulesText.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).ToList();
            
            List<string> docs = new List<string>();
            docs = Regex.Split(text, "={75}", RegexOptions.Multiline).ToList();
            List<string> paras=new List<string>();
            
            foreach(var doc in docs)
            {
                text = doc;
                if (usingemailseg == "true")
                    paras = SpliteToSentences(text, segrules);
                else
                    paras = text.Split(new char[] { '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(p => Preprocessing(p.Trim())).ToList();
}


初始化页面时做的:

        protected void Page_Load(object sender, EventArgs e)
        {
            string lang = LangUtil.GetRequestLang(Request);
            if (lang == "en")
            {
                SegRules= File.ReadAllText(Server.MapPath("~/app_data/SegRules.txt"), Encoding.UTF8);
            }
	}


最后,上我的所有分句规则,为了灵活,都是写在外部文档中的:

//SegRules.txt

@"\n\s*\n", @"#N#");
@"\.#N#(?=\w+([-+.]\w+)*@(\w+([-.]\w+)*))", @".");
@"(?<=\w+([-+.]\w+)*@\w+([-.]\w+)*)\.#N#", ".");
@"\.#N#(?=([a-z]+(\.#N#)*)*([a-z]+))", @".");
@"\bvs\.\s*#N#",@"vs.");
@"-{4}(-)+([a-zA-z/0-9:\s]+)?-{4}(-)+", @"#N#$&#N#");
@"\nFrom\s*:", @"#N#$&");
@"\n(To|to)\s*:", @"#N#$&");
@"\n(Cc|cc)\s*:", @"#N#$&");
@"\n(Bcc|bcc)\s*:", @"#N#$&");
@"\nSent\s*:", @"#N#$&");
@"\nSubject\s*:", @"#N#$&");
@"\nTitle\s*:", @"#N#$&");
@"\nDate\s*:", @"#N#$&");
@"\nWhen\s*:", @"#N#$&");
@"\nTime\s*:", @"#N#$&");
@"\nWhere\s*:", @"#N#$&");
@"\nHost\s*:", @"#N#$&");
@"\nSpeaker\s*:", @"#N#$&");
@"\nVenue\s*:", @"#N#$&");
@"\nNote\s*:", @"#N#$&");
@"\nAbstract\s*:", @"#N#$&");
@"\nRoom\s*:", @"#N#$&");
@"\nBio\n\s*:", @"#N#$&#N#");
@"(!#N#){3}", @"!!!#N#");
@"(!#N#){2}", @"!!#N#");
@"(\.#N#){3}", @"...#N#");
@"(\.#N#){2}", @"..#N#");
@"(i\.#N#e)", @"i.e");
@"(?<t1>\.)#N#(?<t2>txt|doc|xls)(?=\s)", @"${t1}${t2}#N#");
@"\r\r(\r)*", @"#N#$&#N#");




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值