C#版正文抽取所需正则全集

最新推荐文章于 2023-04-03 10:37:08 发布

yongping8204

最新推荐文章于 2023-04-03 10:37:08 发布

阅读量1.4k

点赞数

分类专栏： Nutch搜索引擎文章标签： c# regex 正则表达式 bbs url html

Nutch搜索引擎专栏收录该内容

5 篇文章 1 订阅

订阅专栏

在正文抽取（正文提取）里一般会用到的正则，易尔译科技收集了一下，是C#版本的正文抽取正则表达式。欢迎大家补充。

#region 相关正则表达式

/// <summary>
/// 去掉所有html标签
/// </summary>
private static readonly Regex FilterAll = new Regex(
@"(/[([^=]*)(=[^/]]*)?/][/s/S]*?/[//1/])|(?<lj>(?=[^/u4E00-/u9FA5/uFE30-/uFFA0,."");])<a/s+[^>]*>[^<]{2,}</a>(?=[^/u4E00-/u9FA5/uFE30-/uFFA0,."");]))|(?<Style><style[/s/S]+?/style>)|(?<select><select[/s/S]+?/select>)|(?<Script><script[/s/S]*?/script>)|(?<Explein></!/-/-[/s/S]*?/-/->)|(?<li><li(/s+[^>]+)?>[/s/S]*?/li>)|(?<Html></?/s*[^> ]+(/s*[^=>]+?=['""]?[^""']+?['""]?)*?[^/[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>/#[a-z0-9]{6})|(?<Space>/s+)|(/&/#/d+/;)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase); //(?<Link><a[/s/S]*?</a>)|
//(?<Style><style[/s/S]+?/style>)|(?<select><select[/s/S]+?/select>)|(?<Script><script[/s/S]*?/script>)|(?<Explein></!/-/-[/s/S]*?/-/->)|(?<li><li(/s+[^>]+)?>[/s/S]*?/li>)|(?<Html></?/s*[^> ]+(/s*[^=>]+?=['""]?[^""']+?['""]?)*?[^/[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>/#[a-z0-9]{6})|(?<Space>/s+)

/// <summary>
/// 找出title标签
/// </summary>
private static readonly Regex FindTitle = new Regex(
@"</s*/?title/s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出title标签内容
/// </summary>
private static readonly Regex FindTitleContent = new Regex(
@"</s*/?title/s*>(?<Content>[/s/S]*?)</s*/?title/s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出h 和Strong标签
/// </summary>
private static readonly Regex FindHStrong = new Regex(
@"</s*/?h/s*>|</s*/?strong/s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出p 和br标签
/// </summary>
private static readonly Regex FindPB = new Regex(
@"</s*/?p/s*>|</s*br/s*/?>|</s*/?tr/s*>",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出nbsp标签
/// </summary>
private static readonly Regex FindNbsp = new Regex(
@"&nbsp",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出结尾标签
/// </summary>
private static readonly Regex FindS = new Regex(
@"(?<Content>[/s/S]*?)/$",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出是否为标准句
/// </summary>
private static readonly Regex IsSen = new Regex(
@"[,.，。!！;；:：……？?《》“”""]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出是否为垃圾句[strong][h]标签过多的
/// </summary>
private static readonly Regex IsWs = new Regex(
@"/[/(h/)/]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出是否为垃圾句冒号和·-过多的
/// </summary>
private static readonly Regex IsWsM = new Regex(
@"/[·]|[－]|[：:]",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出是否为BBS特征
/// </summary>
private static readonly Regex IsBbsInfo = new Regex(
@"第[^楼]{1,50}楼|Powered/s*/?by[/s/S]*?Dvbbs|Powered/s*/?by[/s/S]*?Discuz",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 取KEYWORD
/// </summary>
private static readonly Regex mKeyWord = new Regex(
@"<meta/s*name/s*=/s*['""]?keywords['""]?/s*content/s*=/s*['""]?(?<KeyWords>[^'"">]*)['""]?[^>]*>|<meta/s*content/s*=/s*['""]?(?<KeyWords>[^'"">]*)['""]?/s*name/s*=/s*['""]?keywords['""]?/s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);

/// <summary>
/// 取DESCRIPTION
/// </summary>
private static readonly Regex mDescription = new Regex(
@"<meta/s*name/s*=/s*['""]?description['""]?/s*content/s*=/s*['""]?(?<description>[^'"">]*)['""]?[^>]*>|<meta/s*content/s*=/s*['""]?(?<description>[^'"">]*)['""]?/s*name/s*=/s*['""]?description['""]?/s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);

/// <summary>
/// 取Tags
/// </summary>
private static readonly Regex mTag = new Regex(
@"<meta/s*name/s*=/s*['""]?tagwords['""]?/s*content/s*=/s*['""]?(?<tagwords>[^'"">]*)['""]?[^>]*>|<meta/s*content/s*=/s*['""]?(?<tagwords>[^'"">]*)['""]?/s*name/s*=/s*['""]?tagwords['""]?/s*[^>]*>
", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);

/// <summary>
/// 找出是否为垃圾句:后字符号过少，：号前无“说”字,:号后无"关于"
/// </summary>
private static readonly Regex IsWsMM = new Regex(
@"^[^说/s]{0,8}?[:：].{0,10}$",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出spider写入的url标记
/// </summary>
private static readonly Regex txtUrl = new Regex(
@"当前URL为:http://(?<URL>.*)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/// <summary>
/// 找出spider写入的锚点描述标记
/// </summary>
private static readonly Regex txtDescription = new Regex(
@"当前链接描述为:(?<Describe>.*)",
RegexOptions.ExplicitCapture
| RegexOptions.Multiline
| RegexOptions.IgnoreCase);

/ <summary>
/ 取需要a标签
/ </summary>
//private static readonly Regex cleanFirst = new Regex(
// @"([/u4E00-/u9FA5]|[/uFE30-/uFFA0]|[,."");])(?<Robbish1><a/s+[^>]*>)[^<]{1,6}(?<Robbish2></a>)([/u4E00-/u9FA5]|[/uFE30-/uFFA0]|[,."");])", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);

#endregion

yongping8204

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
C#版正文抽取所需正则全集

在正文抽取（正文提取）里一般会用到的正则，易尔译科技收集了一下，是C#版本的正文抽取正则表达式。欢迎大家补充。#region 相关正则表达式 /// /// 去掉所有html标签 /// private static readonly Regex FilterAll = new Regex( @"(/[([^=]*)(=[^/]]*)?/][/s/S]*?/[//1/])|(?(?=
复制链接

扫一扫