正则表达式获取HTML标记中的内容(C#)
//
=====================Begin1========================
// 试验字符串
string strTmp = string .Empty;
// 正则表达式
string tmpStr = string .Empty;
// 取出指定HTML标记中的匹配项的值RegexOptions.IgnoreCase忽略大小写,RegexOptions.Multiline忽略多行显示,
// tmpStr = "<title>([^<]*)</title>" // 获取<title>之间内容
strTmp = @" <add key='ConnectionString' value='server=localhost;database=数据库名;uid=sa;pwd=;pooling=true'/> " ;
// 获取“database=”与“;”号之间的字符串:database=(.*);
tmpStr = " database=([^;]*); " ;
Match TitleMatch = Regex.Match(strTmp, tmpStr ,RegexOptions.IgnoreCase | RegexOptions.Multiline );
// 如下例子作语法参考用获取size的值,实际应用可能不会如此复杂
strTmp = " ><font color='red' size=6>WebForm3</font>< " ;
tmpStr = @" <(w+s+w+[=]+[']+w+[']+s+[size=]+d)> " ;
Match TitleMatch = Regex.Match(strTmp ,tmpStr , RegexOptions.IgnoreCase | RegexOptions.Multiline );
// 取出匹配项的值
string tmpStrTitle = TitleMatch.Groups[ 1 ].Value;
// 替换掉HTML页中所有HTML标记
Label1.Text = Regex.Replace(Label1.Text.Trim(), " <.+?> " , "" ) + " ********* " + TitleMatch.Groups[ 1 ].Value;
// 判断匹配正则表达式是否成功
if (Regex.Match(tmpStr, " <.+?> " ).Success)
... {
//操作
}
// =====================End1========================
// =====================Begin2========================
string webDocContent = " <a href=http://www.xxx.xxx/college/pages/default.htm target=_blank>师资队伍</A> " ;
// 解释下面正则表达式:[s]表示匹配空格字符,"+" 表示连接
string strPattern = @" a[s]+href=(?<Link>[^s>]+)[^>]*>(?<Text>[^<]*)</a> " ;
// 获取链接显示的文字
MatchCollection Matches = Regex.Matches(webDocContent,strPattern,RegexOptions.IgnoreCase | RegexOptions.Compiled);
foreach (Match NextMatch in Matches)
... {
string URL=NextMatch.Groups["Link"].Value.ToString().Trim();
string URLText=NextMatch.Groups["Text"].Value.ToString().Trim();
Response.Write(URL+"****");
Response.Write(URLText);
}
// =====================End2========================
// =====================Begin3========================
string strPageContent = string .Empty;
StreamReader srPage = new StreamReader( @" e:save.txt " ,System.Text.Encoding.GetEncoding( " gb2312 " ));
strPageContent = srPage.ReadToEnd();
srPage.Close();
// (/s)*表示0或多个空格符、回车符等,*表示比配0或多个。(.*?)表示除回车符外的所有信息
MatchCollection TitleMatchs = Regex.Matches(strPageContent, " <td width="85%" class="common_text">((/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*)</td> " , RegexOptions.IgnoreCase | RegexOptions.Multiline );
int tmpNum = 0 ;
// 循环正则表达式所获取的,满足表达式的内容集合
foreach (Match NextMatch in TitleMatchs)
... {
++tmpNum;
Label1.Text += tmpNum + "<br>****" + NextMatch.Groups[1].Value;
}
// =====================End3========================
// 试验字符串
string strTmp = string .Empty;
// 正则表达式
string tmpStr = string .Empty;
// 取出指定HTML标记中的匹配项的值RegexOptions.IgnoreCase忽略大小写,RegexOptions.Multiline忽略多行显示,
// tmpStr = "<title>([^<]*)</title>" // 获取<title>之间内容
strTmp = @" <add key='ConnectionString' value='server=localhost;database=数据库名;uid=sa;pwd=;pooling=true'/> " ;
// 获取“database=”与“;”号之间的字符串:database=(.*);
tmpStr = " database=([^;]*); " ;
Match TitleMatch = Regex.Match(strTmp, tmpStr ,RegexOptions.IgnoreCase | RegexOptions.Multiline );
// 如下例子作语法参考用获取size的值,实际应用可能不会如此复杂
strTmp = " ><font color='red' size=6>WebForm3</font>< " ;
tmpStr = @" <(w+s+w+[=]+[']+w+[']+s+[size=]+d)> " ;
Match TitleMatch = Regex.Match(strTmp ,tmpStr , RegexOptions.IgnoreCase | RegexOptions.Multiline );
// 取出匹配项的值
string tmpStrTitle = TitleMatch.Groups[ 1 ].Value;
// 替换掉HTML页中所有HTML标记
Label1.Text = Regex.Replace(Label1.Text.Trim(), " <.+?> " , "" ) + " ********* " + TitleMatch.Groups[ 1 ].Value;
// 判断匹配正则表达式是否成功
if (Regex.Match(tmpStr, " <.+?> " ).Success)
... {
//操作
}
// =====================End1========================
// =====================Begin2========================
string webDocContent = " <a href=http://www.xxx.xxx/college/pages/default.htm target=_blank>师资队伍</A> " ;
// 解释下面正则表达式:[s]表示匹配空格字符,"+" 表示连接
string strPattern = @" a[s]+href=(?<Link>[^s>]+)[^>]*>(?<Text>[^<]*)</a> " ;
// 获取链接显示的文字
MatchCollection Matches = Regex.Matches(webDocContent,strPattern,RegexOptions.IgnoreCase | RegexOptions.Compiled);
foreach (Match NextMatch in Matches)
... {
string URL=NextMatch.Groups["Link"].Value.ToString().Trim();
string URLText=NextMatch.Groups["Text"].Value.ToString().Trim();
Response.Write(URL+"****");
Response.Write(URLText);
}
// =====================End2========================
// =====================Begin3========================
string strPageContent = string .Empty;
StreamReader srPage = new StreamReader( @" e:save.txt " ,System.Text.Encoding.GetEncoding( " gb2312 " ));
strPageContent = srPage.ReadToEnd();
srPage.Close();
// (/s)*表示0或多个空格符、回车符等,*表示比配0或多个。(.*?)表示除回车符外的所有信息
MatchCollection TitleMatchs = Regex.Matches(strPageContent, " <td width="85%" class="common_text">((/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*(.*?)(/s)*)</td> " , RegexOptions.IgnoreCase | RegexOptions.Multiline );
int tmpNum = 0 ;
// 循环正则表达式所获取的,满足表达式的内容集合
foreach (Match NextMatch in TitleMatchs)
... {
++tmpNum;
Label1.Text += tmpNum + "<br>****" + NextMatch.Groups[1].Value;
}
// =====================End3========================