C#中使用正则表达式提取超链接地址

最新推荐文章于 2024-02-26 08:59:06 发布

hezudao25

最新推荐文章于 2024-02-26 08:59:06 发布

阅读量7.5k

点赞数 1

分类专栏： NET 文章标签：正则表达式链接地址

本文链接：https://blog.csdn.net/hezudao25/article/details/9250517

版权

NET 专栏收录该内容

66 篇文章 0 订阅

订阅专栏

一般在做爬虫或者CMS的时候经常需要提取 href链接或者是src地址。此时可以使用正则表达式轻松完成。

方法一：

Regex reg = new Regex(@"(?is)]*?href=(['""]?)(?[^'""\s>]+)\1[^>]*>(?(?:(?!");
 MatchCollection mc = reg.Matches(yourStr); 
foreach (Match m in mc) { richTextBox2.Text += m.Groups["url"].Value + "\n";//得到href值 richTextBox2.Text += m.Groups["text"].Value + "\n";//得到中间的内容 }

方法二：

Regex r;        
 Match m;          
 r = new Regex("href\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))",            
 RegexOptions.IgnoreCase|RegexOptions.Compiled);        
for (m = r.Match(inputString); m.Success; m = m.NextMatch())         
{             
 Console.WriteLine("Found href " + m.Groups[1] + " at " + m.Groups[1].Index);
}

方法三：提取img src的


  
  Regex reg = new Regex(@"(?i)<img[^>]*?\ssrc\s*=\s*(['""]?)(?<src>[^'""\s>]+)\1[^>]*>");   
 MatchCollection mc = reg.Matches(yourStr);   
 foreach (Match m in mc)    
 {    Console.Write(m.Groups["src"].Value + "\n");   
 }


方法四： 提取img src///          
  /// 获取Img的路径         
  ///          
  /// Html字符串文本        
  /// 以数组形式返回图片路径        
    public static string[] GetHtmlImageUrlList(string htmlText)       
  {          
   Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); 
   //新建一个matches的MatchCollection对象 保存 匹配对象个数(img标签) 
   MatchCollection matches = regImg.Matches(htmlText);
   int i = 0;            
   string[] sUrlList = new string[matches.Count]; 
   //遍历所有的img标签对象            
   foreach (Match match in matches) 
    {                 
    //获取所有Img的路径src,并保存到数组中 
    sUrlList[i++] = match.Groups["imgUrl"].Value;          
    }         
         return sUrlList;     
   }