C#正则表达式抓取网页备忘录

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Spider.VO;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;

namespace CAIJI
{
    public class SiteGet
    {
        //<div class="fixList">.*?(((?'Open'<div[^>]*>).*?)+((?'-Open'</div>).*?)+)*(?(Open)(?!))</div>//正解
        private Regex asynchroRegex = new Regex(@"<a[^>]*>((?!<a)(?!</a>).)+</a>", RegexOptions.IgnoreCase);

        public List<ArticleLink> GetTitle(Page page)
        {
            string url = page.Url;
            string htmlBody;
            try
            {
                HttpWebRequest wr = WebRequest.Create(url) as HttpWebRequest;
                wr.Timeout = 3000;
                WebResponse response = wr.GetResponse();
                StreamReader sr = new StreamReader(response.GetResponseStream(), System.Text.Encoding.GetEncoding(page.Encod));
                //读取整个页面
                htmlBody = sr.ReadToEnd();
                sr.Close();
            }
            catch (Exception e)
            {
                throw e;
            }
            Regex re = new Regex(page.BeginTag+".*?(((?'Open'<div[^>]*>).*?)+((?'-Open'</div>).*?)+)*(?(Open)(?!))"+page.EndTag, RegexOptions.Singleline);
            Match ma = re.Match(htmlBody);
            htmlBody = ma.Value.Trim();
            MatchCollection mc = asynchroRegex.Matches(htmlBody);
            string aLink;
            string href;
            string innerHTML;
            ArticleLink al;
            List<ArticleLink> als = new List<ArticleLink>();
            for (int i = 0; i < mc.Count; i++)
            {
                al = new ArticleLink();
                aLink = mc[i].Value;//获取a标签
                //当A标签内的href地址是Javascript则跳过不采集
                if (Regex.IsMatch(aLink, @"\s*javascript\s*:.[^)]*.[^>]*>.*/a>"))
                {
                    continue;
                }
                href = Regex.Match(aLink, @"(?<=href\s*=)\s*[^\s]*").Value.Trim();//获取A标签的href地址

                innerHTML = Regex.Match(aLink, "(?<=>).*(?=<)").Value.Trim();//获取A标签内的文本

                //href地址校正
                if (href.IndexOf("'") == 0 || href.IndexOf("\"") == 0)
                {
                    href = href.Substring(0, href.Length - 1);
                }
                if (href == null || href.Equals("") || href.Equals("#") || innerHTML.Equals(""))
                {
                    continue;
                }
                al.LinkUrl = href;
                al.LinkTitle = innerHTML;
                als.Add(al);
            }
            return als;
           
        }
    }
}

 

结果:

 

图片

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值