C#对Ajax页面内容采集

public class PageSnatch 
    { 
        #region field 
        private WebBrowser browser; 
        /// <summary>    
        /// 默认异步加载延时5s    
        /// </summary>    
        private int timeout; 
        private const int defaultTimeout = 5 * 1000; 
        #endregion 
        #region ctor 
        public PageSnatch() 
        { 
            this.IsBusy = false;  //标志为完成状态,可开始新一导航    
            this.timeout = defaultTimeout; 
        } 
        public PageSnatch(string url) 
            : this() 
        { 
            this.Url = url; //绑定URL    
        } 
        public PageSnatch(string url, int timeout) 
            : this(url)  //调用一个构造参数    
        { 
            this.timeout = timeout; //绑定延时    
        } 
        #endregion 
        #region event 
        /// <summary>    
        /// 在控件导航到新文档并开始加载该文档时发生    
        /// </summary>    
        public event SnatchingEventHandler Snatching; 
        /// <summary>    
        /// 在控件完成加载文档时发生    
        /// </summary>    
        public event SnatchedEventHandler Snatched; 
        /// <summary>    
        /// 异步文档加载完毕发生    
        /// </summary>    
        public event SnatchCompletedEventHandler SnatchCompleted; 
        #endregion 
        #region property 
        private DateTime CurrentTime = DateTime.Now; 
        /// <summary>    
        /// 获取或设置是否取消异步数据加载    
        /// </summary>    
        public bool Cancel { set; get; } 
        /// <summary>    
        /// 获取加载文档是否完成    
        /// </summary>    
        public bool IsBusy { private set; get; } 
        /// <summary>    
        /// 验证网址是否有效    
        /// </summary>    
        /// <returns></returns>    
        private bool IsValidate 
        { 
            get { return Regex.IsMatch(Url, @"http(s)?://([\w-]+\.)+[\w-]+(\[\w- .\?%&=]*)?"); } 
        } 
        private string OuterHtml 
        { 
            get
            { 
                string text = string.Empty; 
                this.Execute(delegate() 
                { 
                    HtmlDocument doc = browser.Document; 
                    if (browser.Document.Body == null) 
                    { 
                        text = " document body null"; 
                    } 
                    else
                    { 
                        text = browser.Document.Body.InnerText; 
                    } 
                }); 
                return text; 
            } 
        } 
        /// <summary>    
        /// 获取或设置加载异步数延时间隔(默认延时,最小设置值5s)    
        /// </summary>    
        public int Timeout 
        { 
            get { return this.timeout; } 
            set
            { 
                if (value > defaultTimeout) this.timeout = value; 
            } 
        } 
        /// <summary>    
        /// 获取或设置当前文档的 URL.    
        /// </summary>    
        public string Url { set; get; } 
        #endregion 
        #region methods 
        /// <summary>    
        /// 释放文档资源    
        /// </summary>    
        private void Dispose() 
        { 
            this.Execute(delegate() 
            { 
                browser.Stop(); 
                browser.Dispose(); 
                browser = null; 
            }); 
        } 
        /// <summary>    
        /// WebBrowser 跨线程获取数据代理方法    
        /// </summary>    
        /// <param name="browserEventHanler"></param>    
        private void Execute(BrowserEventHandler browserEventHanler) 
        { 
            //this.IsHandleCreated 
            if (this.browser != null && this.browser.IsHandleCreated) 
            { 
                this.browser.Invoke(browserEventHanler); 
            } 
        } 
        /// <summary>    
        /// 将指定的URL资源加载到WebBrowser控件    
        /// </summary>    
        public void Navigate() 
        { 
            this.Navigate(DBNull.Value); 
        } 
        /// <summary>    
        /// 将指定的URL资源加载到WebBrowser控件    
        /// </summary>    
        /// <param name="url">网址</param>    
        public void Navigate(string url) 
        { 
            this.Url = url; 
            this.Navigate(DBNull.Value); 
        } 
        /// <summary>    
        /// 将指定的URL资源加载到WebBrowser控件    
        /// <param name="argument">一个对象,包含网页内容抓取完毕要使用的数据</param>    
        /// </summary>    
        public void Navigate(object argument) 
        { 
            if (this.IsBusy) throw new Exception("This document is busy!"); 
            if (!this.IsValidate) throw new Exception("This url is wrong!"); 
            int interval = 500; 
            this.IsBusy = true;  //网页异步加载状态    
            bool completed = false; 
            SnatchCompletedEventArgs scea = new SnatchCompletedEventArgs();  //事件模型    
            scea.Argument = argument; 
            try
            { 
                this.browser = new WebBrowser();  //初始化导航网页对象    
                this.browser.ScriptErrorsSuppressed = true; 
                this.browser.Navigated += delegate(object sender, WebBrowserNavigatedEventArgs e) 
                { 
                    if (this.Snatching != null) 
                    { 
                        SnatchingEventArgs sea = new SnatchingEventArgs(); 
                        sea.Argument = argument; 
                        sea.Url = e.Url; 
                        this.Snatching(this, sea); 
                    } 
                }; 
                this.browser.DocumentCompleted += delegate(object sender, WebBrowserDocumentCompletedEventArgs e) 
                { 
                        
                    scea.Url = e.Url; 
                    scea.Document = browser.Document; 
                    scea.Text = browser.Document.Body.OuterHtml; 
                    string url0 = browser.Document.Url.ToString(); 
                    //"res://ieframe.dll/navcancl.htm#http://item.taobao.com/item.htm?spm=686.1000925.1000774.5.amHxgt&amp;id=15960593087" 
                    completed = url0.Equals(e.Url.ToString()); 
                    if (url0.Contains("res://ieframe.dll")) { 
                        completed = true; 
                    } 
                    if (this.Snatched != null) 
                    { 
                        SnatchedEventArgs sea = new SnatchedEventArgs(); 
                        sea.Url = e.Url; 
                        this.Snatched(this, sea); 
                    } 
                }; 
                this.browser.Navigate(Url);  //导航到当前文档    
                BackgroundWorker worker = new BackgroundWorker(); 
                worker.DoWork += delegate(object obj, DoWorkEventArgs dow) 
                { 
                    while (!completed && !Cancel) 
                    { 
                        Application.DoEvents(); 
                        System.Threading.Thread.Sleep(interval); 
                    } 
                    int count = 6; 
                    int index = 0; 
                    int length = 0; 
                    DateTime startTime = DateTime.Now;   //异步开始计时初始值    
                    while (this.IsBusy && !Cancel) 
                    { 
                        System.Threading.Thread.Sleep(interval); 
                        double t = Math.Ceiling((DateTime.Now - startTime).TotalMilliseconds); 
                        if (t >= this.Timeout) 
                        { 
                            scea.Error = new Exception("Visiting about new exception delay, since the setting is timeout"); 
                            break; 
                        } 
                        this.Execute(delegate() { this.IsBusy = !browser.IsBusy; }); 
                        if (!IsBusy) 
                        { 
                            this.IsBusy = true; 
                            int len = this.OuterHtml.Length; 
                            if (len == length) { index++; } 
                            else { index = 0; length = len; } 
                            if (index == count) { this.IsBusy = false; } 
                        } 
                        length = this.OuterHtml.Length; 
                    } 
                    if (!Cancel) 
                    { 
                        if (SnatchCompleted != null) 
                        { 
                            scea.TextAsync = this.OuterHtml; 
                            scea.Timeout = (int)Math.Ceiling((DateTime.Now - startTime).TotalMilliseconds); //计算所耗时间    
                            SnatchCompleted(this, scea); //触发文档加载完毕事件     
                        } 
                    } 
                    Dispose(); 
                    GC.Collect(); 
                    GC.WaitForPendingFinalizers(); 
                }; 
                worker.RunWorkerAsync(); 
            } 
            catch (Exception ex) { throw ex; } 
        } 
        /// <summary>    
        /// 将指定的URL资源加载到WebBrowser控件    
        /// </summary>    
        /// <param name="url">网址</param>    
        /// <param name="argument">一个对象,包含网页内容抓取完毕要使用的数据</param>    
        public void Navigate(string url, object argument) 
        { 
            this.Url = url; 
            this.Navigate(argument); 
        } 
        /// <summary>    
        /// 将指定的URL资源加载到WebBrowser控件    
        /// </summary>    
        /// <param name="url">网址</param>    
        /// <param name="argument">一个对象,包含网页内容抓取完毕要使用的数据</param>    
        /// <param name="timeOut">异步数据延时等待时间</param>    
        public void Navigate(string url, object argument, int timeout) 
        { 
            this.Url = url; 
            this.timeout = timeout; 
            this.Navigate(argument); 
        } 
        #endregion 
    
            
    } 
    #region delegate 
    /// <summary>    
    /// 内部方法代理    
    /// </summary>    
    delegate void BrowserEventHandler(); 
    /// <summary>    
    /// 表示将处理 Yyc.Net.PageSnatch 类的 Yyc.Net.PageSnatch.Snatching 事件的方法    
    /// </summary>    
    /// <param name="sender">事件源: Yyc.Net.PageSnatch</param>    
    /// <param name="e">包含事件数据: Yyc.Net.SnatchingEventArgs</param>    
    public delegate void SnatchingEventHandler(object sender, SnatchingEventArgs e); 
    /// <summary>    
    /// 表示将处理 Yyc.Net.PageSnatch 类的 Yyc.Net.PageSnatch.Snatched 事件的方法    
    /// </summary>    
    /// <param name="sender">事件源: Yyc.Net.PageSnatch</param>    
    /// <param name="e">包含事件数据: Yyc.Net.SnatchedEventArgs</param>    
    public delegate void SnatchedEventHandler(object sender, SnatchedEventArgs e); 
    /// <summary>    
    /// 表示将处理 Yyc.Net.PageSnatch 类的 Yyc.Net.PageSnatch.SnatchCompleted 事件的方法    
    /// </summary>    
    /// <param name="sender">事件源: Yyc.Net.PageSnatch</param>    
    /// <param name="e">包含事件数据: Yyc.Net.SnatchCompletedEventArgs</param>    
    public delegate void SnatchCompletedEventHandler(object sender, SnatchCompletedEventArgs e); 
    #endregion 
    #region model 
    /// <summary>    
    /// 为 Yyc.Net.PageSnatch.Snatching 事件提供数据    
    /// </summary>    
    public class SnatchingEventArgs 
    { 
        /// <summary>    
        /// 事件处理程序中执行的后台操作使用的参数    
        /// </summary>    
        public object Argument { set; get; } 
        /// <summary>    
        /// 获取当前导航到的文档位置    
        /// </summary>    
        public Uri Url { set; get; } 
    } 
    /// <summary>    
    /// 为 Yyc.Net.PageSnatch.SnatchedEventArgs 事件提供数据    
    /// </summary>    
    public class SnatchedEventArgs 
    { 
        /// <summary>    
        /// 获取当前导航到的文档位置    
        /// </summary>    
        public Uri Url { set; get; } 
    } 
    /// <summary>    
    /// 为 Yyc.Net.PageSnatch.SnatchCompleted 事件提供数据    
    /// </summary>    
    public class SnatchCompletedEventArgs 
    { 
        /// <summary>    
        /// 获取或设置静态源文本    
        /// </summary>    
        public string Text { set; get; } 
        /// <summary>    
        /// 获取或设置异步网页源文本    
        /// </summary>    
        public string TextAsync { set; get; } 
        /// <summary>    
        /// 获取或设置加载文档异常    
        /// </summary>    
        public Exception Error { set; get; } 
        /// <summary>    
        /// 获取或设置加载异步验证延时(ms)    
        /// </summary>    
        public int Timeout { set; get; } 
        /// <summary>    
        /// 事件处理程序中执行的后台操作使用的参数    
        /// </summary>    
        public object Argument { set; get; } 
        /// <summary>    
        /// 获取当前导航到的文档位置    
        /// </summary>    
        public Uri Url { set; get; } 
    
        public HtmlDocument Document { set; get; } 
    } 
    #endregion

此类使用方法

var p = new PageSnatch();
            p.Timeout = this.Timeout;
            p.Url = this.Url;
            p.SnatchCompleted += new SnatchCompletedEventHandler(CompletedEventHandler);
            p.Navigate(this.Url);

为了使用起来方便,在此基础扩展了一个MyWebBrowser类,代码如下

 

public class MyWebBrowser
    {
        public bool isCompleted { get; set; }
        public SnatchCompletedEventArgs SnatchObj { get; set; }
        public string Url { get; set; }
        public int Timeout { get; set; }
        public MyWebBrowser()
        {
            this.SnatchObj = null;
            this.isCompleted = false;
            var p = new PageSnatch();
            p.Timeout = this.Timeout;
            p.Url = this.Url;
            p.SnatchCompleted += new SnatchCompletedEventHandler(CompletedEventHandler);
            p.Navigate(this.Url);
        }
        public MyWebBrowser(string url, int timeout, object argument)
        {
            this.SnatchObj = null;
            this.isCompleted = false;
            var p = new PageSnatch();
            p.Timeout = timeout;
            p.Url = url;
            p.SnatchCompleted += new SnatchCompletedEventHandler(CompletedEventHandler);
            p.Navigate(url, argument);
        }
        public void CompletedEventHandler(object sender, SnatchCompletedEventArgs e)
        {
            this.SnatchObj = e;
            this.isCompleted = true;
        }
    }

直接调用方式

 

public static SnatchCompletedEventArgs GetAsnyPageDocument(string url,int timeOut) {
            var webBrowser = new MyWebBrowser(url, timeOut, null);
            while (!webBrowser.isCompleted)
            {
                Application.DoEvents();
                System.Threading.Thread.Sleep(50);
            }
            return webBrowser.SnatchObj;
        }

有的页面加载时间太长,可以在这个方法中设置请求超时时间,来解决页面长时间无法响应问题。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值