.net抓取网页信息

.net抓取网页信息

废话

正则表达式就是个坑,学了不常用就忘光了,可是编码过程中万一遇上就是一个大坑,偷偷喜欢一个姑娘

上代码

主体方法之外的可以单独建几个类,懒得建了

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.Services;
using System.Collections;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
using System.Text;

/// <summary>
///zzkPaChong 的摘要说明
/// </summary>
[WebService(Namespace = "http://tempuri.org/")]
[WebServiceBinding(ConformsTo = WsiProfiles.BasicProfile1_1)]
//若要允许使用 ASP.NET AJAX 从脚本中调用此 Web 服务,请取消对下行的注释。 
// [System.Web.Script.Services.ScriptService]
public class zzkPaChong : System.Web.Services.WebService {
    public zzkPaChong () {

        //如果使用设计的组件,请取消注释以下行 
        //InitializeComponent(); 
    }

    [WebMethod]
    public string HelloWorld() {
        return "Hello World";
    }
    [WebMethod]
    public string zzkPaChongTest()
    {
        tbt_sps();
        return "{\"status\":1, \"msg\":\"调用完成!\"}";
    }
    # region 主体方法
    /// <summary>
    /// 抓取gdtbt.org.cn的信息
    /// </summary>
    public void tbt_sps()
    {
        Model.wsn_tbt_sps mwts;
        BLL.wsn_tbt_sps bwts = new BLL.wsn_tbt_sps();
        Dictionary<string, string> listtitA = new Dictionary<string, string>();
        listtitA.Add("WTO/TBT", "http://gdtbt.org.cn/tbt.aspx");
        listtitA.Add("WTO/SPS", "http://gdtbt.org.cn/sps.aspx");

        foreach (var item in listtitA)
        {
            List<Hashtable> htt = ListHtml(item.Value);
            //string ziUrl = HrefHtml("http://gdtbt.org.cn/", sdptAzzktbt(itemTr));
            if (htt.Count > 0)
            {
                foreach (Hashtable htItem in htt)
                {
                    if (htItem["status"]!=null&&htItem["status"].ToString()=="1")
                    {
                        //查看本地数据库是否存在此条信息避免重复抓取插入
                        if (1 == 1)
                        //if (bwts.GetModelList(" TBNumber='" + htt["TBNumber"].ToString() + "' ").Count <= 0)
                        {
                            string ziUrl = "http://gdtbt.org.cn/" + htItem["ziurl"].ToString();
                            mwts = new Model.wsn_tbt_sps();
                            if (htItem["TBTitle"] != null)
                            {
                                mwts.TBTitle = htItem["TBTitle"].ToString();
                            }
                            if (htItem["TBChengYuan"] != null)
                            {
                                mwts.TBChengYuan = htItem["TBChengYuan"].ToString();
                            }
                            if (htItem["TBDate"] != null)
                            {
                                mwts.TBDate = Convert.ToDateTime(htItem["TBDate"]);
                            }
                            if (htItem["TBNumber"] != null)
                            {
                                mwts.TBNumber = htItem["TBNumber"].ToString();
                                if (mwts.TBNumber.IndexOf("TBT") >= 0)
                                {
                                    mwts.TBType = "TBT";
                                }
                                else
                                {
                                    mwts.TBType = "SPS";
                                }
                            }
                            string bdboy = details(ziUrl);
                            mwts.TBbody = bdboy;
                            mwts.zq_Time = DateTime.Now;
                            mwts.state = 0;
                            //int bolid = bwts.Add(mwts);
                        }
                    }
                }
            }
        }
    }
    #endregion
    # region 抓取网页标签方法
    /// <summary>
    /// 获取列表页信息
    /// </summary>
    /// <param name="url">url带http://</param>
    /// <returns></returns>
    public List<Hashtable> ListHtml(string url)
    {
        List<Hashtable> htt = new List<Hashtable>();
        Hashtable ht = new Hashtable();
        try
        {
            string htmlcode = GetHTML(url);
            Regex reg1 = new Regex(@"<table[^>]*(class=""tablelist"")[^>]*>[\s\S]*?</table>");
            MatchCollection ms1 = reg1.Matches(htmlcode);
            string b = ms1[0].Groups[0].Value.ToString();
            Regex reg2 = new Regex(@"<tr[^>]*>(?<ww>[\s\S]*?)</tr>");
            MatchCollection ms2 = reg2.Matches(b);
            foreach (Match m2 in ms2)
            {
                string html = m2.Groups["ww"].Value.ToString().Trim();
                Regex reg3 = new Regex(@"<td[^>]*>(?<ww>[\s\S]*?)</td>");
                MatchCollection ms3 = reg3.Matches(html);
                if (ms3.Count > 0)
                {
                    ht = new Hashtable();
                    ht.Add("status", "1");
                    for (int i = 0; i < ms3.Count; i++)
                    {
                        //此处可加判断加一个关键字查询
                        if (i == 0)
                        {
                            string a = ms3[i].Groups["ww"].Value.ToString();
                            Regex reg4 = new Regex(@"(?is)<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*class=""tbh"">");
                            MatchCollection ms4 = reg4.Matches(a);
                            foreach (Match m4 in ms4)
                            {
                                ht.Add("ziurl", m4.Groups["href"].Value.ToString().Trim());
                            }
                            Regex reg5 = new Regex(@"(?is)<a[^>]*class=""tbh"">(?<ww>[\s\S]*?)</a>");
                            MatchCollection ms5 = reg5.Matches(a);
                            foreach (Match m5 in ms5)
                            {
                                ht.Add("TBNumber", m5.Groups["ww"].Value.ToString().Trim());
                            }
                        }
                        if (i == 1)
                        {
                            string a = ms3[i].Groups["ww"].Value.ToString();
                            Regex reg4 = new Regex(@"(?is)<a[^>]*class=""tbbt"">(?<ww>[\s\S]*?)</a>");
                            MatchCollection ms4 = reg4.Matches(a);
                            foreach (Match m4 in ms4)
                            {
                                ht.Add("TBTitle", m4.Groups["ww"].Value.ToString().Trim());
                            }
                        }
                        if (i == 2)
                        {
                            string a = ms3[i].Groups["ww"].Value.ToString();
                            ht.Add("TBChengYuan", a.Trim());
                        }
                        if (i == 3)
                        {
                            string a = ms3[i].Groups["ww"].Value.ToString();
                            ht.Add("TBDate", a.Trim());
                        }
                    }
                    htt.Add(ht);
                }
            }
        }
        catch (Exception ex)
        {
            //ht.Remove("status");
            ht = new Hashtable();
            ht.Add("status", "0");
            ht.Add("msg",ex.Message);
            htt.Add(ht);
        }
        return htt;
    }

    /// <summary>
    /// 获取详情页信息
    /// </summary>
    /// <param name="url">url带http://</param>
    /// <returns></returns>
    public string details(string url)
    {
        string zong = "";
        string htmlcode = GetHTML(url);
        Regex reg = new Regex(@"(?is)<div class=""tb1[^>]*>(?><div[^>]*>(?<o>)|</div>(?<-o>)|(?:(?!</?div\b).)*)*(?(o)(?!))</div>");
        MatchCollection ms = reg.Matches(htmlcode);

        if (ms.Count > 0)
        {
            zong += ms[0].Value.ToString().Trim();
        }
        return zong;
    }
    #endregion
    #region 工具方法
    /// <summary>
    /// 获取utf-8
    /// </summary>
    /// <param name="url"></param>
    /// <returns></returns>
    private string GetHTML(string url)
    {
        string rt = "";
        try
        {
            WebRequest Wrq = WebRequest.Create(url);//URL

            WebResponse Wrs = Wrq.GetResponse();
            Stream strm = Wrs.GetResponseStream();
            StreamReader sr = new StreamReader(strm, System.Text.Encoding.GetEncoding("UTF-8"));
            rt = sr.ReadToEnd(); 
            Wrs.Close();
            strm.Close();
            sr.Close();
        }
        catch
        {
            rt = "1";
        }
        return rt;
    }
    /// <summary>
    /// 获取gb2312源代码
    /// </summary>
    /// <param name="url"></param>
    /// <returns></returns>
    private string GetHTML2312(string url)
    {
        string rt = "";
        try
        {
            WebRequest Wrq = WebRequest.Create(url);//URL
            WebResponse Wrs = Wrq.GetResponse();
            Stream strm = Wrs.GetResponseStream();
            StreamReader sr = new StreamReader(strm, System.Text.Encoding.GetEncoding("gb2312"));
            rt = sr.ReadToEnd();
            Wrs.Close();
            strm.Close();
            sr.Close();

        }
        catch
        {
            rt = "1";
        }
        return rt;
    }
    //部分链接用上边的获取不到源代码,可以用这个试一试
    public string GetHtmlwc(string url)
    {
        WebClient client = new WebClient();
        client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");  // Add a user agent header in case the // requested URI contains a query. 
        Stream data = client.OpenRead(url);
        StreamReader reader = new StreamReader(data, Encoding.UTF8);
        string s = reader.ReadToEnd();
        data.Close();
        reader.Close();
        return s;
    }
    #endregion
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值