asp.net抓取数据(一)

本文简单介绍如何抓取单页的数据

先根据传入URL获取网页源码(注:这里有一个编码的问题,现在还不完善,不能自动判断编码)

//获取网页源码
        public static string Get_Http(string a_strUrl, int timeout)
        {
            string strResult;
            try
            {
                HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl);
                
                myReq.Timeout = timeout;
                HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();
                Stream myStream = HttpWResp.GetResponseStream();
                Encoding encoding = Encoding.GetEncoding("UTF-8");
                //如果是gb2312编码
                StreamReader sr = new StreamReader(myStream, Encoding.Default);
                //如果是utf-8编码
                //StreamReader sr = new StreamReader(myStream, Encoding.UTF8);
                StringBuilder strBuilder = new StringBuilder();
                while (-1 != sr.Peek())
                {
                    strBuilder.Append(sr.ReadLine() + "/r/n");
                }
                strResult = strBuilder.ToString();
            }
            catch (Exception exp)
            {
                strResult = "错误:" + exp.Message;
            }
            return strResult;
        }
挑出标题、时间、内容转换成实体类 (这里进行正则匹配)

//转换成实体
        public static MsgInfo getinfomation(string strhtml, string strbtstart, string strbtend,string strsjstart,string strsjend,string strnrstart,string strnrend) {
            string retitle = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strbtstart, strbtend);//匹配标题
            string redate = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strsjstart, strsjend);//匹配日期
            string recontent = string.Format("{0}(?<g>(.|[\r\n])+?){1}", strnrstart, strnrend);//匹配正文
            string title = Regex.Match(strhtml, retitle).Groups["g"].Value;
            string date = Regex.Match(strhtml, redate).Groups["g"].Value;
            string contents = Regex.Match(strhtml, recontent).Groups["g"].Value;
            MsgInfo msg = new MsgInfo();
            msg.title = title;
            msg.pubdate = Convert.ToDateTime(date);
            msg.content = contents;
            return msg;
        }



评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值