使用HttpWebRequest与HttpWebResponse抓取网页数据

最近一个功能需要从其他网站上抓取数据存到本地数据库中,供后续展示与分析使用。这里通过使用HttpWebRequest模拟浏览器发送GET和POST请求到目标网站,然后通过HttpWebResponse获取目标服务器返回流作为字符串,然后通过html parser 获取需要保存的数据存储到数据库中。下面贴出两个获取页面的方法。
1,通过GET 获取页面

private static string getGETHtml(string url){
    string responsestr = "";
    HttpWebRequest req = HttpWebRequest.Create(url) as HttpWebRequest;
    req.Method = "GET";
    req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
    req.ContentType = "application/x-www-form-urlencoded";
    // 模拟浏览器终端
    req.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36";
    // 防止页面使用gzip压缩,获取数据乱码问题
    req.Headers["Accept-Encoding"] = "gzip, deflate";
    req.AutomaticDecompression = DecompressionMethods.GZip;
    using (HttpWebResponse response = req.GetResponse() as HttpWebResponse)
    {
                Stream stream;
                // 如果是gzip压缩,则对应解压缩
                if (response.ContentEncoding.ToLower().Contains("gzip"))
                {
                    stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
                }
                else if (response.ContentEncoding.ToLower().Contains("deflate"))
                {
                    stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress);
                }
                else
                {
                    stream = response.GetResponseStream();
                }
                using (StreamReader reader = new StreamReader(stream, System.Text.Encoding.Default))
                {
                    responsestr = reader.ReadToEnd();
                    stream.Dispose();
                }
            }
            return responsestr;
}

2,通过POST 获取页面

     private static string getPOSTHtml(string url,string postData) {
            string responsestr = "";
            HttpWebRequest req = HttpWebRequest.Create(url) as HttpWebRequest;
            req.Method = "POST";
            req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
            req.ContentType = "application/x-www-form-urlencoded";
            req.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36";
            req.Headers["Accept-Encoding"] = "gzip, deflate";
            req.AutomaticDecompression = DecompressionMethods.GZip;
            req.Timeout = 10000;
            req.KeepAlive = true;
            //req.ProtocolVersion = new Version("1.0"); 
            if(!string.IsNullOrEmpty(postData)){
                byte[] btBodys = Encoding.Default.GetBytes(postData);
                req.ContentLength = btBodys.Length;
                req.GetRequestStream().Write(btBodys, 0, btBodys.Length);
            }
            using (HttpWebResponse response = req.GetResponse() as HttpWebResponse)
            {
                Stream stream;
                if (response.ContentEncoding.ToLower().Contains("gzip"))
                {
                    stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
                }
                else if (response.ContentEncoding.ToLower().Contains("deflate"))
                {
                    stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress);
                }
                else
                {
                    stream = response.GetResponseStream();
                }
                using (StreamReader reader = new StreamReader(stream, System.Text.Encoding.Default))
                {
                    responsestr = reader.ReadToEnd();
                    stream.Dispose();
                }
            }
            return responsestr;
        }

获取页面之后就可以尽心html解析然后存储所需数据了。
html解析工具可以使用HtmlAgilityPack或者NSoup等。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值