asp.net 网页抓取内容

网页抓取代码

复制代码
using System;
using System.Collections.Generic;
using System.Linq; using System.Web; // using System.Net; using System.IO; using System.Text.RegularExpressions; using System.Text; namespace WSYL.Web.Common { public static class GetSteamShipInfo { public static string GetWebSite(string steamshipname,int itype) { if (steamshipname == null || steamshipname.Trim() == "") return null; //step1: get html from url string urlToCrawl = @"网址"; //generate http request HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl); //use GET method to get url's html req.Method = "GET"; //use request to get response HttpWebResponse resp = (HttpWebResponse)req.GetResponse(); // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死 // string htmlCharset = "UTF-8"; string htmlCharset = "utf-8"; //use songtaste's html's charset GB2312 to decode html //otherwise will return messy code Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset); StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding); //read out the returned html string respHtml = sr.ReadToEnd(); //第三种获取内容 //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//需要获取的代码开始和结尾内容
Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value; if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="") return respHtml = ""; if(itype==0) { respHtml = TitleMatch2.Groups[1].Value.ToString(); } if(itype==1) { respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString()); } if (itype == 2) { respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString()); } return respHtml; } /// <summary> /// 去除html标签和空格有些例外会使得去除不干净,所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作 /// </summary> /// <param name="strHtml">标签内容</param> /// <returns></returns> private static string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "&lt;"); strOutput = strOutput.Replace(">", "&gt;"); //把所有空格变为一个空格 Regex r = new Regex(@"\s+"); strOutput = r.Replace(strOutput, " "); return strOutput.Trim(); } } }
复制代码

转载于:https://www.cnblogs.com/hs8888/p/5520564.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值