c 抓取html 内容,CSharp抓取HTML网页内容

using mshtml;

using HtmlAgilityPack;

class HTMLCrawler

{

private PhaseResultBean PhaseHtml(int index, Uri uri, String szResultPath, String szErrorPath, HTMLEnginType htmlEngin)

{

PhaseResultBean result = new PhaseResultBean();

try

{

WebClient client = new WebClient();

client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");

Byte[] pageData = client.DownloadData(uri);

string pageHtml = Encoding.UTF8.GetString(pageData);

if (checkSavePages.Checked)

{

String szHtmlPath = XWin32.getExeParentPath() + index.ToString()+".html";

using (StreamWriter sw = new StreamWriter(szHtmlPath, true))

{

sw.WriteLine(pageHtml);

}

}

switch(htmlEngin)

{

case HTMLEnginType.HTMLEngin_mshtml:

PhaseHtml_mshtml(pageHtml, szResultPath, szErrorPath, result);

break;

case HTMLEnginType.HTMLEngin_HtmlAgilityPack:

PhaseHtml_HtmlAgilityPack(pageHtml, szResultPath, szErrorPath, result);

break;

}

}

catch (WebException webEx)

{

using (StreamWriter sw = new StreamWriter(szErrorPath, true))

{

sw.WriteLine(webEx.Message);

}

result.bSuccess = false;

}

return result;

}

private void PhaseHtml_mshtml(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)

{

mshtml.HTMLDocument docObject = new mshtml.HTMLDocument();

mshtml.IHTMLDocument2 doc2 = docObject as mshtml.IHTMLDocument2;

doc2.write(pageHtml);

doc2.close();

mshtml.IHTMLDocument3 doc3 = docObject as mshtml.IHTMLDocument3;

int len = doc3.getElementById("shop-all-list").children[0].children.length;

result.total += len;

foreach (IHTMLElement li in doc3.getElementById("shop-all-list").children[0].children)

{

try

{

IHTMLElement title = li.children[1].children[0];

String szTitle = title.innerText;

if (szTitle != null) szTitle = szTitle.Replace("\r\n", "-");

IHTMLElement star = li.children[1].children[1].children[0];

String szStar = star.getAttribute("title");

IHTMLElement reviewNum = li.children[1].children[1].children[1];

String szReviewNum = reviewNum.innerText;

IHTMLElement meanPrice = li.children[1].children[1].children[3];

String szMeanPrice = meanPrice.innerText;

IHTMLElement category = li.children[1].children[2].children[0];

String szCategory = category.innerText;

IHTMLElement address = li.children[1].children[2].children[3];

String szAddress = address.innerText;

if (szAddress != null) szAddress.Replace(",", "-");

IHTMLElement taste = li.children[1].children[3].children[0];

String szTaste = taste.innerText;

IHTMLElement evn = li.children[1].children[3].children[1];

String szEvn = evn.innerText;

IHTMLElement service = li.children[1].children[3].children[2];

String szService = service.innerText;

//将获取的内容写入文本

using (StreamWriter sw = new StreamWriter(szResultPath, true))

{

sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);

}

}

catch (Exception Ex)

{

using (StreamWriter sw = new StreamWriter(szErrorPath, true))

{

sw.WriteLine(Ex.Message);

}

result.failed += 1;

}

}

}

private void PhaseHtml_HtmlAgilityPack(String pageHtml, String szResultPath, String szErrorPath, PhaseResultBean result)

{

HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();

doc.LoadHtml(pageHtml);

HtmlAgilityPack.HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div[1]/div[1]/div[2]/ul[1]/li");

result.total += nodes.Count;

foreach (HtmlAgilityPack.HtmlNode li in nodes)

{

try

{

HtmlAgilityPack.HtmlNode titleA = li.SelectNodes("div[2]/div[1]/a[1]")[0];

HtmlAgilityPack.HtmlNode titleB = li.SelectNodes("div[2]/div[1]/a[2]") == null ? null : li.SelectNodes("div[2]/div[1]/a[2]")[0];

String szTitle = (titleA==null?"":titleA.InnerText) + "-" + (titleB == null ? "" : titleB.InnerText);

if (szTitle != null) szTitle = szTitle.Replace("\n", "");

if (szTitle != null) szTitle = szTitle.Replace(" ", "");

HtmlAgilityPack.HtmlNode star = li.SelectNodes("div[2]/div[2]/span[1]")[0];

String szStar = star.Attributes["title"].Value.ToString();

HtmlAgilityPack.HtmlNode reviewNum = li.SelectNodes("div[2]/div[2]/a[1]")[0];

String szReviewNum = reviewNum.InnerText;

if (szReviewNum != null) szReviewNum = szReviewNum.Replace("\n", "");

if (szReviewNum != null) szReviewNum = szReviewNum.Replace(" ", "");

HtmlAgilityPack.HtmlNode meanPrice = li.SelectNodes("div[2]/div[2]/a[2]")[0];

String szMeanPrice = meanPrice.InnerText;

if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace("\n", "");

if (szMeanPrice != null) szMeanPrice = szMeanPrice.Replace(" ", "");

HtmlAgilityPack.HtmlNode category = li.SelectNodes("div[2]/div[3]/a[1]")[0];

String szCategory = category.InnerText;

HtmlAgilityPack.HtmlNode addressA = li.SelectNodes("div[2]/div[3]/a[2]")[0];

HtmlAgilityPack.HtmlNode addressB = li.SelectNodes("div[2]/div[3]/span[1]")[0];

String szAddress = addressA.InnerText + "-" + addressB.InnerText;

if (szAddress != null) szAddress.Replace(",", "-");

HtmlAgilityPack.HtmlNode taste = li.SelectNodes("div[2]/span[1]/span[1]")[0];

String szTaste = taste.InnerText;

HtmlAgilityPack.HtmlNode evn = li.SelectNodes("div[2]/span[1]/span[2]")[0];

String szEvn = evn.InnerText;

HtmlAgilityPack.HtmlNode service = li.SelectNodes("div[2]/span[1]/span[3]")[0];

String szService = service.InnerText;

//将获取的内容写入文本

using (StreamWriter sw = new StreamWriter(szResultPath, true))

{

sw.WriteLine(szTitle + "," + szStar + "," + szReviewNum + "," + szMeanPrice + "," + szCategory + "," + szAddress + "," + szTaste + "," + szEvn + "," + szService);

}

}

catch (Exception Ex)

{

using (StreamWriter sw = new StreamWriter(szErrorPath, true))

{

sw.WriteLine(Ex.Message);

}

result.failed += 1;

}

}

}

}

class PhaseResultBean

{

public Boolean bSuccess;

public int total;

public int successed;

public int failed;

}

public enum HTMLEnginType

{

HTMLEngin_mshtml,

HTMLEngin_HtmlAgilityPack

}

Share the post "CSharp抓取HTML网页内容"

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值