这个东西之前一直没有做过,网上也有很多工具,不过还是自己尝试写代码来完成。
之前上传的代码片段有这个方法
后台方法1:
public static String GetHtml(string url)
{
try
{
HttpWebRequest req = HttpWebRequest.Create(url) as HttpWebRequest;
req.ContentType = " text/html; charset=utf-8;";
//req.ContentType = "multipart/form-data; boundary=" + DateTime.Now.Ticks.ToString("x");
req.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0";
req.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
req.Timeout = 30 * 1000;
HttpWebResponse response = req.GetResponse() as HttpWebResponse;
//Stream stream = new System.IO.Compression.GZipStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress);
Stream stream = response.GetResponseStream();
var encoding = Encoding.GetEncoding(response.CharacterSet);
using (StreamReader streamReader = new StreamReader(stream, encoding))
{
return streamReader.ReadToEnd();
}
}
catch
{
return String.Empty;
}
}
这个方法将传入的url路径,转为html字符串。
后台方法二:
/// <summary>
/// 在文本html的文本查找标志名为tagName,并且属性attrName的值为attrValue的所有标志
/// 例如:FindTagByAttr(html, "div", "class", "demo")
/// 返回所有class为demo的div标志
/// </summary>
public static List<HtmlTag> FindTagByAttr(String html, String tagName, String attrName, String attrValue)
{
String format = String.Format(@"<{0}\s[^<>]*{1}\s*=\s*(\x27|\x22){2}(\x27|\x22)[^<>]*>", tagName, attrName, attrValue);
return FindTag(html, tagName, format);
}
好,现在写前台。
比如现在要抓http://www.j1.com/p-1-1474_20-11000-0-1.html上的商品名称、价格、备案号。
仔细分析网站:
1.有分页。必须将总页数抓出来,循环总页数,套入页数索引到网址中,得到新的URL,再抓数据
2.商品的名称、价格可以直接抓,但是文案号要进入‘查看详情’才能抓到。所以必须得到‘查看详情’按钮的URL。
分析完毕,开工..
第一步:得到总页数
打开网址,从页面上看,一共27页,就需要抓到这个27.
String html = HtmlTag.GetHtml("http://www.j1.com/p-1-1474_20-11000-0-1.html");
var goodsCount = HtmlTag.FindTagByAttr(html, "div", "class", "pagenav")[0].FindTag("cite")[0].InnerHTML;//页面上class为pagenav的div的第一个cite标签的内容
if (!string.IsNullOrWhiteSpace(goodsCount))
{
pageCount = Int32.Parse(goodsCount.Substring(goodsCount.IndexOf("/") + 1));//抓到了1/27-->商品总页数27
}
for (int index = 1; index <= pageCount; index++)
{
html = HtmlTag.GetHtml("http://www.j1.com/p-1-1474_20-11000-0-1.html?orderBy=0&pageNum=" + index);//循环页数
List<HtmlTag> tags = HtmlTag.FindTagByAttr(html, "div", "class", "listcheck");//页面上‘查看详情’按钮
if (tags.Count > 0)
{
for (int i = 0; i < tags.Count; i++)
{
var href = tags[i].FindTag("a")[0].GetAttribute("href");//查看详情URL
if (!string.IsNullOrWhiteSpace(href))
{
var html_tag = HtmlTag.GetHtml(href);
//-------名称
var name_tag = HtmlTag.FindTagByAttr(html_tag, "dl", "class", "detailinfo");
if (name_tag.Count > 0)
{
name = name_tag[0].FindTag("dd")[0].InnerHTML;
}
//----------文号
var infoNbr_tag = HtmlTag.FindTagByAttr(html_tag, "table", "class", "detailgctable");
if (infoNbr_tag.Count > 0)
{
var infoTr_tag = infoNbr_tag[0].FindTag("tr");
if (infoTr_tag.Count > 0)
{
infoNbr = infoTr_tag[infoTr_tag.Count - 2].FindTag("td")[1].InnerHTML;
}
else
{
infoNbr = "";
}
}
else
{
infoNbr = "";
}
Console.WriteLine("品牌:{0},名称:{1},文号:{2},价格:{3}", brand, name, infoNbr, price);
}
}
---------------收工