555 asp.net mvc 抓取京东商城分类
URL:http://www.jd.com/allSort.aspx
效果:
//后台代码
public ActionResult GetCategoryFromJd()
{
var url = "http://www.jd.com/allSort.aspx";
var htmlCode = HttpGet(url, "", new System.Net.CookieContainer());
var firstCats = new List<string>();
var codeCut = "";
var endStr = "</h2>";
var start = -1;
var end = -1;
var current = htmlCode.IndexOf("全部商品分类<b></b>");//去除第一个无用的h2标签内容
start = htmlCode.IndexOf("<h2>", current);
end = htmlCode.IndexOf("<h2>", start+1);
while (start >= 0)
{
if (end > start)
{
codeCut = htmlCode.Substring(start, end - start);
//一级分类
var firstCatName = GetChinese(codeCut, "<h2>(.*)</h2>", "[^\u4e00-\u9fa5、]");
//二级、三级分类
var matches = Regex.Matches(codeCut.Replace("\r\n", ""), "<dl(.*?)</dl>");
foreach (Match mc in matches)
{
//二级分类
var secondCatName = GetChinese(mc.Groups[1].Value, "<dt>(.*?)</dt>","[^\u4e00-\u9fa5、]");
//三级分类
var mts = Regex.Matches(mc.Groups[1].Value, "<em>(.*?)</em>");
foreach (Match m in mts)
{
var thirdCatName = GetChinese(m.Groups[1].Value, "<a.*>(.*)</a>");
}
}
}
current = end;
start = htmlCode.IndexOf("<h2>", current);
end = htmlCode.IndexOf("<h2>", start+1);
if (end < 0)
{
end = htmlCode.IndexOf("id=\"service-2013\"", start);
}
current = end+endStr.Length;
}
return View();
}
//辅助方法
public static string GetChinese(string content,string pattern,string reg="[^\u4e00-\u9fa5、0-9-./A-Za-z&·]")
{
var match = Regex.Match(content, pattern);
if (match.Success)
{
var result = Regex.Replace(match.Groups[1].Value, @reg, "");//取汉字或标点“、”、数字和“-”
return result;
}
return null;
}
public static string HttpGet(string Url, string postDataStr, CookieContainer cookie)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
request.Method = "GET";
request.ContentType = "text/html;charset=gbk";
request.CookieContainer = cookie;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("gbk"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
request.Method = "GET";
request.ContentType = "text/html;charset=gbk";
request.CookieContainer = cookie;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("gbk"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}