C#正则表达式抓取它网信息,本示例以抓取京东商城商品详情为例。
1、创建JdRobber.cs程序类
public class JdRobber
{
/// <summary>
/// 判断是否京东链接
/// </summary>
/// <param name="param"></param>
/// <returns></returns>
public bool ValidationUrl(string url)
{
bool result = false;
if (!String.IsNullOrEmpty(url))
{
Regex regex = new Regex(@"^http://item.jd.com/\d+.html$");
Match match = regex.Match(url);
if (match.Success)
{
result = true;
}
}
return result;
}
/// <summary>
/// 抓取京东信息
/// </summary>
/// <param name="param"></param>
/// <returns></returns>
public void GetInfo(string url)
{
if (ValidationUrl(url))
{
string htmlStr = WebHandler.GetHtmlStr(url, "Default");
if (!String.IsNullOrEmpty(htmlStr))
{
string pattern = ""; //正则表达式
string sourceWebID = ""; //商品关键ID
string title = ""; //标题
decimal price = 0; //价格
string picName = ""; //图片
//提取商品关键ID
pattern = @"http://item.jd.com/(?<Object>\d+).html";
sourceWebID = WebHandler.GetRegexText(url, pattern);
//提取标题
pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>";
title = WebHandler.GetRegexText(htmlStr, pattern);
//提取图片
int begin = htmlStr.IndexOf("<div id=\"spec-n1\"");
int end = htmlStr.IndexOf("</div>", begin + 1);
if (begin > 0 && end > 0)
{
string subPicHtml = htmlStr.Substring(begin, end - begin);
pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>";
picName = WebHandler.GetRegexText(subPicHtml, pattern);
}
//提取价格
if (sourceWebID != "")
{
string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1";
string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default");
pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""";
price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern));
}
Console.WriteLine("商品名称:{0}", title);
Console.WriteLine("图片:{0}", picName);
Console.WriteLine("价格:{0}", price);
}
}
}
}
2、创建WebHandler.cs公共方法类
/// <summary>
/// 公共方法类
/// </summary>
public class WebHandler
{
/// <summary>
/// 获取网页的HTML码
/// </summary>
/// <param name="url">链接地址</param>
/// <param name="encoding">编码类型</param>
/// <returns></returns>
public static string GetHtmlStr(string url, string encoding)
{
string htmlStr = "";
try
{
if (!String.IsNullOrEmpty(url))
{
WebRequest request = WebRequest.Create(url); //实例化WebRequest对象
WebResponse response = request.GetResponse(); //创建WebResponse对象
Stream datastream = response.GetResponseStream(); //创建流对象
Encoding ec = Encoding.Default;
if (encoding == "UTF8")
{
ec = Encoding.UTF8;
}
else if (encoding == "Default")
{
ec = Encoding.Default;
}
StreamReader reader = new StreamReader(datastream, ec);
htmlStr = reader.ReadToEnd(); //读取数据
reader.Close();
datastream.Close();
response.Close();
}
}
catch { }
return htmlStr;
}
/// <summary>
/// 获取正则表达式中的关键字
/// </summary>
/// <param name="input">文本</param>
/// <param name="pattern">表达式</param>
/// <returns></returns>
public static string GetRegexText(string input, string pattern)
{
string result = "";
if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern))
{
Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
Match match = regex.Match(input);
if (match.Success)
{
result = match.Groups["Object"].Value;
}
}
return result;
}
/// <summary>
/// 返回有效价格
/// </summary>
/// <param name="strPrice"></param>
/// <returns></returns>
public static decimal GetValidPrice(string strPrice)
{
decimal price = 0;
try
{
if (!String.IsNullOrEmpty(strPrice))
{
Regex regex = new Regex(@"^\d+(\.\d{1,2})?$", RegexOptions.IgnoreCase);
Match match = regex.Match(strPrice);
if (match.Success)
{
price = decimal.Parse(strPrice);
}
}
}
catch { }
return price;
}
}