C#正则表达式抓取网站信息

C#正则表达式抓取它网信息,本示例以抓取京东商城商品详情为例。

1、创建JdRobber.cs程序类

public class JdRobber
{
    /// <summary>
    /// 判断是否京东链接
    /// </summary>
    /// <param name="param"></param>
    /// <returns></returns>
    public bool ValidationUrl(string url)
    {
        bool result = false;
        if (!String.IsNullOrEmpty(url))
        {
            Regex regex = new Regex(@"^http://item.jd.com/\d+.html$");
            Match match = regex.Match(url);
            if (match.Success)
            {
                result = true;
            }
        }
        return result;
    }

    /// <summary>
    /// 抓取京东信息
    /// </summary>
    /// <param name="param"></param>
    /// <returns></returns>
    public void GetInfo(string url)
    {
        if (ValidationUrl(url))
        {
            string htmlStr = WebHandler.GetHtmlStr(url, "Default");
            if (!String.IsNullOrEmpty(htmlStr))
            {
                string pattern = "";          //正则表达式
                string sourceWebID = "";      //商品关键ID
                string title = "";            //标题
                decimal price = 0;            //价格
                string picName = "";          //图片

                //提取商品关键ID
                pattern = @"http://item.jd.com/(?<Object>\d+).html";
                sourceWebID = WebHandler.GetRegexText(url, pattern);

                //提取标题
                pattern = @"<div.*id=\""name\"".*>[\s\S]*<h1>(?<Object>.*?)</h1>";
                title = WebHandler.GetRegexText(htmlStr, pattern);

                //提取图片
                int begin = htmlStr.IndexOf("<div id=\"spec-n1\"");
                int end = htmlStr.IndexOf("</div>", begin + 1);
                if (begin > 0 && end > 0)
                {
                    string subPicHtml = htmlStr.Substring(begin, end - begin);
                    pattern = @"<img.*src=\""(?<Object>.*?)\"".*/>";
                    picName = WebHandler.GetRegexText(subPicHtml, pattern);
                }

                //提取价格
                if (sourceWebID != "")
                {
                    string priceUrl = @"http://p.3.cn/prices/get?skuid=J_" + sourceWebID + "&type=1";
                    string priceJson = WebHandler.GetHtmlStr(priceUrl, "Default");
                    pattern = @"\""p\"":\""(?<Object>\d+(\.\d{1,2})?)\""";
                    price = WebHandler.GetValidPrice(WebHandler.GetRegexText(priceJson, pattern));
                }

                Console.WriteLine("商品名称:{0}", title);
                Console.WriteLine("图片:{0}", picName);
                Console.WriteLine("价格:{0}", price);
            }
        }
    }
}

2、创建WebHandler.cs公共方法类

/// <summary>
/// 公共方法类
/// </summary>
public class WebHandler
{
    /// <summary>
    /// 获取网页的HTML码
    /// </summary>
    /// <param name="url">链接地址</param>
    /// <param name="encoding">编码类型</param>
    /// <returns></returns>
    public static string GetHtmlStr(string url, string encoding)
    {
        string htmlStr = "";
        try
        {
            if (!String.IsNullOrEmpty(url))
            {
                WebRequest request = WebRequest.Create(url);            //实例化WebRequest对象
                WebResponse response = request.GetResponse();           //创建WebResponse对象
                Stream datastream = response.GetResponseStream();       //创建流对象
                Encoding ec = Encoding.Default;
                if (encoding == "UTF8")
                {
                    ec = Encoding.UTF8;
                }
                else if (encoding == "Default")
                {
                    ec = Encoding.Default;
                }
                StreamReader reader = new StreamReader(datastream, ec);
                htmlStr = reader.ReadToEnd();                           //读取数据
                reader.Close();
                datastream.Close();
                response.Close();
            }
        }
        catch { }
        return htmlStr;
    }

    /// <summary>
    /// 获取正则表达式中的关键字
    /// </summary>
    /// <param name="input">文本</param>
    /// <param name="pattern">表达式</param>
    /// <returns></returns>
    public static string GetRegexText(string input, string pattern)
    {
        string result = "";
        if (!String.IsNullOrEmpty(input) && !String.IsNullOrEmpty(pattern))
        {
            Regex regex = new Regex(pattern, RegexOptions.IgnoreCase);
            Match match = regex.Match(input);
            if (match.Success)
            {
                result = match.Groups["Object"].Value;
            }
        }
        return result;
    }

    /// <summary>
    /// 返回有效价格
    /// </summary>
    /// <param name="strPrice"></param>
    /// <returns></returns>
    public static decimal GetValidPrice(string strPrice)
    {
        decimal price = 0;
        try
        {
            if (!String.IsNullOrEmpty(strPrice))
            {
                Regex regex = new Regex(@"^\d+(\.\d{1,2})?$", RegexOptions.IgnoreCase);
                Match match = regex.Match(strPrice);
                if (match.Success)
                {
                    price = decimal.Parse(strPrice);
                }
            }
        }
        catch { }
        return price;
    }
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

pan_junbiao

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值