读淘宝页面字节流提取宝贝图片地址宝贝标题宝贝价格

最新推荐文章于 2019-12-13 16:09:38 发布

lbx541575387

最新推荐文章于 2019-12-13 16:09:38 发布

阅读量2.3k

点赞数

分类专栏： asp.net c# 文章标签： string regex url null byte class

本文链接：https://blog.csdn.net/lbx541575387/article/details/6755902

版权

asp.net c# 专栏收录该内容

22 篇文章 0 订阅

订阅专栏

public static class taobao_message
        {
            /// <summary>
            /// 读页面的byte转化为string
            /// </summary>
            /// <param name="url">地址</param>
            /// <returns></returns>
            public static string webclinet_content(string url)
            {
                url = url.Replace("http://", "");
                System.Net.WebClient client = new WebClient();
                byte[] page = client.DownloadData("http://" + url);
                string content = System.Text.Encoding.GetEncoding("GB2312").GetString(page);//淘宝的页面编码为gb2312
                return content;
            }
            /// <summary>
            /// 读淘宝宝贝的信息
            /// 数组内容{宝贝图片地址，宝贝标题，宝贝价格}
            /// </summary>
            /// <param name="url">宝贝地址</param>
            /// <returns></returns>
            public static string[] baobei_mess(string url)
            {
                string content = webclinet_content(url);
                string baobei_img =get_taobao(content,1);
                string baobei_title = get_taobao(content,2);
                string baobei_price = get_taobao(content,3);
                string[] arry1 = {baobei_img,baobei_title,baobei_price};
                return arry1;;
            }
            /// <summary>
            /// 特定标签内容提取
            /// </summary>
            /// <param name="content">提取的字符串</param>
            /// <param name="type">验证类型0：空； 1：淘宝贝图片；2：宝贝标题；3：宝贝价格；</param>
            /// <returns></returns>
            public static string get_taobao(string content, int type)
            {
                string result = "";
                string reg = "";
                switch (type)
                {
                    case 0: return "";
                    case 1: reg = @"J_ImgBooth\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>"; break;
                    case 2: reg = "<h3>(<a[^>]*>)?([^<]*)(</a>)?</h3>"; break;
                    case 3: reg = "J_StrPrice[^>]*>([^<>]*)(</)"; break;
                }
                string regex = reg;
                Regex re = new Regex(regex);
                MatchCollection matches = re.Matches(content);
                System.Collections.IEnumerator enu = matches.GetEnumerator();
                switch (type)
                {
                    case 0: return "";
                    case 1:
                        while (enu.MoveNext() && enu.Current != null)
                        {
                            Match match = (Match)(enu.Current);
                            result += match.Groups["imgUrl"];
                        } break;
                    case 2:
                        while (enu.MoveNext() && enu.Current != null)
                        {
                            Match match = (Match)(enu.Current);
                            result += match.Groups[2];
                        } break;
                    case 3:
                        while (enu.MoveNext() && enu.Current != null)
                        {
                            Match match = (Match)(enu.Current);
                            result += match.Groups[1];
                        } break;
                }
                return result;
           }
        }