《程序员的第一年》---------- 【抓取网页数据】定时查寻淘宝搜索结果并用excel记录下来(HttpWebRequest与正则等的使用)


最近在看一些关于HttpWebRequest与正则来抓取网页数据的知识,就做了一个搜索淘宝电商网站关键词(例如,1手机,2手机配件,3手机外壳....),

    /// 把这些关键词的结果数量,在Excel中输出

功能很简单高手路过别喷

不过查寻的速度有点慢,有哪位高手能指点一下嘛?

 


 

界面如下:






代码如下:


using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Text.RegularExpressions;

namespace 自动搜索记录
{
    public partial class Main : Form
    {
        public Main()
        {
            InitializeComponent();
        }

        private void btnAdd_Click(object sender, EventArgs e)
        {
            if (!string.IsNullOrEmpty(txtKeyWord.Text))
            {
                MyItem item = new MyItem();
                item.id = (lboxKeyWord.Items.Count+1).ToString();
                item.keyWord=txtKeyWord.Text.ToString();
                lboxKeyWord.Items.Add(item);
            //    MessageBox.Show("添加成功!");
            }
            else
            {
                MessageBox.Show("关键字不能为空!");
            }
        }

        private void button1_Click(object sender, EventArgs e)
        {
            lboxResultShow.Items.Clear();
            foreach (MyItem item in lboxKeyWord.Items)
            {
                lboxResultShow.Items.Add(GetFindResult(item.keyWord));
            }
        }

        /// <summary>
        /// 取得查寻结果
        /// </summary>
        /// <param name="keyWord">关键字</param>
        private static string GetFindResult(string keyWord)
        {
            keyWord = System.Web.HttpUtility.UrlEncode(keyWord, System.Text.UnicodeEncoding.GetEncoding("GB2312")).ToUpper();
            // System.Diagnostics.Process.Start("http://s.taobao.com/search?q=" + keyWord);//弹出网页 
            comm comm = new comm();
            string url = "http://s.taobao.com/search?q=" + keyWord;//根据要求更改网址
            string html = comm.NoLoginGetHtml(url);//取得当前页HTML
            if (string.IsNullOrEmpty(html))
            {
                Console.WriteLine("查找有误");
                return "";
            }
            string resultInfo = comm.GetElementsByClass(html, "result-info")[0].ToString();
             <li class="result-info">3180078件宝贝</li>  
            string regex = "<li class=\"result-info\">(?<total>[^<]*)件宝贝</li>";
            Regex rege = new Regex(regex);
            int total = Convert.ToInt32(rege.Match(resultInfo).Groups["total"].ToString());
            keyWord = System.Web.HttpUtility.UrlDecode(keyWord, System.Text.UnicodeEncoding.GetEncoding("GB2312"));
            return keyWord + "---总共有:" + total + "个查寻结果";
            //  Console.WriteLine(keyWord + "---" + total + "个结果");
        }

        private bool isCloseTime;
        private void btnFindByTime_Click(object sender, EventArgs e)
        {
            System.Timers.Timer t = new System.Timers.Timer(1000 * Convert.ToInt32(txtTime.Text));//实例化Timer类,设置间隔时间为5000毫秒;
            if (btnFindByTime.Text == "关闭定时")
            {
                isCloseTime = true;
                btnFindByTime.Text = "定时查寻";
            }
            else
            {
                isCloseTime = false;
                this.btnAdd.Enabled = false;
                t.Elapsed += new System.Timers.ElapsedEventHandler(theout);//到达时间的时候执行事件;
                t.AutoReset = true;//设置是执行一次(false)还是一直执行(true);
                t.Enabled = true;//是否执行System.Timers.Timer.Elapsed事件;
                Control.CheckForIllegalCrossThreadCalls = false;
                btnFindByTime.Text = "关闭定时";
            }
        }
         
        /// <summary>
        /// 定时查寻事件
        /// </summary> 
        public void theout(object source, System.Timers.ElapsedEventArgs e)
        {
            if ( isCloseTime)
            {
                System.Timers.Timer timer = (System.Timers.Timer)source;
                timer.Enabled = false;
                this.btnAdd.Enabled = true;
            }
            else
            {
                lboxResultShow.Items.Clear();
                foreach (MyItem item in lboxKeyWord.Items)
                {
                    lboxResultShow.Items.Add(GetFindResult(item.keyWord));
                }
            } 
        }
        private void btnSaveExcel_Click(object sender, EventArgs e)
        {
            //建立excel对象
            Microsoft.Office.Interop.Excel.Application excel = new Microsoft.Office.Interop.Excel.Application();
            excel.Application.Workbooks.Add(true); 
            excel.Cells[1, 1] = "关键字";
            excel.Cells[1, 2] = "查寻结果";
            //填充数据     
            for (int i = 0; i < lboxResultShow.Items.Count; i++)
            { 
                excel.Cells[i + 2, 1] = "" +this.lboxKeyWord.Items[i].ToString();
                excel.Cells[i + 2, 2] = lboxResultShow.Items[i].ToString();
            }
            excel.Visible = true;
        }
    }

    public class MyItem : object
    {
        public string keyWord;
        public string id;
        public override string ToString()
        {
            // TODO:  添加 MyItem.ToString 实现
            return keyWord;
        }
    }

}


using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Drawing;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;



namespace 自动搜索记录
{ 
    public  class comm
    {

        /// <summary>
        /// 根据url取得图片
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        private   Image GetImageByUrl(string url)
        {
            HttpWebRequest req = (HttpWebRequest)WebRequest.Create(new Uri(url)); 
            HttpWebResponse res = (HttpWebResponse)req.GetResponse();//获取服务器返回的资源
            Stream stream = res.GetResponseStream();
            Bitmap bitmap = new Bitmap(stream);
            stream.Close();
            return bitmap;

        }

        /// <summary>
        /// 取得HTML中所有图片的 URL。
        /// </summary>
        /// <param name="sHtmlText">HTML代码</param>
        /// <returns>图片的URL列表</returns> 
        public   string[] GetHtmlImageUrlList(string sHtmlText)
        {
            // 定义正则表达式用来匹配 img 标签
            Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s]*=[\s]*[""']?[\s]*(?<imgUrl>[^\s""'<>]*)[^<>]*?/?[\s]*>", RegexOptions.IgnoreCase);

            // 搜索匹配的字符串
            MatchCollection matches = regImg.Matches(sHtmlText);
            int i = 0;
            string[] sUrlList = new string[matches.Count];

            // 取得匹配项列表
            foreach (Match match in matches)
            {
                string imgUrl = match.Groups["imgUrl"].Value;
                sUrlList[i++] = imgUrl;
            }
            return sUrlList;
        }


        /// <summary>
        /// 保存图片
        /// </summary>
        /// <param name="url"></param>
        public   void SaveImage(string url, string path)
        {
            Image img = GetImageByUrl(url);

            if (!Directory.Exists(Path.GetDirectoryName(path)))
            {
                Directory.CreateDirectory(Path.GetDirectoryName(path));
            }
            //img.Save(GetNameByTime()+".jpg");

            img.Save(path);
        }

        /// <summary>
        /// 打开资源管理器并指定保存的文件
        /// </summary>
        /// <param name="Path"></param>
        public void OpenExplorerFile(string Path)
        {
            System.Diagnostics.ProcessStartInfo psi = new System.Diagnostics.ProcessStartInfo("Explorer.exe");
            psi.Arguments = "/e,/select," + Path;
            System.Diagnostics.Process.Start(psi);
        }

        /// <summary>
        /// 获取指定ID的标签内容
        /// </summary>
        /// <param name="html">HTML源码</param>
        /// <param name="id">标签ID</param>
        /// <returns></returns>
        public string GetElementById(string html, string id)
        {
            string pattern = @"<([a-z]+)(?:(?!id)[^<>])*id=([""']?){0}\2[^>]*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
            pattern = string.Format(pattern, Regex.Escape(id));
            Match match = Regex.Match(html, pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
            return match.Success ? match.Value : "";
        }

        /// <summary>
        /// 通过class属性获取对应标签集合
        /// </summary>
        /// <param name="html">HTML源码</param>
        /// <param name="className">class值</param>
        /// <returns></returns>
        public   string[] GetElementsByClass(string html, string className)
        {
            return GetElements(html, "", className);
        }
        /// <summary>
        /// 根据正则获取内容
        /// </summary>
        /// <param name="text">内容</param>
        /// <param name="pat">正则</param>
        public   string[] GetListByHtml(string text, string pat)
        {
            List<string> list = new List<string>();
            Regex r = new Regex(pat, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
            Match m = r.Match(text);
            //int matchCount = 0;
            while (m.Success)
            {
                list.Add(m.Value);
                m = m.NextMatch();
            }
            return list.ToArray();
        }

        /// <summary>
        /// 通过标签名获取标签集合
        /// </summary>
        /// <param name="html">HTML源码</param>
        /// <param name="tagName">标签名(如div)</param>
        /// <returns></returns>
        public   string[] GetElementsByTagName(string html, string tagName)
        {
            return GetElements(html, tagName, "");
        }
        /// <summary>
        /// 通过同时指定标签名+class值获取标签集合(内部方法)
        /// </summary>
        /// <param name="html"></param>
        /// <param name="tagName"></param>
        /// <param name="className"></param>
        /// <returns></returns>
        private   string[] GetElements(string html, string tagName, string className)
        {
            string pattern = "";
            if (tagName != "" && className != "")
            {
                pattern = @"<({0})(?:(?!class)[^<>])*class=([""']?){1}\2[^>]*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
                pattern = string.Format(pattern, Regex.Escape(tagName), Regex.Escape(className));
            }
            else if (tagName != "")
            {
                pattern = @"<({0})(?:[^<>])*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
                pattern = string.Format(pattern, Regex.Escape(tagName));
            }
            else if (className != "")
            {
                pattern = @"<([a-z]+)(?:(?!class)[^<>])*class=([""']?){0}\2[^>]*>(?>(?<o><\1[^>]*>)|(?<-o></\1>)|(?:(?!</?\1).))*(?(o)(?!))</\1>";
                pattern = string.Format(pattern, Regex.Escape(className));
            }
            if (pattern == "")
            {
                return new string[] { };
            }
            List<string> list = new List<string>();
            Regex reg = new Regex(pattern, RegexOptions.Singleline | RegexOptions.IgnoreCase);
            Match match = reg.Match(html);
            while (match.Success)
            {
                list.Add(match.Value);
                match = reg.Match(html, match.Index + match.Length);
            }
            return list.ToArray();
        }

        /// <summary>
        /// 无需登录 直接查看网页源码
        /// </summary>
        /// <param name="Url">被查看的地址</param>
        /// <returns></returns>
        public string  NoLoginGetHtml(string Url)
        {
            try
            { 
                string URI = "http://";
                if (Url.IndexOf("http://") >= 0 || Url.IndexOf("https://") >= 0)
                {
                    URI = Url;
                }
                else
                {
                    URI += Url;
                } 
                  
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URI); 
                request.Method = "GET";
                request.UserAgent = ".NET Framework Test Client";
                request.KeepAlive = false;
                //声明一个HttpWebRequest请求 
                request.Timeout = 300000;
                //设置连接超时时间  
                request.Headers.Set("Pragma", "no-cache");
                // 接收返回的页面
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                //System.Threading.Thread.Sleep(5000);
                Stream responseStream = response.GetResponseStream();
                StreamReader reader = new System.IO.StreamReader(responseStream, Encoding.GetEncoding("GB2312"));
                return  reader.ReadToEnd();
                 
            }
            catch  
            {
                return "";
            }

        }

    }
}




转载请注明来处(Taomeng)









  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值