用C#做一个网页数据采集工具

//提取产品列表页中产品最终页的网页
        private void button1_Click(object sender, EventArgs e)
        {
            if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
            {
                MessageBox.Show("网址和域名不能为空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
                return;
            }
            try
            {
                string Html = inc.GetHtml("http://www.shaoqun.com/");
                //ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
                ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取链接

                StringBuilder sb = new StringBuilder();
                foreach (object var in al)
                {
                    string a = var.ToString().Replace("/"", "").Replace("'", "");
                    a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    if (a.StartsWith("/"))
                        a = textBox2.Text.Trim() + a;
                    if (!a.StartsWith("http://"))
                        a = "http://" + a;
                    sb.Append(a + "/r/n");
                }
                textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox,每个链接占一行

 

                MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

            }
            catch (Exception err)
            {
                MessageBox.Show("提取出错!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
            }

        }

 


 //把采集的产品页面html代码进行字符串处理,提取需要的代码,最后保存到本地一个access数据库中,同时提取产品图片地址并自动现在图片到本地images文件夹下
        private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
        {
            //填充产品表
            Database.ExecuteNonQuery("delete from Tb_Product");
            DataTable dt2 = new DataTable();
            OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
            OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
            OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
            da.Fill(dt2);
            dt2.Rows.Clear();

            BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条

            string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(',');
            DataTable dt = new DataTable();
            StringBuilder ErrorStr = new StringBuilder();
            string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//";

            //循环每次采集网址
            for (int i = 0; i < Urls.Length; i++)
            {
                try
                {
                    if (!worker.CancellationPending)
                    {
                        if (Urls[i] == "")
                            return;
                        html = inc.GetHtml(Urls[i]);//获取该url的html代码
                     DataRow NewRow = dt2.NewRow();

                        //产品名
                        string ProductName = html.Substring(html.IndexOf("<title>") + 7);
                        NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

                        //产品编号
                        NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

                        //产品介绍,这些都是根据不同网站的html做相应的修改
                        string Introduce = html.Substring(html.IndexOf("Product Details") + 26);
                        Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

                        NewRow["Introduce"] = Introduce;

 

 

                            //下载图片
                            string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);
                            ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=/"") + 5);
                            ProductImage = ProductImage.Remove(ProductImage.IndexOf("/""));
                            try
                            {
                                inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));
                            }
                            catch (Exception)
                            {
                                ErrorStr.Append("下载图片失败,图片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "/r/n");
                            }


                        dt2.Rows.Add(NewRow);

                        //Thread.Sleep(100);
                        worker.ReportProgress((i + 1) * 100 / Urls.Length, i);
                        toolStripStatusLabel1.Text = "处理进度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//进度条
                    }

                }
                catch (Exception err)
                {
                    ErrorStr.Append("采集错误:" + err.Message + ";网址:" + Urls[i] + "/r/n");
                }
            }
            da.Update(dt2);
            DataBind(dt2);
            ShowError(ErrorStr.ToString());
        }

         /// <summary>
        /// ASPX页面生成静态Html页面

        /// </summary>
        public static string GetHtml(string url)
        {
            StreamReader sr = null;
            string str = null;
            //读取远程路径
            WebRequest request = WebRequest.Create(url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));
            str = sr.ReadToEnd();
            sr.Close();
            return str;
        }

     // 提取HTML代码中的网址 
        public static ArrayList GetMatchesStr(string htmlCode, string strRegex)
        {
            ArrayList al = new ArrayList();

            Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
            MatchCollection m = r.Matches(htmlCode);

            for (int i = 0; i < m.Count; i++)
            {
                bool rep = false;
                string strNew = m[i].ToString();

                // 过滤重复的URL 
                foreach (string str in al)
                {
                    if (strNew == str)
                    {
                        rep = true;
                        break;
                    }
                }

                if (!rep) al.Add(strNew);
            }

            al.Sort();

            return al;
        }

        public static void DownFile(string Url, string Path)
        {

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream stream = response.GetResponseStream();
            long size = response.ContentLength;
            //创建文件流对象
            using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))
            {
                byte[] b = new byte[1025];
                int n = 0;
                while ((n = stream.Read(b, 0, 1024)) > 0)
                {
                    fs.Write(b, 0, n);
                }
            }
        }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
预览图片见:http://www.cnblogs.com/xxpyeippx/archive/2008/03/31/1131211.html运行环境windows nt/xp/2003 or above.net Framework 1.1SqlServer 2000 开发环境 VS 2003目的学习了网络编程,总要点什么东西才好。于是想到要一个网页内容采集器。作者主页: http://www.fltek.com.cn使用方式测试数据采用自cnBlog。见下图用户首先填写“起始网页”,即从哪一页开始采集。然后填写数据库连接字符串,这里是定义了采集到的数据插入到哪个数据库,后面选择表名,不必说了。网页编码,不出意外的话,中国大陆都可以采用UTF-8爬取文件名的正则:呵呵 这个工具明显是给编程人员用的。正则都要直接填写啦。比如说cnblogs的都是数字的,所以写了\d建表帮助:用户指定要建立几个varchar型的,几个text型的,主要是放短数据和长数据啊。如果你的表里本来就有列,那就免啦。程序里面没有验证哦。网页设置里面:采集内容前后标记:比如说都有 xxx,如果我要采集xxx就写“到”,意思,当然就是到之间的内容啦。后面的几个文本框是显示内容的。点击“获取URL”可以查看它捕获的Url对不对的。点击“采集”,可以把采集内容放到数据库,然后就用 Insert xx () (select xx) 可以直接插入目标数据了。程序代码量非常小(也非常简陋),需要的改动一下啦。不足 应用到了正则表达式、网络编程由于是最简单的东西,所以没有用多线程,没有用其他的优化方法,不支持分页。测试了一下,获取38条数据,用了700M内存啊。。。。如果有用的人 ,可以改一下使用啦。方便程序员用,免写很多代码。Surance Yin@ Surance Center 转载请注明出处
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值