用C＃做一个网页数据采集工具

最新推荐文章于 2023-09-15 15:03:33 发布

zhaili1978

最新推荐文章于 2023-09-15 15:03:33 发布

阅读量912

点赞数

分类专栏： asp.net中的有些封装（方法）文章标签：工具 c textbox string exception html

本文链接：https://blog.csdn.net/zhaili1978/article/details/6331401

版权

asp.net中的有些封装（方法）专栏收录该内容

102 篇文章 1 订阅

订阅专栏

//提取产品列表页中产品最终页的网页
 private void button1_Click(object sender, EventArgs e)
 {
 if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
 {
 MessageBox.Show("网址和域名不能为空！", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
 return;
 }
 try
 {
 string Html = inc.GetHtml("http://www.shaoqun.com/");
 //ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
 ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取链接

                StringBuilder sb = new StringBuilder();
                foreach (object var in al)
                {
                    string a = var.ToString().Replace("/"", "").Replace("'", "");
                    a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                    if (a.StartsWith("/"))
                        a = textBox2.Text.Trim() + a;
                    if (!a.StartsWith("http://"))
                        a = "http://" + a;
                    sb.Append(a + "/r/n");
                }
                textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox，每个链接占一行

MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

            }
            catch (Exception err)
            {
                MessageBox.Show("提取出错！原因：" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
            }

}

//把采集的产品页面html代码进行字符串处理，提取需要的代码，最后保存到本地一个access数据库中，同时提取产品图片地址并自动现在图片到本地images文件夹下
        private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
        {
            //填充产品表
            Database.ExecuteNonQuery("delete from Tb_Product");
            DataTable dt2 = new DataTable();
            OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
            OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
            OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
            da.Fill(dt2);
            dt2.Rows.Clear();

BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条

            string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(',');
            DataTable dt = new DataTable();
            StringBuilder ErrorStr = new StringBuilder();
            string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//";

//循环每次采集网址
 for (int i = 0; i < Urls.Length; i++)
 {
 try
 {
 if (!worker.CancellationPending)
 {
 if (Urls[i] == "")
 return;
 html = inc.GetHtml(Urls[i]);//获取该url的html代码
 DataRow NewRow = dt2.NewRow();

//产品名
 string ProductName = html.Substring(html.IndexOf("<title>") + 7);
 NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

//产品编号
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

//产品介绍，这些都是根据不同网站的html做相应的修改
 string Introduce = html.Substring(html.IndexOf("Product Details") + 26);
 Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

NewRow["Introduce"] = Introduce;

//下载图片
 string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);
 ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=/"") + 5);
 ProductImage = ProductImage.Remove(ProductImage.IndexOf("/""));
 try
 {
 inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));
 }
 catch (Exception)
 {
 ErrorStr.Append("下载图片失败，图片地址：" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "/r/n");
 }

dt2.Rows.Add(NewRow);

                        //Thread.Sleep(100);
                        worker.ReportProgress((i + 1) * 100 / Urls.Length, i);
                        toolStripStatusLabel1.Text = "处理进度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//进度条
                    }

                }
                catch (Exception err)
                {
                    ErrorStr.Append("采集错误：" + err.Message + ";网址：" + Urls[i] + "/r/n");
                }
            }
            da.Update(dt2);
            DataBind(dt2);
            ShowError(ErrorStr.ToString());
        }

/// <summary>
/// ASPX页面生成静态Html页面

/// </summary>
 public static string GetHtml(string url)
 {
 StreamReader sr = null;
 string str = null;
 //读取远程路径
 WebRequest request = WebRequest.Create(url);
 HttpWebResponse response = (HttpWebResponse)request.GetResponse();
 sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));
 str = sr.ReadToEnd();
 sr.Close();
 return str;
 }

     // 提取HTML代码中的网址
        public static ArrayList GetMatchesStr(string htmlCode, string strRegex)
        {
            ArrayList al = new ArrayList();

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i < m.Count; i++)
 {
 bool rep = false;
 string strNew = m[i].ToString();

                // 过滤重复的URL
                foreach (string str in al)
                {
                    if (strNew == str)
                    {
                        rep = true;
                        break;
                    }
                }

if (!rep) al.Add(strNew);
}

al.Sort();

return al;
}

public static void DownFile(string Url, string Path)
{

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream stream = response.GetResponseStream();
            long size = response.ContentLength;
            //创建文件流对象
            using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))
            {
                byte[] b = new byte[1025];
                int n = 0;
                while ((n = stream.Read(b, 0, 1024)) > 0)
                {
                    fs.Write(b, 0, n);
                }
            }
        }

zhaili1978

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
用C＃做一个网页数据采集工具

 //提取产品列表页中产品最终页的网页 private void button1_Click(object sender, EventArgs e) { if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "") { MessageBox.Show("网址和域名不
复制链接

扫一扫

专栏目录