C#网页采集

//提取产品列表页中产品最终页的网页

private void button1_Click(object sender, EventArgs e)

{

  if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")

  {

    MessageBox.Show("网址和域名不能为空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); return;

  }

   try

   {

     string Html = inc.GetHtml("http://study.pctoday.net.cn");

    //ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");

    ArrayList al = inc.GetMatchesStr(Html, @"href\s*=\s*(?:[\'\""\s](?<1>[^\""\']*)[\'\""])");//提取链接

" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale! StringBuilder sb = new StringBuilder(); foreach (object var in al) { string a = var.ToString().Replace("\"", "").Replace("'", ""); a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline); if (a.StartsWith("/")) a = textBox2.Text.Trim() + a; if (!a.StartsWith("http://")) a = "http://" + a; sb.Append(a + "\r\n"); } textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox,每个链接占一行

 

MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

} catch (Exception err) { MessageBox.Show("提取出错!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); }

}

 

//把采集的产品页面html代码进行字符串处理,提取需要的代码,最后保存到本地一个access数据库中,同时提取产品图片地址并自动现在图片到本地images文件夹下

private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) { //填充产品表 Database.ExecuteNonQuery("delete from Tb_Product"); DataTable dt2 = new DataTable(); OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings); OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn); OleDbCommandBuilder cb = new OleDbCommandBuilder(da); da.Fill(dt2); dt2.Rows.Clear();

BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条

string[] Urls = textBox5.Text.Trim().ToLower().Replace("\r\n", ",").Split(','); DataTable dt = new DataTable(); StringBuilder ErrorStr = new StringBuilder(); string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images\\";

//循环每次采集网址 for (int i = 0; i < Urls.Length; i++) { try { if (!worker.CancellationPending) { if (Urls[i] == "") return; html = inc.GetHtml(Urls[i]);//获取该url的html代码 DataRow NewRow = dt2.NewRow();

//产品名 string ProductName = html.Substring(html.IndexOf("<title>") + 7); NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

//产品编号 NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

//产品介绍,这些都是根据不同网站的html做相应的修改 string Introduce = html.Substring(html.IndexOf("Product Details") + 26); Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

NewRow["Introduce"] = Introduce;

 

" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale! //下载图片 string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17); ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=\"") + 5); ProductImage = ProductImage.Remove(ProductImage.IndexOf("\"")); try { inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1)); } catch (Exception) { ErrorStr.Append("下载图片失败,图片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "\r\n"); }

dt2.Rows.Add(NewRow);

//Thread.Sleep(100); worker.ReportProgress((i + 1) * 100 / Urls.Length, i); toolStripStatusLabel1.Text = "处理进度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//进度条 }

} catch (Exception err) { ErrorStr.Append("采集错误:" + err.Message + ";网址:" + Urls[i] + "\r\n"); } } da.Update(dt2); DataBind(dt2); ShowError(ErrorStr.ToString()); }

/// <summary> /// ASPX页面生成静态Html页面,作者:郑少群 /// </summary> public static string GetHtml(string url) { StreamReader sr = null; string str = null; //读取远程路径 WebRequest request = WebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet)); str = sr.ReadToEnd(); sr.Close(); return str; }

// 提取HTML代码中的网址 public static ArrayList GetMatchesStr(string htmlCode, string strRegex) { ArrayList al = new ArrayList();

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i < m.Count; i++) { bool rep = false; string strNew = m[i].ToString();

// 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } }

if (!rep) al.Add(strNew); }

al.Sort();

return al; }

public static void DownFile(string Url, string Path) {

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream stream = response.GetResponseStream(); long size = response.ContentLength; //创建文件流对象 using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write)) { byte[] b = new byte[1025]; int n = 0; while ((n = stream.Read(b, 0, 1024)) > 0) { fs.Write(b, 0, n); } } }

转载于:https://www.cnblogs.com/hesijian/p/3230167.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值