using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Tool;
using System.Net;
using System.Text.RegularExpressions;
using System.Threading;
namespace Search
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
/**
* 队列,保存将要访问的URL
*/
public class Queue
{
//使用链表实现队列
private LinkedList<string> queue = new LinkedList<string>();
//入队列
public void enQueue(string t)
{
queue.AddLast(t);
}
//出队列
public string deQueue()
{
string o = queue.Last.Value;
queue.RemoveLast();
return o;
}
//判断队列是否为空
public bool isQueueEmpty()
{
return queue.Count > 0 ? false : true;
}
//判断队列是否包含t
public bool contians(string t)
{
return queue.Contains(t);
}
public int getcount()
{
return queue.Count;
}
}
public class LinkQueue
{
//已访问的url 集合
private static ISet<string> visitedUrl = new HashSet<string>();
//待访问的url 集合
private static Queue unVisitedUrl = new Queue();
//获得URL 队列
public static Queue getUnVisitedUrl()
{
return unVisitedUrl;
}
//添加到访问过的URL 队列中
public static void addVisitedUrl(String url)
{
visitedUrl.Add(url);
}
//移除访问过的URL
public static void removeVisitedUrl(String url)
{
visitedUrl.Remove(url);
}
//未访问的URL 出队列
public static Object unVisitedUrlDeQueue()
{
return unVisitedUrl.deQueue();
}
// 保证每个URL 只被访问一次
public static void addUnvisitedUrl(String url)
{
if (url != null && !url.Trim().Equals("")
&& !visitedUrl.Contains(url)
&& !unVisitedUrl.contians(url))
unVisitedUrl.enQueue(url);
}
//获得已经访问的URL 数目
public static int getVisitedUrlNum()
{
return visitedUrl.Count;
}
//判断未访问的URL 队列中是否为空
public static bool unVisitedUrlsEmpty()
{
return unVisitedUrl.isQueueEmpty();
}
}
string[] urlarr=new string[100];
private void button1_Click(object sender, EventArgs e)
{
zzHttp http = new zzHttp();
CookieContainer cookie = new CookieContainer();
string url = textBox1.Text!=""?textBox1.Text:"http://image.baidu.com/";
string content=http.SendDataByGET(url,"",ref cookie);
string baseUri = Utility.GetBaseUri(url);
string[] links = Parser.ExtractLinks(baseUri, content);
foreach (string link in links)
{
richTextBox1.Text += link;
richTextBox1.Text += "\n";
}
Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase); // 搜索匹配的字符串
MatchCollection matches = regImg.Matches(content);
Queue que = new Queue();
foreach (Match match in matches)
que.enQueue(match.Groups["imgUrl"].Value);
int k;
for (k = 0; k < que.getcount(); k++)
{
string picurl = que.deQueue();
richTextBox1.Text += picurl;
richTextBox1.Text += "\n";
string[] s = picurl.Split('/');
string picname=s[s.Length - 1];
zzHttp.downfile(picurl, picname, @"d:\pic\");
}
label1.Text = k+"张";
}
//搜索
void search()
{
int i = 0;
LinkQueue.addUnvisitedUrl("http://blog.csdn.net/zhujunxxxxx/");
while (!LinkQueue.unVisitedUrlsEmpty()
&& LinkQueue.getVisitedUrlNum() <= 1000)
{
//队头URL 出队列
String visitUrl=(String)LinkQueue.unVisitedUrlDeQueue();
if(visitUrl==null)
continue;
zzHttp downLoader = new zzHttp();
CookieContainer cookie = new CookieContainer();
//下载网页
string content=downLoader.SendDataByGET(visitUrl,"",ref cookie);
//该URL 放入已访问的URL 中
LinkQueue.addVisitedUrl(visitUrl);
//提取出下载网页中的URL
string baseUri = Utility.GetBaseUri(visitUrl);
string[] links = Parser.ExtractLinks(baseUri, content);
//新的未访问的URL 入队
i++;
Add2Message("已访问数目:" + LinkQueue.getVisitedUrlNum() + ",count=" + LinkQueue.getUnVisitedUrl().getcount());
foreach (string link in links)
{
if (link.Contains("css") || link.Contains("js") || link.Contains("gif") || link.Contains("jpg") || link.Contains("png") || link.Contains("jpeg"))
continue;
LinkQueue.addUnvisitedUrl(link);
AddMessage(link);
}
}
}
private void button2_Click(object sender, EventArgs e)
{
new Thread(search).Start();
}
private delegate void InfoDelegate(string message);
public void AddMessage(string message)
{
if (richTextBox1.InvokeRequired)//不能访问就创建委托
{
InfoDelegate d = new InfoDelegate(AddMessage);
richTextBox1.Invoke(d, new object[] { message});
}
else
{
richTextBox1.AppendText(message + Environment.NewLine);
richTextBox1.ScrollToCaret();
}
}
private delegate void Info2Delegate(string message);
public void Add2Message(string message)
{
if (label2.InvokeRequired)//不能访问就创建委托
{
Info2Delegate d = new Info2Delegate(Add2Message);
label2.Invoke(d, new object[] { message });
}
else
{
label2.Text = message;
}
}
}
}
c#宽度优先的网络爬虫
最新推荐文章于 2024-06-12 00:09:12 发布