C#使用多线程实现网络爬虫，并且通过网络传输，传到另外的服务器数据库存储

最新推荐文章于 2022-10-07 22:56:09 发布

吕川

最新推荐文章于 2022-10-07 22:56:09 发布

阅读量1.5k

点赞数

分类专栏：软件工程

本文链接：https://blog.csdn.net/qq_20949153/article/details/50801780

版权

该博客介绍了如何使用C#编写一个网络爬虫，利用多线程抓取指定网站的数据。爬虫从一个起始URL开始，通过正则表达式提取链接，将数据通过HTTP发送到另一服务器的数据库存储。博客涵盖了线程同步、HTTP请求、正则匹配以及数据传输等关键点。

摘要由CSDN通过智能技术生成

using System;
using System.Net;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using System.Text;
using System.Threading;
namespace spider
{
class MainClass
{

private string basicurl;
private string mtch=@"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? ";
private string myurl=@"http://kysysgl.nwsuaf.edu.cn/index.php?m=Ss&a=add";
private Dictionary <String,int> dict=new Dictionary <string,int>();
List <string> l = new List<string> ();
private int now = 0;
private int filen=0;
int x=0;
public MainClass()
{

dict.Add (@"http://202.117.179.110/ListTeacher.jsp",0);
l.Add (@"http://202.117.179.110/ListTeacher.jsp");
string url;

//在这里可以开启四个线程，但是需要同时进行互斥锁保证，list的数据不被重复执行。
var q = 0;
while (now<100000) {

if(now>=dict.Count)
{

continue;
}

try
{

// Thread one=new Thread(geturl);
geturl(l[now++]);

// one.Start(l[now++]);

}

catch (Exception e)
{

}

//Console.WriteLine(l[now]);

}

}

public void sayhello()
{

}

public void geturl(object u)
{

string url = u.ToString ();

//Console.WriteLine ("address"+url);
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
WebResponse response = request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader sr = new StreamReader(resStream,System.Text.Encoding.GetEncoding("gbk"));

addurl (sr,url);

}

catch (Exception e){

return ;

}

}