第一次写博客,但是这个爬虫并非是我自己写的,非常感谢苍的付出,有兴趣的同学可以点击查看源码。
这个爬虫基于c#语言,现在的爬虫很多是基于python,node.js,java。c#的话就比较少了,所以发出来给大家一点小小的帮助,下面是我修改过的代码,
有兴趣的可以看一下,改的不好,欢迎指正。
static void Main(string[] args)
{
for (int i = 1; i <=621; i++)
{
Program cra = new Program();
string htm = "http://news2.sysu.edu.cn/news01/index"+i+".htm";
string html01 = "http://news2.sysu.edu.cn/news01/";
string html = cra.HttpGet(htm, "");
//获取内容所在的范围
Regex reg_mulu = new Regex(@"<div class=""lan5"">(.|\n)*?</div>");
var mat_mulu = reg_mulu.Match(html);
string mulu = mat_mulu.Groups[0].ToString();
//匹配a标签里面的url
Regex tmpreg = new Regex("<a[^>]+?href=\"([^\"]+)\"[^>]*>([^<]+)</a>", RegexOptions.Compiled);
MatchCollection sMC = tmpreg.Matches(mulu);
if (sMC.Count != 0)
{
for (int j = 0;j<sMC.Count; j++)
{
//获取新闻标题
string title = sMC[j].Groups[2].Value.Replace("\r\n","");
//sMC[i].Groups[1].Value
//0是<a href="http://news2.sysu.edu.cn/">中山大学2012届高校毕...</a>
//1是http://news2.sysu.edu.cn
//2是中山大学2012届高校毕...
//获取url
string ur = sMC[j].Groups[1].Value;
string[] str = ur.Split(':');
if(str!=null)
{
if(str[0]=="http")
{
Insert(title, ur,i);
}
else
{
string html_a = formUrl(html01, ur);
Insert(title, html_a,i);
}
}
}
}
}
}
///<summary>
///将数据插入SQLserver中
///<param name="title">标题</param>
///<param name="url">链接</param>
///<param name="i">页号</param>
public static void Insert(string title ,string url,int i)
{
string sqlstr = "Data Source=.;Initial Catalog=Group4;Integrated Security=True";
string sql = string.Format("insert into News(Title,Url,i) values('{0}','{1}','{2}')",title,url,i);
using(SqlConnection conn=new SqlConnection(sqlstr))
{
conn.Open();
SqlCommand cmd = new SqlCommand(sql,conn);
try
{
cmd.ExecuteNonQuery();
}
catch(Exception ex)
{
throw ex;
}
}
}
///<summary>
///拼接URL
///<param name="rootUrl">根路径</param>
/// <param name="oldUrl">子路径</param>
public static string formUrl(string rootUrl, string sonUrl)
{
rootUrl += sonUrl;
return rootUrl;
//string[] str = sonUrl.Split(':');
//return sonUrl;
}
//Get
public string HttpGet(string Url, string postDataStr)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
request.Method = "GET";
HttpWebResponse response;
request.ContentType = "text/html;charset=UTF-8";
try
{
response = (HttpWebResponse)request.GetResponse();
}
catch (WebException ex)
{
response = (HttpWebResponse)request.GetResponse();
throw ex;
}
Stream myResponseStream = response.GetResponseStream();
StreamReader myStreamReader = new StreamReader(myResponseStream, System.Text.Encoding.GetEncoding("utf-8"));
string retString = myStreamReader.ReadToEnd();
myStreamReader.Close();
myResponseStream.Close();
return retString;
}
}
整个程序是比较简单的,难点是分析网站后如何写出正确的正则表达式,这个我也没有深入的研究,还在学习当中。
在抓取的时候还出现一个错误,循环到某一页的时候会报错,直接在程序里将循环的数值在报错的数值上增一就可以。