用C#爬虫爬取校园最新通知

前言

前几个月用Python写了一个爬虫,用于爬取校园最新通知。最近的C#课程设计中想实现同样的功能,于是按照之前Python代码的思路重构了一下。
用Python实现校园通知更新提醒

思路

使用C#中的HttpWebRequest库去获取目标url(各个网站的’更多通知’页)的源代码,然后使用System.Text.RegularExpressions;进行正则匹配。
正则表达式应包含三个分组:日期、标题、链接;

准备工作

  • 命名空间导入
    1
    2
    3
    
    using System.Net;
    using System.Collections;
    using System.Text.RegularExpressions;
    

源代码

Spider基类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
public class Spider
{
    private string url = string.Empty;//要爬取的网页的url
    public HttpWebRequest request;
    public HttpWebResponse response;
    public string encode = "UTF-8";//目标url的网页编码格式
    public string Method = string.Empty;
    public string Url
    {
        get { return url; }
        set
        {
            url = value;
        }
    }
    public string Encode
    {
        get { return encode; }
        set
        {
            if (value == "UTF-8" || value == "GBK")
            {
                encode = value;
            }
            else
            {
                //Todo:handle exception
            }
        }
    }

    public Spider(string url,string method)
    {
        this.url = url;
        this.Method = method;
        Create();
    }
    public Spider() { }

    public void Create()
    {
        request = (HttpWebRequest)WebRequest.Create(url);
        request.Method = Method;
        request.Credentials = CredentialCache.DefaultCredentials;
        request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36";
    }
    //获取目标url的Html代码
    public string getHtml()
    {
        string reader = string.Empty;
        try
        {
            Create();
            response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK)
            {
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(encode));
                reader = sr.ReadToEnd();
                sr.Close();
                response.Close();//关闭response响应流
            }
            else
                throw new Exception();
        }
        catch(Exception e)
        {
            Log.Write( e.Message, "Exception");
            MessageBox.Show(e.Message, "错误", MessageBoxButtons.OK, MessageBoxIcon.Error);
        }
        return reader;
    }

}

SpiderNotice类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
public class SpiderNotice : Spider
{
    private string url_main;//各个通知链接的公共部分
    private string department;//所属部门
    private string type;//通知类型
    private string pattern;//用于匹配的正则表达式规则
    private ArrayList all_notice = new ArrayList();//所有的通知
    public string parse = string.Empty;//日期的解析规则
    public SpiderNotice(string url, string pattern, string url_main, string department, string type) : base(url, "GET")
    {
        this.department = department;
        this.pattern = pattern;
        this.url_main = url_main;
        this.type = type;
        this.parse = "yyyy-MM-dd";
    }
    public void Get()
    {
        all_notice.Clear();
        Regex r = new Regex(pattern, RegexOptions.ExplicitCapture);
        try
        {

            string s = getHtml();
            if (s == null)
                throw new ArgumentNullException("html");
            MatchCollection mc = r.Matches(s);
            foreach (Match m in mc)
            {
                GroupCollection group = m.Groups;
                DateTime date = DateTime.ParseExact(group["date"].Value, parse, System.Globalization.CultureInfo.InstalledUICulture);
                string link = url_main + group["link"].Value;
                Notice n = new Notice();
                n.Title = group["title"].Value;
                n.Link = link;
                n.Date = date;
                n.Department = department;
                n.Type = type;
                all_notice.Add(n);
            }
        }
        catch (Exception e)
        {
            BLL.Log.Write(e.Message, "Exception");
            MessageBox.Show(e.Message, "错误", MessageBoxButtons.OK, MessageBoxIcon.Error);
        }
    }
    public Notice index(int index)
    {
        if (index >= all_notice.Count)
            throw new IndexOutOfRangeException();
        else
            return all_notice[index] as Notice;
    }
    public Notice[] GetAll()
    {
        Notice[] All = new Notice[all_notice.Count];
        int flag = 0;
        foreach (object t in all_notice)
        {
            All[flag++] = t as Notice;
        }
        return All;
    }
    public int Count()
    {
        return all_notice.Count;
    }
}

数据模型Notice类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
public class Notice
{
    #region 数据模型
    private string title=string.Empty;
    private string link=string.Empty;
    private DateTime date=DateTime.Now;
    private string type=string.Empty;
    private string department=string.Empty;
    public string Title
    {
        get
        {
            return title;
        }
        set
        {
            title = value;
        }
    }
    public DateTime Date
    {
        get
        {
            return date;
        }
        set
        {
            date = value;
        }
    }
    public string Link
    {
        set
        {
            link = value;
        }
        get
        {
            return link;
        }
    }
    public string Type
    {
        get
        {
            return type;
        }
        set
        {
            type = value;
        }
    }
    public string Department
    {
        get
        {
            return department;
        }
        set
        {
            department = value;
        }
    }
    #endregion
    override public string ToString()
    {
        return string.Format("标题:{0}\n时间:{1}\n链接:{2}\n", title, date.ToString("yyyy-MM-dd"), link);

    }
}

问题总结

  • 从不同网站上爬取到的日期格式可能不同,为统一格式,可以使用`DateTime.ParseExact(datastr, parse, System.Globalization.CultureInfo.InstalledUICulture);`去解析用于表示日期的字符串,然后再转为统一日期格式的字符串。其中,datastr为表示日期的字符串,parse为解析规则。例:

    1
    2
    3
    
    string parse="yyyy-M-D";
    DateTime t=DateTime.ParseExact("2016-9-5", parse, System.Globalization.CultureInfo.InstalledUICulture);
    string str=t.toString("yyyy-MM-dd");//2016-09-05
    
  • 不用网页的网页编码格式可能不同,有的为GBK,有的问UTF-8;

  • 用字符串去表达正则表达式的规则时,有两种方法:
    1. string s=@”\d+”””;字符串前加@表示字符串按常量解析(注意:双引号用两个双引号来表示)
    2. string s=”\d+\””;使用转义符
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值