一段时间没接触C#了,最近三天写了一个采集网页的工具,来填充一下网页的内容。
先介绍程序主要有三个实体:采集的任务、文章列表、文章。
任务:为了传输,共享方便任务将存在xml中。
class Task { private string name; /// <summary> /// 采集任务名称 /// </summary> public string Name { get { return name; } set { name = value; } } private string url; /// <summary> /// 采集任务列表地址 /// </summary> public string Url { get { return url; } set { url = value; } } private string ecode; /// <summary> /// 采集任务站点编码 /// </summary> public string Ecode { get { return ecode; } set { ecode = value; } } private string category; /// <summary> /// 采集任务列表栏目名 /// </summary> public string Category { get { return category; } set { category = value; } } private string listareaf; /// <summary> /// 采集任务列表区域开始标志 /// </summary> public string ListAreaf { get { return listareaf; } set { listareaf = value; } } private string listareae; /// <summary> /// 采集任务列表区域结束标志 /// </summary> public string ListAreae { get { return listareae; } set { listareae = value; } } private string rextitle; /// <summary> /// 采集任务文章标题和链接的正则 /// </summary> public string RexTitle { get { return rextitle; } set { rextitle = value; } } private string contentf; /// <summary> /// 采集任务文章内容区域开始标志 /// </summary> public string Contentf { get { return contentf; } set { contentf = value; } } private string contente; /// <summary> /// 采集任务文章内容区域结束标志 /// </summary> public string Contente { get { return contente; } set { contente = value; } } private string articlef; /// <summary> /// 采集任务文章内容开始标志 /// </summary> public string Articlef { get { return articlef; } set { articlef = value; } } private string articlee; /// <summary> /// 采集任务文章内容结束标志 /// </summary> public string Articlee { get { return articlee; } set { articlee = value; } } private string authorf; /// <summary> /// 采集任务文章作者开始标志 /// </summary> public string Authorf { get { return authorf; } set { authorf = value; } } private string authore; /// <summary> /// 采集任务文章作者结束标志 /// </summary> public string Authore { get { return authore; } set { authore = value; } } private string sourcef; /// <summary> /// 采集任务文章来源开始标志 /// </summary> public string Sourcef { get { return sourcef; } set { sourcef = value; } } private string sourcee; /// <summary> /// 采集任务文章来源结束标志 /// </summary> public string Sourcee { get { return sourcee; } set { sourcee = value; } } private string datef; /// <summary> /// 采集任务文章日期开始标志 /// </summary> public string Datef { get { return datef; } set { datef = value; } } private string datee; /// <summary> /// 采集任务文章日期结束标志 /// </summary> public string Datee { get { return datee; } set { datee = value; } } private string state; /// <summary> /// 采集任务状态 /// </summary> public string State { get { return state; } set { state = value; } } }
文章列表:取得以后会存在本地access数据库
class Urls { private string title; /// <summary> /// 标题 /// </summary> public string Title { get { return title; } set { title = value; } } private string url; /// <summary> /// 链接 /// </summary> public string Url { get { return url; } set { url = value; } } private int oid; /// <summary> /// id /// </summary> public int Oid { get { return oid; } set { oid = value; } } private int isc; /// <summary> /// 是否已采集 /// </summary> public int Isc { get { return isc; } set { isc = value; } } private string name; /// <summary> /// 任务名,和采集任务多对一关系 /// </summary> public string Name { get { return name; } set { name = value; } } }
文章:也将其存在本地access数据库
class Article { private string category; /// <summary> /// 文章类别 /// </summary> public string Category { get { return category; } set { category = value; } } private string url; /// <summary> /// 文章连接地址 /// </summary> public string Url { get { return url; } set { url = value; } } private string title; /// <summary> /// 文章标题 /// </summary> public string Title { get { return title; } set { if (value == "" || value.Length <= 0) { throw new ApplicationException("文章的标题不能为空!"); } title = value; } } private int views; /// <summary> /// 文章浏览次数 /// </summary> public int Views { get { return views; } set { views = value; } } private int replys; /// <summary> /// 文章评论次数 /// </summary> public int Replys { get { return replys; } set { replys = value; } } private string postdate; /// <summary> /// 文章发布日期 /// </summary> public string Postdate { get { return postdate; } set { postdate = value; } } private string author; /// <summary> /// 文章作者 /// </summary> public string Author { get { return author; } set { author = value; } } private string source; /// <summary> /// 文章来源 /// </summary> public string Source { get { return source; } set { source = value; } } private string content; /// <summary> /// 文章内容 /// </summary> public string Content { get { return content; } set { content = value; } } private int oid; public int Oid { get { return oid; } set { oid = value; } } }