java新闻采集_新闻采集源码可自写规则

这个博客展示了如何使用C#进行网页数据抓取和处理。主要涉及GetRequest方法获取请求参数,AjaxAction方法处理Ajax请求,GetModelData和GetNews方法分别用于获取模型数据和新闻详情。此外,还实现了编辑站点信息的功能。
摘要由CSDN通过智能技术生成

using System;

using System.Data;

using System.Configuration;

using System.Collections;

using System.Web;

using System.Web.Security;

using System.Web.UI;

using System.Web.UI.WebControls;

using System.Web.UI.WebControls.WebParts;

using System.Web.UI.HtmlControls;

using System.Text.RegularExpressions;

using System.Collections;

using System.IO;

using System.Net;

using System.Text;

namespace NewsCollection

{

public partial class SiteEdit : System.Web.UI.Page

{

string urlData = "";

protected void Page_Load(object sender, EventArgs e)

{

this.BtCollection.Attributes.Add("onclick", "Status.showInfo('加载中');");

if (!Page.IsPostBack)

{

AjaxAction();

BindData();

}

}

public string GetRequest(string key)

{

key = Convert.ToString(Request[key]??"");

key = key==null?(""):(key);

return key;

}

public void AjaxAction()

{

string isAjax = GetRequest("isAjax").ToLower();

if (isAjax == "true")

{

string state = "";

string action = this.GetRequest("action").ToLower();

string values = this.GetRequest("values");

if (action == "newsbody")

{

string modelstart = this.GetRequest("modelstart");

string modelend = this.GetRequest("modelend");

string modelbody = this.GetRequest("modelbody");

string siteUrl = this.GetRequest("siteUrl");

ArrayList al = this.GetModelData(modelstart, modelend, modelbody, siteUrl);

StringBuilder sb = new StringBuilder();

foreach (string s in al)

{

sb.Append(s);

}

state = sb.ToString();

}

else if (action == "newsdetail")

{

string modelstart = this.GetRequest("modelstart");

string modelend = this.GetRequest("modelend");

string modelbody = this.GetRequest("modelbody");

string siteUrl = this.GetRequest("siteUrl");

string newsTitleStart = this.GetRequest("newsTitleStart");

string newsTitleEnd = this.GetRequest("newsTitleEnd");

string newsContentStart = this.GetRequest("newsContentStart");

string newsContentEnd = this.GetRequest("newsContentEnd");

state = GetNews(GetModelData(modelstart, modelend, modelbody, siteUrl), newsTitleStart, newsTitleEnd, newsContentStart, newsContentEnd);

}

else

{

state = "test Ajax";

}

Response.Clear();

Response.Write(state);

Response.End();

}

}

public void BindData()

{

string Gid = Convert.ToString(Request["Gid"] ?? "");

if (Gid.Length > 0)

{

Beans.Sites sites = new Beans.Sites();

sites.Gid = Gid;

sites = sites.SelectById();

TbSiteName.Text  = sites.SiteName;

TbSiteUrl.Text =sites.SiteUrl;

TbSiteModelStart.Text = sites.SiteModelStart;

TbSiteModelEnd.Text =sites.SiteModelEnd;

TbSiteModelBody.Text = sites.SiteModelBody;

TbNewsTitleStart.Text =sites.NewsTitleStart;

TbNewsTitleEnd.Text =sites.NewsTitleEnd;

TbNewsContentStart.Text =sites.NewsContentStart;

TbNewsContentEnd.Text =sites.NewsContentEnd;

}

}

protected void BtEdit_Click(object sender, EventArgs e)

{

string message = "系统错误请重试";

string script = "history.go(-1)";

string Gid = Convert.ToString(Request["Gid"]??"");

Beans.Sites sites = new Beans.Sites();

sites.SiteName = TbSiteName.Text.Trim();

sites.SiteUrl = TbSiteUrl.Text.Trim();

sites.SiteModelStart = TbSiteModelStart.Text.Trim();

sites.SiteModelEnd = TbSiteModelEnd.Text.Trim();

sites.SiteModelBody = TbSiteModelBody.Text.Trim();

sites.NewsTitleStart = TbNewsTitleStart.Text.Trim();

sites.NewsTitleEnd = TbNewsTitleEnd.Text.Trim();

sites.NewsContentStart = TbNewsContentStart.Text.Trim();

sites.NewsContentEnd = TbNewsContentEnd.Text.Trim();

if (Gid.Length > 0)

{

sites.Gid=Gid;

if (sites.Update())

{

message = "修改成功!";

}

}

else

{

if (sites.Add())

{

message = "添加成功!";

}

}

Response.Write("");

}

protected void BtCollection_Click(object sender, EventArgs e)

{

GetNews(GetModelData(TbSiteModelStart.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelEnd.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelBody.Text, TbSiteUrl.Text), TbNewsTitleStart.Text.Trim(), TbNewsTitleEnd.Text.Trim(), TbNewsContentStart.Text.Trim(), TbNewsContentEnd.Text.Trim());

// GetNews(GetModelData(TbSiteModelStart.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelEnd.Text.Replace("(", "\\(").Replace(")", "\\)").Replace(".", "\\."), TbSiteModelBody.Text, TbSiteUrl.Text));

}

public ArrayList GetModelData(string modelstart,string modelend,string modelbody,string SiteUrl)

{

ArrayList al = new ArrayList();

string content = GetHttpData(SiteUrl,"gb2312");

Regex reg = new Regex(modelstart+"(?[\\s\\S]*)"+modelend, RegexOptions.IgnoreCase | RegexOptions.Multiline);

Match mat = reg.Match(content);

//TbContent.Text = mat.Groups["newsBody"].Value.ToString();

Regex regurl = new Regex(modelbody.Replace("_url_", "(?[^\"'\\s]+)"), RegexOptions.IgnoreCase | RegexOptions.Singleline);

Match maturl = regurl.Match(mat.Groups["newsBody"].Value.ToString());

while (maturl.Success)

{

//z-zA-A0-9/\\.:

string temp = maturl.Groups["url"].Value;

al.Add(temp);

Response.Write(temp.StartsWith("http://") ? (temp) : (temp.Insert(0, "http://" + SiteUrl.Replace("http://","").Substring(0, SiteUrl.LastIndexOf("/")))) + "
");

maturl = maturl.NextMatch();

}

return  al;

}

public string  GetNews(ArrayList al,string titleStart,string titleEnd,string contentStart,string contetnEnd)

{

StringBuilder sb = new StringBuilder();

if (al != null)

{

foreach (string s in al)

{

Regex reg = new Regex(titleStart.Replace("(", "\\(").Replace(")", "\\)") + "(?

[^[\\s\\S]+)" + contetnEnd.Replace("(", "\\(").Replace(")", "\\)"), RegexOptions.IgnoreCase | RegexOptions.Multiline);

Match mat = reg.Match(GetHttpData(s, "gb2312"));

//Response.Write(string.Format("news:{0}
content:{1}
",mat.Groups["title"].Value,mat.Groups["content"].Value));

Beans.News news = new Beans.News();

news.Title = mat.Groups["title"].Value;

news.Typeid = Convert.ToString(Request["Gid"]);//mat.Groups["title"].Value;

news.From = this.TbSiteName.Text;//mat.Groups["content"].Value;

news.Content = mat.Groups["content"].Value;

news.Add();

sb.AppendFormat("title:{0}", mat.Groups["title"].Value);

}

}

return sb.ToString();

}

public string GetHttpData(string sUrl, string encoding)

{

string sRslt = null;

WebResponse oWebRps = null;

WebRequest oWebRqst = WebRequest.Create(sUrl);

oWebRqst.Timeout = 50000;

try

{

oWebRps = oWebRqst.GetResponse();

}

finally

{

if (oWebRps != null)

{

StreamReader oStreamRd = new StreamReader(oWebRps.GetResponseStream(), System.Text.Encoding.GetEncoding(encoding));

sRslt = oStreamRd.ReadToEnd();

oStreamRd.Close();

oWebRps.Close();

}

}

return sRslt;

}

}

}

posted on 2009-07-18 15:08 sanmao 阅读(194) 评论(0)  编辑  收藏

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值