自己的一个toy

准备写个toy:

实现这么几个功能:

从 新浪,凤凰网,搜狐等抓取当天的新闻(目前暂定一天抓三次)。

对他们进行分词,然后看看他们关注的话题的不同。

                       -----mark 一下

抓取的网页多一点,然后每天对分出来的词进行归类,与hao123的热点新闻比较,看能不能得到点结果。


2013-4-7:

已经可以抓取好多网页的新闻标题了。

package Web;

import java.io.*;
import java.net.*;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CrawlWeb {
	public static String strNewsTitle;
	public static String strNewsTitleSeg;
	public static String rawTitle;
	public static String strTimeOfBorn;
	public CrawlWeb()
	{
		getTime();
	}
	//public static String strWriteFile = "./os.txt";
	public static String[] webURLs = {"http://news.sohu.com/","http://news.ifeng.com/","http://news.qq.com/",
									"http://news.sina.com.cn/","http://www.xinhuanet.com/","http://news.baidu.com/",
									"http://news.163.com/","http://www.people.com.cn/","http://news.cntv.cn/",
									"http://www.chinanews.com/","http://www.zaobao.com/","http://www.huanqiu.com/",
									"http://www.gov.cn/","http://cn.yahoo.com/","http://www.stnn.cc/",
									"http://www.cankaoxiaoxi.com/","http://www.takungpao.com/",
									"http://www.china.com/","http://www.china.com.cn/"};
	void getWebPage(String url)
	{
		StringBuffer res = new StringBuffer();
		try
		{
			URL tric =  new URL(url);
			HttpURLConnection con = (HttpURLConnection) tric.openConnection();
			con.setRequestProperty("User-Agent",
			"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10");
			InputStream is  = con.getInputStream();
			BufferedReader in = new BufferedReader(new InputStreamReader(is));
			String line = in.readLine();
			int iLineCount = 0;
			while( null != line)
			{  
				res.append(line + "\r\n");
				line = in.readLine();
				iLineCount ++;
			}
			in.close();
			is.close();
		}
		catch (Exception e){
			System.out.println(e);
		}
		this.rawTitle =  res.toString();
	}
	
	void getNewsTitle()
	{
		StringBuffer res = new StringBuffer();
		
		Pattern p = Pattern.compile(">([^<]+?)</[aA]>");
		Matcher m = p.matcher(this.rawTitle); 
		while(m.find())
		{
			res.append(m.group(1) + "\r\n");
		}
		
		this.strNewsTitle =  res.toString();
	}
	
	void Show()
	{
		System.out.println(this.strNewsTitle);
	}
	
	void writeToFile(String content, String savefile) {
		savefile = ".\\data\\" + savefile + ".txt";
		System.out.println(savefile);
		File file = new File(savefile);
		try {
			BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
					new FileOutputStream(file,true)));
			bw.append(content);
			bw.close();
		} catch (FileNotFoundException e) {
			System.out.println("file not found");
		} catch (IOException e) {
			System.out.println(e);
		}
	}
	
	void getTime()
	{
		Date date=new Date(); 
		String str=date.toString();
		String[] strs = str.split(" ");
		String strTime = strs[5] + "-" + strs[1] + "-" + strs[2] + "-" + strs[3].substring(0, 2);
		//System.out.println("---" + strTime);
		this.strTimeOfBorn = strTime;
	}
}

package Web;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import Web.CrawlWeb;
public class main {

	public static void main(String[] args)
	{ 
		String url;
		CrawlWeb cw = new CrawlWeb();
		System.out.println(cw.strTimeOfBorn);
		
		for(int i = 0; i < cw.webURLs.length; i++)
		{
			url = cw.webURLs[i];
			cw.getWebPage(url);
			cw.getNewsTitle();
			
			/*
			 * get savefile name. name format : news.sina.com.cn.2013-Apr-07-15 -- websitename.time
			 */
			String savefile = "";
			Pattern p = Pattern.compile("//(.+?)/");
			Matcher m = p.matcher(url); 
			if(m.find())
			{
				savefile = m.group(1) + "." + cw.strTimeOfBorn;
			}
			
			cw.writeToFile(cw.strNewsTitle, savefile);
			System.out.println("---");
		}
		
	}
}


抓取的数据示例:



------------------

可惜,杂质还是挺多的。

(杂质如何去除:想了半天,发现可以从字符长度入手。一般的新闻标题都不短,暂用长度阈值5过滤吧)


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值