自己的一个toy

最新推荐文章于 2022-06-02 08:29:19 发布

warrioR_wx

最新推荐文章于 2022-06-02 08:29:19 发布

阅读量1.8k

点赞数

分类专栏： java

本文链接：https://blog.csdn.net/wangxinginnlp/article/details/8765264

版权

java 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

准备写个toy：

实现这么几个功能：

从新浪，凤凰网，搜狐等抓取当天的新闻（目前暂定一天抓三次）。

对他们进行分词，然后看看他们关注的话题的不同。

-----mark 一下

抓取的网页多一点，然后每天对分出来的词进行归类，与hao123的热点新闻比较，看能不能得到点结果。

2013-4-7：

已经可以抓取好多网页的新闻标题了。

package Web;

import java.io.*;
import java.net.*;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CrawlWeb {
	public static String strNewsTitle;
	public static String strNewsTitleSeg;
	public static String rawTitle;
	public static String strTimeOfBorn;
	public CrawlWeb()
	{
		getTime();
	}
	//public static String strWriteFile = "./os.txt";
	public static String[] webURLs = {"http://news.sohu.com/","http://news.ifeng.com/","http://news.qq.com/",
									"http://news.sina.com.cn/","http://www.xinhuanet.com/","http://news.baidu.com/",
									"http://news.163.com/","http://www.people.com.cn/","http://news.cntv.cn/",
									"http://www.chinanews.com/","http://www.zaobao.com/","http://www.huanqiu.com/",
									"http://www.gov.cn/","http://cn.yahoo.com/","http://www.stnn.cc/",
									"http://www.cankaoxiaoxi.com/","http://www.takungpao.com/",
									"http://www.china.com/","http://www.china.com.cn/"};
	void getWebPage(String url)
	{
		StringBuffer res = new StringBuffer();
		try
		{
			URL tric =  new URL(url);
			HttpURLConnection con = (HttpURLConnection) tric.openConnection();
			con.setRequestProperty("User-Agent",
			"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10");
			InputStream is  = con.getInputStream();
			BufferedReader in = new BufferedReader(new InputStreamReader(is));
			String line = in.readLine();
			int iLineCount = 0;
			while( null != line)
			{  
				res.append(line + "\r\n");
				line = in.readLine();
				iLineCount ++;
			}
			in.close();
			is.close();
		}
		catch (Exception e){
			System.out.println(e);
		}
		this.rawTitle =  res.toString();
	}
	
	void getNewsTitle()
	{
		StringBuffer res = new StringBuffer();
		
		Pattern p = Pattern.compile(">([^<]+?)</[aA]>");
		Matcher m = p.matcher(this.rawTitle); 
		while(m.find())
		{
			res.append(m.group(1) + "\r\n");
		}
		
		this.strNewsTitle =  res.toString();
	}
	
	void Show()
	{
		System.out.println(this.strNewsTitle);
	}
	
	void writeToFile(String content, String savefile) {
		savefile = ".\\data\\" + savefile + ".txt";
		System.out.println(savefile);
		File file = new File(savefile);
		try {
			BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
					new FileOutputStream(file,true)));
			bw.append(content);
			bw.close();
		} catch (FileNotFoundException e) {
			System.out.println("file not found");
		} catch (IOException e) {
			System.out.println(e);
		}
	}
	
	void getTime()
	{
		Date date=new Date(); 
		String str=date.toString();
		String[] strs = str.split(" ");
		String strTime = strs[5] + "-" + strs[1] + "-" + strs[2] + "-" + strs[3].substring(0, 2);
		//System.out.println("---" + strTime);
		this.strTimeOfBorn = strTime;
	}
}

package Web;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import Web.CrawlWeb;
public class main {

	public static void main(String[] args)
	{ 
		String url;
		CrawlWeb cw = new CrawlWeb();
		System.out.println(cw.strTimeOfBorn);
		
		for(int i = 0; i < cw.webURLs.length; i++)
		{
			url = cw.webURLs[i];
			cw.getWebPage(url);
			cw.getNewsTitle();
			
			/*
			 * get savefile name. name format : news.sina.com.cn.2013-Apr-07-15 -- websitename.time
			 */
			String savefile = "";
			Pattern p = Pattern.compile("//(.+?)/");
			Matcher m = p.matcher(url); 
			if(m.find())
			{
				savefile = m.group(1) + "." + cw.strTimeOfBorn;
			}
			
			cw.writeToFile(cw.strNewsTitle, savefile);
			System.out.println("---");
		}
		
	}
}

抓取的数据示例：