准备写个toy:
实现这么几个功能:
从 新浪,凤凰网,搜狐等抓取当天的新闻(目前暂定一天抓三次)。
对他们进行分词,然后看看他们关注的话题的不同。
-----mark 一下
抓取的网页多一点,然后每天对分出来的词进行归类,与hao123的热点新闻比较,看能不能得到点结果。
2013-4-7:
已经可以抓取好多网页的新闻标题了。
package Web;
import java.io.*;
import java.net.*;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CrawlWeb {
public static String strNewsTitle;
public static String strNewsTitleSeg;
public static String rawTitle;
public static String strTimeOfBorn;
public CrawlWeb()
{
getTime();
}
//public static String strWriteFile = "./os.txt";
public static String[] webURLs = {"http://news.sohu.com/","http://news.ifeng.com/","http://news.qq.com/",
"http://news.sina.com.cn/","http://www.xinhuanet.com/","http://news.baidu.com/",
"http://news.163.com/","http://www.people.com.cn/","http://news.cntv.cn/",
"http://www.chinanews.com/","http://www.zaobao.com/","http://www.huanqiu.com/",
"http://www.gov.cn/","http://cn.yahoo.com/","http://www.stnn.cc/",
"http://www.cankaoxiaoxi.com/","http://www.takungpao.com/",
"http://www.china.com/","http://www.china.com.cn/"};
void getWebPage(String url)
{
StringBuffer res = new StringBuffer();
try
{
URL tric = new URL(url);
HttpURLConnection con = (HttpURLConnection) tric.openConnection();
con.setRequestProperty("User-Agent",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10");
InputStream is = con.getInputStream();
BufferedReader in = new BufferedReader(new InputStreamReader(is));
String line = in.readLine();
int iLineCount = 0;
while( null != line)
{
res.append(line + "\r\n");
line = in.readLine();
iLineCount ++;
}
in.close();
is.close();
}
catch (Exception e){
System.out.println(e);
}
this.rawTitle = res.toString();
}
void getNewsTitle()
{
StringBuffer res = new StringBuffer();
Pattern p = Pattern.compile(">([^<]+?)</[aA]>");
Matcher m = p.matcher(this.rawTitle);
while(m.find())
{
res.append(m.group(1) + "\r\n");
}
this.strNewsTitle = res.toString();
}
void Show()
{
System.out.println(this.strNewsTitle);
}
void writeToFile(String content, String savefile) {
savefile = ".\\data\\" + savefile + ".txt";
System.out.println(savefile);
File file = new File(savefile);
try {
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(file,true)));
bw.append(content);
bw.close();
} catch (FileNotFoundException e) {
System.out.println("file not found");
} catch (IOException e) {
System.out.println(e);
}
}
void getTime()
{
Date date=new Date();
String str=date.toString();
String[] strs = str.split(" ");
String strTime = strs[5] + "-" + strs[1] + "-" + strs[2] + "-" + strs[3].substring(0, 2);
//System.out.println("---" + strTime);
this.strTimeOfBorn = strTime;
}
}
package Web;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import Web.CrawlWeb;
public class main {
public static void main(String[] args)
{
String url;
CrawlWeb cw = new CrawlWeb();
System.out.println(cw.strTimeOfBorn);
for(int i = 0; i < cw.webURLs.length; i++)
{
url = cw.webURLs[i];
cw.getWebPage(url);
cw.getNewsTitle();
/*
* get savefile name. name format : news.sina.com.cn.2013-Apr-07-15 -- websitename.time
*/
String savefile = "";
Pattern p = Pattern.compile("//(.+?)/");
Matcher m = p.matcher(url);
if(m.find())
{
savefile = m.group(1) + "." + cw.strTimeOfBorn;
}
cw.writeToFile(cw.strNewsTitle, savefile);
System.out.println("---");
}
}
}
抓取的数据示例:
------------------
可惜,杂质还是挺多的。
(杂质如何去除:想了半天,发现可以从字符长度入手。一般的新闻标题都不短,暂用长度阈值5过滤吧)