这个东西虽然简单,但还是挺好玩的:首先把搜索后的页面用流读取出来,再写个正则,去除不要的内容,再把最后的结果存成xml格式文件、或者直接存入数据库,用的时候再调用
本代码只是显示html也的源码内容,如果需要抽取内容请自行改写public static String regex()中的正则式
- package rssTest;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * @author Der
- * @date 05-01
- * @E-mail uidin@163.com
- * */
- public class MyRSS
- {
- /**
- * 获取搜索结果的html源码
- * */
- public static String getHtmlSource(String url)
- {
- StringBuffer codeBuffer = null ;
- BufferedReader in=null ;
- try
- {
- URLConnection uc = new URL(url).openConnection();
- /**
- * 为了限制客户端不通过网页直接读取网页内容,就限制只能从浏览器提交请求.
- * 但是我们可以通过修改http头的User-Agent来伪装,这个代码就是这个作用
- *
- */
- uc.setRequestProperty("User-Agent" ,
- "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)" );
- // 读取url流内容
- in = new BufferedReader( new InputStreamReader(uc
- .getInputStream(), "gb2312" ));
- codeBuffer = new StringBuffer();
- String tempCode = "" ;
- // 把buffer内的值读取出来,保存到code中
- while ((tempCode = in.readLine()) != null )
- {
- codeBuffer.append(tempCode).append("\n" );
- }
- in.close();
- }
- catch (MalformedURLException e)
- {
- e.printStackTrace();
- }
- catch (IOException e)
- {
- e.printStackTrace();
- }
- return codeBuffer.toString();
- }
- /**
- * 正则表达式
- * */
- public static String regex()
- {
- String googleRegex = "<div class=g>(.*?)href=\"(.*?)\"(.*?)\">(.*?)</a>(.*?)<div class=std>(.*?)<br>" ;
- return googleRegex;
- }
- /**
- * 测试用
- * 在google中检索关键字,并抽取自己想要的内容
- *
- * */
- public static List<String> GetNews()
- {
- List<String> newsList = new ArrayList<String>();
- String allHtmlSource = MyRSS
- .getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os-maxthon&hs=SUZ&q=%E8%A7%81%E9%BE%99%E5%8D%B8%E7%94%B2&meta=&aq=f" );
- Pattern pattern = Pattern.compile(regex());
- Matcher matcher = pattern.matcher(allHtmlSource);
- while (matcher.find())
- {
- String urlLink = matcher.group(2 );
- String title = matcher.group(4 );
- title = title.replaceAll("<font color=CC0033>" , "" );
- title = title.replaceAll("</font>" , "" );
- title = title.replaceAll("<b>...</b>" , "" );
- String content = matcher.group(6 );
- content = content.replaceAll("<font color=CC0033>" , "" );
- content = content.replaceAll("</font>" , "" );
- content = content.replaceAll("<b>...</b>" , "" );
- newsList.add(urlLink);
- newsList.add(title);
- newsList.add(content);
- }
- return newsList;
- }
- /**
- * main方法
- * */
- public static void main(String[] args)
- {
- System.out
- .println(MyRSS
- .getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html" ));
- }
- }