我做了个程序把新浪上的天气新闻抓过来存到本地,考虑访问速度问题,新闻中的图片也要保存到本地。
程序如下
package vnet.com.weather1;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import vnet.com.update.Getdata;
/**
* 正则方式抓取新浪天气新闻上的新闻
* 地址http://weather.news.sina.com.cn/weather/news/index.html
* @param args
*/
public class Newlist {
private static final Log log = LogFactory.getLog(Newlist.class);
/**
* 测试
* @param args
*/
public static void main(String args[]){
Newlist n=new Newlist();
String[] k=n.getNewList();
for (int i=0;i<k.length;i++){
System.out.println(k[i].replace("href=/"", "href=/"newinfo2.jsp?url="));
}
String[] m=n.getNewinfo("news/2008/1119/35261.html");
for (int l=0;l<m.length;l++){
System.out.println(m[l]);
}
}
/**
* 由url地址获得新闻内容string[]
* 新闻中的图片下载到本地,文中新闻地址改成本地地址
* @param url
* @return
*/
public String[] getNewinfo(String url){
String URL="http://weather.news.sina.com.cn/"+url;
//30是指取30段满足给出的正则条件的字符串,如果只找出10个,那数组后面的全为null
String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);
for (int i=0;i<s.length;i++){
Pattern sp = Pattern.compile("src=/"(.*?)/"");
Matcher matcher = sp.matcher(s[i]);
if (matcher.find()){
String imageurl=analysis("src=/"(.*?)/"" , s[i] , 1)[0];
if(!imageurl.startsWith("http://")){
imageurl="http://weather.news.sina.com.cn/"+imageurl;
}
System.out.println("新闻有图片:"+imageurl);
String content=getContent(imageurl);
String[] images=imageurl.split("/");
String imagename=images[images.length-1];
System.out.println("图片名:"+imagename);
try {
File fwl = new File(imagename);
PrintWriter outl = new PrintWriter(fwl);
outl.println(content);
outl.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("s[i]:"+s[i]);
//修改文件图片地址
s[i]=s[i].replace(analysis("src=/"(.*?)/"" , s[i] , 1)[0], imagename);
}
}
return s;
}
public String[] getNewList(){
String url="http://weather.news.sina.com.cn/weather/news/index.html";
return getNewList(getContent(url));
}
private String[] getNewList(String content ){
//String[] s = analysis("align=/"center/" valign=/"top/"><img src=/"../images/a(.*?).gif/" width=/"70/" height=/"65/"></td>" , content , 50);
String[] s = analysis("<li>(.*?)</li>" , content , 50);
return s;
}
private String[] analysis(String pattern, String match , int i){
Pattern sp = Pattern.compile(pattern);
Matcher matcher = sp.matcher(match);
String[] content = new String[i];
for (int i1 = 0; matcher.find(); i1++){
content[i1] = matcher.group(1);
}
//下面一段是为了剔除为空的串
int l=0;
for (int k=0;k<content.length;k++){
if (content[k]==null){
l=k;
break;
}
}
String[] content2;
if (l!=0){
content2=new String[l];
for (int n=0;n<l;n++){
content2[n]=content[n];
}
return content2;
}else{
return content;
}
}
/**
* 由地址获取网页内容
* @param strUrl
* @return
private String getContent(String strUrl){
try{
//URL url = new URL(strUrl);
//BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
URLConnection uc = new URL(strUrl).openConnection();
//通过修改http头的User-Agent来伪装成是通过浏览器提交的请求
uc.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
System.out.println("-----------------------------------------");
System.out.println("Content-Length: "+uc.getContentLength());
System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie"));
System.out.println("-----------------------------------------");
//获取文件头信息
System.out.println("Header"+uc.getHeaderFields().toString());
System.out.println("-----------------------------------------");
BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312"));
String s = "";
StringBuffer sb=new StringBuffer();
while((s = br.readLine())!=null){
sb.append(s+"/r/n");
}
System.out.println("长度+"+sb.toString().length());
return sb.toString();
}catch(Exception e){
return "error open url" + strUrl;
}
}
*/
public static String getContent (String strUrl){
URLConnection uc = null;
String all_content=null;
try {
all_content =new String();
URL url = new URL(strUrl);
uc = url.openConnection();
uc.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
System.out.println("-----------------------------------------");
System.out.println("Content-Length: "+uc.getContentLength());
System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie"));
System.out.println("-----------------------------------------");
//获取文件头信息
System.out.println("Header"+uc.getHeaderFields().toString());
System.out.println("-----------------------------------------");
if (uc == null)
return null;
InputStream ins = uc.getInputStream();
ByteArrayOutputStream outputstream = new ByteArrayOutputStream();
byte[] str_b = new byte[1024];
int i = -1;
while ((i=ins.read(str_b)) > 0) {
outputstream.write(str_b,0,i);
}
all_content = outputstream.toString();
// System.out.println(all_content);
} catch (Exception e) {
e.printStackTrace();
log.error("获取网页内容出错");
}finally{
uc = null;
}
// return new String(all_content.getBytes("ISO8859-1"));
System.out.println(all_content.length());
return all_content;
}
}
现在的问题是:图片下载不全,我用后面两种getContent方法下图片,下来的图片大小都和文件头里获得的Content-Length,也就是图片的实际大小不符,预览不了。
而且反复测试,两种方法每次下来的东西大小是固定的,所以重复下载没有用?
测试toString后length大小比图片实际的小,而生成的图片比图片数据大。下载后存储过程中图片数据增加了!
图片数据流toString过程中数据大小发生了改变,还原不回来。其它新闻内容没有问题。估计是图片的编码格式等的问题。在图片数据流读过来时直接生成图片就可以了。
public int saveImage (String strUrl){
URLConnection uc = null;
try {
URL url = new URL(strUrl);
uc = url.openConnection();
uc.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
//uc.setReadTimeout(30000);
//获取图片长度
//System.out.println("Content-Length: "+uc.getContentLength());
//获取文件头信息
//System.out.println("Header"+uc.getHeaderFields().toString());
if (uc == null)
return 0;
InputStream ins = uc.getInputStream();
byte[] str_b = new byte[1024];
int byteRead=0;
String[] images=strUrl.split("/");
String imagename=images[images.length-1];
File fwl = new File(imagename);
FileOutputStream fos= new FileOutputStream(fwl);
while ((byteRead=ins.read(str_b)) > 0) {
fos.write(str_b,0,byteRead);
};
fos.flush();
fos.close();
} catch (Exception e) {
e.printStackTrace();
log.error("获取网页内容出错");
}finally{
uc = null;
}
return 1;
}
方法二:
首先把搜索后的页面用流读取出来,再写个正则,去除不要的内容,再把最后的结果存成xml格式文件、或者直接存入数据库,用的时候再调用
本代码只是显示html页的源码内容,如果需要抽取内容请自行改写public static String regex()中的正则式
package rssTest;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MyRSS
{
/**
* 获取搜索结果的html源码
* */
public static String getHtmlSource(String url)
{
StringBuffer codeBuffer = null;
BufferedReader in=null;
try
{
URLConnection uc = new URL(url).openConnection();
/**
* 为了限制客户端不通过网页直接读取网页内容,就限制只能从浏览器提交请求.
* 但是我们可以通过修改http头的User-Agent来伪装,这个代码就是这个作用
*
*/
uc.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");
// 读取url流内容
in = new BufferedReader(new InputStreamReader(uc
.getInputStream(), "gb2312"));
codeBuffer = new StringBuffer();
String tempCode = "";
// 把buffer内的值读取出来,保存到code中
while ((tempCode = in.readLine()) != null)
{
codeBuffer.append(tempCode).append("/n");
}
in.close();
}
catch (MalformedURLException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
return codeBuffer.toString();
}
/**
* 正则表达式
* */
public static String regex()
{
String googleRegex = "<div class=g>(.*?)href=/"(.*?)/"(.*?)/">(.*?)</a>(.*?)<div class=std>(.*?)<br>";
return googleRegex;
}
/**
* 测试用
* 在google中检索关键字,并抽取自己想要的内容
*
* */
public static List<String> GetNews()
{
List<String> newsList = new ArrayList<String>();
String allHtmlSource = MyRSS
.getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os- maxthon&hs=SUZ&q=%E8%A7%81%E9%BE%99%E5%8D%B8%E7%94%B2&meta=&aq=f");
Pattern pattern = Pattern.compile(regex());
Matcher matcher = pattern.matcher(allHtmlSource);
while (matcher.find())
{
String urlLink = matcher.group(2);
String title = matcher.group(4);
title = title.replaceAll("<font color=CC0033>", "");
title = title.replaceAll("</font>", "");
title = title.replaceAll("<b>...</b>", "");
String content = matcher.group(6);
content = content.replaceAll("<font color=CC0033>", "");
content = content.replaceAll("</font>", "");
content = content.replaceAll("<b>...</b>", "");
newsList.add(urlLink);
newsList.add(title);
newsList.add(content);
}
return newsList;
}
/**
* main方法
* */
public static void main(String[] args)
{
System.out
.println(MyRSS
.getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html"));
}
}
方法三:
jsp自动抓取新闻 自动抓取新闻
package com.news.spider;
import java.io.File;
import java.io.FileFilter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.db.DBAccess;
public class SpiderNewsServer {
public static void main(String[] args) throws Exception{
//设置抓取信息的首页面
String endPointUrl = "http://cn.china.cn/zixun/";
//获得当前时间
Calendar calendar=Calendar.getInstance();
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
String DateNews = sdf.format(calendar.getTime());
/********************
* 抓取二级URl 开始
* url匹配类型:"http://cn.china.cn/article/"
*/
List listNewsType = new ArrayList();
//取入口页面html
WebHtml webHtml = new WebHtml();
String htmlDocuemtnt1 = webHtml.getWebHtml(endPointUrl);
if(htmlDocuemtnt1 == null || htmlDocuemtnt1.length() == 0){
return;
}
String strTemp1 = "http://cn.china.cn/article/";
String strTemp2 = "</li>";
int stopIndex=0;
int startIndex=0;
int dd=0;
while(true){
dd++;
startIndex = htmlDocuemtnt1.indexOf(strTemp1, stopIndex);
System.out.println("=========="+startIndex);
stopIndex= htmlDocuemtnt1.indexOf(strTemp2, startIndex);
System.out.println("==========---------"+stopIndex);
if(startIndex!=-1 && stopIndex!=-1){
String companyType=htmlDocuemtnt1.substring(startIndex,stopIndex);
System.out.println("@@@@@--------"+companyType);
System.out.println("@@@@@--------"+companyType.indexOf("/""));
companyType=companyType.substring(0,companyType.indexOf("/""));
System.out.println("#####--------"+companyType);
listNewsType.add(companyType);
}
if(dd>10){
break;
}
if(stopIndex==-1 || startIndex==-1){
break;
}
}
System.out.println("listCompanyType====="+listNewsType.size());
/**
* 抓取二级URl 结束
********************/
/********************
* 抓取页面内容 开始
*/
String title="";
String hometext="";
String bodytext="";
String keywords="";
String counter = "221";
String cdate= "";
int begainIndex=0;//检索字符串的起点索引
int endIndex=0;//检索字符串的终点索引
String begainStr;//检索开始字符串
String endStr;//检索结束字符串
for (int rows = 1; rows < listNewsType.size(); rows++) {
String strNewsDetail = listNewsType.get(rows).toString();
System.out.println("strNewsDetail====="+strNewsDetail);
if(strNewsDetail != null && strNewsDetail.length() > 0){
WebHtml newsListHtml = new WebHtml();
String htmlDocuemtntCom = newsListHtml.getWebHtml(strNewsDetail);
System.out.println("$$$$$------"+htmlDocuemtntCom);
if(htmlDocuemtntCom == null || htmlDocuemtntCom.length() == 0){
return;
}
//截取时间
int dateBegainIndex = htmlDocuemtntCom.indexOf("<div>时间:");
System.out.println("%%%%%--"+dateBegainIndex);
String newTime = htmlDocuemtntCom.substring(dateBegainIndex,dateBegainIndex+20);
System.out.println("^^^^^^^^^^^^^^^---"+newTime);
String newTimeM = newTime.substring(newTime.lastIndexOf("-")+1,newTime.lastIndexOf("-")+3);
String dateM = DateNews.substring(DateNews.lastIndexOf("-")+1);
System.out.println("^^^^^^^^^^^^^^^---"+newTimeM);
System.out.println("^^^^^^^^^^^^^^^---"+dateM);
if(newTimeM == dateM || newTimeM.equals(dateM)){
//检索新闻标题
begainStr="<div class=/"divCon bg008 /">";
endStr="<div>时间:";
begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);
System.out.println("&&&&&&------"+begainIndex);
endIndex=htmlDocuemtntCom.indexOf(endStr,0);
System.out.println("&&&&&&------"+endIndex);
if(begainIndex!=-1 && endIndex!=-1){
title = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();
title = title.substring(title.indexOf("<h1>")+4,title.indexOf("</h1>"));
title = title.replace("'", "");
title = title.replace(";", "");
title = title.replace(" ", "");
}
//检索新闻内容
begainStr="<div class=/"divCon bg008 /">";
endStr="<!-- page begin -->";
begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);
endIndex=htmlDocuemtntCom.indexOf(endStr,0);
if(begainIndex!=-1 && endIndex!=-1){
bodytext = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();
if(bodytext.indexOf("<p>")>0 && bodytext.indexOf("</p>")>bodytext.indexOf("<p>") && bodytext.indexOf("</p>")>0)
bodytext = bodytext.substring(bodytext.indexOf("<p>")+3,bodytext.indexOf("</p>"));
bodytext=bodytext.replace(" ", "");
bodytext=bodytext.replace("<br>", "");
bodytext=bodytext.replace("/n", "<br>");
bodytext=bodytext.replace("'", "");
bodytext=bodytext.replace(";", "");
}
//简介
if(bodytext.length()>40)
hometext = bodytext.substring(0,40)+"......";
else{
hometext = bodytext+"......";
}
//浏览量
String str = String.valueOf(Math.random());
counter = str.substring(str.lastIndexOf(".")+1,5);
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cdate = cal.getTimeInMillis()+"";
cdate = cdate.substring(0,10);
}else{
continue;
}
}
System.out.println("-------------------------"+title);
System.out.println("-------------------------"+cdate);
System.out.println("-------------------------"+cdate);
System.out.println("-------------------------"+hometext);
System.out.println("-------------------------"+bodytext);
System.out.println("-------------------------"+keywords);
System.out.println("-------------------------"+counter);
/*String str = "INSERT INTO ecim_stories(uid,title,created,published,hostname,hometext,bodytext,keywords,counter,topicid,ihome,notifypub,story_type,topicdisplay,topicalign,comments,rating,votes,description) ";
str += "VALUE (1,'"+title+"',"+cdate+","+cdate+",'125.122.83.177','"+hometext+"','"+bodytext+"','"+keywords+"',"+counter+",1,0,1,'admin',0,'R',0,0,0,'')";
DBAccess db = new DBAccess();;
if(db.executeUpdate(str)>0) {
System.out.println("-------------------------成功!!!!!!!!!!");
}else {
System.out.println("-------------------------失败!!!!!!!!!!");
}*/
}
/**
* 抓取页面内容 结束
********************/
}
}
package com.news.spider;
import java.net.URL;
import java.net.URLConnection;
import java.io.BufferedReader;
import java.io.InputStreamReader;
public class WebHtml {
/**
* 根据url,抓取webhmtl内容
* @param url
*/
public String getWebHtml(String url){
try {
URL myURL = new URL(url);
URLConnection conn = myURL.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line = null;
StringBuffer document = new StringBuffer("");
while ((line = reader.readLine()) != null){
document.append(line + "/n");
}
reader.close();
String resutlDocument = new String(document);
return resutlDocument;
} catch (Exception e) {}
return "";
}
}