新浪天气预报新闻java抓去程序

我做了个程序把新浪上的天气新闻抓过来存到本地,考虑访问速度问题,新闻中的图片也要保存到本地。
程序如下
package vnet.com.weather1;

import java.io.BufferedReader;
import
java.io.ByteArrayOutputStream;
import
java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import vnet.com.update.Getdata;
/**
 * 正则方式抓取新浪天气新闻上的新闻
 * 地址http://weather.news.sina.com.cn/weather/news/index.
html
 * @param args
 */
public class Newlist {
    private static final Log log = LogFactory.getLog(Newlist.class);
    /**
     * 测试
     * @param args
     */
    public  static void main(String args[]){
        Newlist n=new Newlist();
        String[] k=n.getNewList();
        for (int i=0;i<k.length;i++){
        System.out.println(k[i].replace("href=/"", "href=/"newinfo2.
jsp?url="));
        }
        String[] m=n.getNewinfo("news/2008/1119/35261.
html");
        for (int l=0;l<m.length;l++){       
            System.out.println(m[l]);   
        }
       
    }
    /**
     * 由url地址获得新闻内容string[]
     * 新闻中的图片下载到本地,文中新闻地址改成本地地址
     * @param url
     * @return
     */
    public String[] getNewinfo(String url){
        String URL="http://weather.news.sina.com.cn/"+url;
        //30是指取30段满足给出的正则条件的字符串,如果只找出10个,那数组后面的全为null
        String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);
        for (int i=0;i<s.length;i++){
            Pattern sp = Pattern.compile("src=/"(.*?)/"");
            Matcher matcher = sp.matcher(s[i]);
            if (matcher.find()){
               
                 String imageurl=analysis("src=/"(.*?)/"" , s[i] , 1)[0];
                 if(!imageurl.startsWith("http://")){
                     imageurl="http://weather.news.sina.com.cn/"+imageurl;
                  }
                System.out.println("新闻有图片:"+imageurl);
                String content=getContent(imageurl);
                  String[] images=imageurl.split("/");
                  String imagename=images[images.length-1];
                  System.out.println("图片名:"+imagename);
                 
                
        try {
            File fwl = new File(imagename);
            PrintWriter outl = new PrintWriter(fwl);
            outl.println(content);
            outl.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println("s[i]:"+s[i]);
            //修改文件图片地址
            s[i]=s[i].replace(analysis("src=/"(.*?)/"" , s[i] , 1)[0], imagename);
            }
        }
       
        return s;
    }
    public  String[] getNewList(){
        String url="http://weather.news.sina.com.cn/weather/news/index.
html";
        return getNewList(getContent(url));      
    }

    private  String[] getNewList(String content ){
        //String[] s = analysis("align=/"center/" valign=/"top/"><img src=/"../images/a(.*?).gif/" width=/"70/" height=/"65/"></td>" , content , 50);   
        String[] s = analysis("<li>(.*?)</li>" , content , 50);
       
        return s;
    }
    private String[] analysis(String pattern, String match , int i){
        Pattern sp = Pattern.compile(pattern);
        Matcher matcher = sp.matcher(match);
        String[] content = new String[i];
        for (int i1 = 0; matcher.find(); i1++){       
            content[i1] = matcher.group(1);      
        }
        //下面一段是为了剔除为空的串
        int l=0;
        for (int k=0;k<content.length;k++){
            if (content[k]==null){
                l=k;
                break;
            }
        }
        String[] content2;
        if (l!=0){
            content2=new String[l];
            for (int n=0;n<l;n++){
                content2[n]=content[n];
            }
             return content2;
        }else{
            return content;   
        }
      
    }
    /**
     * 由地址获取网页内容
     * @param strUrl
     * @return
    private String getContent(String strUrl){
        try{
            //URL url = new URL(strUrl);   
            //BufferedReader br = new BufferedReader(new InputStreamReader(url.openStream()));
            URLConnection uc = new URL(strUrl).openConnection();
               //通过修改http头的User-Agent来伪装成是通过浏览器提交的请求
              uc.setRequestProperty("User-Agent", 
                                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                 
              System.out.println("-----------------------------------------"); 
              System.out.println("Content-Length:     "+uc.getContentLength()); 
              System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie")); 
              System.out.println("-----------------------------------------");
              //获取文件头信息
              System.out.println("Header"+uc.getHeaderFields().toString());
              System.out.println("-----------------------------------------"); 
            BufferedReader br=new BufferedReader(new InputStreamReader(uc.getInputStream(), "gb2312"));
            String s = "";
            StringBuffer sb=new StringBuffer();
            while((s = br.readLine())!=null){
                sb.append(s+"/r/n");
            }
            System.out.println("长度+"+sb.toString().length());
           
            return sb.toString();
        }catch(Exception e){
            return "error open url" + strUrl;
        }
    }
    */

    public static  String getContent (String strUrl){
        URLConnection uc = null;
        String all_content=null;

      
    try {
               all_content =new  String();
               URL url = new URL(strUrl);

               uc = url.openConnection();
               uc.setRequestProperty("User-Agent", 
                                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                 
              System.out.println("-----------------------------------------"); 
              System.out.println("Content-Length:     "+uc.getContentLength()); 
              System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie")); 
              System.out.println("-----------------------------------------");
              //获取文件头信息
              System.out.println("Header"+uc.getHeaderFields().toString());
              System.out.println("-----------------------------------------"); 
               if (uc == null)
                   return null;

               InputStream ins = uc.getInputStream();
                ByteArrayOutputStream outputstream = new ByteArrayOutputStream();
               byte[] str_b = new byte[1024];
                   int i = -1;
                   while ((i=ins.read(str_b)) > 0) {
                    outputstream.write(str_b,0,i);
                   }
                   all_content = outputstream.toString();
                  // System.out.println(all_content);

           } catch (Exception e) {
               e.printStackTrace();
               log.error("获取网页内容出错");
           }finally{
               uc = null;
           }
         
          // return new String(all_content.getBytes("ISO8859-1"));
           System.out.println(all_content.length());
           return all_content;
       }
     
}


现在的问题是:图片下载不全,我用后面两种getContent方法下图片,下来的图片大小都和文件头里获得的Content-Length,也就是图片的实际大小不符,预览不了。
  而且反复测试,两种方法每次下来的东西大小是固定的,所以重复下载没有用?
测试toString后length大小比图片实际的小,而生成的图片比图片数据大。下载后存储过程中图片数据增加了!
  图片数据流toString过程中数据大小发生了改变,还原不回来。其它新闻内容没有问题。估计是图片的编码格式等的问题。在图片数据流读过来时直接生成图片就可以了。
public  int saveImage (String strUrl){
        URLConnection uc = null;
    try {
               URL url = new URL(strUrl);
               uc = url.openConnection();
               uc.setRequestProperty("User-Agent", 
                                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");   
               //uc.setReadTimeout(30000);
         //获取图片长度 
          //System.out.println("Content-Length:     "+uc.getContentLength());
          //获取文件头信息
           //System.out.println("Header"+uc.getHeaderFields().toString());        
               if (uc == null)
                   return 0;
               InputStream ins = uc.getInputStream();
                 byte[] str_b = new byte[1024];        
                 int byteRead=0;                          
                String[] images=strUrl.split("/");
        String imagename=images[images.length-1];
              File fwl = new File(imagename);
              FileOutputStream fos= new FileOutputStream(fwl);
                  while ((byteRead=ins.read(str_b)) > 0) {
                      fos.write(str_b,0,byteRead);
                     };
                    fos.flush(); 
                  fos.close();
           } catch (Exception e) {
               e.printStackTrace();
               log.error("获取网页内容出错");
           }finally{
               uc = null;
           }
           return 1;
       }

 

 

 

方法二:
首先把搜索后的页面用流读取出来,再写个正则,去除不要的内容,再把最后的结果存成xml格式文件、或者直接存入数据库,用的时候再调用

本代码只是显示html页的源码内容,如果需要抽取内容请自行改写public static String regex()中的正则式
 
package rssTest;  
 
import java.io.BufferedReader;  
import java.io.IOException;  
import java.io.InputStreamReader;  
import java.net.HttpURLConnection;  
import java.net.MalformedURLException;  
import java.net.URL;  
import java.net.URLConnection;  
import java.util.ArrayList;  
import java.util.List;  
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
 
public class MyRSS  
{  
    /** 
     * 获取搜索结果的html源码 
     * */ 
    public static String getHtmlSource(String url)  
    {  
          
        StringBuffer codeBuffer = null;  
        BufferedReader in=null;  
        try 
        {  
           URLConnection uc = new URL(url).openConnection();  
 
            /** 
            * 为了限制客户端不通过网页直接读取网页内容,就限制只能从浏览器提交请求. 
             * 但是我们可以通过修改http头的User-Agent来伪装,这个代码就是这个作用 
             *  
             */ 
            uc.setRequestProperty("User-Agent",  
                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");  
 
            // 读取url流内容  
            in = new BufferedReader(new InputStreamReader(uc  
                    .getInputStream(), "gb2312"));  
            codeBuffer = new StringBuffer();  
            String tempCode = "";  
            // 把buffer内的值读取出来,保存到code中  
            while ((tempCode = in.readLine()) != null)  
            {  
                codeBuffer.append(tempCode).append("/n");  
            }  
            in.close();  
        }  
        catch (MalformedURLException e)  
        {  
            e.printStackTrace();  
        }  
        catch (IOException e)  
        {  
            e.printStackTrace();  
        }  
          
        return codeBuffer.toString();  
    }  
 
    /** 
     * 正则表达式 
     * */ 
    public static String regex()  
    {  
        String googleRegex = "<div class=g>(.*?)href=/"(.*?)/"(.*?)/">(.*?)</a>(.*?)<div class=std>(.*?)<br>";  
        return googleRegex;  
    }  
 
    /** 
     * 测试用 
     * 在google中检索关键字,并抽取自己想要的内容 
     *  
     * */ 
    public static List<String> GetNews()  
    {  
        List<String> newsList = new ArrayList<String>();  
        String allHtmlSource = MyRSS  
                .getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os-  maxthon&hs=SUZ&q=%E8%A7%81%E9%BE%99%E5%8D%B8%E7%94%B2&meta=&aq=f");  
        Pattern pattern = Pattern.compile(regex());  
        Matcher matcher = pattern.matcher(allHtmlSource);  
 
        while (matcher.find())  
        {  
            String urlLink = matcher.group(2);  
            String title = matcher.group(4);  
            title = title.replaceAll("<font color=CC0033>", "");  
            title = title.replaceAll("</font>", "");  
            title = title.replaceAll("<b>...</b>", "");  
 
            String content = matcher.group(6);  
            content = content.replaceAll("<font color=CC0033>", "");  
            content = content.replaceAll("</font>", "");  
            content = content.replaceAll("<b>...</b>", "");  
 
            newsList.add(urlLink);  
            newsList.add(title);  
            newsList.add(content);  
        }  
       return newsList;  
    }  
 
    /** 
     * main方法 
     * */ 
    public static void main(String[] args)  
    {  
       System.out  
        .println(MyRSS  
                .getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html"));  
    }  
}

 

方法三:
jsp自动抓取新闻 自动抓取新闻

package com.news.spider;

import java.io.File;
import java.io.FileFilter;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.db.DBAccess;


public class SpiderNewsServer {
public static void main(String[] args) throws Exception{


   //设置抓取信息的首页面
   String endPointUrl = "http://cn.china.cn/zixun/";
   //获得当前时间
   Calendar calendar=Calendar.getInstance();
      SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
      String DateNews = sdf.format(calendar.getTime());
      
   /********************
   * 抓取二级URl 开始
   * url匹配类型:"http://cn.china.cn/article/"
   */
   List listNewsType = new ArrayList();
   //取入口页面html
   WebHtml webHtml = new WebHtml();
   String htmlDocuemtnt1 = webHtml.getWebHtml(endPointUrl);
   if(htmlDocuemtnt1 == null || htmlDocuemtnt1.length() == 0){
    return;
   }
   String strTemp1 = "http://cn.china.cn/article/";
   String strTemp2 = "</li>";
   int stopIndex=0;
   int startIndex=0;
   int dd=0;
   while(true){
    dd++;
    startIndex = htmlDocuemtnt1.indexOf(strTemp1, stopIndex);
  
    System.out.println("=========="+startIndex);
    stopIndex= htmlDocuemtnt1.indexOf(strTemp2, startIndex);
    System.out.println("==========---------"+stopIndex);
    if(startIndex!=-1 && stopIndex!=-1){
     String companyType=htmlDocuemtnt1.substring(startIndex,stopIndex);
     System.out.println("@@@@@--------"+companyType);
     System.out.println("@@@@@--------"+companyType.indexOf("/""));
     companyType=companyType.substring(0,companyType.indexOf("/""));
     System.out.println("#####--------"+companyType);
     listNewsType.add(companyType);
    }
    if(dd>10){
     break;
    }
    if(stopIndex==-1 || startIndex==-1){
     break;
    }
   }
   System.out.println("listCompanyType====="+listNewsType.size());
   /**
   * 抓取二级URl 结束
   ********************/
 
 
   /********************
   * 抓取页面内容 开始
   */
   String title="";
     String hometext="";
     String bodytext="";
     String keywords="";
     String counter = "221";
     String cdate= "";
 
   int begainIndex=0;//检索字符串的起点索引
   int endIndex=0;//检索字符串的终点索引
   String begainStr;//检索开始字符串       
   String endStr;//检索结束字符串
 
   for (int rows = 1; rows < listNewsType.size(); rows++) {
    String strNewsDetail = listNewsType.get(rows).toString();
    System.out.println("strNewsDetail====="+strNewsDetail);
    if(strNewsDetail != null && strNewsDetail.length() > 0){
     WebHtml newsListHtml = new WebHtml();
     String htmlDocuemtntCom = newsListHtml.getWebHtml(strNewsDetail);
     System.out.println("$$$$$------"+htmlDocuemtntCom);
   
     if(htmlDocuemtntCom == null || htmlDocuemtntCom.length() == 0){
      return;
     }
     //截取时间    
     int dateBegainIndex = htmlDocuemtntCom.indexOf("<div>时间:");
     System.out.println("%%%%%--"+dateBegainIndex);
     String newTime = htmlDocuemtntCom.substring(dateBegainIndex,dateBegainIndex+20);
     System.out.println("^^^^^^^^^^^^^^^---"+newTime);
     String newTimeM = newTime.substring(newTime.lastIndexOf("-")+1,newTime.lastIndexOf("-")+3);
     String dateM = DateNews.substring(DateNews.lastIndexOf("-")+1);
     System.out.println("^^^^^^^^^^^^^^^---"+newTimeM);
     System.out.println("^^^^^^^^^^^^^^^---"+dateM);
     if(newTimeM == dateM || newTimeM.equals(dateM)){
      //检索新闻标题
      begainStr="<div class=/"divCon bg008 /">";       
      endStr="<div>时间:";
    
      begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);
      System.out.println("&&&&&&------"+begainIndex);
      endIndex=htmlDocuemtntCom.indexOf(endStr,0);
      System.out.println("&&&&&&------"+endIndex);
      if(begainIndex!=-1 && endIndex!=-1){
       title = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();
       title = title.substring(title.indexOf("<h1>")+4,title.indexOf("</h1>"));
       title = title.replace("'", "");
       title = title.replace(";", "");
       title = title.replace(" ", "");
      }
    
      //检索新闻内容
      begainStr="<div class=/"divCon bg008 /">";       
      endStr="<!-- page begin -->";
      begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);
      endIndex=htmlDocuemtntCom.indexOf(endStr,0);
      if(begainIndex!=-1 && endIndex!=-1){
       bodytext = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();
       if(bodytext.indexOf("<p>")>0 && bodytext.indexOf("</p>")>bodytext.indexOf("<p>") && bodytext.indexOf("</p>")>0)
        bodytext = bodytext.substring(bodytext.indexOf("<p>")+3,bodytext.indexOf("</p>"));
       bodytext=bodytext.replace("&nbsp;", "");
       bodytext=bodytext.replace("<br>", "");
       bodytext=bodytext.replace("/n", "<br>");
       bodytext=bodytext.replace("'", "");
       bodytext=bodytext.replace(";", "");
      }
      //简介
      if(bodytext.length()>40)
       hometext = bodytext.substring(0,40)+"......";
      else{
       hometext = bodytext+"......";
      }
      //浏览量
      String str = String.valueOf(Math.random());
      counter = str.substring(str.lastIndexOf(".")+1,5);
    
      Calendar cal = Calendar.getInstance();
      cal.setTime(new Date());
      cdate = cal.getTimeInMillis()+"";
      cdate = cdate.substring(0,10);
     }else{
      continue;
     }
    }
    System.out.println("-------------------------"+title);
    System.out.println("-------------------------"+cdate);
    System.out.println("-------------------------"+cdate);
    System.out.println("-------------------------"+hometext);
    System.out.println("-------------------------"+bodytext);
    System.out.println("-------------------------"+keywords);
    System.out.println("-------------------------"+counter);
    /*String str = "INSERT INTO ecim_stories(uid,title,created,published,hostname,hometext,bodytext,keywords,counter,topicid,ihome,notifypub,story_type,topicdisplay,topicalign,comments,rating,votes,description) ";
    str += "VALUE (1,'"+title+"',"+cdate+","+cdate+",'125.122.83.177','"+hometext+"','"+bodytext+"','"+keywords+"',"+counter+",1,0,1,'admin',0,'R',0,0,0,'')";
    DBAccess db = new DBAccess();;
    if(db.executeUpdate(str)>0) {
     System.out.println("-------------------------成功!!!!!!!!!!");
    }else {
     System.out.println("-------------------------失败!!!!!!!!!!");
    }*/
   }
 
   /**
   * 抓取页面内容 结束
   ********************/
}
}
 

 

package com.news.spider;

import java.net.URL;
import java.net.URLConnection;
import java.io.BufferedReader;
import java.io.InputStreamReader;


public class WebHtml {

/**
* 根据url,抓取webhmtl内容
* @param url
*/
public String getWebHtml(String url){
   try {
    URL myURL = new URL(url);
    URLConnection conn = myURL.openConnection();
    BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
    String line = null;
    StringBuffer document = new StringBuffer("");
    while ((line = reader.readLine()) != null){
     document.append(line + "/n");
    }
    reader.close();
  
    String resutlDocument = new String(document);
    return resutlDocument;
  
   } catch (Exception e) {}
   return "";
}


}

 



出处:【Gjava人才】
网址:
http://www.gjrencai.com
转载时请注明出处和网址
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值