简单的java爬虫程序

最新推荐文章于 2021-12-21 15:15:49 发布
小黑oba
最新推荐文章于 2021-12-21 15:15:49 发布
阅读量989
点赞数
分类专栏： java
java 专栏收录该内容
27 篇文章 0 订阅
订阅专栏
http://blog.csdn.net/new2008060110/article/details/4737708

 

一个简单的java网络爬虫,由于时间原因,没有进一步解释.

需要的htmlparser.jar包到官方网上去下.

 

进来公司要做爬虫，框架都不会用，就按照网上的python原理用java实现间的程序 用到HTMLParser包

package my.url.test;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class MyUrl {

    private  URL url ;
    private File file;
    private static String dirName;
    private static int depth = 0;
    private static int number = 0;
    private static HttpURLConnection urlConnectin ;
    
    public static void main(String[] args){
        try {            
            String [] urlNames = new String[]{
                    "http://www.ebiotrade.com/newsf/",
            };
            init(urlNames); 
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } catch (URISyntaxException e) {
            e.printStackTrace();
        } catch (ParserException e) {
            e.printStackTrace();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    public static void  init(String[] urlNames) throws Exception{
        for(int i=0;i<urlNames.length;i++){
            MyUrl myUrl = new MyUrl(urlNames[i]);
            myUrl.connect();
            myUrl.html2txt(urlConnectin,dirName);
        }
    }
    
    
    public MyUrl(String urlString) throws URISyntaxException, ParserException, IOException{
        url= new URL(urlString);
        if(url.getPath().equals("") || url.getPath().equals("/")){
            dirName = url.toURI().getHost();
        }else{
            dirName = url.toURI().getHost()+""+url.toURI().getPath();
        }

        CreateDir(dirName);
    }
    
    private void CreateDir(String dirName) throws URISyntaxException{
        //创建目录
        String[] dirNames = dirName.split("/");
        String temp = "";
        for(int i=0;i<((dirNames.length == 4)?dirNames.length - 1:dirNames.length);i++){
            temp += dirNames[i]+"/";
            File dir = new File("c:/"+ temp);
            if(dir.exists() == false){
                System.out.println("创建文件夹:"+dirNames[i]);
                dir.mkdir();
             }
        }
    }
    
    private void  connect() throws Exception{
         urlConnectin = (HttpURLConnection)url.openConnection();    
         urlConnectin.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
         urlConnectin.setRequestProperty("Accept","image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-powerpoint, application/vnd.ms-excel, application/msword, */*");
         urlConnectin.setRequestProperty("Accept-Language", "zh-cn");
         urlConnectin.setRequestProperty("UA-CPU", "x86");
         //urlConnectin.setRequestProperty("Accept-Encoding", "gzip");//为什么没有deflate呢
         urlConnectin.setRequestProperty("Content-type", "text/html");
         urlConnectin.setRequestProperty("Connection", "close");
         urlConnectin.setUseCaches(false);//不要用cache，用了也没有什么用，因为我们不会经常对一个链接频繁访问。（针对程序）
         //urlConnectin.setConnectTimeout(10 * 1000);
         //urlConnectin.setReadTimeout(10 * 1000);
         urlConnectin.setDoOutput(true);
         urlConnectin.setDoInput(true); 
         if(depth == 0){
             analyzeHTML(urlConnectin,"gb2312");
         }

         
    }
    
    private  void readAndWrite(HttpURLConnection urlConnectin,String fileName) throws IOException, ParserException, URISyntaxException{
        if(url.getPath().equals("")){
            file = new File("c:/"+dirName+"/index.html");
        }else{
            file = new File("c:/"+dirName);
        }
        if(file.exists() == false){
            number ++;
            System.out.println("下载:"+number);
            System.out.println(file.getPath());
            file.createNewFile();
            FileWriter fw = new FileWriter(file);
            BufferedWriter bw = new BufferedWriter(fw);
            String inputLine;
            BufferedReader in = new BufferedReader( new InputStreamReader(urlConnectin.getInputStream()));
            
            while ((inputLine = in.readLine()) != null) {
                  bw.write(inputLine);
                  bw.newLine();
                  bw.flush();
              }

        }
    }

   //解析出连接
    private void analyzeHTML(HttpURLConnection urlConnectin,String encode) throws Exception{
        depth = 1;
        Parser parser = new Parser(urlConnectin);
        parser.setEncoding(encode);
        NodeFilter filter = new AndFilter(new TagNameFilter("a"),new HasAttributeFilter("class","TDcenter"));
        NodeList nodeList = parser.parse(filter);
        NodeIterator it = nodeList.elements();
        while(it.hasMoreNodes()){
            Node node = it.nextNode();
            init(new String[]{((LinkTag)node).getLink()});
        }
    }

   //下载图片
    private void downloadImageAnalyzeHtml(URLConnection urlConnectin,String encode) throws Exception{
        Parser parser = new Parser(urlConnectin);
        parser.setEncoding(encode);
        NodeFilter filter = new TagNameFilter("img");
        NodeList nodeList = parser.parse(filter);
        NodeIterator it = nodeList.elements();
        while(it.hasMoreNodes()){
            Node node = it.nextNode();
            if(((ImageTag)node).getImageURL().toString().startsWith("http") == false){
                String tempDirectory = url.getHost()+((ImageTag)node).getImageURL().replace("file:", "");
                CreateDir(tempDirectory);
                URLConnectionDownloader.download("http://"+tempDirectory,"c:/"+tempDirectory);
            }
        }
    }

   //解析成txt文件
    private void html2txt(HttpURLConnection urlConnectin,String dirName) throws Exception{
        File dirTxt = new File("c:/"+dirName+".txt"); 
        if(dirTxt.exists() == false){
            Parser parser = new Parser(urlConnectin);
            parser.setEncoding("gb2312"); 
    
            NodeFilter filter =new OrFilter(new NodeFilter[]{new TagNameFilter("p"),new HasAttributeFilter("class","MsoNormal"),new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","newsf"))});
            NodeList  nodeList = parser.parse(filter);
            NodeIterator it = nodeList.elements();
            FileWriter fw = new FileWriter(dirTxt);
            BufferedWriter bw = new BufferedWriter(fw);
            if(it.hasMoreNodes()){
                dirTxt.createNewFile();
                System.out.println("创建文件："+dirTxt);
            }
            while(it.hasMoreNodes()){
                Node node = it.nextNode();
                    bw.write(node.toHtml());
                    bw.newLine();
                    bw.flush();
            }
            bw.close();
            downloadImageAnalyzeHtml((dirTxt.toURI().toURL().openConnection()),"gb2312");
        }
    }
    
}

//以下类来源网络

import java.io.FileOutputStream;  
import java.io.InputStream;  
import java.io.OutputStream;  
import java.net.URL;  
import java.net.URLConnection;  
/** 
* 使用URLConnection下载文件或图片并保存到本地。 
*  
* @author 老紫竹(laozizhu.com) 
*/  
public class URLConnectionDownloader {  
public static void main(String[] args) throws Exception {  
   download("http://www.cnbeta.com/images/cnlogo.gif", "cnlogo.gif");  
}  
/** 
  * 下载文件到本地 
  *  
  * @param urlString 
  *          被下载的文件地址 
  * @param filename 
  *          本地文件名 
  * @throws Exception 
  *           各种异常 
  */  
public static void download(String urlString, String filename) throws Exception { 
   // 构造URL  
   URL url = new URL(urlString);  
   // 打开连接  
   URLConnection con = url.openConnection();  
   // 输入流  
   InputStream is = con.getInputStream();  
   //  的数据缓冲  
   byte[] bs = new byte[1024] ;  
   // 读取到的数据长度  
   int len;  
   // 输出的文件流  
   OutputStream os = new FileOutputStream(filename);  
   // 开始读取  
   while ((len = is.read(bs)) != -1)  {  
     os.write(bs, 0, len);  
   }  
   // 完毕，关闭所有链接  
   os.close();  
   is.close();  
}  
}