Java爬虫爬取网易汽车车型库

本文地址:http://blog.csdn.net/shanglianlm/article/details/70188385

最近由于工作需要,写了一个小的爬虫,主要用于爬取网易汽车车型库(http://product.auto.163.com/)上的不同品牌/车标(共175个车标)下不同车系(共1650个系列)的的图片(各八张)
这里写图片描述

代码下载

代码如下:
共CarBrand.java,CarCrawer.java,CarCrawerDemo.java三个文件。

实体
CarBrand.java

package com.mingo.crawer;

import java.util.ArrayList;

public class CarBrand {

    private String ppName;  
    private String ppUrl;   
    private ArrayList<CarBrand> ppList;

    private String cxName;  
    private String cxUrl;   
    private ArrayList<CarBrand> cxList;

    private String cxTpName;    
    private String cxTpUrl; 
    private ArrayList<CarBrand> cxTpList;

    private String tpName;
    private String tpNameUrl;

    //getter() 和 setter() 省略

}

具体实现
CarCrawer.java

package com.mingo.crawer;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CarCrawer {

    public static String carUrl = "http://product.auto.163.com";

    public static String SendGet(String url) {  
        // 定义一个字符串用来存储网页内容  
        String result = "";  
        // 定义一个缓冲字符输入流  
        BufferedReader in = null;  
        try {  
            // 将string转成url对象  
            URL realUrl = new URL(url);  
            // 初始化一个链接到那个url的连接  
            URLConnection connection = realUrl.openConnection();  
            // 开始实际的连接  
            connection.connect();  
            // 初始化 BufferedReader输入流来读取URL的响应  
            in = new BufferedReader(new InputStreamReader(  
                    connection.getInputStream(), "GB2312"));  
            // 用来临时存储抓取到的每一行的数据  
            String line;  
            while ((line = in.readLine()) != null) {  
                // 遍历抓取到的每一行并将其存储到result里面  
                result += line;  
            }  
        } catch (Exception e) {  
            System.out.println("发送GET请求出现异常!" + e);  
            e.printStackTrace();  
        }  
        // 使用finally来关闭输入流  
        finally {  
            try {  
                if (in != null) {  
                    in.close();  
                }  
            } catch (Exception e2) {  
                e2.printStackTrace();  
            }  
        }  
        return result;  
    }  

    /**
       * 下载文件到本地
       *
       * @param urlString
       *          被下载的文件地址
       * @param filename
       *          本地文件名
       * @throws Exception
       *           各种异常
       */
    public static void download(String urlString, String filename,String savePath) throws Exception {  
        // 构造URL  
        URL url = new URL(urlString);  
        // 打开连接  
        URLConnection con = url.openConnection();  
        //设置请求超时为5s  
        con.setConnectTimeout(5*1000);  
        // 输入流  
        InputStream is = con.getInputStream();  

        // 1K的数据缓冲  
        byte[] bs = new byte[1024];  
        // 读取到的数据长度  
        int len;  
        // 输出的文件流  
       File sf=new File(savePath);  
       if(!sf.exists()){  
           sf.mkdirs();  
       }  
       OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);  
        // 开始读取  
        while ((len = is.read(bs)) != -1) {  
          os.write(bs, 0, len);  
        }  
        // 完毕,关闭所有链接  
        os.close();  
        is.close();  
    }   

    public static void writeTxtFile(String content,String txtfilename)throws Exception{  
        FileWriter writer = new FileWriter(txtfilename, true);   
        writer.write(content);   
        writer.close(); 
     }  

     public static ArrayList<CarBrand> removeDuplicate(ArrayList<CarBrand> list)  {

          List<CarBrand> newlist= new ArrayList<CarBrand>();
          Set<String> set=new HashSet<String>();
          for (CarBrand car:list) {
              if (car == null) {continue;}
              String  str = car.getCxName();
              if (str != null) {
                   if (!set.contains(str)) { //set中不包含重复的
                    set.add(str);
                    newlist.add(car);
                   } 
                }   
          }
          return (ArrayList<CarBrand>) newlist;
      }

    /*
     * @param url
     * 示例 http://product.auto.163.com/brand/a/
     */
    public static ArrayList<CarBrand> getPpUrl(String url) throws Exception {

        ArrayList<CarBrand> ppList = new ArrayList<CarBrand>();

        String content = CarCrawer.SendGet(url);
        Pattern patternName = Pattern.compile("title=\"进入.{1,20}品牌频道");
        Pattern patternUrl = Pattern.compile("<a href='/brand/[a-z]/.{1,20}' title");
        Matcher matcherName = patternName.matcher(content); 
        Matcher matcherUrl = patternUrl.matcher(content);

        while(matcherName.find()&&matcherUrl.find()){

            CarBrand carBrand = new CarBrand();
            carBrand.setPpName(matcherName.group(0).substring(9, matcherName.group(0).length()-4));
            carBrand.setPpUrl(carUrl+matcherUrl.group(0).substring(9, matcherUrl.group(0).length()-7));

            //System.out.println(carBrand.getPpName()+": "+carBrand.getPpUrl());
            ppList.add(carBrand);
        }
        return ppList;
    }

    /*
     * @param url
     * 示例 http://product.auto.163.com/brand/a/
     */
    public static ArrayList<CarBrand> getCxUrl(String url) throws Exception {

        ArrayList<CarBrand> cxPicList = new ArrayList<CarBrand>();
        String content = CarCrawer.SendGet(url);

        //Pattern pattern = Pattern.compile("class=\"group\">.*<div class=\"gbox gbox2\" >");
        //Matcher matcher = pattern.matcher(content); 

        int i=0;
        while(content.indexOf("class=\"group\">",i)>0){
            int subS = content.indexOf("class=\"group\">",i);
            int subE = content.indexOf("<div class=\"gbox gbox2\" >",i);

            String subContent = content.substring(subS, subE);
            i=subE+10;          

            //System.out.println("subContent "+subContent);
            Pattern patternTitle = Pattern.compile("频道\">进入.{1,20}品牌频道</a>]</span>");
            Matcher matcherTitle = patternTitle.matcher(subContent);


            String strtitle= null;
            if(matcherTitle.find()){
                strtitle = matcherTitle.group(0).substring(6, matcherTitle.group(0).length()-16);
            }

            Pattern patternName = Pattern.compile("\"查看.{1,20}图片\">");
            Pattern patternUrl = Pattern.compile("/series/photo/.{10,20}\"");
            Matcher matcherName = patternName.matcher(subContent); 
            Matcher matcherUrl = patternUrl.matcher(subContent);
            while(matcherName.find()&&matcherUrl.find()){
                CarBrand carBrand = new CarBrand();
                carBrand.setPpName(strtitle);
                //System.out.println(carBrand.getPpName());
                carBrand.setCxName(matcherName.group(0).substring(3, matcherName.group(0).length()-4));
                carBrand.setCxUrl(carUrl+matcherUrl.group(0).substring(0, matcherUrl.group(0).length()-1));

                //System.out.println(carBrand.getCxName()+": "+carBrand.getCxUrl());
                cxPicList.add(carBrand);
            }

        }
        return cxPicList;
    }

    /*
     * @param url
     * 示例 http://product.auto.163.com/series/photo/2350.html#CX001
     */
    public static ArrayList<CarBrand> getCxPic(String url) throws Exception {

        ArrayList<CarBrand> cxPicList = new ArrayList<CarBrand>();

        String content = CarCrawer.SendGet(url);
        Pattern pattern = Pattern.compile("http://product.auto.163.com/picture/photoview.{30,40}.html");
        Matcher matcher = pattern.matcher(content); 
        int num=1;
        while(matcher.find()&&num<9){
            CarBrand carBrand = new CarBrand();

            if(num==1){ carBrand.setCxTpName("左前");} else if(num==2){ carBrand.setCxTpName("正前");               
            }else if(num==3){ carBrand.setCxTpName("正侧"); } else if(num==4){ carBrand.setCxTpName("左后");                
            }else if(num==5){ carBrand.setCxTpName("正后"); } else if(num==6){ carBrand.setCxTpName("车顶");        
            }else if(num==7){ carBrand.setCxTpName("前大灯局部"); } else if(num==8){ carBrand.setCxTpName("后大灯局部"); 
            }else{ System.out.println("Error: num = "+num); return null;}

            carBrand.setCxTpUrl(matcher.group(0));
            //System.out.println(carBrand.getCxTpName()+": "+matcher.group(0));
            num = num + 1;
            cxPicList.add(carBrand);
        }
        return cxPicList;
    }

    public static String getBigPic(String url) throws Exception {

        String bigPicUrl = null;
        String content = CarCrawer.SendGet(url);
        Pattern pattern = Pattern.compile("<img class=\"main_photo hidden\" data-src=\".{60,70}.jpg"); 
        Matcher matcher = pattern.matcher(content); 
        if(matcher.find()){
            //System.out.println(matcher.group(0).substring(41));
            bigPicUrl = matcher.group(0).substring(41);
        }
        return bigPicUrl;
    }

}

调用
CarCrawerDemo.java

package com.mingo.crawer;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CarCrawerDemo {

    public static String carUrl = "http://product.auto.163.com";

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub

        //保存路径 D:\\CarPic\\
        String savePath = "D:\\CarTp\\";
        //文件名 奥迪__奥迪Q5_2017款_左后.jpg
        String filename = "";
        //txt文件名
        String txtfilename=savePath+"output.txt";

        String url = "http://product.auto.163.com/brand/";
        System.out.println(url);
        ArrayList<CarBrand> pplist = CarCrawer.getPpUrl(url);
        System.out.println(pplist.size());


        ArrayList<CarBrand> cxUrllistNew = new ArrayList<CarBrand>();
        Set<String> ppUrlSet=new HashSet<String>();
        for(CarBrand pp:pplist){    
            String ppUrlStr= pp.getPpUrl().substring(0, 36);
            if (!ppUrlSet.contains(ppUrlStr)) { //set中不包含重复的
                ppUrlSet.add(ppUrlStr);             
                ArrayList<CarBrand> cxUrllist = CarCrawer.getCxUrl(pp.getPpUrl());          
                cxUrllistNew.addAll(cxUrllist); 
            }

        }

        System.out.println(cxUrllistNew.size());

        CarCrawer.writeTxtFile("\nCalendar: "+Calendar.getInstance(),txtfilename);
        for(CarBrand cxUrlNew:cxUrllistNew){
            //System.out.println(cxUrlNew.getPpName()+" "+cxUrlNew.getCxName()+" "+cxUrlNew.getCxUrl());

            ArrayList<CarBrand> cxTplist = CarCrawer.getCxPic(cxUrlNew.getCxUrl());

            for(CarBrand cxTp:cxTplist){    
                String tpName = cxUrlNew.getPpName()+"_"+cxUrlNew.getCxName()+"_"+cxTp.getCxTpName()+".jpg";
                String tpNameUrl = CarCrawer.getBigPic(cxTp.getCxTpUrl());

                //System.out.println(tpName+" "+tpNameUrl);

                CarCrawer.writeTxtFile("\n"+tpName+" "+tpNameUrl,txtfilename);

                if(tpName!=null&&tpNameUrl!=null){
                    CarCrawer.download(tpNameUrl, tpName, savePath);
                }
            }
        }
        System.out.println("finished!");
    }   
}

下载结果:
这里写图片描述

改进点:
1 没有爬取每个车系的年款;
2 库有点小,车辆主要是小型车,
3 代码速度要进一步优化。

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值