java爬虫汽车数据_Java爬虫爬取网易汽车车型库

package com.mingo.crawer;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.FileWriter;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.io.RandomAccessFile;

import java.net.URL;

import java.net.URLConnection;

import java.util.ArrayList;

import java.util.HashSet;

import java.util.List;

import java.util.Set;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class CarCrawer {

public static String carUrl = "http://product.auto.163.com";

public static String SendGet(String url) {

// 定义一个字符串用来存储网页内容

String result = "";

// 定义一个缓冲字符输入流

BufferedReader in = null;

try {

// 将string转成url对象

URL realUrl = new URL(url);

// 初始化一个链接到那个url的连接

URLConnection connection = realUrl.openConnection();

// 开始实际的连接

connection.connect();

// 初始化 BufferedReader输入流来读取URL的响应

in = new BufferedReader(new InputStreamReader(

connection.getInputStream(), "GB2312"));

// 用来临时存储抓取到的每一行的数据

String line;

while ((line = in.readLine()) != null) {

// 遍历抓取到的每一行并将其存储到result里面

result += line;

}

} catch (Exception e) {

System.out.println("发送GET请求出现异常!" + e);

e.printStackTrace();

}

// 使用finally来关闭输入流

finally {

try {

if (in != null) {

in.close();

}

} catch (Exception e2) {

e2.printStackTrace();

}

}

return result;

}

/**

* 下载文件到本地

*

*@param urlString

* 被下载的文件地址

*@param filename

* 本地文件名

*@throws Exception

* 各种异常

*/

public static void download(String urlString, String filename,String savePath) throws Exception {

// 构造URL

URL url = new URL(urlString);

// 打开连接

URLConnection con = url.openConnection();

//设置请求超时为5s

con.setConnectTimeout(5*1000);

// 输入流

InputStream is = con.getInputStream();

// 1K的数据缓冲

byte[] bs = new byte[1024];

// 读取到的数据长度

int len;

// 输出的文件流

File sf=new File(savePath);

if(!sf.exists()){

sf.mkdirs();

}

OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);

// 开始读取

while ((len = is.read(bs)) != -1) {

os.write(bs, 0, len);

}

// 完毕,关闭所有链接

os.close();

is.close();

}

public static void writeTxtFile(String content,String txtfilename)throws Exception{

FileWriter writer = new FileWriter(txtfilename, true);

writer.write(content);

writer.close();

}

public static ArrayList removeDuplicate(ArrayList list) {

List newlist= new ArrayList();

Set set=new HashSet();

for (CarBrand car:list) {

if (car == null) {continue;}

String str = car.getCxName();

if (str != null) {

if (!set.contains(str)) { //set中不包含重复的

set.add(str);

newlist.add(car);

}

}

}

return (ArrayList) newlist;

}

/*

* @param url

* 示例 http://product.auto.163.com/brand/a/

*/

public static ArrayList getPpUrl(String url) throws Exception {

ArrayList ppList = new ArrayList();

String content = CarCrawer.SendGet(url);

Pattern patternName = Pattern.compile("title=\"进入.{1,20}品牌频道");

Pattern patternUrl = Pattern.compile("

Matcher matcherName = patternName.matcher(content);

Matcher matcherUrl = patternUrl.matcher(content);

while(matcherName.find()&&matcherUrl.find()){

CarBrand carBrand = new CarBrand();

carBrand.setPpName(matcherName.group(0).substring(9, matcherName.group(0).length()-4));

carBrand.setPpUrl(carUrl+matcherUrl.group(0).substring(9, matcherUrl.group(0).length()-7));

//System.out.println(carBrand.getPpName()+": "+carBrand.getPpUrl());

ppList.add(carBrand);

}

return ppList;

}

/*

* @param url

* 示例 http://product.auto.163.com/brand/a/

*/

public static ArrayList getCxUrl(String url) throws Exception {

ArrayList cxPicList = new ArrayList();

String content = CarCrawer.SendGet(url);

//Pattern pattern = Pattern.compile("class=\"group\">.*

");

//Matcher matcher = pattern.matcher(content);

int i=0;

while(content.indexOf("class=\"group\">",i)>0){

int subS = content.indexOf("class=\"group\">",i);

int subE = content.indexOf("

",i);

String subContent = content.substring(subS, subE);

i=subE+10;

//System.out.println("subContent "+subContent);

Pattern patternTitle = Pattern.compile("频道\">进入.{1,20}品牌频道]");

Matcher matcherTitle = patternTitle.matcher(subContent);

String strtitle= null;

if(matcherTitle.find()){

strtitle = matcherTitle.group(0).substring(6, matcherTitle.group(0).length()-16);

}

Pattern patternName = Pattern.compile("\"查看.{1,20}图片\">");

Pattern patternUrl = Pattern.compile("/series/photo/.{10,20}\"");

Matcher matcherName = patternName.matcher(subContent);

Matcher matcherUrl = patternUrl.matcher(subContent);

while(matcherName.find()&&matcherUrl.find()){

CarBrand carBrand = new CarBrand();

carBrand.setPpName(strtitle);

//System.out.println(carBrand.getPpName());

carBrand.setCxName(matcherName.group(0).substring(3, matcherName.group(0).length()-4));

carBrand.setCxUrl(carUrl+matcherUrl.group(0).substring(0, matcherUrl.group(0).length()-1));

//System.out.println(carBrand.getCxName()+": "+carBrand.getCxUrl());

cxPicList.add(carBrand);

}

}

return cxPicList;

}

/*

* @param url

* 示例 http://product.auto.163.com/series/photo/2350.html#CX001

*/

public static ArrayList getCxPic(String url) throws Exception {

ArrayList cxPicList = new ArrayList();

String content = CarCrawer.SendGet(url);

Pattern pattern = Pattern.compile("http://product.auto.163.com/picture/photoview.{30,40}.html");

Matcher matcher = pattern.matcher(content);

int num=1;

while(matcher.find()&&num<9){

CarBrand carBrand = new CarBrand();

if(num==1){ carBrand.setCxTpName("左前");} else if(num==2){ carBrand.setCxTpName("正前");

}else if(num==3){ carBrand.setCxTpName("正侧"); } else if(num==4){ carBrand.setCxTpName("左后");

}else if(num==5){ carBrand.setCxTpName("正后"); } else if(num==6){ carBrand.setCxTpName("车顶");

}else if(num==7){ carBrand.setCxTpName("前大灯局部"); } else if(num==8){ carBrand.setCxTpName("后大灯局部");

}else{ System.out.println("Error: num = "+num); return null;}

carBrand.setCxTpUrl(matcher.group(0));

//System.out.println(carBrand.getCxTpName()+": "+matcher.group(0));

num = num + 1;

cxPicList.add(carBrand);

}

return cxPicList;

}

public static String getBigPic(String url) throws Exception {

String bigPicUrl = null;

String content = CarCrawer.SendGet(url);

Pattern pattern = Pattern.compile("%5C%22.%7B60,70%7D.jpg%22);

Matcher matcher = pattern.matcher(content);

if(matcher.find()){

//System.out.println(matcher.group(0).substring(41));

bigPicUrl = matcher.group(0).substring(41);

}

return bigPicUrl;

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值