package com.mingo.crawer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.RandomAccessFile;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CarCrawer {
public static String carUrl = "http://product.auto.163.com";
public static String SendGet(String url) {
// 定义一个字符串用来存储网页内容
String result = "";
// 定义一个缓冲字符输入流
BufferedReader in = null;
try {
// 将string转成url对象
URL realUrl = new URL(url);
// 初始化一个链接到那个url的连接
URLConnection connection = realUrl.openConnection();
// 开始实际的连接
connection.connect();
// 初始化 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(
connection.getInputStream(), "GB2312"));
// 用来临时存储抓取到的每一行的数据
String line;
while ((line = in.readLine()) != null) {
// 遍历抓取到的每一行并将其存储到result里面
result += line;
}
} catch (Exception e) {
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
}
// 使用finally来关闭输入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 下载文件到本地
*
*@param urlString
* 被下载的文件地址
*@param filename
* 本地文件名
*@throws Exception
* 各种异常
*/
public static void download(String urlString, String filename,String savePath) throws Exception {
// 构造URL
URL url = new URL(urlString);
// 打开连接
URLConnection con = url.openConnection();
//设置请求超时为5s
con.setConnectTimeout(5*1000);
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File sf=new File(savePath);
if(!sf.exists()){
sf.mkdirs();
}
OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
public static void writeTxtFile(String content,String txtfilename)throws Exception{
FileWriter writer = new FileWriter(txtfilename, true);
writer.write(content);
writer.close();
}
public static ArrayList removeDuplicate(ArrayList list) {
List newlist= new ArrayList();
Set set=new HashSet();
for (CarBrand car:list) {
if (car == null) {continue;}
String str = car.getCxName();
if (str != null) {
if (!set.contains(str)) { //set中不包含重复的
set.add(str);
newlist.add(car);
}
}
}
return (ArrayList) newlist;
}
/*
* @param url
* 示例 http://product.auto.163.com/brand/a/
*/
public static ArrayList getPpUrl(String url) throws Exception {
ArrayList ppList = new ArrayList();
String content = CarCrawer.SendGet(url);
Pattern patternName = Pattern.compile("title=\"进入.{1,20}品牌频道");
Pattern patternUrl = Pattern.compile("
Matcher matcherName = patternName.matcher(content);
Matcher matcherUrl = patternUrl.matcher(content);
while(matcherName.find()&&matcherUrl.find()){
CarBrand carBrand = new CarBrand();
carBrand.setPpName(matcherName.group(0).substring(9, matcherName.group(0).length()-4));
carBrand.setPpUrl(carUrl+matcherUrl.group(0).substring(9, matcherUrl.group(0).length()-7));
//System.out.println(carBrand.getPpName()+": "+carBrand.getPpUrl());
* 示例 http://product.auto.163.com/brand/a/
public static ArrayList getCxUrl(String url) throws Exception {
ArrayList cxPicList = new ArrayList();
String content = CarCrawer.SendGet(url);
//Pattern pattern = Pattern.compile("class=\"group\">.*
//Matcher matcher = pattern.matcher(content);
while(content.indexOf("class=\"group\">",i)>0){
int subS = content.indexOf("class=\"group\">",i);
String subContent = content.substring(subS, subE);
//System.out.println("subContent "+subContent);
Pattern patternTitle = Pattern.compile("频道\">进入.{1,20}品牌频道]");
Matcher matcherTitle = patternTitle.matcher(subContent);
strtitle = matcherTitle.group(0).substring(6, matcherTitle.group(0).length()-16);
Pattern patternName = Pattern.compile("\"查看.{1,20}图片\">");
Pattern patternUrl = Pattern.compile("/series/photo/.{10,20}\"");
Matcher matcherName = patternName.matcher(subContent);
Matcher matcherUrl = patternUrl.matcher(subContent);
while(matcherName.find()&&matcherUrl.find()){
CarBrand carBrand = new CarBrand();
//System.out.println(carBrand.getPpName());
carBrand.setCxName(matcherName.group(0).substring(3, matcherName.group(0).length()-4));
carBrand.setCxUrl(carUrl+matcherUrl.group(0).substring(0, matcherUrl.group(0).length()-1));
//System.out.println(carBrand.getCxName()+": "+carBrand.getCxUrl());
* 示例 http://product.auto.163.com/series/photo/2350.html#CX001
public static ArrayList getCxPic(String url) throws Exception {
ArrayList cxPicList = new ArrayList();
String content = CarCrawer.SendGet(url);
Pattern pattern = Pattern.compile("http://product.auto.163.com/picture/photoview.{30,40}.html");
Matcher matcher = pattern.matcher(content);
CarBrand carBrand = new CarBrand();
if(num==1){ carBrand.setCxTpName("左前");} else if(num==2){ carBrand.setCxTpName("正前");
}else if(num==3){ carBrand.setCxTpName("正侧"); } else if(num==4){ carBrand.setCxTpName("左后");
}else if(num==5){ carBrand.setCxTpName("正后"); } else if(num==6){ carBrand.setCxTpName("车顶");
}else if(num==7){ carBrand.setCxTpName("前大灯局部"); } else if(num==8){ carBrand.setCxTpName("后大灯局部");
}else{ System.out.println("Error: num = "+num); return null;}
carBrand.setCxTpUrl(matcher.group(0));
//System.out.println(carBrand.getCxTpName()+": "+matcher.group(0));
public static String getBigPic(String url) throws Exception {
String content = CarCrawer.SendGet(url);
Pattern pattern = Pattern.compile("
Matcher matcher = pattern.matcher(content);
//System.out.println(matcher.group(0).substring(41));
bigPicUrl = matcher.group(0).substring(41);