现在以https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1为例
抓取这个站点的汽车信息
1、设置得到信息的的汽车对象类
package com.xiang;
import java.util.List;
public class CarInfo {
// private String manufacturer;
// private String serieliaze;
// private String model;
// private String enginCode;
// private String kilowatt;
// private String horsepower;
// private String makeTime;
List<String> car;
public List<String> getCar() {
return car;
}
public void setCar(List<String> car) {
this.car = car;
}
}
2、设置目录的类(包括子目录与父目录的关系)
package com.xiang;
import java.util.List;
public class CategoryAnther {
private String id;
private String name;
private List<CategoryAnther> categoryAnther;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public List<CategoryAnther> getCategoryAnther() {
return categoryAnther;
}
public void setCategoryAnther(List<CategoryAnther> categoryAnther) {
this.categoryAnther = categoryAnther;
}
}
3、主程序抓取
package com.xiang;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.OptionTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class ExportInfo {
/**
* @param args
**/
public static void main(String[] args) {
System.out.println("main start-----------"+new Date());
// TODO Auto-generated method stub
String url1 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1";
String url2 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=2";
List<CategoryAnther> firstCategory = new ArrayList<CategoryAnther>();
// List<CategoryAnther> secondCategory = new ArrayList<CategoryAnther>();
firstCategory = addChildrenToList(url1);
firstCategory.addAll(addChildrenToList(url2));
// secondCategory = addChildrenToList(url2);
List<CarInfo> carInfo = new ArrayList<CarInfo>();
try{
File f = new File("liufen.txt");
if(!f.exists())
f.createNewFile();
FileWriter fw = new FileWriter(f,true);
// readFileByLines("xiangqi.txt",fw);
for(int i =0;i<firstCategory.size();i++){
CategoryAnther categoryAnther = firstCategory.get(i);
List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther();
for(int j=0;j<childrenCategory.size();j++){
String _url = url1+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId();
// System.out.println(_url);
//start analyze data by url
carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url));
}
}
// for(int i =0;i<secondCategory.size();i++){
// CategoryAnther categoryAnther = secondCategory.get(i);
// List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther();
// for(int j=0;j<childrenCategory.size();j++){
// String _url = url2+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId();
// //start analyze data by url
// carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url));
// }
// }
fw.write("开始写入1------\r\n");
for(int k=0;k<carInfo.size();k++){
fw.write(carInfo.get(k).getCar().get(0)+"\r\n");
}
fw.write("开始写入2------\r\n");
for(int k=0;k<carInfo.size();k++){
fw.write(carInfo.get(k).getCar().get(1).replace(" ", " ")+"\r\n");
}
fw.write("开始写入3------\r\n");
for(int k=0;k<carInfo.size();k++){
fw.write(carInfo.get(k).getCar().get(2)+"\r\n");
}
fw.write("开始写入4------\r\n");
for(int k=0;k<carInfo.size();k++){
fw.write(carInfo.get(k).getCar().get(3)+"\r\n");
}
fw.write("开始写入5------\r\n");
for(int k=0;k<carInfo.size();k++){
fw.write(carInfo.get(k).getCar().get(4)+"\r\n");
}
fw.write("开始写入6------\r\n");
for(int k=0;k<carInfo.size();k++){
fw.write(carInfo.get(k).getCar().get(5)+"\r\n");
}
fw.write("开始写入7------\r\n");
for(int k=0;k<carInfo.size();k++){
fw.write(carInfo.get(k).getCar().get(6)+"\r\n");
}
fw.flush();
fw.close();
}catch(Exception e){
e.printStackTrace();
}
System.out.println("main end-----------"+new Date());
}
public static String getHtmlByUrl(String url){
int layouttime = 20000;
String html ="";
try {
URL b = new URL(url);
URLConnection urlConnection = b.openConnection();
urlConnection.setReadTimeout(layouttime);
InputStream inputStream = urlConnection.getInputStream();
BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, "gb2312"));
String rString = null;
while ((rString = in.readLine()) != null) {
html+=rString;
}
}catch(Exception e){
e.printStackTrace();
}
return html;
}
public static List<CarInfo> getDataByUrl(String firstName,String secondName,String url){
System.out.println("getDataByUrl start-----------"+new Date());
List<CarInfo> carInfoList = new ArrayList<CarInfo>();
String html ="";
html = getHtmlByUrl(url);
Parser parser = Parser.createParser(html, "gb2312");
NodeFilter nameFilter = new HasAttributeFilter("id",
"rahmen");
NodeList list = null;
try {
list = parser.extractAllNodesThatMatch(nameFilter);
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// System.out.println("得到的行数的大小1:"+list.toHtml());
NodeList tablelist= list.elementAt(0).getChildren();
// System.out.println("得到的行数的大小2:"+tablelist.toHtml());
NodeList trlist= tablelist.elementAt(1).getChildren();
// System.out.println("得到的行数的大小3:"+trlist.toHtml());
for(int i =6;i<trlist.size();i=i+2){
List<String> trInfo = new ArrayList<String>();
trInfo.add(firstName);
trInfo.add(secondName);
TableRow tableRow = (TableRow) trlist.elementAt(i);
NodeList tdlist = tableRow.getChildren();
for(int j =2;j<tdlist.size();j=j+3){
TableColumn tableColumn = (TableColumn) tdlist.elementAt(j);
NodeList alist = tableColumn.getChildren();
LinkTag linkTag = null;
if(j==2)
linkTag = (LinkTag) alist.elementAt(1);
else
linkTag = (LinkTag) alist.elementAt(2);
trInfo.add(linkTag.getLinkText());
// System.out.print(linkTag.getLinkText()+"--");
}
CarInfo carInfo = new CarInfo();
carInfo.setCar(trInfo);
System.out.println(trInfo.get(0));
carInfoList.add(carInfo);
}
System.out.println("getDataByUrl end-----------"+new Date());
return carInfoList;
}
public static List<CategoryAnther> addChildrenToList(String url){
System.out.println("addChildrenToList start-----------"+new Date());
List<CategoryAnther> firstCategrory = getFirstPageCategoryIds(url,"ktlg_01_mrksl");
for (int i = 0; i < firstCategrory.size(); i++) {
String _url = url + "&ktlg_01_mrksl=" + firstCategrory.get(i).getId();
//对二级目录进行解析
firstCategrory.get(i).setCategoryAnther(getFirstPageCategoryIds(_url,"ktlg_01_mdrsl"));
}
System.out.println("addChildrenToList end-----------"+new Date());
return firstCategrory;
}
public static List<CategoryAnther> getFirstPageCategoryIds(String url,String nameValue) {
System.out.println("getFirstPageCategoryIds start-----------"+new Date());
List<CategoryAnther> categorys = new ArrayList<CategoryAnther>();
String html ="";
html = getHtmlByUrl(url);
Parser parser = Parser.createParser(html, "gb2312");
NodeFilter nameFilter = new HasAttributeFilter("name",
nameValue);
NodeList list = null;
try {
list = parser.extractAllNodesThatMatch(nameFilter);
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
NodeList optionList= list.elementAt(0).getChildren();
for(int i =1;i<optionList.size();i++){
OptionTag option = (OptionTag) optionList.elementAt(i);
CategoryAnther categoryAnther = new CategoryAnther();
// System.out.print(option.getAttribute("value")+"--");
// System.out.println(option.getChildrenHTML());
categoryAnther.setId(option.getAttribute("value"));
categoryAnther.setName(option.getChildrenHTML());
categorys.add(categoryAnther);
}
System.out.println("getFirstPageCategoryIds end-----------"+new Date());
return categorys;
}
public static void readFileByLines(String fileName,FileWriter fw) {
File file = new File(fileName);
BufferedReader reader = null;
try {
System.out.println("以行为单位读取文件内容,一次读一整行:");
reader = new BufferedReader(new FileReader(file));
String tempString = null;
int line = 1;
// 一次读入一行,直到读入null为文件结束
while ((tempString = reader.readLine()) != null) {
// 显示行号
if(tempString.trim().equals(""))
fw.write(tempString+"\r\n");
else if(tempString.indexOf("-")>-1)
fw.write(tempString+"\r\n");
else
fw.write(tempString+"→"+"\r\n");
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e1) {
}
}
}
}
}
不懂得call 13886053422 或QQ 526151410
下面附有项目文件。由于版权所有,设有密码。请向本人索要密码