抓取网站全站信息,并导出数据为EXCEL

现在以https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1为例

抓取这个站点的汽车信息

1、设置得到信息的的汽车对象类

package com.xiang;

import java.util.List;

public class CarInfo {
	
//	private String manufacturer;
//	private String serieliaze;
//	private String model;
//	private String enginCode;
//	private String kilowatt;
//	private String horsepower;
//	private String makeTime;
	
	List<String> car;

	public List<String> getCar() {
		return car;
	}

	public void setCar(List<String> car) {
		this.car = car;
	}
	
}

 2、设置目录的类(包括子目录与父目录的关系)

package com.xiang;

import java.util.List;

public class CategoryAnther {
	private String id;
	private String name;
	private List<CategoryAnther> categoryAnther;
	public String getId() {
		return id;
	}
	public void setId(String id) {
		this.id = id;
	}
	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public List<CategoryAnther> getCategoryAnther() {
		return categoryAnther;
	}
	public void setCategoryAnther(List<CategoryAnther> categoryAnther) {
		this.categoryAnther = categoryAnther;
	}
	
}

 3、主程序抓取

package com.xiang;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.OptionTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class ExportInfo {
	/**
	 * @param args
	 **/
	public static void main(String[] args) {

		System.out.println("main start-----------"+new Date());
		// TODO Auto-generated method stub
		String url1 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=1";
		String url2 = "https://www.mann-hummel.com/mf_prodkata_china/index.html?ktlg_page=1&ktlg_lang=16&ktlg_01_fzart=2";
		List<CategoryAnther> firstCategory = new ArrayList<CategoryAnther>();
//		List<CategoryAnther> secondCategory = new ArrayList<CategoryAnther>();
		
		firstCategory = addChildrenToList(url1);
		firstCategory.addAll(addChildrenToList(url2));
//		secondCategory = addChildrenToList(url2);
		List<CarInfo> carInfo = new ArrayList<CarInfo>();
		try{
		File f = new File("liufen.txt");
		if(!f.exists())
			f.createNewFile();
		FileWriter fw = new FileWriter(f,true);
//		readFileByLines("xiangqi.txt",fw);
		for(int i =0;i<firstCategory.size();i++){
			CategoryAnther categoryAnther = firstCategory.get(i);
			List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther();
			for(int j=0;j<childrenCategory.size();j++){
				String _url = url1+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId();
//				System.out.println(_url);
				//start analyze data by url
				carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url));
			}
		}
//		for(int i =0;i<secondCategory.size();i++){
//			CategoryAnther categoryAnther = secondCategory.get(i);
//			List<CategoryAnther> childrenCategory = categoryAnther.getCategoryAnther();
//			for(int j=0;j<childrenCategory.size();j++){
//				String _url = url2+"&ktlg_01_mrksl="+categoryAnther.getId()+"&ktlg_01_mdrsl="+childrenCategory.get(j).getId();
//				//start analyze data by url
//				carInfo.addAll(getDataByUrl(categoryAnther.getName(),childrenCategory.get(j).getName(),_url));
//			}
//		}
		fw.write("开始写入1------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(0)+"\r\n");
		}
		fw.write("开始写入2------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(1).replace("&#160;", " ")+"\r\n");
		}
		fw.write("开始写入3------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(2)+"\r\n");
		}
		fw.write("开始写入4------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(3)+"\r\n");
		}
		fw.write("开始写入5------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(4)+"\r\n");
		}
		fw.write("开始写入6------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(5)+"\r\n");
		}
		fw.write("开始写入7------\r\n");
		for(int k=0;k<carInfo.size();k++){
			fw.write(carInfo.get(k).getCar().get(6)+"\r\n");
		}
		fw.flush();
		fw.close();

		}catch(Exception e){
			e.printStackTrace();
		}
		System.out.println("main end-----------"+new Date());
	}
	
	public static String getHtmlByUrl(String url){
		int layouttime = 20000;
		String html ="";
		try {
			URL b = new URL(url);
			URLConnection urlConnection = b.openConnection();
			urlConnection.setReadTimeout(layouttime);
			InputStream inputStream = urlConnection.getInputStream();
			BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, "gb2312"));
			String rString = null;
			while ((rString = in.readLine()) != null) {
				html+=rString;
			}
		}catch(Exception e){
			e.printStackTrace();
		}
			return html;
	}
	
	public static List<CarInfo> getDataByUrl(String firstName,String secondName,String url){
		System.out.println("getDataByUrl start-----------"+new Date());
		List<CarInfo> carInfoList = new ArrayList<CarInfo>();
		String html ="";
		html = getHtmlByUrl(url);
		    Parser parser = Parser.createParser(html, "gb2312");

	        NodeFilter nameFilter = new HasAttributeFilter("id",
	        		"rahmen");
	        NodeList list = null;
			try {
				list = parser.extractAllNodesThatMatch(nameFilter);
			} catch (ParserException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
//	        System.out.println("得到的行数的大小1:"+list.toHtml());
	        NodeList tablelist= list.elementAt(0).getChildren();
//	        System.out.println("得到的行数的大小2:"+tablelist.toHtml());
	        NodeList trlist= tablelist.elementAt(1).getChildren();
//	        System.out.println("得到的行数的大小3:"+trlist.toHtml());
	        for(int i =6;i<trlist.size();i=i+2){
	        	List<String> trInfo = new ArrayList<String>();
	        	trInfo.add(firstName);
	        	trInfo.add(secondName);
	        	TableRow tableRow = (TableRow) trlist.elementAt(i);
	        	NodeList tdlist = tableRow.getChildren();
	        	for(int j =2;j<tdlist.size();j=j+3){
	        		TableColumn tableColumn = (TableColumn) tdlist.elementAt(j);
	        		NodeList alist = tableColumn.getChildren();
	        		LinkTag linkTag = null;
	        		if(j==2)
	        			linkTag = (LinkTag) alist.elementAt(1);
	        		else
	        			linkTag = (LinkTag) alist.elementAt(2);
	        		trInfo.add(linkTag.getLinkText());
//	        		System.out.print(linkTag.getLinkText()+"--");
	        	}
	        	CarInfo carInfo = new CarInfo();
	        	carInfo.setCar(trInfo);
	        	System.out.println(trInfo.get(0));
	        	carInfoList.add(carInfo);
	        }
		System.out.println("getDataByUrl end-----------"+new Date());
		return carInfoList;
	}
	
	public static List<CategoryAnther> addChildrenToList(String url){
		System.out.println("addChildrenToList start-----------"+new Date());
		List<CategoryAnther> firstCategrory = getFirstPageCategoryIds(url,"ktlg_01_mrksl");
		for (int i = 0; i < firstCategrory.size(); i++) {
			String _url = url + "&ktlg_01_mrksl=" + firstCategrory.get(i).getId();
			//对二级目录进行解析
			firstCategrory.get(i).setCategoryAnther(getFirstPageCategoryIds(_url,"ktlg_01_mdrsl"));
		}
		System.out.println("addChildrenToList end-----------"+new Date());
		return firstCategrory;
	}
	public static List<CategoryAnther> getFirstPageCategoryIds(String url,String nameValue) {
		System.out.println("getFirstPageCategoryIds start-----------"+new Date());
		List<CategoryAnther> categorys = new ArrayList<CategoryAnther>();
		String html ="";
		html = getHtmlByUrl(url);
		    Parser parser = Parser.createParser(html, "gb2312");

	        NodeFilter nameFilter = new HasAttributeFilter("name",
	        		nameValue);
	        NodeList list = null;
			try {
				list = parser.extractAllNodesThatMatch(nameFilter);
			} catch (ParserException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
	        NodeList optionList= list.elementAt(0).getChildren();
	        for(int i =1;i<optionList.size();i++){
	        	OptionTag option =  (OptionTag) optionList.elementAt(i);
	        	CategoryAnther categoryAnther = new CategoryAnther();
//	        	System.out.print(option.getAttribute("value")+"--");
//	        	System.out.println(option.getChildrenHTML());
	        	categoryAnther.setId(option.getAttribute("value"));
	        	categoryAnther.setName(option.getChildrenHTML());
	        	categorys.add(categoryAnther);
	        }
		System.out.println("getFirstPageCategoryIds end-----------"+new Date());
		return categorys;
	}
	 public static void readFileByLines(String fileName,FileWriter fw) {
	        File file = new File(fileName);
	        BufferedReader reader = null;
	        try {
	            System.out.println("以行为单位读取文件内容,一次读一整行:");
	            reader = new BufferedReader(new FileReader(file));
	            String tempString = null;
	            int line = 1;
	            // 一次读入一行,直到读入null为文件结束
	            while ((tempString = reader.readLine()) != null) {
	                // 显示行号
	            	if(tempString.trim().equals(""))
	            		fw.write(tempString+"\r\n");
	            	else if(tempString.indexOf("-")>-1)
	                	fw.write(tempString+"\r\n");
	                else
	                	fw.write(tempString+"→"+"\r\n");
	                	
	            }
	            reader.close();
	        } catch (IOException e) {
	            e.printStackTrace();
	        } finally {
	            if (reader != null) {
	                try {
	                    reader.close();
	                } catch (IOException e1) {
	                }
	            }
	        }
	    }
}

 不懂得call 13886053422      或QQ 526151410

下面附有项目文件。由于版权所有,设有密码。请向本人索要密码

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值