使用jsoup,爬取省/市/区/县/镇/乡 地域划分消息

package com.test;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class DressMessage {
	public static Map<Integer, String> cssMap = new HashMap<>();
	
	public static BufferedWriter bufferedWriter = null;
	
	public static BufferedWriter bufferedWriter2 = null;
	
	private static void initFile(){
		try {
			bufferedWriter = new BufferedWriter(new FileWriter(new File("f:\\广东省.txt"), true));
			bufferedWriter2 = new BufferedWriter(new FileWriter(new File("f:\\timeOutUrl.txt"), true));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	static{
		cssMap.put(1, "provincetr");	//省
		cssMap.put(2, "citytr");		//市
		cssMap.put(3, "countytr");		//县
		cssMap.put(4, "towntr");		//镇
		cssMap.put(5, "villagetr");		//村
	}
	
	public static void main(String[] args) {
		int level = 1;
		initFile();
		//获取全国各个省级信息
		Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/");
		Elements rowProvince = connect.select("tr." + cssMap.get(level));
		for (Element provinceElement : rowProvince) {
			Elements select = provinceElement.select("a");
			for (Element province : select) {
				if (province.select("a").last().text().equals("广东省")) {
					printProvince(province, level);
					parseNextLevel(province,level+1);
				}
			}
		}
		closeStream();
		System.out.println("ok");
	}
	
	private static void parseNextLevel(Element parentElement, int level) {
		try {
			Thread.sleep(500);
		} catch (Exception e) {
			e.printStackTrace();
		}
		//获取个市级信息
		Document doc = connect(parentElement.attr("abs:href"));
		if (doc != null) {
			Elements elements = doc.select("tr." + cssMap.get(level));
			//获取表格的一行数据
			for (Element element : elements) {
				printInfo(element, level );
				// 在递归调用的时候,这里是判断是否是村一级的数据,村一级的数据没有a标签
				Elements select = element.select("a");
				if (select.size() != 0 && level+1 <= 4) {
					parseNextLevel(select.last(), level+1);
				}
			}
			
		}
	}

	/**
	 * 写一行数据到数据文件中去
	 * @param element 爬取到的数据元素
	 * @param level	城市级别	
	 */
	private static void printInfo(Element element, int level) {
		try {
			if (!element.select("td").last().text().equals("市辖区")) {
				String number = "";
				if (level == 2 || level == 3) {
					number = element.select("td").first().text().substring(0, 6);
				}else if (level == 4) {
					number = element.select("td").first().text().substring(0, 9);
				}else{
					throw new Exception();
				}
				bufferedWriter.write(number +":"+ element.select("td").last().text() + ":" +level);
				bufferedWriter.newLine();
				bufferedWriter.flush();
			}
		} catch (Exception e) {
			System.out.println("错误级别:"+level);
			e.printStackTrace();
		}
	}
	
	private static void printProvince(Element element, int level) {
		try {
			bufferedWriter.write(440000 + ":"+element.select("a").last().text() + ":" +level);
			bufferedWriter.newLine();
			bufferedWriter.flush();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	private static void printException(String url) {
		try {
			bufferedWriter2.write(url);
			bufferedWriter2.newLine();
			bufferedWriter2.flush();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private static Document connect(String url) {
		if (url == null || url.isEmpty()) {
			throw new IllegalArgumentException("The input url('" + url + "') is invalid!");
		}
		try {
			return Jsoup.connect(url).timeout(100*1000).get();
		} catch (Exception e) {
			System.out.println(url);
			printException(url);
			e.printStackTrace();
		}
		return null;
	}
	
	//关流
	private static void closeStream() {
		if (bufferedWriter != null) {
			try {
				bufferedWriter.close();
			} catch (Exception e) {
				e.printStackTrace();
			}
			bufferedWriter = null;
		}
		
	}
}
运行结果如下图
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值