java 抓取2016年统计用区划代码和城乡划分代码

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 抓取
 *
 * @author brianye QQ 
 * @date 2017-7-10
 */
public class GetRegion {

	public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html";
	private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
	
	public static void main(String[] args) {
		System.out.println("抓取开始:"+ new Date());
		getProvince();
		StringBuffer content = new StringBuffer();
		for (RegionEntry one : regions) {
			content.append("insert into sys_region values('").append(one.getCode()).append("', '").append(one.getCode()).append("',null, null,'").append(one.getCode()).append("', '000000', '").append(one.getName()).append("', '1' );\r\n");
			for (RegionEntry two : one.getSub()) {
				content.append("insert into sys_region values('").append(two.getCode()).append("', '").append(two.getCode()).append("',null, '").append(two.getCode()).append("','").append(one.getCode()).append("', '").append(one.getCode()).append("', '").append(two.getName()).append("', '0' );\r\n");
				for (RegionEntry three : two.getSub()) {
					content.append("insert into sys_region values('").append(three.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getCode()).append("', '").append(two.getCode()).append("','").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getName()).append("', '0' );\r\n");
				}
			}
		}
		
		Region.writeFile(content.toString());
		System.out.println("抓取完毕:"+ new Date());
	}
	
	private static void getProvince() {
		try {
			Document doc = Jsoup.parse(new URL(SITE_URL).openStream(), "GBK", SITE_URL); //Jsoup.connect(SITE_URL).get();
			Elements links = doc.select("tr.provincetr").select("a");
			RegionEntry region = null;
			for (Element e : links) {
				region = new RegionEntry();
				String href = e.attr("href");
				String[] arr = href.split("\\.");
				String code = arr[0];
				if (arr[0].length() < 6) {
					for (int i = 0; i < 6 -arr[0].length(); i++ ) {
						code +="0";
					}
				}
				
				region.setCode(code);
				region.setName(e.text());
				
				String absHref = e.attr("abs:href");
				getCity(absHref, region);
				regions.add(region);
				try {
					Thread.sleep(2000);
				} catch (InterruptedException e1) {
					// TODO Auto-generated catch block
					e1.printStackTrace();
				}
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 获取市地址
	 * @param url
	 * @param region
	 */
	private static void getCity(String url, RegionEntry region) {
		Document doc;
		try {
			doc = Jsoup.parse(new URL(url).openStream(), "GBK", url); //Jsoup.connect(url).get().charset(charset);
			Elements links = doc.select("tr.citytr");
			RegionEntry city;
			for (Element e : links) {
				city = new RegionEntry();
				Elements alist = e.select("a");
				Element codeE = alist.get(0);
				Element codeN = alist.get(1);
				String name  = codeN.text();
				
				String code = codeE.text();
				code = code.substring(0, 6);
				
				if ("市辖区".equals(name)) {
					name = region.getName();
					//code = region.getCode();
				}
				
				city.setCode(code);
				city.setName(name);	
				
				String absHref = codeE.attr("abs:href");
				getArea(absHref, city);
				
				region.getSub().add(city);
			}
			
			
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
	
	/**
	 * 获取区县地址
	 * @param url
	 * @param region
	 */
	private static void getArea(String url, RegionEntry region) {
		Document doc;
		try {
			doc =Jsoup.parse(new URL(url).openStream(), "GBK", url); // Jsoup.connect(url).get();
			Elements links = doc.select("tr.countytr");
			RegionEntry area;
			for (Element e : links) {
				area = new RegionEntry();
				Elements alist = e.select("a");
				if (alist.size() > 0) {
					Element codeE = alist.get(0);
					
					String code = codeE.text();
					code = code.substring(0, 6);
					area.setCode(code);
					
					Element codeN = alist.get(1);
					String name  = codeN.text();
					area.setName(name);				
					region.getSub().add(area);
				} else {
					alist = e.select("td");
					area.setCode(alist.get(0).text());
					area.setName(alist.get(1).text());				
					region.getSub().add(area);
				}
				
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}
	
}

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值