Java多线程爬虫-2018国家统计局区划和城乡划分代码以及数据库、json文件

 

package com.reptile.area.jsoup;

import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import cn.hutool.core.util.CharsetUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;

/**
 * * 省市区区划地址解析
 * 
 * @author zhang.xiaoming
 */
public class CityParserThread {

	public static final String COMMON_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
	
	public static final String INDEX_URL = COMMON_URL + "index.html";

	public static final String LEFT_SLANT = "/";
	
	public static final Charset CHARSET = CharsetUtil.CHARSET_GBK;

	public static Node parseCity(String url, Node node) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("citytr");

		List<Node> cities = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			String href = links.get(0).attr("href");
			String cityCode = links.get(0).text().substring(0, 4);
			String cityName = links.get(1).text();

			Node cityNode = Node.builder().name(cityName).code(cityCode).dataFromUrl(url).childNodeUrl(COMMON_URL + href)
					.nodes(parseCounty(COMMON_URL + href)).build();

			StaticLog.info("	市级数据:  {} - {}   ", Thread.currentThread().getName(), cityNode);

			cities.add(cityNode);
		}
		node.setNodes(cities);
		return node;
	}

	public static List<Node> parseCounty(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("countytr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String countyCode = links.get(0).text().substring(0, 6);
			String countyName = links.get(1).text();

			Node countyNode = Node.builder().code(countyCode).name(countyName).dataFromUrl(url).childNodeUrl(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href)
					.nodes(parseTowntr(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href)).build();
			StaticLog.info("		县级数据:  {} - {}   ", Thread.currentThread().getName(), countyNode);

			counties.add(countyNode);
		}
		return counties;
	}

	public static List<Node> parseTowntr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("towntr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements links = tr.getElementsByTag("a");
			if (links == null || links.size() != 2) {
				continue;
			}
			String href = links.get(0).attr("href");
			String towntrCode = links.get(0).text().substring(0, 9);
			String towntrName = links.get(1).text();

			Node towntrNode = Node.builder().name(towntrName).code(towntrCode).dataFromUrl(url).childNodeUrl(COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href.substring(5, 7) + LEFT_SLANT + href)
					.nodes(parseVillagetr(
							COMMON_URL + href.subSequence(3, 5).toString() + LEFT_SLANT + href.substring(5, 7) + LEFT_SLANT + href))
					.build();

			//StaticLog.info("			乡镇级数据:  {} - {}  ", Thread.currentThread().getName(),  towntrNode);

			counties.add(towntrNode);
		}
		return counties;
	}

	public static List<Node> parseVillagetr(String url) {
		String htmlStr = HttpUtil.get(url, CHARSET);
		Document document = Jsoup.parse(htmlStr);
		Elements trs = document.getElementsByClass("villagetr");

		List<Node> counties = new LinkedList<Node>();
		for (Element tr : trs) {
			Elements tds = tr.getElementsByTag("td");
			if (tds == null || tds.size() != 3) {
				continue;
			}
			String villagetrCode = tds.get(0).text();
			String villagetrName = tds.get(2).text();
			
			Node villagetrNode = Node.builder().code(villagetrCode).name(villagetrName).dataFromUrl(url).build();
			//StaticLog.info("				村级数据:  {} - {} ", Thread.currentThread().getName(), villagetrNode);
			
			counties.add(villagetrNode);
		}
		return counties;
	}
}

实体类:

package com.reptile.area.jsoup;

import java.util.List;

import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;

@Data
@ToString
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class Node {

    private String name;

    private String code;
    
    private String childNodeUrl;
    
    private String dataFromUrl;

    private List<Node> nodes;
}

测试:

package com.reptile.area;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.reptile.area.jsoup.CityParserThread;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.date.DateUtil;
import cn.hutool.core.date.TimeInterval;
import cn.hutool.http.HttpUtil;
import cn.hutool.log.StaticLog;

public class CityParserThreadTest {

	public static List<Node> province() {
		String htmlStr = HttpUtil.get(CityParserThread.INDEX_URL, CityParserThread.CHARSET);
		Document document = Jsoup.parse(htmlStr);

		// 获取 class='provincetr' 的元素
		Elements elements = document.getElementsByClass("provincetr");
		List<Node> provinces = new LinkedList<Node>();
		for (Element element : elements) {
			// 获取 elements 下属性是 href 的元素
			Elements links = element.getElementsByAttribute("href");
			for (Element link : links) {
				String provinceName = link.text();
				String href = link.attr("href");
				String provinceCode = href.substring(0, 2);

				Node provinceNode = Node.builder().code(provinceCode).name(provinceName)
						.dataFromUrl(CityParserThread.INDEX_URL).childNodeUrl(CityParserThread.COMMON_URL + href)
						.build();
				provinces.add(provinceNode);
			}
		}
		return provinces;
	}

	public static void main(String[] args) {

		TimeInterval timer = DateUtil.timer();
		// -------这是执行过程--------------

		List<Node> provinces = province();
		if (CollUtil.isNotEmpty(provinces)) {
			List<Node> nodes = new LinkedList<Node>();
			List<Future<Node>> futureList = new ArrayList<Future<Node>>();
			ExecutorService pool = Executors.newFixedThreadPool(provinces.size());//根据实际情况定义大小
			for (Node province : provinces) {
				futureList.add(pool.submit(new TaskCallable(province.getChildNodeUrl(), province.getName(), province)));
			}
			pool.shutdown(); // 不允许再想线程池中增加线程
			// 判断是否所有线程已经执行完毕
			try {
				boolean isFinish = pool.awaitTermination(1, TimeUnit.HOURS);
				StaticLog.info("==========================");
				// 如果没有执行完
				if (!isFinish) {
					// 线程池执行结束 不在等待线程执行完毕,直接执行下面的代码
					pool.shutdownNow();
				}
				// 2.结果归集,用迭代器遍历futureList,高速轮询(模拟实现了并发),任务完成就移除
				while (futureList.size() > 0) {
					Iterator<Future<Node>> iterable = futureList.iterator();
					// 遍历一遍
					while (iterable.hasNext()) {
						Future<Node> future = iterable.next();
						// 如果任务完成取结果,否则判断下一个任务是否完成
						if (future.isDone() && !future.isCancelled()) {
							// 获取结果
							nodes.add(future.get());
						} else {
							Thread.sleep(1);// 避免CPU高速运转,这里休息1毫秒,CPU纳秒级别
						}
					}
				}
			} catch (InterruptedException e) {
				e.printStackTrace();
			} catch (ExecutionException e) {
				e.printStackTrace();
			}
			// 只给线程池中的线程1小时,然后就继续执行
			StaticLog.info("it is ok !!!");
			
			SqlJsonWriter.jsonWriter(nodes, "F://20190314area.json");

			SqlJsonWriter.sqlWriter(nodes, "F://20190314area.sql");
			
			// ---------------------------------
			long interval = timer.interval();// 花费毫秒数
			long intervalMinute = timer.intervalMinute();// 花费分钟数
			StaticLog.info("本次程序执行 花费毫秒数: {} ,   花费分钟数:{} . ", interval, intervalMinute);
		}
	}
}

class TaskCallable implements Callable<Node> {
	private String url;

	private String areaName;

	private Node node;

	public TaskCallable(String url, String areaName, Node node) {
		this.url = url;
		this.areaName = areaName;
		this.node = node;
	}

	@Override
	public Node call() throws Exception {
		StaticLog.info("当前线程: {} ,  地区名称: {} ,  请求地址:{} 。 ", Thread.currentThread().getName(), areaName, url);
		return CityParserThread.parseCity(url, node);
	}
}

sql生成:

package com.reptile.area;

import java.util.ArrayList;
import java.util.List;

import com.github.stuxuhai.jpinyin.PinyinException;
import com.github.stuxuhai.jpinyin.PinyinFormat;
import com.github.stuxuhai.jpinyin.PinyinHelper;
import com.reptile.area.jsoup.Node;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.file.FileReader;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.core.text.StrFormatter;
import cn.hutool.json.JSONUtil;
import cn.hutool.log.StaticLog;

public class SqlJsonWriter {
	
	public static void sqlWriter(List<Node> nodes, String path) {
		if (CollUtil.isNotEmpty(nodes)) {
			FileWriter sqlWriter = new FileWriter(FileUtil.touch(path));
			sqlWriter.writeLines(nodes);
			StaticLog.info("SQL文件保存成功...");
		}
	}
	
	public static void jsonWriter(List<Node> nodes, String path) {
		if (CollUtil.isNotEmpty(nodes)) {
			// json数据写入到文件
			FileWriter jsonWriter = new FileWriter(FileUtil.touch(path));
			jsonWriter.write(JSONUtil.toJsonStr(nodes));
			StaticLog.info("JSON文件保存成功...");
		}
	}
	
	
	/**
	 * *实体转sql数据
	 * 
	 * @param provinces 省市县数据
	 */
	private static List<String> buildSql(List<Node> provinces) {
		List<String> sqls = null;
		if (CollUtil.isNotEmpty(provinces)) {
			sqls = new ArrayList<>();
			for (Node province : provinces) {
				sqls.add(initSql(province.getName(), province.getCode(), province.getDataFromUrl(), province.getChildNodeUrl(), "", 1));
				buildCitySql(sqls, province.getNodes(), province.getCode());
			}
		}
		return sqls;
	}

	private static void buildCitySql(List<String> sqls, List<Node> cities, String parentCode) {
		if (CollUtil.isNotEmpty(cities)) {
			for (Node city : cities) {
				sqls.add(initSql(city.getName(), city.getCode(), city.getDataFromUrl(), city.getChildNodeUrl(), parentCode, 2));
				buildCountySql(sqls, city.getNodes(), city.getCode());
			}
		}
	}

	private static void buildCountySql(List<String> sqls, List<Node> counties, String parentCode) {
		if (CollUtil.isNotEmpty(counties)) {
			for (Node county : counties) {
				sqls.add(initSql(county.getName(), county.getCode(), county.getDataFromUrl(), county.getChildNodeUrl(), parentCode, 3));
				buildTowntrSql(sqls, county.getNodes(), county.getCode());
			}
		}
	}

	private static void buildTowntrSql(List<String> sqls, List<Node> towies, String parentCode) {
		if (CollUtil.isNotEmpty(towies)) {
			for (Node towntr : towies) {
				sqls.add(initSql(towntr.getName(), towntr.getCode(), towntr.getDataFromUrl(), towntr.getChildNodeUrl(), parentCode, 4));
				buildVillagetrSql(sqls, towntr.getNodes(), towntr.getCode());
			}
		}
	}

	private static void buildVillagetrSql(List<String> sqls, List<Node> vilies, String parentCode) {
		if (CollUtil.isNotEmpty(vilies)) {
			for (Node villagetr : vilies) {
				sqls.add(initSql(villagetr.getName(), villagetr.getCode(), villagetr.getDataFromUrl(), villagetr.getChildNodeUrl(), parentCode, 5));
			}
		}
	}

	/**
	 ** 初始化sql语句
	 */
	private static String initSql(String name, String code, String dataFromUrl, String childNodeUrl, String parentCode, Integer depth) {
		final String SQL = "insert into area(`name`, `code`, full_spell, easy_spell, initial, parent_code, depth, data_from_url, child_node_url) values ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');";

		String insertSql = null;
		try {
			insertSql = StrFormatter.format(SQL, name, code,
					PinyinHelper.convertToPinyinString(name, "", PinyinFormat.WITHOUT_TONE),
					PinyinHelper.getShortPinyin(name), PinyinHelper.getShortPinyin(name).substring(0, 1).toString(),
					parentCode, depth, dataFromUrl, childNodeUrl);
			StaticLog.info(insertSql);
		} catch (PinyinException e) {
			StaticLog.error("拼音解析失败:{} .", e.getMessage());
		}
		return insertSql;
	}
}

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值