package com.test;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class DressMessage {
public static Map<Integer, String> cssMap = new HashMap<>();
public static BufferedWriter bufferedWriter = null;
public static BufferedWriter bufferedWriter2 = null;
private static void initFile(){
try {
bufferedWriter = new BufferedWriter(new FileWriter(new File("f:\\广东省.txt"), true));
bufferedWriter2 = new BufferedWriter(new FileWriter(new File("f:\\timeOutUrl.txt"), true));
} catch (Exception e) {
e.printStackTrace();
}
}
static{
cssMap.put(1, "provincetr"); //省
cssMap.put(2, "citytr"); //市
cssMap.put(3, "countytr"); //县
cssMap.put(4, "towntr"); //镇
cssMap.put(5, "villagetr"); //村
}
public static void main(String[] args) {
int level = 1;
initFile();
//获取全国各个省级信息
Document connect = connect("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/");
Elements rowProvince = connect.select("tr." + cssMap.get(level));
for (Element provinceElement : rowProvince) {
Elements select = provinceElement.select("a");
for (Element province : select) {
if (province.select("a").last().text().equals("广东省")) {
printProvince(province, level);
parseNextLevel(province,level+1);
}
}
}
closeStream();
System.out.println("ok");
}
private static void parseNextLevel(Element parentElement, int level) {
try {
Thread.sleep(500);
} catch (Exception e) {
e.printStackTrace();
}
//获取个市级信息
Document doc = connect(parentElement.attr("abs:href"));
if (doc != null) {
Elements elements = doc.select("tr." + cssMap.get(level));
//获取表格的一行数据
for (Element element : elements) {
printInfo(element, level );
// 在递归调用的时候,这里是判断是否是村一级的数据,村一级的数据没有a标签
Elements select = element.select("a");
if (select.size() != 0 && level+1 <= 4) {
parseNextLevel(select.last(), level+1);
}
}
}
}
/**
* 写一行数据到数据文件中去
* @param element 爬取到的数据元素
* @param level 城市级别
*/
private static void printInfo(Element element, int level) {
try {
if (!element.select("td").last().text().equals("市辖区")) {
String number = "";
if (level == 2 || level == 3) {
number = element.select("td").first().text().substring(0, 6);
}else if (level == 4) {
number = element.select("td").first().text().substring(0, 9);
}else{
throw new Exception();
}
bufferedWriter.write(number +":"+ element.select("td").last().text() + ":" +level);
bufferedWriter.newLine();
bufferedWriter.flush();
}
} catch (Exception e) {
System.out.println("错误级别:"+level);
e.printStackTrace();
}
}
private static void printProvince(Element element, int level) {
try {
bufferedWriter.write(440000 + ":"+element.select("a").last().text() + ":" +level);
bufferedWriter.newLine();
bufferedWriter.flush();
} catch (Exception e) {
e.printStackTrace();
}
}
private static void printException(String url) {
try {
bufferedWriter2.write(url);
bufferedWriter2.newLine();
bufferedWriter2.flush();
} catch (Exception e) {
e.printStackTrace();
}
}
private static Document connect(String url) {
if (url == null || url.isEmpty()) {
throw new IllegalArgumentException("The input url('" + url + "') is invalid!");
}
try {
return Jsoup.connect(url).timeout(100*1000).get();
} catch (Exception e) {
System.out.println(url);
printException(url);
e.printStackTrace();
}
return null;
}
//关流
private static void closeStream() {
if (bufferedWriter != null) {
try {
bufferedWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
bufferedWriter = null;
}
}
}
运行结果如下图
使用jsoup,爬取省/市/区/县/镇/乡 地域划分消息
最新推荐文章于 2024-05-24 13:32:50 发布