爬取网址 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
因为数据比较大,存储为一个json,会内存溢出。
所以按照每个省市进行存储。
同时因为远程访问链接拿取数据,所以会将已经拿到网页进行缓存,以便下次使用。
package com.witwicky.jsoup;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.witwicky.vo.CrawlingVo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class Crawling {
private static final String BASE_SAVE_DIR = "E:\\工作\\extract";
private static final String RESULT_SAVE_DIR = "E:\\工作\\extract_result";
public static void main(String[] args) throws Exception {
Gson gsonPretty = new GsonBuilder().setPrettyPrinting().create();
Gson gsonSimple = new GsonBuilder().create();
List<CrawlingVo> crawlingVos = new ArrayList<CrawlingVo>();
Elements select = getElements("index.html", "tr.provincetr > td > a");
for (Element element : select) {
List<CrawlingVo> crawlingVos1 = new ArrayList<CrawlingVo>();
String val = element.attr("href");
crawlingVos.add(new CrawlingVo(val.substring(0, val.indexOf(".")), element.text(), crawlingVos1));
String baseUrl = element.attr("href");
String baseUrlPre = baseUrl.substring(0, baseUrl.indexOf("."));
Elements ele = getElements(baseUrl, "tr.citytr");
for (Element nextE : ele) {
List<CrawlingVo> crawlingVos2 = new ArrayList<CrawlingVo>();
crawlingVos1.add(new CrawlingVo(nextE.select("td:eq(0) a").text(), nextE.select("td:eq(1) a").text(), crawlingVos2));
String href = nextE.select("td:eq(1) a").attr("href");
String substring = href.substring(0, baseUrl.indexOf("."));
Elements contryElements = getElements(href, "tr.countytr");
for (Element contryElement : contryElements) {
List<CrawlingVo> crawlingVos3 = new ArrayList<CrawlingVo>();
crawlingVos2.add(new CrawlingVo(contryElement.select("td:eq(0) a").text(), contryElement.select("td:eq(1) a").text(), crawlingVos3));
String href1 = contryElement.select("td:eq(1) a").attr("href");
if (!"".equalsIgnoreCase(href1)) {
String substring1 = href1.substring(0, baseUrl.indexOf("."));
Elements elements = getElements(substring + "/" + href1, "tr.towntr");
for (Element element1 : elements) {
List<CrawlingVo> crawlingVos4 = new ArrayList<CrawlingVo>();
crawlingVos3.add(new CrawlingVo(element1.select("td:eq(0) a").text(), element1.select("td:eq(1) a").text(), crawlingVos4));
String href2 = element1.select("td:eq(1) a").attr("href");
Elements elements1 = getElements(baseUrlPre + "/" + substring1 + "/" + href2, "tr.villagetr");
for (Element element2 : elements1) {
crawlingVos4.add(new CrawlingVo(element2.select("td:eq(0)").text(), element2.select("td:eq(2)").text(), new ArrayList<CrawlingVo>()));
}
}
}
}
}
save2File(gsonSimple.toJson(crawlingVos), element.text() + ".json", RESULT_SAVE_DIR);
save2File(gsonPretty.toJson(crawlingVos), element.text() + "_pretty.json", RESULT_SAVE_DIR);
System.out.println(element.text() + " is complete!");
}
}
private static Elements getElements(String u, String selector) throws IOException, InterruptedException {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/" + u;
String cleanUrl = cleanName(url);
Document select = null;
File localFile = new File(BASE_SAVE_DIR, cleanUrl);
if (localFile.exists()) {
select = Jsoup.parse(localFile, "UTF-8");
}
boolean remoteUrl = false;
if (select == null) {
int intRd = new Random().nextInt(5) + 1;
Thread.sleep(intRd * 1000);
select = Jsoup.connect(url).get();
remoteUrl = true;
}
if (remoteUrl) {
save2File(select.toString(), cleanName(url), BASE_SAVE_DIR);
}
return select.select(selector);
}
private static String cleanName(String name) {
return name
.replace("\\", "_")
.replace("/", "_")
.replace("//", "_")
.replace(".", "_")
.replace(":", "_");
}
private static void save2File(String content, String fileName, String saveDir) {
File dir = new File(saveDir);
if (!dir.exists()) {
boolean mkdirs = dir.mkdirs();
if (!mkdirs) {
return;
}
}
File file = new File(dir, fileName);
if (file.exists()) {
return;
}
try {
FileOutputStream outSTr = new FileOutputStream(file);
BufferedOutputStream Buff = new BufferedOutputStream(outSTr);
Buff.write(content.getBytes());
Buff.flush();
Buff.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.witwicky.vo;
import java.util.List;
public class CrawlingVo {
private String value;
private String label;
private List<CrawlingVo> children;
public CrawlingVo() {
}
public CrawlingVo(String value, String label, List<CrawlingVo> children) {
this.value = value;
this.label = label;
this.children = children;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public String getLabel() {
return label;
}
public void setLabel(String label) {
this.label = label;
}
public List<CrawlingVo> getChildren() {
return children;
}
public void setChildren(List<CrawlingVo> children) {
this.children = children;
}
}
\\审判系统
[\\Shěnpàn xìtǒng]
\\ trial system