官网信息:
HttpClient:http://hc.apache.org/httpcomponents-client-5.0.x/index.html
Jsoup:https://jsoup.org/
功能:
抓取 最新县及县以上行政区划代码 http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html
代码实例:
package zhua_qu_shu_qu;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.*;
import org.apache.http.impl.client.*;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class GrabUtil {
String ctiyParent = "";//县,区,县级市的父级
String parent = "";//地级市的父级
int cityNum = 0 ; //省,直辖市的排序
String cityNum1 = "" ; //地级市、县,区,县级市的排序(通过截取城市代码来获取)
public static void main(String[] args) {
GrabUtil gu = new GrabUtil();
gu.grabData();
}
/**
* 抓取数据
*/
public void grabData() {
String html = ""; //页面数据
//抓取的地址
String url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html";
CloseableHttpClient httpclient = HttpClients.createDefault();
CloseableHttpResponse response = null;
try {
HttpGet httpget = new HttpGet(url);
response = httpclient.execute(httpget);
// 获取响应实体
HttpEntity entity = response.getEntity();
// 打印响应状态
if (entity != null) {
html = EntityUtils.toString(entity , "UTF-8");
}
//提取数据
extractData(html);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
response.close();
httpclient.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 根据响应的数据,通过Jsoup对数据进行提取
*/
private void extractData(String html) throws FileNotFoundException,
IOException {
Document document = Jsoup.parse(html);
//根据class,获取页面数据
Elements elementsByClass = document.getElementsByClass("MsoNormal");
//把数据吸入文件
OutputStream out = new FileOutputStream(new File("F:/cityDB-2.txt"));
for (Element e : elementsByClass) {
String text = analyticData(e);
out.write(text.getBytes());
//txt 文本中换行
out.write("\r\n".getBytes());
System.out.println(text);
}
}
/**
* 解析数据,同时对数据进行响应的处理
*/
private String analyticData(Element e) {
//获取html 标签中的文本
String text = e.text();
return text;
}
}
输出结果: