本来想去看看百度地图API有没有根据地址直接获取邮编的功能,结果发现可能百度觉得有利可图便把ZIP的返回结果给去掉了..
因为我的excel表的地址写的不是很规范 ,所以提出了一下思路:
- 先根据excel表中的模糊地址找经纬度
- 根据经纬找到省市县(区)
- 根据省市县区字符串再去百度的邮编爬取邮编
直接贴代码:
/**
* 根据excel中的地址找到经纬度,返回省市县
* @param data
* @return
* @throws IOException
*/
private static List<String> getLocationFromBaidu(data data) throws IOException {
String ak = "你的ak";
String url = "http://api.map.baidu.com/geocoder/v2/";
HTTPParam output = new HTTPParam("output", "json");
HTTPParam Ak = new HTTPParam("ak", ak);
HTTPParam address = new HTTPParam();
String result = "";
List<HTTPParam> params = new ArrayList<HTTPParam>();
params.add(output);
params.add(Ak);
address = new HTTPParam("address", data.getAddress());
params.add(address);
result = HttpSend.sendGet(url, params);
JSONObject getObj = JSONObject.parseObject(result);
String status = getObj.getString("status");
List<String> strings = new ArrayList<String>();
if (status.equals("0")) {
JSONObject res = getObj.getJSONObject("result");
JSONObject location = res.getJSONObject("location");
String lng = location.getString("lng");
String lat = location.getString("lat");
params = new ArrayList<HTTPParam>();
params.add(output);
params.add(Ak);
params.add(new HTTPParam("location", lat + "," + lng));
String result1 = HttpSend.sendGet(url, params);
JSONObject getObj1 = JSONObject.parseObject(result1);
String status1 = getObj1.getString("status");
if (status1.equals("0")) {
JSONObject jsonObject = getObj1.getJSONObject("result");
JSONObject addressComponent = jsonObject.getJSONObject("addressComponent");
strings.add(URLDecoder.decode(addressComponent.getString("province").toString(), "UTF-8"));
strings.add(URLDecoder.decode(addressComponent.getString("city"), "UTF-8"));
strings.add(URLDecoder.decode(addressComponent.getString("district"), "UTF-8"));
strings.add(URLDecoder.decode(addressComponent.getString("street"), "UTF-8"));
return strings;
}
}
return null;
}
/**
* 直接post 百度邮编
*
* @param url
* @param province
* @param city
* @param district
* @return
* @throws IOException
*/
public static String sendPost(String url, String province, String city, String district, String street) throws IOException {
url = "http://opendata.baidu.com/post/s";
HttpPost httppost = new HttpPost(url);
HttpClient httpclient = HttpClients.createDefault();
List<NameValuePair> params = new ArrayList<NameValuePair>();
params.add(new BasicNameValuePair("wd", province+city+district+street));
httppost.setEntity(new UrlEncodedFormEntity(params, "GBK"));
HttpResponse response = httpclient.execute(httppost);
HttpEntity entity = response.getEntity();
String html = EntityUtils.toString(entity, "GBK");
httppost.abort();
Document document = Jsoup.parse(html);
Elements divs = document.getElementsByClass("table-list");
Elements links = divs.first().getElementsByTag("td");
for (Element link : links) {
if (!isInteger(link.ownText()))
continue;
return link.ownText();
}
return "-1";
}
辅助类:
/**
* 解析excel存于此
*/
public class data {
private String address;
private String code;
private String lng;
private String lat;
public data(String address ,String code){
this.address = address;
this.code = code;
}
public data(String lng, String lat ) {
this.lat = lat;
this.lng = lng;
}
public String getLng() {
return lng;
}
public void setLng(String lng) {
this.lng = lng;
}
public String getLat() {
return lat;
}
public void setLat(String lat) {
this.lat = lat;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
}
/**
* Created by.
* <p/>
* HTTP请求参数封装
*/
public class HTTPParam {
//请求参数
private String key;
//参数值
private String value;
public HTTPParam() {
}
public HTTPParam(String key, String value) {
this.key = key;
this.value = value;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.List;
/**
* Created by .
* <p/>
* HTTP请求工具类
*/
public class HttpSend {
/**
* 发送get请求
*
* @param url 请求地址
* @param list 请求参数
*
* @return 请求结果
*
* @throws IOException
*/
public static String sendGet(String url, List<HTTPParam> list) throws IOException {
StringBuffer buffer = new StringBuffer(); //用来拼接参数
StringBuffer result = new StringBuffer(); //用来接受返回值
URL httpUrl = null; //HTTP URL类 用这个类来创建连接
URLConnection connection = null; //创建的http连接
BufferedReader bufferedReader = null; //接受连接受的参数
//如果存在参数,我们才需要拼接参数 类似于 localhost/index.html?a=a&b=b
if (list.size() > 0) {
for (int i = 0; i < list.size(); i++) {
buffer.append(list.get(i).getKey()).append("=").append(URLEncoder.encode(list.get(i).getValue(), "utf-8"));
//如果不是最后一个参数,不需要添加&
if ((i + 1) < list.size()) {
buffer.append("&");
}
}
url = url + "?" + buffer.toString();
}
//创建URL
httpUrl = new URL(url);
//建立连接
connection = httpUrl.openConnection();
connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
connection.setRequestProperty("connection", "keep-alive");
connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0");
connection.connect();
//接受连接返回参数
bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
String line;
while ((line = bufferedReader.readLine()) != null) {
result.append(line);
}
bufferedReader.close();
return result.toString();
}
/**
* 发送Post请求
*
* @param url 请求地址
* @param list 请求参数
*
* @return 请求结果
*
* @throws IOException
*/
public static String sendPost(String url, List<HTTPParam> list) throws IOException {
StringBuffer buffer = new StringBuffer(); //用来拼接参数
StringBuffer result = new StringBuffer(); //用来接受返回值
URL httpUrl = null; //HTTP URL类 用这个类来创建连接
URLConnection connection = null; //创建的http连接
PrintWriter printWriter = null;
BufferedReader bufferedReader; //接受连接受的参数
//创建URL
httpUrl = new URL(url);
//建立连接
connection = httpUrl.openConnection();
connection.setRequestProperty("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
connection.setRequestProperty("connection", "keep-alive");
connection.setRequestProperty("user-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0");
connection.setDoOutput(true);
connection.setDoInput(true);
printWriter = new PrintWriter(connection.getOutputStream());
if (list.size() > 0) {
for (int i = 0; i < list.size(); i++) {
buffer.append(list.get(i).getKey()).append("=").append(URLEncoder.encode(list.get(i).getValue(),"UTF-8"));
//如果不是最后一个参数,不需要添加&
if ((i + 1) < list.size()) {
buffer.append("&");
}
}
}
printWriter.print(buffer.toString());
printWriter.flush();
connection.connect();
//接受连接返回参数
bufferedReader = new BufferedReader(new InputStreamReader(connection.getInputStream(),"GBK"));
String line;
while ((line = bufferedReader.readLine()) != null) {
result.append(line);
}
bufferedReader.close();
return result.toString();
}
}