最近一个人负责公司的一个app项目开发,需要从高德地图爬取杭州市全部的超市信息,放入mongodb的数据库中。做地理位置查询。(mongodb这部分有时间补上)
首先去高德地图创建一个开发者账号,获取一个开发web服务的高德key.这个是必须要有的,可以用我这个从百度到的key试一下。
废话不说了直接上代码
package com.pingogo.visit.service;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.pingogo.api.common.HttpUtils;
import com.pingogo.visit.domain.Shop;
import jxl.Cell;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
//import org.apache.poi.ss.usermodel.Workbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by cw on 2017/8/29.
*
*/
public class AddressLngLatExchange {
private static final String KEY = "389880a06e3f893ea46036f030c94700";
private static final String OUTPUT = "JSON";
private static final String GET_LNG_LAT_URL = "http://restapi.amap.com/v3/geocode/geo";
private static final String GET_LNG_PIO_URL = "http://restapi.amap.com/v3/place/polygon";
private static final Logger LOGGER = LoggerFactory.getLogger(AddressLngLatExchange.class);
//获取指定地点经纬度
public static String[] getLngLatFromOneAddr(String address){
if(StringUtils.isBlank(address)) {
LOGGER.error("地址(" + address + ")为null或者空");
return null;
}
Map<String, String> params = new HashMap<String, String>();
params.put("address", address);
params.put("output", OUTPUT);
params.put("key", KEY);
String result = HttpUtils.URLPost(GET_LNG_LAT_URL,params,"");
JSONObject jsonObject = JSONObject.parseObject(result);
String[] lngLatArr = new String[2];
//拿到返回报文的status值,高德的该接口返回值有两个:0-请求失败,1-请求成功;
int status = Integer.valueOf(jsonObject.getString("status"));
if(status == 1) {
JSONArray jsonArray = jsonObject.getJSONArray("geocodes");
for(int i = 0; i < jsonArray.size(); i++) {
JSONObject json = jsonArray.getJSONObject(i);
String lngLat = json.getString("location");
lngLatArr = lngLat.split(",");
}
} else {
String errorMsg = jsonObject.getString("info");
LOGGER.error("地址(" + address + ")" + errorMsg);
}
return lngLatArr;
}
public static List<Shop> initialData(String lonLat, String keyword, List<Shop> shopListSon){
if(StringUtils.isBlank(keyword)) {
LOGGER.error("地址(" + keyword + ")为null或者空");
}
Map<String, String> params = new HashMap<String, String>();
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
params.put("polygon", lonLat);//"118.21,29.11;120.30,30.33"
params.put("output", OUTPUT);
params.put("keywords", keyword);
params.put("offset", "20");
params.put("page", "1");
params.put("key", KEY);
String result = HttpUtils.URLGet(GET_LNG_PIO_URL,params,"UTF-8");
JSONObject jsonObject = JSONObject.parseObject(result);
int statusOne = Integer.valueOf(jsonObject.getString("status"));
//第一次获取数据时做的判断
if(statusOne==1){
int count=Integer.valueOf(jsonObject.getString("count"));
int pageNumber=count/20;
int remainder=count%20;
if(remainder>0)pageNumber=pageNumber+1;
for(int i=1;i<=pageNumber;i++){
params.put("page", String.valueOf(i));
result = HttpUtils.URLGet(GET_LNG_PIO_URL,params,"UTF-8");
JSONObject jsonObject2 = JSONObject.parseObject(result);
System.out.println("+++++++++"+result);
//拿到返回报文的status值,高德的该接口返回值有两个:0-请求失败,1-请求成功;
int status = Integer.valueOf(jsonObject2.getString("status"));
if(status == 1) {
JSONArray jsonArray = jsonObject2.getJSONArray("pois");
if(jsonArray.size()>0){
for(int j =0;j<jsonArray.size();j++){
Shop shop =new Shop();
JSONObject jsonObject1 =jsonArray.getJSONObject(j);
shop.setShopName(jsonObject1.getString("name"));
shop.setSpecificAddress(jsonObject1.getString("address"));
shop.setId(jsonObject1.getString("id"));
String [] initLonLat =jsonObject1.getString("location").split(",");
shop.setLongitude(initLonLat[0]);
shop.setLatitude(initLonLat[1]);
shopListSon.add(shop);
//DBObject doci = new BasicDBObject("shopId", "300"+i).append("shopName", "人生得意"+i).append("shopStatus",0).append("specificAddress","天堂"+i).append("gps", new Point(new Position(lon, lat)));
}
}
} else {
String errorMsg = jsonObject.getString("info");
LOGGER.error("地址(" + keyword + ")" + errorMsg);
}
}
}
return shopListSon;
}
//从高德地图上取数据
// public static void main(String[] args) {
// List<Shop> listShop =new ArrayList<>();
// //东经118°21′-120°30′,北纬29°11′-30°33′。杭州位置
// for(double i=118.20;i<=120.31;i=i+0.1){
// for(double j=29.10;j<=30.33;j=j+0.1){
// List<Shop> listShopSon =new ArrayList<>();
// double lonHead=i;
// double latHead=j;
// double lonTail=i+0.1;
// double latTail=j+0.1;
// String LonLat=lonHead+","+latHead+";"+lonTail+","+latTail;
// listShopSon =initialData(LonLat,"便利店",listShopSon);
// for(int n=0;n<listShopSon.size();n++){
// System.out.println("店铺地址:"+listShopSon.get(n).getSpecificAddress());
// }
// if(listShopSon.size()>0){
// listShop.addAll(listShopSon);
// }
// System.out.println("ListShop的大小:"+listShop.size());
// double d =Distance(lonHead,latHead,lonTail,latTail);
// System.out.println("两点距离"+d);
//
// }
//
// }
//
// System.out.println("ListShop的大小:"+listShop.size());
// creatExcel(listShop);
// }
public static void main(String[] args) {
readFile("D:\\geode\\222.xls");
}
//写入excel中
public static void creatExcel(List<Shop> shopList){
HSSFWorkbook workbook = new HSSFWorkbook();
//第二部,在workbook中创建一个sheet对应excel中的sheet
HSSFSheet sheet = workbook.createSheet("高德地图数据");
//第三部,在sheet表中添加表头第0行,老版本的poi对sheet的行列有限制
HSSFRow row = sheet.createRow(0);
//第四步,创建单元格,设置表头
HSSFCell cell = row.createCell(0);
cell.setCellValue("店铺id");
cell = row.createCell(1);
cell.setCellValue("店铺名称");
cell = row.createCell(2);
cell.setCellValue("店铺地址");
cell = row.createCell(3);
cell.setCellValue("经度");
cell = row.createCell(4);
cell.setCellValue("纬度");
//第五步,写入实体数据,实际应用中这些数据从数据库得到,对象封装数据,集合包对象。对象的属性值对应表的每行的值
for (int i = 0; i < shopList.size(); i++) {
HSSFRow row1 = sheet.createRow(i + 1);
Shop shop = shopList.get(i);
//创建单元格设值
row1.createCell(0).setCellValue(shop.getId());
row1.createCell(1).setCellValue(shop.getShopName());
row1.createCell(2).setCellValue(shop.getSpecificAddress());
row1.createCell(3).setCellValue(shop.getLongitude());
row1.createCell(4).setCellValue(shop.getLatitude());
}
//将文件保存到指定的位置
try {
FileOutputStream fos = new FileOutputStream("D:\\geode\\高德便利店地图数据.xls");
workbook.write(fos);
System.out.println("写入成功");
fos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static double Distance(double long1, double lat1, double long2, double lat2) {
double a, b, R;
R =6371; // 地球半径 6371km
lat1 = lat1 * Math.PI / 180.0;
lat2 = lat2 * Math.PI / 180.0;
a = lat1 - lat2;
b = (long1 - long2) * Math.PI / 180.0;
double d;
double sa2, sb2;
sa2 = Math.sin(a / 2.0);
sb2 = Math.sin(b / 2.0);
d = 2
* R
* Math.asin(Math.sqrt(sa2 * sa2 + Math.cos(lat1)
* Math.cos(lat2) * sb2 * sb2));
BigDecimal bigDecimal = new BigDecimal(d*1000);
Double din = bigDecimal.setScale(2, BigDecimal.ROUND_HALF_UP).doubleValue();
return din ;
}
public static List<Shop> readFile(String filename){
List<Shop> shopList =new ArrayList<>();
Workbook wb=null;
Cell cell=null;
try {
File f=new File(filename);
InputStream in=new FileInputStream(f); //创建输入流
wb= Workbook.getWorkbook(in); //获取Excel文件对象
jxl.Sheet s=wb.getSheet(0); //获取文件的指定工作表,默认为第一个
String value=null;
for(int i=1;i<s.getRows();i++){//表头目录不需要,从第一行开始
Shop shop =new Shop();
for(int j=0;j<s.getColumns();j++){
cell=s.getCell(j, i);
value=cell.getContents();
if(j==0){
shop.setId(value);
}else if(j==1){
shop.setShopName(value);
}else if(j==2){
shop.setSpecificAddress(value);
}else if(j==3){
shop.setLongitude(value);
}else if(j==4){
shop.setLatitude(value);
}
// System.out.println("value:"+value);
}
shopList.add(shop);
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (BiffException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return shopList;
}
}
由于高德地图对他的数据做了保护,我这边采用的是矩形搜索。 百度到杭州的经纬度划分成多个小矩形,然后调用高德地图的API服务。我这边将爬取的数据写入excel表格中,一是为了展示验证数据是否准确,二是怕直接写入会不会有内存泄漏问题。我上面的代码有写入excel和读取excel的代码。不过要注意一下 我用得jar包不同。写入用的poi,读取用的是jxl.
这边调用http请求是客户端,代码如下。是在网上找到的,首先谢谢分享的人。因为有段时间,原文地址忘记了。一开始用的是
URL myURL = null; URLConnection httpsConn = null; try { myURL = new URL(url); } catch (MalformedURLException e) { e.printStackTrace(); } InputStreamReader insr = null; BufferedReader br = null; httpsConn = (URLConnection) myURL.openConnection();// 不使用代理然后在tomcat项目中调用时,报错了,原因现在还没有弄清楚,知道原因的告诉我一二。
package com.pingogo.api.common;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* HTTP工具类
*
* @author lixiangyang
*
*/
public class HttpUtils {
private static Log log = LogFactory.getLog(HttpUtils.class);
/**
* 定义编码格式 UTF-8
*/
public static final String URL_PARAM_DECODECHARSET_UTF8 = "UTF-8";
/**
* 定义编码格式 GBK
*/
public static final String URL_PARAM_DECODECHARSET_GBK = "GBK";
private static final String URL_PARAM_CONNECT_FLAG = "&";
private static final String EMPTY = "";
private static MultiThreadedHttpConnectionManager connectionManager = null;
private static int connectionTimeOut = 25000;
private static int socketTimeOut = 25000;
private static int maxConnectionPerHost = 20;
private static int maxTotalConnections = 20;
private static HttpClient client;
static{
connectionManager = new MultiThreadedHttpConnectionManager();
connectionManager.getParams().setConnectionTimeout(connectionTimeOut);
connectionManager.getParams().setSoTimeout(socketTimeOut);
connectionManager.getParams().setDefaultMaxConnectionsPerHost(maxConnectionPerHost);
connectionManager.getParams().setMaxTotalConnections(maxTotalConnections);
client = new HttpClient(connectionManager);
}
/**
* POST方式提交数据
* @param url
* 待请求的URL
* @param params
* 要提交的数据
* @param enc
* 编码
* @return
* 响应结果
* @throws IOException
* IO异常
*/
public static String URLPost(String url, Map<String, String> params, String enc){
enc=URL_PARAM_DECODECHARSET_UTF8;
String response = EMPTY;
PostMethod postMethod = null;
try {
postMethod = new PostMethod(url);
postMethod.setRequestHeader("Content-Type", "application/x-www-form-urlencoded;charset=" + enc);
//将表单的值放入postMethod中
Set<String> keySet = params.keySet();
for(String key : keySet){
String value = params.get(key);
postMethod.addParameter(key, value);
}
//执行postMethod
int statusCode = client.executeMethod(postMethod);
if(statusCode == HttpStatus.SC_OK) {
response = postMethod.getResponseBodyAsString();
}else{
log.error("响应状态码 = " + postMethod.getStatusCode());
}
}catch(HttpException e){
log.error("发生致命的异常,可能是协议不对或者返回的内容有问题", e);
e.printStackTrace();
}catch(IOException e){
log.error("发生网络异常", e);
e.printStackTrace();
}finally{
if(postMethod != null){
postMethod.releaseConnection();
postMethod = null;
}
}
return response;
}
/**
* GET方式提交数据
* @param url
* 待请求的URL
* @param params
* 要提交的数据
* @param enc
* 编码
* @return
* 响应结果
* @throws IOException
* IO异常
*/
public static String URLGet(String url, Map<String, String> params, String enc){
String response = EMPTY;
GetMethod getMethod = null;
StringBuffer strtTotalURL = new StringBuffer(EMPTY);
if(strtTotalURL.indexOf("?") == -1) {
strtTotalURL.append(url).append("?").append(getUrl(params, enc));
} else {
strtTotalURL.append(url).append("&").append(getUrl(params, enc));
}
log.debug("GET请求URL = \n" + strtTotalURL.toString());
try {
getMethod = new GetMethod(strtTotalURL.toString());
getMethod.setRequestHeader("Content-Type", "application/x-www-form-urlencoded;charset=" + enc);
//执行getMethod
int statusCode = client.executeMethod(getMethod);
if(statusCode == HttpStatus.SC_OK) {
response = getMethod.getResponseBodyAsString();
}else{
log.debug("响应状态码 = " + getMethod.getStatusCode());
}
}catch(HttpException e){
log.error("发生致命的异常,可能是协议不对或者返回的内容有问题", e);
e.printStackTrace();
}catch(IOException e){
log.error("发生网络异常", e);
e.printStackTrace();
}finally{
if(getMethod != null){
getMethod.releaseConnection();
getMethod = null;
}
}
return response;
}
/**
* 据Map生成URL字符串
* @param map
* Map
* @param valueEnc
* URL编码
* @return
* URL
*/
private static String getUrl(Map<String, String> map, String valueEnc) {
if (null == map || map.keySet().size() == 0) {
return (EMPTY);
}
StringBuffer url = new StringBuffer();
Set<String> keys = map.keySet();
for (Iterator<String> it = keys.iterator(); it.hasNext();) {
String key = it.next();
if (map.containsKey(key)) {
String val = map.get(key);
String str = val != null ? val : EMPTY;
try {
str = URLEncoder.encode(str, valueEnc);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
url.append(key).append("=").append(str).append(URL_PARAM_CONNECT_FLAG);
}
}
String strURL = EMPTY;
strURL = url.toString();
if (URL_PARAM_CONNECT_FLAG.equals(EMPTY + strURL.charAt(strURL.length() - 1))) {
strURL = strURL.substring(0, strURL.length() - 1);
}
return (strURL);
}
}
maven里的配置
<dependency> <groupId>commons-httpclient</groupId> <artifactId>commons-httpclient</artifactId> <version>3.1</version> </dependency
最后给大家看一下我爬取出来的数据
如果有什么不对的地方,希望大家指点。
试一下付款二维码