爬取Google地图的商务、商户信息(原创)
2012年6月11日星期一
12:42
有问题可以联系我
QQ:443144476
mail:nibaogang@gmail.com
永久博客地址
国内http://blog.nibaogang.tk
国外http://nibaogang.blogspot.com
我们都知道目前在国内商户信息最全的莫过google了,具体多少数据量就不得而知了,前几天接到一个项目,好象是给移动弄商户数据,而且全部要带电话,收益很丰厚哦!可惜只要一个省的 要是要全国的 就发了
废话不多说先介绍一下google的商户接口
声明一下,本文章涉及的所有接口都是从googleweb服务中截取出来的接口,非google开放的api接口
StringBuilderbuilder = new StringBuilder(
"http://ditu.google.cn/maps?output=js&q=");
builder.append(q);
builder.append("&sll=");
builder.append(lat_point);
builder.append(",");
builder.append(lng_point);
builder.append("&radius=");
builder.append(radius);
builder.append("&start=");
builder.append(start);
Stringurl = builder.toString();
先介绍一个接口的url,前面不用说了地址 主要是参数 output指出我们需要js代码
q是要查询的字符,(推荐按照google的分类搜索里面的参数进行爬取,这样有利于以后整理分类,当然每个详细分类还可以单独爬取 只需要每个商户的cid就好了)
sll指的是你要爬取的中心位置经纬度,google是根据一个中心位置加搜索半径来进行检索的
radius是半径 单位公里
start说的是页码,如果返回结果超过10个google会分页 也就是取完前10个之后需要继续取,但是有个事情要说明一下google只支持取前190条数据,一次搜索结果190条以后的是取不到的
那有人会问了,我们需要海量数据,如何办呢
其实也还算简单,只需要深入爬取就好了,举个简单算法(我就是这么用的),大家应该清楚google地图是分块的256×256的块,那我就按照块来爬取,如果一个块的返回结果超过190,我就把这个块放大了下一级的4个块,重新请求以此类推,就可以拿到所有的数据了
请求出来的结果就不贴了 一大段 js 里面有一个json串是包含的数据,不过比较复杂,处理起来比较麻烦,我也是弄了很久才弄好,都是把json转成了xml才容易 查看的,一会我直接把代码贴上了 一般的网速大概一天十万条数据没有问题 都带有经纬度,带电话的大概5分之一吧
特别说明的是 需要 频繁更换 ip,我的代码里面有 自动更换IP的接口,不过是 操作我本地路由的,如果有人真需要爬取的话,可以自己实现一个更换路由的方法,注意同步处理哦,如果google开始封锁IP多线程会几乎同时报错,所以要处理一下,只重启一次路由
下面是代码:
packagecom.mimo.proxy.gansu;
importjava.io.IOException;
importjava.net.MalformedURLException;
importjava.net.URLEncoder;
importnet.sf.json.JSONArray;
importnet.sf.json.JSONObject;
importorg.apache.commons.httpclient.methods.GetMethod;
importorg.slf4j.Logger;
importorg.slf4j.LoggerFactory;
importcom.mimo.proxy.bdb.KeyDB;
importcom.mimo.proxy.bdb.MapEnv;
importcom.mimo.proxy.bean.BusinessBean;
importcom.mimo.proxy.bean.HttpClientManager;
importcom.mimo.proxy.dao.BusinessJdbc;
importcom.mimo.proxy.google.GoogleMap;
publicclass Crawler implements Runnable {
privatefinal Logger logger = LoggerFactory.getLogger(getClass());
privatefinal KeyDB keyDB = MapEnv.getMapEnv().getKeyDB();
@Override
publicvoid run() {
try {
Thread.sleep(2000);
handler(8,194, 96);
handler(8,194, 97);
handler(8,195, 95);
handler(8,195, 96);
handler(8,195, 97);
handler(8,195, 98);
handler(8,196, 94);
handler(8,196, 95);
handler(8,196, 96);
handler(8,196, 97);
handler(8,195, 98);
handler(8,197, 95);
handler(8,197, 96);
handler(8,197, 97);
handler(8,198, 96);
handler(8,198, 97);
handler(8,199, 97);
handler(8,199, 98);
handler(8,200, 98);
handler(8,200, 101);
handler(8,200, 102);
handler(8,201, 97);
handler(8,201, 98);
handler(8,201, 99);
handler(8,201, 100);
handler(8,201, 101);
handler(8,201, 102);
handler(8,202, 99);
handler(8,202, 100);
handler(8,202, 101);
handler(8,202, 102);
handler(8,202, 103);
handler(8,203, 101);
handler(8,203, 102);
handler(8,204, 100);
handler(8,204, 101);
}catch (Exception e) {
e.printStackTrace();
}
}
privatestatic long time = 0;
publicsynchronized static void restartRouter() {
if(System.currentTimeMillis() - time < 1000) {
System.out.println("不需要重连");
return;
}
Stringdisconnect = "http://192.168.1.2/userRpm/StatusRpm.htm?Disconnect=%B6%CF%20%CF%DF";
GetMethoddisconnectMethod = new GetMethod(disconnect);
disconnectMethod.setRequestHeader("Authorization",
"BasicYWRtaW46bWltb0FkbWlu");
try {
intstatus = HttpClientManager.getHttpClient().executeMethod(
disconnectMethod);
if(status != 200) {
thrownew IOException("error status:" + status);
}
}catch (MalformedURLException e) {
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}finally {
disconnectMethod.releaseConnection();
}
StringstatusRpm = "http://192.168.1.2/userRpm/StatusRpm.htm";
while(true) {
GetMethodstatusMethod = new GetMethod(statusRpm);
statusMethod.setRequestHeader("Authorization",
"BasicYWRtaW46bWltb0FkbWlu");
try {
intstatus = HttpClientManager.getHttpClient().executeMethod(
statusMethod);
if(status != 200) {
thrownew IOException("error status:" + status);
}
byte[]bs = statusMethod.getResponseBody(102400);
Stringresponse = new String(bs, "GBK");
int a = response.indexOf("<tr><td>DNS服务器:</td>");
int b= response.indexOf("</td></tr>", a);
Stringstr = response.substring(a, b);
booleanconnect = str.indexOf(".") != -1;
if(connect) {
System.out.println("重新连接正常...");
time =System.currentTimeMillis();
break;
}
Thread.sleep(3000);
}catch (MalformedURLException e) {
e.printStackTrace();
}catch (IOException e) {
e.printStackTrace();
}catch (InterruptedException e) {
e.printStackTrace();
}finally {
statusMethod.releaseConnection();
}
}
}
privateString query = null;
publicCrawler(String query) {
this.query= query;
}
privateint max = 0;
privateint index = 0;
publicvoid handler(int zoom, int x, int y) throws Exception {
Stringkey = zoom + "-" + x + "-" + y + "-" + query;
if(keyDB.get(key)) {
logger.info("不需要处理key:{}", key);
return;
}
Stringq = URLEncoder.encode(query, "utf-8");
doublelat = GoogleMap.pixelToLat(y * 256, zoom);
doublelng = GoogleMap.pixelToLng(x * 256, zoom);
doublelat_point = GoogleMap.pixelToLat(y * 256 + 128, zoom);
doublelng_point = GoogleMap.pixelToLng(x * 256 + 128, zoom);
doubleradius = distanceOfTwoPoints(lat, lng, lat_point, lng_point);
logger.info("开始去分析 zoom:" + zoom + " x:" + x + " y:" + y);
intnum = parser2(q, lat_point, lng_point, radius, 0);
logger.info("此块需要解析数目:{}",num);
if(this.max == 0) {
this.max= num;
}
if(num > 190) {
logger.info("需要 分层继续处理");
handler(zoom+ 1, 2 * x, 2 * y);
handler(zoom+ 1, 2 * x, 2 * y + 1);
handler(zoom+ 1, 2 * x + 1, 2 * y);
handler(zoom+ 1, 2 * x + 1, 2 * y + 1);
} elseif (num == 0) {
logger.info("处理完毕");
} else{
logger.info("需要遍历处理num:"+ num);
for(int i = 10; i < num; i += 10) {
parser2(q,lat_point, lng_point, radius, i);
}
}
keyDB.put(key);
keyDB.syn();
logger.info("处理完成key:{}", key);
}
publicint parser2(String q, double lat_point, double lng_point,
doubleradius, int start) throws Exception {
try {
returnparser(q, lat_point, lng_point, radius, start);
}catch (Exception e) {
restartRouter();
returnparser(q, lat_point, lng_point, radius, start);
}
}
publicint parser(String q, double lat_point, double lng_point,
doubleradius, int start) throws Exception {
StringBuilderbuilder = new StringBuilder(
"http://ditu.google.cn/maps?output=js&q=");
builder.append(q);
builder.append("&sll=");
builder.append(lat_point);
builder.append(",");
builder.append(lng_point);
builder.append("&radius=");
builder.append(radius);
builder.append("&start=");
builder.append(start);
Stringurl = builder.toString();
GetMethodgetMethod = new GetMethod(url);
try {
intstatus = HttpClientManager.getHttpClient().executeMethod(
getMethod);
if(200 == status) {
Stringhtml = getMethod.getResponseBodyAsString(102400);
Stringstart_str = "w.loadVPage(";
Stringend_str = ",\"state\");}";
int a= html.indexOf(start_str);
int b= html.lastIndexOf(end_str);
StringjsonStr = html.substring(a + start_str.length(), b);
JSONObjectjson = JSONObject.fromObject(jsonStr);
//String xml = new XMLSerializer().write(json);
//System.out.println(xml);
JSONObjectoverlays = json.getJSONObject("overlays");
if(json.has("panel")) {
Stringpanel = json.getString("panel");
String selectStr = "</b> 个结果";
int c= panel.indexOf(selectStr);
int d= panel.lastIndexOf(selectStr);
if (c== d && c != -1) {
int e= panel.lastIndexOf("<b>", d);
if (e!= -1) {
Stringnum_str = panel.substring(e + 3, d)
.replaceAll(",","");
returnInteger.parseInt(num_str);
}
}
}
if(overlays.has("markers")) {
JSONArraymarkers = overlays.getJSONArray("markers");
intsize = markers.size();
for(int i = 0; i < size; i++) {
JSONObjectmarker = markers.getJSONObject(i);
intb_s = marker.getInt("b_s");
if(b_s == 2) {
BusinessBeanbusinessBean = new BusinessBean();
businessBean.setKeyword(query);
Stringcid = marker.getString("cid");
businessBean.setCid(cid);
Stringname = marker.getString("name");
businessBean.setName(name);
JSONObjectinfoWindow = marker
.getJSONObject("infoWindow");
if(infoWindow.has("addressLines")) {
JSONArrayaddress = infoWindow
.getJSONArray("addressLines");
intaddrSize = address.size();
if(addrSize > 0) {
businessBean.setArea(address.getString(0));
if(!address.getString(0).startsWith("你要的省份")) {
continue;
}
}
if(addrSize > 1) {
businessBean
.setStreet(address.getString(1));
}
}
if(infoWindow.has("phones")) {
JSONArrayphones = infoWindow
.getJSONArray("phones");
intphoneSize = phones.size();
if(phoneSize > 0) {
Stringnumber = phones.getJSONObject(0)
.getString("number");
if(!"04007161717".equals(number)) {
businessBean.setPhone(number);
}
}
}
JSONObjectlatlng = marker.getJSONObject("latlng");
Stringll = latlng.getJSONObject("alt").getString(
"ll");
doublelat = getLat(ll);
doublelng = getLng(ll);
businessBean.setLat(lat);
businessBean.setLng(lng);
booleaninsert = BusinessJdbc
.insertOrUpdate(businessBean);
if(insert) {
index++;
logger.info("havemax:{} index:{}", max, index);
}
}
}
}
return0;
} else{
thrownew IOException("status error" + status);
}
}finally {
getMethod.releaseConnection();
}
}
publicdouble distanceOfTwoPoints(double lat1, double lng1, double lat2,
doublelng2) {
doubleradLng1 = lng1 * Math.PI / 180.0;
doubleradLng2 = lng2 * Math.PI / 180.0;
doublea = radLng1 - radLng2;
doubleb = (lat1 - lat2) * Math.PI / 180.0;
doubles = 2 * Math.asin(Math.sqrt(Math.pow(Math.sin(a / 2), 2)
+Math.cos(radLng1) * Math.cos(radLng2)
*Math.pow(Math.sin(b / 2), 2))) * 6378.137;
s =Math.round(s * 1000) / 1000D;
returns;
}
publicdouble getLat(String ll) {
intlength = ll.length();
if(length == 20) {
long y= Long.parseLong(ll.substring(7, 14)) * 256
+Long.parseLong(ll.substring(17, 20));
returnGoogleMap.pixelToLat(y, 22);
} else{
thrownew RuntimeException("error length:" + length);
}
}
publicdouble getLng(String ll) {
intlength = ll.length();
if(length == 20) {
long x= Long.parseLong(ll.substring(0, 7)) * 256
+Long.parseLong(ll.substring(14, 17));
returnGoogleMap.pixelToLng(x, 22);
} else{
thrownew RuntimeException("error length:" + length);
}
}
}
jar 包应该没啥 就是有一个json处理那个比较复杂,引的多一些,叫json-lib主要是当时用了它的转xml功能,如果自己用的话可以替换成一个简单的
源文档 <http://nibaogang.blogspot.com/2011/02/google_21.html>
已使用 Microsoft OneNote 2010 创建
一个用于存放所有笔记和信息的位置