爬取Google地图的商务、商户信息(原创)

爬取Google地图的商务、商户信息(原创)

2012611日星期一

12:42

有问题可以联系我

QQ:443144476

mail:nibaogang@gmail.com

永久博客地址 

国内http://blog.nibaogang.tk 

国外http://nibaogang.blogspot.com

 

我们都知道目前在国内商户信息最全的莫过google了,具体多少数据量就不得而知了,前几天接到一个项目,好象是给移动弄商户数据,而且全部要带电话,收益很丰厚哦!可惜只要一个省的 要是要全国的  就发了

 

废话不多说先介绍一下google的商户接口

 

声明一下,本文章涉及的所有接口都是从googleweb服务中截取出来的接口,非google开放的api接口

 

 

StringBuilderbuilder = new StringBuilder(

"http://ditu.google.cn/maps?output=js&q=");

builder.append(q);

builder.append("&sll=");

builder.append(lat_point);

builder.append(",");

builder.append(lng_point);

builder.append("&radius=");

builder.append(radius);

builder.append("&start=");

builder.append(start);

Stringurl = builder.toString();

 

 

先介绍一个接口的url,前面不用说了地址   主要是参数  output指出我们需要js代码

q是要查询的字符,(推荐按照google的分类搜索里面的参数进行爬取,这样有利于以后整理分类,当然每个详细分类还可以单独爬取 只需要每个商户的cid就好了)

sll指的是你要爬取的中心位置经纬度,google是根据一个中心位置加搜索半径来进行检索的

radius是半径   单位公里

start说的是页码,如果返回结果超过10个google会分页  也就是取完前10个之后需要继续取,但是有个事情要说明一下google只支持取前190条数据,一次搜索结果190条以后的是取不到的

 

那有人会问了,我们需要海量数据,如何办呢

 

其实也还算简单,只需要深入爬取就好了,举个简单算法(我就是这么用的),大家应该清楚google地图是分块的256×256的块,那我就按照块来爬取,如果一个块的返回结果超过190,我就把这个块放大了下一级的4个块,重新请求以此类推,就可以拿到所有的数据了

 

 

 

 

请求出来的结果就不贴了 一大段  js  里面有一个json串是包含的数据,不过比较复杂,处理起来比较麻烦,我也是弄了很久才弄好,都是把json转成了xml才容易  查看的,一会我直接把代码贴上了  一般的网速大概一天十万条数据没有问题  都带有经纬度,带电话的大概5分之一吧

 

特别说明的是 需要  频繁更换 ip,我的代码里面有  自动更换IP的接口,不过是  操作我本地路由的,如果有人真需要爬取的话,可以自己实现一个更换路由的方法,注意同步处理哦,如果google开始封锁IP多线程会几乎同时报错,所以要处理一下,只重启一次路由

 

下面是代码:

 

 

packagecom.mimo.proxy.gansu;

 

importjava.io.IOException;

importjava.net.MalformedURLException;

importjava.net.URLEncoder;

 

importnet.sf.json.JSONArray;

importnet.sf.json.JSONObject;

 

importorg.apache.commons.httpclient.methods.GetMethod;

importorg.slf4j.Logger;

importorg.slf4j.LoggerFactory;

 

importcom.mimo.proxy.bdb.KeyDB;

importcom.mimo.proxy.bdb.MapEnv;

importcom.mimo.proxy.bean.BusinessBean;

importcom.mimo.proxy.bean.HttpClientManager;

importcom.mimo.proxy.dao.BusinessJdbc;

importcom.mimo.proxy.google.GoogleMap;

 

publicclass Crawler implements Runnable {

privatefinal Logger logger = LoggerFactory.getLogger(getClass());

privatefinal KeyDB keyDB = MapEnv.getMapEnv().getKeyDB();

 

@Override

publicvoid run() {

try {

Thread.sleep(2000);

handler(8,194, 96);

handler(8,194, 97);

 

handler(8,195, 95);

handler(8,195, 96);

handler(8,195, 97);

handler(8,195, 98);

 

handler(8,196, 94);

handler(8,196, 95);

handler(8,196, 96);

handler(8,196, 97);

handler(8,195, 98);

 

handler(8,197, 95);

handler(8,197, 96);

handler(8,197, 97);

 

handler(8,198, 96);

handler(8,198, 97);

 

handler(8,199, 97);

handler(8,199, 98);

 

handler(8,200, 98);

handler(8,200, 101);

handler(8,200, 102);

 

handler(8,201, 97);

handler(8,201, 98);

handler(8,201, 99);

handler(8,201, 100);

handler(8,201, 101);

handler(8,201, 102);

 

handler(8,202, 99);

handler(8,202, 100);

handler(8,202, 101);

handler(8,202, 102);

handler(8,202, 103);

 

handler(8,203, 101);

handler(8,203, 102);

 

handler(8,204, 100);

handler(8,204, 101);

}catch (Exception e) {

e.printStackTrace();

}

}

 

privatestatic long time = 0;

 

publicsynchronized static void restartRouter() {

if(System.currentTimeMillis() - time < 1000) {

System.out.println("不需要重连");

return;

}

Stringdisconnect = "http://192.168.1.2/userRpm/StatusRpm.htm?Disconnect=%B6%CF%20%CF%DF";

GetMethoddisconnectMethod = new GetMethod(disconnect);

disconnectMethod.setRequestHeader("Authorization",

"BasicYWRtaW46bWltb0FkbWlu");

try {

intstatus = HttpClientManager.getHttpClient().executeMethod(

disconnectMethod);

if(status != 200) {

thrownew IOException("error status:" + status);

}

}catch (MalformedURLException e) {

e.printStackTrace();

}catch (IOException e) {

e.printStackTrace();

}finally {

disconnectMethod.releaseConnection();

}

StringstatusRpm = "http://192.168.1.2/userRpm/StatusRpm.htm";

while(true) {

GetMethodstatusMethod = new GetMethod(statusRpm);

statusMethod.setRequestHeader("Authorization",

"BasicYWRtaW46bWltb0FkbWlu");

try {

intstatus = HttpClientManager.getHttpClient().executeMethod(

statusMethod);

if(status != 200) {

thrownew IOException("error status:" + status);

}

byte[]bs = statusMethod.getResponseBody(102400);

Stringresponse = new String(bs, "GBK");

int a = response.indexOf("<tr><td>DNS服务器:</td>");

int b= response.indexOf("</td></tr>", a);

Stringstr = response.substring(a, b);

booleanconnect = str.indexOf(".") != -1;

if(connect) {

System.out.println("重新连接正常...");

time =System.currentTimeMillis();

break;

}

Thread.sleep(3000);

}catch (MalformedURLException e) {

e.printStackTrace();

}catch (IOException e) {

e.printStackTrace();

}catch (InterruptedException e) {

e.printStackTrace();

}finally {

statusMethod.releaseConnection();

}

}

 

}

 

privateString query = null;

 

publicCrawler(String query) {

this.query= query;

}

 

privateint max = 0;

privateint index = 0;

 

publicvoid handler(int zoom, int x, int y) throws Exception {

Stringkey = zoom + "-" + x + "-" + y + "-" + query;

if(keyDB.get(key)) {

logger.info("不需要处理key:{}", key);

return;

}

Stringq = URLEncoder.encode(query, "utf-8");

doublelat = GoogleMap.pixelToLat(y * 256, zoom);

doublelng = GoogleMap.pixelToLng(x * 256, zoom);

 

doublelat_point = GoogleMap.pixelToLat(y * 256 + 128, zoom);

doublelng_point = GoogleMap.pixelToLng(x * 256 + 128, zoom);

 

doubleradius = distanceOfTwoPoints(lat, lng, lat_point, lng_point);

logger.info("开始去分析 zoom:" + zoom + " x:" + x + " y:" + y);

intnum = parser2(q, lat_point, lng_point, radius, 0);

logger.info("此块需要解析数目:{}",num);

if(this.max == 0) {

this.max= num;

}

if(num > 190) {

logger.info("需要  分层继续处理");

handler(zoom+ 1, 2 * x, 2 * y);

handler(zoom+ 1, 2 * x, 2 * y + 1);

handler(zoom+ 1, 2 * x + 1, 2 * y);

handler(zoom+ 1, 2 * x + 1, 2 * y + 1);

} elseif (num == 0) {

logger.info("处理完毕");

} else{

logger.info("需要遍历处理num:"+ num);

for(int i = 10; i < num; i += 10) {

parser2(q,lat_point, lng_point, radius, i);

}

}

keyDB.put(key);

keyDB.syn();

logger.info("处理完成key:{}", key);

}

 

publicint parser2(String q, double lat_point, double lng_point,

doubleradius, int start) throws Exception {

try {

returnparser(q, lat_point, lng_point, radius, start);

}catch (Exception e) {

restartRouter();

returnparser(q, lat_point, lng_point, radius, start);

}

}

 

publicint parser(String q, double lat_point, double lng_point,

doubleradius, int start) throws Exception {

StringBuilderbuilder = new StringBuilder(

"http://ditu.google.cn/maps?output=js&q=");

builder.append(q);

builder.append("&sll=");

builder.append(lat_point);

builder.append(",");

builder.append(lng_point);

builder.append("&radius=");

builder.append(radius);

builder.append("&start=");

builder.append(start);

Stringurl = builder.toString();

GetMethodgetMethod = new GetMethod(url);

 

try {

intstatus = HttpClientManager.getHttpClient().executeMethod(

getMethod);

if(200 == status) {

Stringhtml = getMethod.getResponseBodyAsString(102400);

Stringstart_str = "w.loadVPage(";

Stringend_str = ",\"state\");}";

int a= html.indexOf(start_str);

int b= html.lastIndexOf(end_str);

StringjsonStr = html.substring(a + start_str.length(), b);

JSONObjectjson = JSONObject.fromObject(jsonStr);

//String xml = new XMLSerializer().write(json);

//System.out.println(xml);

JSONObjectoverlays = json.getJSONObject("overlays");

if(json.has("panel")) {

Stringpanel = json.getString("panel");

String selectStr = "</b>&nbsp; 个结果";

int c= panel.indexOf(selectStr);

int d= panel.lastIndexOf(selectStr);

if (c== d && c != -1) {

int e= panel.lastIndexOf("<b>", d);

if (e!= -1) {

Stringnum_str = panel.substring(e + 3, d)

.replaceAll(",","");

returnInteger.parseInt(num_str);

}

}

}

if(overlays.has("markers")) {

JSONArraymarkers = overlays.getJSONArray("markers");

intsize = markers.size();

for(int i = 0; i < size; i++) {

JSONObjectmarker = markers.getJSONObject(i);

intb_s = marker.getInt("b_s");

if(b_s == 2) {

BusinessBeanbusinessBean = new BusinessBean();

businessBean.setKeyword(query);

Stringcid = marker.getString("cid");

businessBean.setCid(cid);

Stringname = marker.getString("name");

businessBean.setName(name);

JSONObjectinfoWindow = marker

.getJSONObject("infoWindow");

if(infoWindow.has("addressLines")) {

JSONArrayaddress = infoWindow

.getJSONArray("addressLines");

intaddrSize = address.size();

if(addrSize > 0) {

businessBean.setArea(address.getString(0));

if(!address.getString(0).startsWith("你要的省份")) {

continue;

}

}

if(addrSize > 1) {

businessBean

.setStreet(address.getString(1));

}

}

if(infoWindow.has("phones")) {

JSONArrayphones = infoWindow

.getJSONArray("phones");

intphoneSize = phones.size();

if(phoneSize > 0) {

Stringnumber = phones.getJSONObject(0)

.getString("number");

if(!"04007161717".equals(number)) {

businessBean.setPhone(number);

}

}

}

JSONObjectlatlng = marker.getJSONObject("latlng");

Stringll = latlng.getJSONObject("alt").getString(

"ll");

doublelat = getLat(ll);

doublelng = getLng(ll);

businessBean.setLat(lat);

businessBean.setLng(lng);

booleaninsert = BusinessJdbc

.insertOrUpdate(businessBean);

if(insert) {

index++;

logger.info("havemax:{} index:{}", max, index);

}

}

}

}

return0;

} else{

thrownew IOException("status error" + status);

}

}finally {

getMethod.releaseConnection();

}

}

 

publicdouble distanceOfTwoPoints(double lat1, double lng1, double lat2,

doublelng2) {

doubleradLng1 = lng1 * Math.PI / 180.0;

doubleradLng2 = lng2 * Math.PI / 180.0;

doublea = radLng1 - radLng2;

doubleb = (lat1 - lat2) * Math.PI / 180.0;

doubles = 2 * Math.asin(Math.sqrt(Math.pow(Math.sin(a / 2), 2)

+Math.cos(radLng1) * Math.cos(radLng2)

*Math.pow(Math.sin(b / 2), 2))) * 6378.137;

s =Math.round(s * 1000) / 1000D;

returns;

}

 

publicdouble getLat(String ll) {

intlength = ll.length();

if(length == 20) {

long y= Long.parseLong(ll.substring(7, 14)) * 256

+Long.parseLong(ll.substring(17, 20));

returnGoogleMap.pixelToLat(y, 22);

} else{

thrownew RuntimeException("error length:" + length);

}

}

 

publicdouble getLng(String ll) {

intlength = ll.length();

if(length == 20) {

long x= Long.parseLong(ll.substring(0, 7)) * 256

+Long.parseLong(ll.substring(14, 17));

returnGoogleMap.pixelToLng(x, 22);

} else{

thrownew RuntimeException("error length:" + length);

}

}

 

}

jar 包应该没啥  就是有一个json处理那个比较复杂,引的多一些,叫json-lib主要是当时用了它的转xml功能,如果自己用的话可以替换成一个简单的

 

源文档 <http://nibaogang.blogspot.com/2011/02/google_21.html>

 

 

已使用 Microsoft OneNote 2010 创建
一个用于存放所有笔记和信息的位置





转载于:https://www.cnblogs.com/buaichiyu/archive/2012/10/07/588b2abc78f5287bc7879245f47a4bd7.html

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值