JAVA爬取百度数据

package com.bonc;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;

public class DownloadBD2 {
private String inputFile;
private String outFile ;
private String outShapeFile ;
private String outImagePath ;
private Map<String, String> cityCodes = new HashMap<>();

public DownloadBD2(String inputFile,String outPath) throws IOException {
	this.inputFile=inputFile;
	this.outFile = outPath+"/poi_" + System.currentTimeMillis() + ".txt";

// this.outShapeFile = outPath+"/map_bd_search_shape"+index+".sql";
this.outImagePath = outPath+"/image";
//湖南的
cityCodes.put(“娄底市”, “221”);
cityCodes.put(“长沙市”, “158”);
cityCodes.put(“岳阳市”, “220”);
cityCodes.put(“常德市”, “219”);
cityCodes.put(“张家界市”, “312”);
cityCodes.put(“怀化市”, “363”);
cityCodes.put(“株洲市”, “222”);
cityCodes.put(“永州市”, “314”);
cityCodes.put(“湘潭市”, “313”);
cityCodes.put(“湘西土家族苗族自治州”, “274”);
cityCodes.put(“益阳市”, “272”);
cityCodes.put(“衡阳市”, “159”);
cityCodes.put(“邵阳市”, “273”);
cityCodes.put(“郴州市”, “275”);
//全国的
// File bianMa = new File(“D:\hunan\bianMa.txt”);
// BufferedReader br = new BufferedReader(new FileReader(bianMa));
// String s = null;
// while((s = br.readLine())!=null){
// String name =s.split("?[0];
// String ID = s.split("?[1];
// cityCodes.put(name, ID);
// }

	}
	//br.close();
	
   

public void download() throws Exception{
	
	File in = new File(this.inputFile);
	 
	BufferedReader br = new BufferedReader(new FileReader(in));//构造一个BufferedReader类来读取文件
    String s = null;
    File outSqlFile = new File(this.outFile);
 	outSqlFile.createNewFile();
 	//System.out.println("创建输出文件");
 	FileOutputStream fot = new FileOutputStream(outSqlFile);
 	OutputStreamWriter outWriter = new OutputStreamWriter(fot,"UTF-8");  
    int r = 1;
    while((s = br.readLine())!=null){//使用readLine方法,一次读一行
    	System.out.println(s);
    	//indexOf 方法返回一个整数值,指出 String对象内子字符串的开始位置。如果没有找到子字符串,则返回-1
    	if(s.indexOf(",")>0) {
    		//System.out.println("进入判断");
    		String keyWord = s.split(",")[1];
    		String city=s.split(",")[0];
    		//System.out.println(keyWord +"     "+city);
    		if(keyWord.indexOf("厕所")>-1) {
    			continue;
    		}
    		try {
    			System.out.print(r+"-->"+city);
    			r++;
    			downloadKeyWord(city,keyWord,outWriter);
    			
    		}catch (Exception e) {
				e.printStackTrace();

// System.err.println(“获取百度信息失败:”+s);
}
}
}
br.close();
outWriter.close();
fot.close();
}

private void downloadKeyWord(String city,String keyWord,OutputStreamWriter Writer) throws Exception {
	//http://api.map.baidu.com/?qt=s&c=131&wd=餐饮&rn=1
	String cityCode = this.cityCodes.get(city);

// String cityCode = “158”;
System.out.println(cityCode);
if(cityCode==null) {
System.out.println(city);
return;

	}
	int rn = 30;
	 String urlStr = "http://api.map.baidu.com/?qt=s&c="+cityCode+"&wd="+keyWord+"&rn="+rn;
	 System.out.println(urlStr);
	 URL url   = new URL(urlStr);
	 HttpURLConnection  huc = (HttpURLConnection)url.openConnection(); 
	 huc.setConnectTimeout(10000); 
	 huc.setReadTimeout(10000); 
	 huc.setRequestMethod("GET"); 
	 InputStream   in = url.openStream(); 
	 InputStreamReader inr = new InputStreamReader(in);
	 BufferedReader br = new BufferedReader(inr);
	 StringBuffer sb = new StringBuffer();
	 String tmpline = "";
	 while((tmpline =br.readLine())!=null) {
		  
		 sb.append(tmpline);
	 }
	 
	 inr.close();
	 in.close();
	 
	 String jsonStr = sb.toString();
	 //System.out.println(city+":"+keyWord+"#"+jsonStr.toString());
	 int n =0;
	 while(jsonStr.contains("\"ext\"")) {
		 String tmpImage = jsonStr.substring(jsonStr.indexOf("\"image\""));
		 tmpImage =tmpImage.substring(0,tmpImage.indexOf("\",")+2);
 
		 int ext = jsonStr.indexOf("\"ext\"");
		 int ext_type = jsonStr.indexOf("\"ext_type\"",ext)+10;
		 if(ext<ext_type&&n<rn) {
			 n++;
		     jsonStr = jsonStr.substring(0, ext)+tmpImage+"\"tmpmytype\""+jsonStr.substring(ext_type);
		 }else {
			 break;
		 }
		// System.out.println(jsonStr);   
		 
	 }


	JSONObject obj = JSONObject.parseObject(jsonStr);
	JSONArray content = (JSONArray)obj.get("content");
	if(content==null) {
		//百度查询不到
		throw new RuntimeException("百度查询不到"+city+":"+keyWord);
	}
	
	 for(int i=0;i<content.size();i++) {
		 JSONObject o =(JSONObject) content.get(i);
		 String uid = o.getString("uid");
		 String addr =notNull( o.getString("addr"));
		 String area =notNull( o.getString("area"));
		 String area_name =notNull( o.getString("area_name"));
		 String name =notNull( o.getString("name"));
		 String catalogID =notNull( o.getString("catalogID"));
		 String std_tag =notNull( o.getString("std_tag"));
		 String x = notNull(o.getString("x"));
		 String y = notNull(o.getString("y"));
		 //String image =notNull( o.getString("image"));
		 String image = "default.jpg";
		 addr = addr.replace("'", "\'");
		 String shape = downloadShape(uid);
		 double[] point = PointUtil.convertMC2LL(new double[] {Double.parseDouble(x)/100,Double.parseDouble(y)/100}); 
		 double lon = point[0];
		 double lat = point[1];			 			 
		 if(isNull(uid)) {
			 continue;
		 }
		 String data = uid + "\t" + addr  + "\t" + area + "\t" +  area_name + "\t" +  name + "\t" +  catalogID + "\t" +  std_tag + "\t" +  lon + "\t" +  lat + "\t" +  image + "\t" +  shape;
		 Writer.write(data + "\n");
		 Writer.flush(); 
	 }
	
	 
}
 private String notNull(Object o) {
	 if(o==null) {
		 return "";
	 }else {
		 return o.toString().trim();
	 }
 }
 
 private boolean isNull(Object o) {
	 if(o==null) {
		 return true;
	 }else {
		 if(o.toString().trim().equals("")) {
			 return true;
		 }else {
			 return false;
		 }
	 }
	 
 }
 private String downloadShape(String uid) {
	 StringBuffer shpStr = new StringBuffer();
	 try {
	 String urlStr = "http://map.baidu.com/?reqflag=pcmap&from=webmap&qt=ext&uid="+uid+"&ext_ver=new&l=18";
	 URL url   = new URL(urlStr);
	 HttpURLConnection  huc = (HttpURLConnection)url.openConnection(); 
	 huc.setConnectTimeout(10000); 
	 huc.setReadTimeout(10000); 
	 huc.setRequestMethod("GET"); 
	 InputStream   in = url.openStream(); 
	 InputStreamReader inr = new InputStreamReader(in);
	 BufferedReader br = new BufferedReader(inr);
	 StringBuffer sb = new StringBuffer();
	 String tmpline = "";
	 while((tmpline =br.readLine())!=null) {
		  
		 sb.append(tmpline);
	 }
	 
	 inr.close();
	 in.close();
	 
	 JSONObject obj = JSONObject.parseObject(sb.toString());
	 JSONObject content = obj.getJSONObject("content");
	 String geo = content.getString("geo");
	 if(geo!=null) {
		 if(geo.indexOf("|1-")>0) {
			 geo= geo.substring(geo.indexOf("|1-")+3);
			 geo= geo.substring(0, geo.indexOf(";"));
			String[]  pointStrArr = geo.split(",");
			if(pointStrArr.length%2==0) {
				shpStr.append("[");
				for(int i =0;i<pointStrArr.length;i+=2) {
					if(i!=0) {
						 
						shpStr.append(",");
					}
					double[] point = PointUtil.convertMC2LL(new double[] {Double.parseDouble(pointStrArr[i]),Double.parseDouble(pointStrArr[i+1])});
					shpStr.append("{\"lng\":"+point[0]+",\"lat\":"+point[1]+"}");
					
				}
				shpStr.append("]");
			}
		 }
		 return shpStr.toString();
	 }
	 }catch (Exception e) {
		// TODO: handle exception
	}
	 return "";
 }
 
private String downloadImage(String uid,String imageUrl ) {
	if(isNull(imageUrl)) {
		return "default.jpg";
	}
	
	try {
		 File f  = new File(this.outImagePath+"/"+uid+".jpg") ; 
		 if(!f.exists()) {
	 URL url   = new URL(imageUrl);
	 HttpURLConnection  huc = (HttpURLConnection)url.openConnection(); 
	 huc.setConnectTimeout(10000); 
	 huc.setReadTimeout(10000); 
	 huc.setRequestMethod("GET"); 
	 InputStream   in = url.openStream(); 
 
	 byte[] buffer = new byte[4096]; 
	   int bytes_read; 
	   
	    f.createNewFile();
	    FileOutputStream   fos  = new FileOutputStream(f); 
	 while ((bytes_read = in.read(buffer)) != -1) 
	   { 
		 fos.write(buffer,0,bytes_read); 
	   }
	 fos.flush();
	  fos.close();
	 in.close();
	 
		 }
	}catch (Exception e) {
		// TODO: handle exception

// e.printStackTrace();
return “default.jpg”;
}
return uid+".jpg";
}

public static void main(String[] args) throws IOException {
	// TODO Auto-generated method stub 		
	
	DownloadBD2 dd = new DownloadBD2("E:\\dichanxiaoqu\\村级地名.txt","E:\\data\\");
	try {
		dd.download();
	} catch (Exception e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}	
}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值