写个爬虫获取全国地区数据

公司需要全国的地址数据,网上找半天找不到最后找到这个网址,怎么办?咱是程序猿,直接写个爬虫搞起!



看看测试效果

public static void main(String[] args) throws IOException {
		
		//构建一个网页处理器
		PageProcessor processor = new NetworkProcessor();
		//构建两个数据提取器,专门用来提取城市地址数据
		AnalyHandler analy = new CityUrlExtractHandler();
		analy.setAnalyHandler(new CityNameExtractHandler());
		//构建一个网页数据过滤器
		PageFilter filter = new DefaultPageFilter();
		
		//构建一个网页对象
		Page page = new Page();
		page.setRegex("<td>.*?</td>");//整个网页数据提取规则
		page.setUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html");//需要爬取的url
		page.setLevel(1);	//页面深度第一次是1
		
		//构建一个爬虫启动对象
		InvokeCenter ic = new InvokeCenter(processor, analy);
		ic.setPageFilter(filter);
		
		//取得爬取结果
		List<Map<String,String>> result = ic.start(page);
		String fileDir = "C:\\Users\\Administrator\\Desktop\\cityData.txt";
		
		fullDisk(result,fileDir);
	}


花了两个多小时写的,感觉还不错,只有InvokeCenter爬虫启动中心还需要做修改,让其可以支撑各种网页的爬取,这个我就不做了,如果你拿到了我的代码,你可以去改下试试!

好了,贴上全部代码!

package com.fyrj.compoment.crawler;

/***
 * 代表一个抽象网页
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public class Page {
	
	/***
	 * 网页URL
	 */
	private String url ; 
	
	/***
	 * 页面数据
	 */
	private String viewData;
	
	/***
	 * 该页面提取数据的规则表达式
	 */
	private String regex;
	
	/***
	 * 页面深度
	 */
	private int level;

	public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	public String getViewData() {
		return viewData;
	}

	public void setViewData(String viewData) {
		this.viewData = viewData;
	}

	public String getRegex() {
		return regex;
	}

	public void setRegex(String regex) {
		this.regex = regex;
	}

	public int getLevel() {
		return level;
	}

	public void setLevel(int level) {
		this.level = level;
	}
	
	
}


package com.fyrj.compoment.crawler;

import java.util.UUID;

/***
 * 静态编码以及ID生成器
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public class NumberAndIdGenerator {
	private static int number = 100001;
	
	private NumberAndIdGenerator() {}
	
	/***
	 * 采用静态内部类实现单列
	 * @author ying.cai
	 * @email 919126624@qq.com
	 * @version 1.0
	 */
	public static class Inner{
		private static NumberAndIdGenerator instance = new NumberAndIdGenerator();
	}
	
	public static NumberAndIdGenerator getInstence(){
		return Inner.instance;
	}
	/***
	 * 生成编码
	 * @return
	 */
	public static String createNumber(){
		return ++number+"";
	}
	
	/***
	 * 生成Id
	 * @return
	 */
	public static String createId(){
		return UUID.randomUUID().toString().replaceAll("-", "");
	}
	
}


package com.fyrj.compoment.crawler;

/***
 * 页面数据获取接口
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public interface PageProcessor {
	
	Page resolverPage(Page page);

}


package com.fyrj.compoment.crawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

/***
 * 通过网络获取页面数据
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public class NetworkProcessor implements PageProcessor {

	@Override
	public Page resolverPage(Page page) {
		String urlStr = page.getUrl();
		if( urlStr == null || urlStr.equals("")){
			throw new RuntimeException("page url not be null!");
		}
		try{
			
			URL url = new URL(urlStr);
			HttpURLConnection urlConnection = (HttpURLConnection)url.openConnection();
			int responsecode=urlConnection.getResponseCode();
			BufferedReader reader;
			StringBuffer bufferData = new StringBuffer();
	        if(responsecode==200){
	            reader=new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"GBK"));
	            String lineData = null;
	    		while( null!=(lineData=reader.readLine())){
	    			bufferData.append(lineData);
	    		}
	        }else{
	        	throw new IOException("获取不到网页的源码,服务器响应代码为:"+responsecode+"URL为:"+urlStr);
	        }
	        page.setViewData(bufferData.toString());
		}catch(Exception e){
	    	throw new RuntimeException("获取不到网页的源码,出现异常:",e);
	    }
	    
        return page;
	}
	
}


package com.fyrj.compoment.crawler;

import java.util.Map;

/***
 * 动态匹配规则中心
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public interface RuleDefinition {
	
	/***
	 * 规则定义
	 * @return
	 */
	String getRegex();
	
	/***
	 * 争对拿到的数据进行解析处理
	 * @param lineData
	 * @param result
	 * @return
	 */
	boolean fullData( String lineData, Map<String,String> result );
}


package com.fyrj.compoment.crawler;

/***
 * 页面解析过滤器
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public abstract class  PageFilter {
	protected PageFilter pageFilter ; 
	
	public Page doFilter(Page page){
		this.filter(page);
		if(this.pageFilter!=null){
			pageFilter.filter(page);
		}
		return page;
	};
	
	protected abstract void filter(Page page);
}


package com.fyrj.compoment.crawler;

/***
 * 针对地址解析这个网站得有这么个过滤器
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public class DefaultPageFilter extends PageFilter {

	@Override
	public void filter(Page page) {
		if( page.getLevel() >1 ){
			page.setRegex("<td>.*?</td><td>.*?</td>");
			page.setViewData(page.getViewData()
					.replaceAll("<td>名称</td>", "")
					.replaceAll("<td>\\D</td>", "")
					);
		}
		
	}

}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/***
 * 数据提取器
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public abstract class AnalyHandler {
	
	protected String dataStr;
	protected AnalyHandler analyHandler ; 
	
	//采用链条式一直调用处理者直到处理完毕
	public Map<String,String> analyStart(){
		Map<String,String> result = new HashMap<>();
		analyDefine(result);
		if( this.analyHandler!=null ){
			analyHandler.setDataStr(dataStr);
			analyHandler.analyDefine(result);
		}
		return result;
	}
	
	protected Map<String,String> analyDefine( Map<String,String> result ){
		//这里采用规则链条调用...直到能匹配出数据!或者最后都匹配不到数据那就放弃算了!
		List<RuleDefinition> rList = getRuleDefinitionChain();
		for (RuleDefinition ruleDefinition : rList) {
			String regex = ruleDefinition.getRegex();
			List<String> list = new ArrayList<String>();
			Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
			Matcher ma = pa.matcher(dataStr);
			while (ma.find()){
				list.add(ma.group());
			}
			if(list.size()>0){
				String lineData = list.get(0);
				boolean fullResult = ruleDefinition.fullData(lineData, result);
				//如果解析成功拿到了东西,就不再继续往下解析,否则继续到第二个规则对象中去解析数据!
				if(fullResult){
					return result;
				}
			}
		}
		return result;
	}
	
	
	public void setAnalyHandler( AnalyHandler analyHandler){
		this.analyHandler = analyHandler;
	}
	
	public void setDataStr( String dataStr){
		this.dataStr = dataStr;
	};

	abstract List<RuleDefinition> getRuleDefinitionChain();
}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/***
 * 解析出城市名称
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public class CityNameExtractHandler extends AnalyHandler{

	@Override
	List<RuleDefinition> getRuleDefinitionChain() {
		List<RuleDefinition> rList = new ArrayList<>();
		//第一规则,这个争对第一级地址可用
		rList.add(new RuleDefinition() {
			@Override
			public String getRegex() {
				return "html'>.*?<br/>";
			}
			
			@Override
			public boolean fullData(String lineData, Map<String, String> result) {
				String cityName = lineData.substring(lineData.lastIndexOf("html'>")+6,lineData.lastIndexOf("<br/>"));;
				if(cityName==null || "".equals(cityName)){
					//如果没解析到数据,证明这次是失败的!
					return false;
				}
				result.put("CITY_NAME", cityName);
				return true;
			}
		});
		
		//第二规则,这个针对中间地址可用
		rList.add(new RuleDefinition() {
			@Override
			public String getRegex() {
				return "html'>\\D*?</a>";
			}
			
			@Override
			public boolean fullData(String lineData, Map<String, String> result) {
				String cityName = lineData.substring(lineData.lastIndexOf("html'>")+6,lineData.lastIndexOf("</a>"));;
				if(cityName==null || "".equals(cityName)){
					//如果没解析到数据,证明这次是失败的!
					return false;
				}
				result.put("CITY_NAME", cityName);
				return true;
			}
		});
		
		//第三规则,这个针对最后一级地址可用
		rList.add(new RuleDefinition() {
			@Override
			public String getRegex() {
				return "<td>\\D*?</td>";
			}
			
			@Override
			public boolean fullData(String lineData, Map<String, String> result) {
				String cityName = lineData.substring(lineData.lastIndexOf("<td>")+4,lineData.lastIndexOf("</td>"));;
				if(cityName==null || "".equals(cityName)){
					//如果没解析到数据,证明这次是失败的!
					return false;
				}
				result.put("CITY_NAME", cityName);
				return true;
			}
		});
		return rList;
	}

}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;

/***
 * 解析出城市URL
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public class CityUrlExtractHandler extends AnalyHandler{

	@Override
	List<RuleDefinition> getRuleDefinitionChain() {
		List<RuleDefinition> rList = new ArrayList<>();
		rList.add(new RuleDefinition() {
			@Override
			public String getRegex() {
				return "<a href='.*?\\.html'";
			}
			
			@Override
			public boolean fullData(String lineData, Map<String, String> result) {
				String url = lineData.substring(lineData.lastIndexOf("href='")+6,lineData.lastIndexOf(".html")+5);
				if(url==null || "".equals(url)){
					//如果没解析到数据,证明这次是失败的!
					return false;
				}
				result.put("URL",url);
				return true;
			}
		});
		return rList;
	}
}


package com.fyrj.compoment.crawler;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/***
 * 爬虫启动器
 * @author ying.cai
 * @email 919126624@qq.com
 * @version 1.0
 */
public class InvokeCenter {
		
	//页面处理器,能打开页面...
	private PageProcessor processor;
	
	//数据提取调用链,用于针对每行具体数据进行数据提取
	private AnalyHandler analyHandler;
	
	//页面过滤器,用于替换规则和页面数据
	private PageFilter pageFilter;
	
	
	public InvokeCenter( PageProcessor processor,AnalyHandler analy ) {
		this.processor = processor;
		this.analyHandler = analy;
	}
	
	public void setPageFilter(PageFilter pageFilter){
		this.pageFilter = pageFilter;
	}
	
	
	public List<Map<String,String>> start( Page page){
		List<Map<String,String>> result = new ArrayList<Map<String,String>>();
		resolverPage(page);
		result = capacityPageData(page,result,null);
		return result;
	} 
	
	public void resolverPage(Page page){
		page = processor.resolverPage(page);
	}
	
	/***
	 * 对网页进行深度解析
	 * @param page 代表一个抽象网页
	 * @param result 
	 * @param level 这个是爬虫深度!可能到了不同的深度,page匹配规则需要改变
	 * @param parentId 父数据ID
	 * @return
	 */
	public List<Map<String,String>> capacityPageData(Page page,List<Map<String,String>> result,String parentId){
		//修改数据匹配规则 ,这里采用过滤器设计
		if( null!=pageFilter ){
			pageFilter.doFilter(page);
		}
		//用正则解析页面数据
	    List<String> list = new ArrayList<String>();
	    if( page.getRegex()==null || "".equals(page.getRegex()) ){
	    	throw new RuntimeException("page regex not be null!");
	    }
	    Pattern pa = Pattern.compile(page.getRegex(), Pattern.CANON_EQ);
	    Matcher ma = pa.matcher(page.getViewData());
	    while (ma.find()){
	      list.add(ma.group());
	    }
	    
	    //接下来要对每一行数据进行分析,转换成我需要的map数据
	    for (String lineStr : list) {
	    	
	    	analyHandler.setDataStr(lineStr);
	    	//得到经过一些列解析器解析后的数据映射
	    	Map<String,String> map = analyHandler.analyStart();
	    	
	    	afterProcess(result, page.getLevel(), parentId, map);
	    	
	    	//如果存在URL,则需要继续递归
	    	if( map.containsKey("URL") ){
	    		//这行代码应该抽取出去,因为URL的构建规则随着要爬取的网页不同都是会变的!
	    		String newUrl = page.getUrl().substring(0,page.getUrl().lastIndexOf("/")+1) + map.get("URL");
	    		
	    		Page newPage = new Page();
	    		newPage.setUrl(newUrl);
	    		newPage.setRegex(page.getRegex());
	    		//继续往下走,页面深度+1
	    		newPage.setLevel(page.getLevel()+1);
	    		resolverPage(newPage);
	    		capacityPageData(newPage, result,map.get("ID"));
	    	}
	    }
	    return result;
	}

	/***
	 * 争对解析到的数据的后期处理
	 * @param result
	 * @param level
	 * @param parentId
	 * @param map
	 */
	private void afterProcess(List<Map<String, String>> result, int level, String parentId, Map<String, String> map) {
		if(map.get("CITY_NAME")!=null){
			//针对结果再做些处理!
			map.put("ID", NumberAndIdGenerator.getInstence().createId());
			map.put("NUMBER", NumberAndIdGenerator.getInstence().createNumber());
			map.put("LEVEL", level+"");
			map.put("PARENT_ID", parentId);
			result.add(map);
			System.out.println(map);
		}
	}
	
}



package com.fyrj.compoment.crawler;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Map;

public class Test {
	
	public static final String SQL_TEMPLATE = "insert into table_name ('id','name','number','level','parent_id')"
	+" values ('#ID#','#NAME#','#NUMBER#','#LEVEL#','#PARENT_ID#') ; ";
	
	
	public static void main(String[] args) throws IOException {
		
		//构建一个网页处理器
		PageProcessor processor = new NetworkProcessor();
		//构建两个数据提取器,专门用来提取城市地址数据
		AnalyHandler analy = new CityUrlExtractHandler();
		analy.setAnalyHandler(new CityNameExtractHandler());
		//构建一个网页数据过滤器
		PageFilter filter = new DefaultPageFilter();
		
		//构建一个网页对象
		Page page = new Page();
		page.setRegex("<td>.*?</td>");//整个网页数据提取规则
		page.setUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html");//需要爬取的url
		page.setLevel(1);	//页面深度第一次是1
		
		//构建一个爬虫启动对象
		InvokeCenter ic = new InvokeCenter(processor, analy);
		ic.setPageFilter(filter);
		
		//取得爬取结果
		List<Map<String,String>> result = ic.start(page);
		String fileDir = "C:\\Users\\Administrator\\Desktop\\cityData.txt";
		
		fullDisk(result,fileDir);
	}
	
	/***
	 * 写入磁盘
	 * @param result
	 * @throws IOException 
	 */
	public static void fullDisk(List<Map<String,String>> result,String fileDir) throws IOException{
		BufferedWriter writer = new BufferedWriter(new FileWriter(new File(fileDir)));
		try {
			for (Map<String, String> map : result) {
				writer.write( findFinalStr(map) );
				writer.newLine();
				writer.flush();
			}
		} catch (IOException e) {
			
		}finally{
			if( writer!=null ){
				writer.close();
			}
		}
	}
	
	public static String findFinalStr( Map<String, String> map ){
		String finalStr = Test.SQL_TEMPLATE.replaceAll("#ID#", map.get("ID"))
				.replaceAll("#ID#", map.get("ID"))
				.replaceAll("#NAME#", map.get("CITY_NAME"))
				.replaceAll("#NUMBER#", map.get("NUMBER"))
				.replaceAll("#LEVEL#", map.get("LEVEL"))
				.replaceAll("#PARENT_ID#", map.get("PARENT_ID")==null?"NULL":map.get("PARENT_ID"));
		return finalStr;
	}
}


写的不好的地方请见谅,这个没做太多思考写的,能满足构建全国地区数据的需求。不过你可以对InvokeCenter进行修改使其可以动态支持各种网页;





  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值