Java WebMagic爬虫爬区最新全国省市区域信息

1:POM文件依赖

		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.36</version>
		</dependency>
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.7</version>
		</dependency>
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-core</artifactId>
			<version>0.7.3</version>
		</dependency>
		<dependency>
			<groupId>us.codecraft</groupId>
			<artifactId>webmagic-extension</artifactId>
			<version>0.7.3</version>
		</dependency>

2:Main方法

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.math.NumberUtils;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 *****************************
 * 爬区统计局城市区域信息代码
 *
 * @author LinYingQiang
 * @date 2018年7月18日 上午11:51:08
 * @mobile 
 *****************************
 */
public class StatisticsPageProcessor implements PageProcessor {
	
    static String driver = "com.mysql.jdbc.Driver";
    static String url = "jdbc:mysql://***.***.***:3306/wool_trade?characterEncoding=utf8&useSSL=false";
    static String username = "***";
    static String password = "***";
    static Connection conn = null;
	
    static{
        try {
            Class.forName(driver); //classLoader,加载对应驱动
            conn =DriverManager.getConnection(url, username, password);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }
	
	@Override
	public Site getSite() {
		return Site.me().setRetryTimes(3).setSleepTime(1500);
	}

	@Override
	public void process(Page page) {
		if(page.getUrl().get().contains("index.html")){//省份
			List<TargetModel> provinceModels = new ArrayList<>();
			List<String> provices = page.getHtml().xpath("tr[@class='provincetr']/td/a/text()").all();
			List<String> urls = page.getHtml().xpath("tr[@class='provincetr']/td/a").links().all();
			TargetModel model = null;
			for(int i = 0 ; i< provices.size() && (provices.size() == urls.size()); i ++){
				model = new TargetModel();
				String url = urls.get(i);
				Pattern p1 = Pattern.compile("\\d+\\.html$");
				Matcher m1 = p1.matcher(url);
				if(m1.find()){
					String group1 = m1.group();
					Pattern p2 = Pattern.compile("\\d+");
					Matcher m2 = p2.matcher(group1);
					if(m2.find()){
						String group2 = m2.group();
						if(NumberUtils.isCreatable(group2)){
							model.setId(Integer.valueOf(group2));
						}
					}
				}
				model.setpId(0);
				model.setName(provices.get(i));
				model.setUrls(url);
				provinceModels.add(model);
				page.addTargetRequest(url);
			}
			page.putField("provinces", provinceModels);
		}else if(page.getUrl().regex("http://www\\.stats\\.gov\\.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/[0-9]*.html").match()){//市级
			List<TargetModel> cityModels = new ArrayList<>();
			List<String> cities = page.getHtml().xpath("tr[@class='citytr']/td[2]/a/text()").all();
			List<String> urls = page.getHtml().xpath("tr[@class='citytr']/td[2]/a").links().all();
			Integer tmId = null;
			//解析父省份id设置到当前对象
			String url = page.getUrl().get();
			Pattern p1 = Pattern.compile("\\d+\\.html$");
			Matcher m1 = p1.matcher(url);
			if(m1.find()){
				String group1 = m1.group();
				Pattern p2 = Pattern.compile("\\d+");
				Matcher m2 = p2.matcher(group1);
				if(m2.find()){
					String group2 = m2.group();
					if(NumberUtils.isCreatable(group2)){
						tmId = Integer.valueOf(group2);
					}
				}
			}
			for(int i = 0; (i < cities.size() && (cities.size() == urls.size())); i ++){
				TargetModel tm = new TargetModel();
				String cUrl = urls.get(i);
				Pattern p3 = Pattern.compile("\\d+\\.html$");
				Matcher m3 = p3.matcher(cUrl);
				if(m3.find()){
					String group3 = m3.group();
					Pattern p4 = Pattern.compile("\\d+");
					Matcher m4 = p4.matcher(group3);
					if(m4.find()){
						String group4 = m4.group();
						if(NumberUtils.isCreatable(group4)){
							tm.setId(Integer.valueOf(group4));
						}
					}
				}
				tm.setpId(tmId);
				tm.setName(cities.get(i));
				tm.setUrls(cUrl);
				cityModels.add(tm);
				page.addTargetRequest(cUrl);
			}
			page.putField("cities", cityModels);
		}else if(page.getUrl().regex("http://www\\.stats\\.gov\\.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/[0-9]*/[0-9]*.html").match()){//区域
			List<TargetModel> areaModels = new ArrayList<>();
			List<String> cities = page.getHtml().xpath("tr[@class='countytr']/td[2]/a/text()").all();
			List<String> urls = page.getHtml().xpath("tr[@class='countytr']/td[2]/a").links().all();
			//解析父区域id设置到当前对象
			Integer tmId = null;
			String url = page.getUrl().get();
			Pattern p1 = Pattern.compile("\\d+\\.html$");
			Matcher m1 = p1.matcher(url);
			if(m1.find()){
				String group1 = m1.group();
				Pattern p2 = Pattern.compile("\\d+");
				Matcher m2 = p2.matcher(group1);
				if(m2.find()){
					String group2 = m2.group();
					if(NumberUtils.isCreatable(group2)){
						tmId = Integer.valueOf(group2);
					}
				}
			}
			for(int i = 0; (i < cities.size() && (cities.size() == urls.size())); i ++){
				TargetModel tm = new TargetModel();
				String cUrl = urls.get(i);
				Pattern p3 = Pattern.compile("\\d+\\.html$");
				Matcher m3 = p3.matcher(cUrl);
				if(m3.find()){
					String group3 = m3.group();
					Pattern p4 = Pattern.compile("\\d+");
					Matcher m4 = p4.matcher(group3);
					if(m4.find()){
						String group4 = m4.group();
						if(NumberUtils.isCreatable(group4)){
							tm.setId(Integer.valueOf(group4));
						}
					}
				}
				tm.setpId(tmId);
				tm.setName(cities.get(i));
				tm.setUrls(cUrl);
				areaModels.add(tm);
				//page.addTargetRequest(cUrl);
			}
			page.putField("areas", areaModels);
		}
	}
	
	public static void main(String[] args) {
		CustomPipeline customPipeline = new CustomPipeline();
		Spider spider = Spider.create(new StatisticsPageProcessor());
		spider.addUrl("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html");
		spider.addPipeline(customPipeline);
		spider.start();	
		for(;;){
			if(spider.getStatus().equals(Spider.Status.Stopped)){
				break;
			}
		}
		List<TargetModel> provinces = customPipeline.provinces;
		//当所有任务执行完毕  
		//INSERT INTO `wool_trade`.`bus_region` (`id`, `name`, `p_id`) VALUES ('1', '北京市', '0');

		String sql = "insert into `bus_region`(`id`,`name`,`p_id`) VALUES (?, ?, ?)";
		PreparedStatement preparedStatement;
		try {
			for(TargetModel province : provinces){
				preparedStatement = conn.prepareStatement(sql);
				preparedStatement.setInt(1, province.getId());
				preparedStatement.setString(2, province.getName());
				preparedStatement.setInt(3, province.getpId());
				preparedStatement.executeUpdate();
				
				//遍历市级
				for(TargetModel city : province.getChilds()){
					preparedStatement = conn.prepareStatement(sql);
					preparedStatement.setInt(1, city.getId());
					preparedStatement.setString(2, city.getName());
					preparedStatement.setInt(3, city.getpId());
					preparedStatement.executeUpdate();
					
					//遍历区域
					for(TargetModel area : city.getChilds()){
						preparedStatement = conn.prepareStatement(sql);
						preparedStatement.setInt(1, area.getId());
						preparedStatement.setString(2, area.getName());
						preparedStatement.setInt(3, area.getpId());
						preparedStatement.executeUpdate();
					}
				}
			}
		}catch (SQLException e) {
			
		}
		
	}
}

3:Pipeline类

import java.util.List;

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

/**
 *****************************
 * 爬区统计局城市区域信息代码
 *
 * @author LinYingQiang
 * @date 2018年7月18日 上午11:52:12
 * @mobile 
 *****************************
 */
public class CustomPipeline implements Pipeline {
	
	List<TargetModel> provinces;

	@Override
	public void process(ResultItems resultItems, Task task) {
		
		if(resultItems.get("provinces") != null){
			provinces = resultItems.get("provinces");
		}
		
		if(resultItems.get("cities") != null && provinces.size() > 0){
			List<TargetModel> cities = resultItems.get("cities");
			for(TargetModel province : provinces){
				for(TargetModel city : cities){
					if(province.getId().equals(city.getpId())){
						province.getChilds().add(city);
					}else{
						continue;
					}
				}
			}
		}
		
		if(resultItems.get("areas") != null && provinces.size() > 0){
			List<TargetModel> areas = resultItems.get("areas");
			for(TargetModel province : provinces){
				for(TargetModel city : province.getChilds()){
					for(TargetModel area : areas){
						if(city.getId().equals(area.getpId())){
							city.getChilds().add(area);
						}else{
							continue;
						}
					}
				}
			}
		}
	}
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值