利用Gecco爬取(蚂蚁短租网)列表页数据

利用Gecco爬取(蚂蚁短租网)数据

  • 代码运行效果:

<dependency>
			<groupId>com.geccocrawler</groupId>
			<artifactId>gecco</artifactId>
			<version>1.1.0</version>
		</dependency>

		<dependency>
			<groupId>com.geccocrawler</groupId>
			<artifactId>gecco-htmlunit</artifactId>
			<version>1.0.5</version>
		</dependency>

		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.38</version>
		</dependency>

		<dependency>
			<groupId>com.belerweb</groupId>
			<artifactId>pinyin4j</artifactId>
			<version>2.5.0</version>
		</dependency>
		
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.34</version>
		</dependency>
  • 代码工程结构(如下图)

1.蚂蚁短租网:城市列表常量类:

package org.ssgroup.spider.constant;

import java.util.LinkedHashMap;
import java.util.Map;

/**
 * @date 2018-11-09
 */
public class MaYiAllCityConstant {
	
	public static boolean on = true;
	
	/**
	 * 蚂蚁短租所有城市
	 */
	public static Map<String,String[]> ALL_CITY = new LinkedHashMap<String,String[]>();
	/**
	 * 蚂蚁短租位置类型
	 */
	public static String[] CITY_LOCATION = new String[] {"商圈","景点 ","行政区 ","车站机场 ","地铁","学校","医院"};
	
	static {
		//国内热门城市
		String[] holdCity = new String[] {"北京","上海","青岛","香港","成都","杭州","台北","三亚","大连","广州","西安","重庆","厦门","秦皇岛","屏东","花莲",
				"深圳","南京","苏州","烟台","威海","天津","北海","武汉","葫芦岛","长沙","哈尔滨","沈阳","昆明","营口"};
		ALL_CITY.put("holdCity", holdCity);
		
		//ABCD
		String[] abcd = new String[] {"鞍山","安阳","阿拉善","澳门","安顺","安庆","安康","阿勒泰",
		"北京","北戴河","北海","保定","本溪","包头","白山","宝鸡","蚌埠","博鳌","保山","百色","巴音郭楞","博尔塔拉","保亭","白沙",
		"成都","重庆","长沙","长春","承德","常州","长白山","赤峰","沧州","郴州","长治","潮州","常德","滁州","池州","昌吉","昌江",
		"大连","东戴河","丹东","大理","东莞","大同","德州","德阳","敦煌","东营","迪庆","都江堰","东方","德宏","定西","儋州","大兴安岭"};
		ALL_CITY.put("ABCD", abcd);
		
		//EFGH
		String[] efgh = new String[] {"洱海","峨眉山","恩施","鄂尔多斯","鄂州",
		"福州","佛山","防城港","凤凰","抚顺","阜阳","抚州","阜新",
		"广州","贵阳","高雄","桂林","赣州","甘孜","广元","贵港","鼓浪屿","甘南",
		"杭州","花莲","葫芦岛","哈尔滨","合肥","海口","惠州","湖州","莫干山","呼和浩特","黄山","呼伦贝尔","横店",
		"邯郸","衡水","淮安","衡阳","黑河","汉中","菏泽","红河","河源","黄龙","海西","海北","贺州","淮北","怀化","河池","黄石","海拉尔",
		"淮南","海东","鹤岗","和田"};
		ALL_CITY.put("EFGH", efgh);
		
		//JKLM
		String[] jklm = new String[] {"济南","基隆","嘉义","金门","嘉兴","锦州","吉林","济宁","九江","江门","焦作","景德镇","佳木斯","荆州","嘉峪关","九寨沟","晋城",
		"吉安","揭阳","鸡西","济源",
		"昆明","开封","克拉玛依","库尔勒",
		"丽江","临高","兰州","洛阳","临沂","乐山","廊坊","连云港","柳州","庐山","凉山","聊城","拉萨","临汾","丽水","六盘水","泸州","龙岩","吕梁",
		"陵水","辽源","陇南","林芝","临夏","乐东","临沧",
		"苗栗","马祖","绵阳","茂名","梅州","眉山","马鞍山"};
		ALL_CITY.put("JKLM", jklm);
		
		//NPQR
		String[] npqr = new String[] {"南戴河","南京","南投","宁波","南宁","南昌","南通","南阳","南充","南平","宁德","内江",
		"屏东","澎湖","普陀山","盘锦","平遥","莆田","平顶山","蓬莱","攀枝花","普洱","萍乡","平凉",
		"青岛","千岛湖","秦皇岛","泉州","清远","黔东南","齐齐哈尔","衢州","黔南","黔西南","曲靖","钦州",
		"日照","日喀则"};
		ALL_CITY.put("NPQR", npqr);
		
		//STW
		String[] stw = new String[] {"上海","上海迪士尼","三亚","深圳","苏州","沈阳","石家庄","绍兴","汕头","上饶","韶关","四平","三明","松原","十堰","神农架","遂宁","石河子",
		"绥化","商洛","随州","三沙",
		"台北","天涯海角","同里","天津","台东","台南","台中","桃园","太原","唐山","泰安","泰山","台州","通化","泰州","天水","通辽","吐鲁番","塔城","屯昌",
		"威海","武汉","温州","无锡","乌鲁木齐","潍坊","武夷山","芜湖","文昌","梧州","渭南","文山","万宁","武威","五指山"};
		ALL_CITY.put("STW", stw);
		
		//XYZs
		String[] xyz = new String[] {"香港","西湖","西安","厦门","新北","新竹","西塘","西宁","徐州","西双版纳","新乡","雪乡","咸阳","邢台","湘西","湘潭","信阳","锡林郭勒","许昌","忻州",
		"宣城","襄樊","兴安","宿迁","咸宁","宿州","孝感",
		"烟台","营口","云林","宜兰","扬州","银川","延边","阳江","宜昌","盐城","宜宾","延安","运城","玉溪","伊春","伊犁","雅安","宜春","岳阳","玉林","榆林",
		"益阳","阳朔","洋浦",
		"周庄","郑州","珠海","彰化","舟山","中山","张家口","张家界","漳州","湛江","淄博","遵义","枣庄","镇江","株洲","肇庆","自贡","张掖","中卫","周口",
		"驻马店","昭通","资阳"};
		ALL_CITY.put("XYZ", xyz);
	}
}

2.蚂蚁短租列表页抓取页面代码:

package org.ssgroup.spider.htmlBean;

import java.util.List;

import org.ssgroup.spider.htmlBean.domain.MaYiRoom;
import org.ssgroup.spider.htmlBean.domain.Page;
import org.ssgroup.spider.htmlBean.domain.list.CarOrAirport;
import org.ssgroup.spider.htmlBean.domain.list.Hospital;
import org.ssgroup.spider.htmlBean.domain.list.OfficeAreas;
import org.ssgroup.spider.htmlBean.domain.list.Offices;
import org.ssgroup.spider.htmlBean.domain.list.ScenicArea;
import org.ssgroup.spider.htmlBean.domain.list.School;
import org.ssgroup.spider.htmlBean.domain.list.ShopLoops;
import org.ssgroup.spider.htmlBean.domain.list.SubWayLine;
import org.ssgroup.spider.htmlBean.domain.list.SubWayStation;

import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.annotation.RequestParameter;
import com.geccocrawler.gecco.annotation.Text;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;

/**
 * 蚂蚁短租列表页
 * @author HX-011
 * @date 2018-11-09
 */
@Gecco(matchUrl="http://www.mayi.com/{city}/{code}", pipelines="maYiListPipeline")
public class MaYiListHtmlBean implements HtmlBean{
	private static final long serialVersionUID = -5332646457923675928L;
	
	@Request
	private HttpRequest request;
	
	/**
	 * 城市参数
	 */
	@RequestParameter("city")
	private String city;
	
	/**
	 * 请求分页参数
	 */
	@RequestParameter("code")
	private String code;
	
	/**
	 * 页面分页参数
	 */
	@Text
	@HtmlField(cssPath="#page > a.pg-active")
	private String page;
	
	/**
	 * 分页总数
	 */
	@HtmlField(cssPath="#page > input[type=hidden]")
	private List<Page> pages;
	
	/**
	 * 获取所有房源数据
	 */
	@HtmlField(cssPath="#searchRoom > dd")
	private List<MaYiRoom> room;
	
	/**
	 * 位置类型
	 */
//	@Text
//	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.item.next > div > a")
//	private List<String> locations;
	
	/**
	 * 商圈:type=1
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-business.pr60 > div > span")
	private List<ShopLoops> shopLoops;
	
	/**
	 * 景点:type=2
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-scenic > div > a")
	private List<ScenicArea> scenicAreas;
	
	/**
	 * 行政区:type=3
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-area > a")
	private List<OfficeAreas> officeAreas;
	/**
	 * 行政区:子区域
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-area > div")
	private List<Offices> offices;
	
	/**
	 * 车站机场:type=4
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-traffic > div > span")
	private List<CarOrAirport> carOrAirport;
	
	/**
	 * 地铁:type=5
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-metro > a")
	private List<SubWayLine> subWayLine;
	/**
	 * 地铁站详细:
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.result-metro > div")
	private List<SubWayStation> subWayStation;
	
	/**
	 * 学校:type=6
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-school > div > a")
	private List<School> school;
	
	/**
	 * 医院:type=7
	 */
	@HtmlField(cssPath="#position > div.rt-word.position_choose > div.lever.pr60.result-hospital > div > a")
	private List<Hospital> hospital;
	
	public HttpRequest getRequest() {
		return request;
	}
	public void setRequest(HttpRequest request) {
		this.request = request;
	}
	public String getCity() {
		return city;
	}
	public void setCity(String city) {
		this.city = city;
	}
	public String getPage() {
		return page;
	}
	public void setPage(String page) {
		this.page = page;
	}
	public void setCode(String code) {
		this.code = code;
	}
	public String getCode() {
		return code;
	}
	public void setRoom(List<MaYiRoom> room) {
		this.room = room;
	}
	public List<MaYiRoom> getRoom() {
		return room;
	}
	public void setPages(List<Page> pages) {
		this.pages = pages;
	}
	public List<Page> getPages() {
		return pages;
	}

	public List<ShopLoops> getShopLoops() {
		return shopLoops;
	}
	public void setShopLoops(List<ShopLoops> shopLoops) {
		this.shopLoops = shopLoops;
	}
	public List<ScenicArea> getScenicAreas() {
		return scenicAreas;
	}
	public void setScenicAreas(List<ScenicArea> scenicAreas) {
		this.scenicAreas = scenicAreas;
	}
	public List<OfficeAreas> getOfficeAreas() {
		return officeAreas;
	}
	public void setOfficeAreas(List<OfficeAreas> officeAreas) {
		this.officeAreas = officeAreas;
	}
	public void setOffices(List<Offices> offices) {
		this.offices = offices;
	}
	public List<Offices> getOffices() {
		return offices;
	}
	public List<CarOrAirport> getCarOrAirport() {
		return carOrAirport;
	}
	public void setCarOrAirport(List<CarOrAirport> carOrAirport) {
		this.carOrAirport = carOrAirport;
	}
	public void setSubWayLine(List<SubWayLine> subWayLine) {
		this.subWayLine = subWayLine;
	}
	public List<SubWayLine> getSubWayLine() {
		return subWayLine;
	}
	public void setSubWayStation(List<SubWayStation> subWayStation) {
		this.subWayStation = subWayStation;
	}
	public List<SubWayStation> getSubWayStation() {
		return subWayStation;
	}
	public List<School> getSchool() {
		return school;
	}
	public void setSchool(List<School> school) {
		this.school = school;
	}
	public List<Hospital> getHospital() {
		return hospital;
	}
	public void setHospital(List<Hospital> hospital) {
		this.hospital = hospital;
	}
//	public void setLocations(List<String> locations) {
//		this.locations = locations;
//	}
//	public List<String> getLocations() {
//		return locations;
//	}
}

3.蚂蚁短租列表页抓取代码分页抓取和入库抓取列表页数据:

package org.ssgroup.spider.service;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.ssgroup.spider.Application;
import org.ssgroup.spider.constant.MaYiAllCityConstant;
import org.ssgroup.spider.htmlBean.MaYiListHtmlBean;
import org.ssgroup.spider.htmlBean.domain.MaYiRoom;
import org.ssgroup.spider.htmlBean.domain.list.CarOrAirport;
import org.ssgroup.spider.htmlBean.domain.list.Hospital;
import org.ssgroup.spider.htmlBean.domain.list.Office;
import org.ssgroup.spider.htmlBean.domain.list.OfficeAreas;
import org.ssgroup.spider.htmlBean.domain.list.Offices;
import org.ssgroup.spider.htmlBean.domain.list.ScenicArea;
import org.ssgroup.spider.htmlBean.domain.list.School;
import org.ssgroup.spider.htmlBean.domain.list.ShopLoops;
import org.ssgroup.spider.htmlBean.domain.list.Station;
import org.ssgroup.spider.htmlBean.domain.list.SubWayLine;
import org.ssgroup.spider.htmlBean.domain.list.SubWayStation;
import org.ssgroup.spider.htmlBean.domain.list.Vehicle;
import org.ssgroup.spider.utils.JdbcUtils;
import org.ssgroup.spider.utils.PinYinUtils;

import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;

@PipelineName("maYiListPipeline")
public class MaYiListPipeline implements Pipeline<MaYiListHtmlBean>{
	
	public void process(MaYiListHtmlBean bean) {
		try {
			List<MaYiRoom> rooms = bean.getRoom();
			if(null!=rooms && rooms.size()>0) {
				String cityPinYin = bean.getCity();
				//保存位置类型
				saveLocation(bean, cityPinYin);
				//保存房源
				saveRooms(rooms,cityPinYin);
				
				//继续抓取分页数据
				String pageStr = StringUtils.isNotBlank(bean.getPage())?bean.getPage():"1";
				int page = Integer.parseInt(pageStr) + 1;
				
				int pageCount = bean.getPages().get(0).getPage();
System.out.println("城市拼音【"+bean.getCity()+"】,第【"+pageStr+"】次分页抓取,总分页数【"+pageCount+"】,每一次抓取数量【"+bean.getRoom().size()+"】");
				if(page>0 && page<=pageCount) {
					HttpRequest request = bean.getRequest();
					String nextUrl = request.getUrl();
					if(StringUtils.isNoneBlank(nextUrl)) {
						String baseUrl = StringUtils.substringBeforeLast(nextUrl, "/");
						nextUrl = baseUrl+"/"+page;
System.out.println("分页请求地址【"+nextUrl+"】");
						SchedulerContext.into(request.subRequest(nextUrl));
					}
				}
				
				//如果最后一页抓取完成,执行下一个城市
				if(page==pageCount) {
					MaYiAllCityConstant.on=true;
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * 保存房源数据
	 * @param rooms
	 * @param city
	 * @throws SQLException
	 */
	private void saveRooms(List<MaYiRoom> rooms,String city) throws SQLException{
		Connection conn = null;
		PreparedStatement pstmt = null;
		try {
			conn = JdbcUtils.getConnection();
			conn.setAutoCommit(false);
			String sql = "INSERT INTO rooms(id,price,house_location,original_url,image_url,title,num_room,num_house,city,city_id) " + 
						 "VALUES(?,?,?,?,?,?,?,?,?,?)";
			pstmt = conn.prepareStatement(sql);
			for(MaYiRoom room : rooms) {
				pstmt.setLong(1, room.getRoomId());
				pstmt.setFloat(2, room.getPrice());
				pstmt.setString(3, room.getPosition());
				pstmt.setString(4, room.getOriginalImageUrl());
				pstmt.setString(5, room.getImageUrl());
				pstmt.setString(6, room.getTitle());
				pstmt.setString(7, room.getRooms());
				pstmt.setString(8, room.getHousing());
				pstmt.setString(9, city);
				pstmt.setLong(10, Application.CITY_CACHE.get(city).getId());
				pstmt.addBatch();
			}
			pstmt.executeBatch();
			conn.commit();
		}catch (Exception e) {
			e.printStackTrace();
			conn.rollback();
		}finally {
			JdbcUtils.close(conn, pstmt, null);
		}
	}
	
	private void saveLocation(MaYiListHtmlBean bean,String city) throws Exception{
		Connection conn = null;
		PreparedStatement pstmt = null;
		ResultSet resultSet = null;
		String sql = "INSERT INTO city_location(name,pin_yin,city_id,city_name,city_pin_yin,parent_id,href) " + 
					 "VALUES(?,?,?,?,?,?,?)";
		try {
			conn = JdbcUtils.getConnection();
			//"商圈","景点 ","行政区 ","车站机场 ","地铁"," 学校"," 医院"
			String[] cityLocation = MaYiAllCityConstant.CITY_LOCATION;
			for(int i=1;i<=cityLocation.length;i++) {
				String location = cityLocation[i-1];
				conn.setAutoCommit(false);
				pstmt = conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);
				
				//城市ID
				String tmpLocaltion = PinYinUtils.convertLower(location);
				Long city_id = Application.CITY_LOCATION_CACHE.get(tmpLocaltion).getId();
				switch (i) {
					case 1:	//商圈
						List<ShopLoops> shopLoop = bean.getShopLoops();
						if(null!=shopLoop && shopLoop.size()>0) {
							for(ShopLoops sl : shopLoop) {
								String name = sl.getLocation();
								addBatch(pstmt, name, city, city_id, sl.getHref());
							}
						}
						break;
					case 2:	//景点
						List<ScenicArea> scenicAreas = bean.getScenicAreas();
						if(null!=scenicAreas && scenicAreas.size()>0) {
							for(ScenicArea scenicArea : scenicAreas) {
								addBatch(pstmt, scenicArea.getScenic(), city, city_id, scenicArea.getHref());
							}
						}
						break;
					case 3:	//行政区
						List<OfficeAreas> officeAreas = bean.getOfficeAreas();
						if(null!=officeAreas && officeAreas.size()>0) {
							for(OfficeAreas officeArea : officeAreas) {
								addBatch(pstmt, officeArea.getOfficeAreas(), city, city_id, null);
							}
						}
						break;
					case 4:	//车站机场 
						List<CarOrAirport> carOrAirport = bean.getCarOrAirport();
						if(null!=carOrAirport && carOrAirport.size()>0) {
							for(CarOrAirport ca : carOrAirport) {
								addBatch(pstmt, ca.getName(), city, city_id, null);
							}
						}
						break;
					case 5:	//地铁
						List<SubWayLine> subWayLine = bean.getSubWayLine();
						if(null!=subWayLine && subWayLine.size()>0) {
							for(SubWayLine swl : subWayLine) {
								addBatch(pstmt, swl.getName(), city, city_id, null);
							}
						}
						break;
					case 6:	//学校
						List<School> school = bean.getSchool();
						if(null!=school && school.size()>0) {
							for(School s : school) {
								addBatch(pstmt, s.getName(), city, city_id, s.getHref());
							}
						}
						break;
					case 7:	//医院
						List<Hospital> hospital = bean.getHospital();
						if(null!=hospital && hospital.size()>0) {
							for(Hospital h : hospital) {
								addBatch(pstmt, h.getName(), city, city_id, h.getHref());
							}
						}
						break;
				}
				
				pstmt.executeBatch();
				conn.commit();
				
				//添加子节点
				if(i==3||i==4||i==5) {
					//获取结果  自增ID
					ResultSet rs = pstmt.getGeneratedKeys(); 
					List<Long> list = new ArrayList<Long>();   
					while(rs.next()) {  
						list.add(rs.getLong(1));//取得ID  
					}
					
					conn.setAutoCommit(false);
					pstmt = conn.prepareStatement(sql,Statement.RETURN_GENERATED_KEYS);
			        
					switch (i) {
						case 3:
							List<Offices> offices = bean.getOffices();
							if(null!=offices && offices.size()>0) {
								for(int j=0;j<list.size();j++) {
									Offices os = offices.get(0);
									for(Office office : os.getOffice()) {
										addBatch(pstmt, office.getOffice(), city, list.get(j), office.getHref());
									}
								}
							}
							break;
						case 4:
							List<CarOrAirport> cas = bean.getCarOrAirport();
							if(null!=cas && cas.size()>0) {
								for(int j=0;j<list.size();j++) {
									List<Vehicle> vehicles = cas.get(j).getVehicles();
									if(null!=vehicles && vehicles.size()>0) {
										for(Vehicle vehicle : vehicles) {
											addBatch(pstmt, vehicle.getVehicle(), city, list.get(j), vehicle.getHref());
										}
									}
								}
							}
							break;
						case 5:
							List<SubWayStation> subWayStation = bean.getSubWayStation();
							if(null!=subWayStation && subWayStation.size()>0) {
								for(int j=0;j<list.size();j++) {
									List<Station> stations = subWayStation.get(j).getStation();
									for(Station station : stations) {
										addBatch(pstmt, station.getName(), city, list.get(j), station.getHref());
									}
								}
							}
							break;
					}
					pstmt.executeBatch();
					conn.commit();
				}
			}
		}catch (Exception e) {
			e.printStackTrace();
			conn.rollback();
		}finally {
			JdbcUtils.close(conn, pstmt, resultSet);
		}
	}
	
	private void addBatch(PreparedStatement pstmt,String name,String city,Long city_id,String href) throws SQLException {
		pstmt.setString(1, name);
		pstmt.setString(2, PinYinUtils.convertLower(name));
		pstmt.setLong(3, Application.CITY_CACHE.get(city).getId());
		pstmt.setString(4, Application.CITY_CACHE.get(city).getName());
		//pstmt.setString(5, PinYinUtils.convertLower(Application.CITY_CACHE.get(city).getName()));
		pstmt.setString(5, city);
		
		pstmt.setLong(6, city_id);
		pstmt.setString(7, href);
		pstmt.addBatch();
	}
}

4.数据库链接工具类使用的JDBC:

package org.ssgroup.spider.utils;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

public class JdbcUtils {
	private static final String USERNAME = "root";
	private static final String PASSWORD = "root";
	private static final String DRIVER = "com.mysql.jdbc.Driver";
	private static final String URL = "jdbc:mysql://192.168.8.110:3306/test";
	
	static {
		try {
			Class.forName(DRIVER);
			System.out.println("数据库连接成功!");
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	public static Connection getConnection() throws SQLException {
		return DriverManager.getConnection(URL, USERNAME, PASSWORD);
	}
	
	public static void close(Connection connection,PreparedStatement pstmt,ResultSet resultSet) throws SQLException {
		if(null!=resultSet) resultSet.close();
		if(null!=pstmt) pstmt.close();
		if(null!=connection) connection.close();
	}
}

5.拼音转换工具类:

package org.ssgroup.spider.utils;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.commons.lang3.StringUtils;
 
/**
 * 针对中文转化拼音处理
 */
public class PinYinUtils {
     
    /**
     * 转为大写字母, 如:中国人民银行 =====>ZHONGGUORENMINYINHAN
     * @author lance
     * 2016年1月16日 下午4:56:07
     */
    public static String convertUpper(String text){
        return convert(text, HanyuPinyinCaseType.UPPERCASE, false);
    }
     
    /**
     * 转为小写字母, 如:中国人民银行 =====>zhongguorenminyinhang
     * @author lance
     * 2016年1月16日 下午4:56:07
     */
    public static String convertLower(String text){
        return convert(text, HanyuPinyinCaseType.LOWERCASE, false);
    }
     
    /**
     * 首字母大写, 如:中国人民银行 =====>ZhongGuoRenMinYinHang
     * @author lance
     * 2016年1月16日 下午5:04:11
     */
    public static String converCapitalize(String text){
        return convert(text, null, true);
    }
     
    /**
     * 所有中文的第一个字母大写, 如:中国人民银行 =====>ZGRMYH
     * @author lance
     * 2016年1月17日 下午10:16:19
     */
    public static String capitalizeLetter(String text){
        String c = converCapitalize(text);
        if(StringUtils.isBlank(c)) {
            return "";
        }
         
        return StringUtils.replacePattern(c, "[a-z]", "");
    }
     
    /**
     * 获取首字母, 如:中国人民银行 =====>Z
     * @author lance
     * 2016年1月17日 下午10:11:57
     */
    public static String firstLetter(String text){
        String c = converCapitalize(text);
        if(StringUtils.isBlank(c)) {
            return "";
        }
         
        return StringUtils.substring(c, 0, 1);
    }
     
    /**
     * 转为拼音
     * @param text          待转化的中文字符
     * @param caseType      转化类型, 即大写小写
     * @param isCapitalize  是否首字母大写
     * @author lance
     * 2016年1月17日 下午10:28:05
     */
    public static String convert(String text, HanyuPinyinCaseType caseType, boolean isCapitalize) {
        if(StringUtils.isBlank(text)){
            return "";
        }
        HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
        if(caseType != null) {
            format.setCaseType(caseType);
            isCapitalize = false;
        }
         
        format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
        format.setVCharType(HanyuPinyinVCharType.WITH_V);
        char[] input = StringUtils.trimToEmpty(text).toCharArray();
        StringBuilder builder = new StringBuilder();
        try {
            for (char c: input) {
                if (Character.toString(c).matches("[\\u4E00-\\u9FA5]+")) {
                    String[] temp = PinyinHelper.toHanyuPinyinStringArray(c, format);
                    if(isCapitalize) {
                        builder.append(StringUtils.capitalize(temp[0]));
                    }else {
                        builder.append(temp[0]);
                    }
                } else {
                    if(isCapitalize) {
                        builder.append(StringUtils.capitalize(Character.toString(c)));
                    }else {
                         builder.append(Character.toString(c));
                    }
                }
            }
        } catch (BadHanyuPinyinOutputFormatCombination ex) {
            ex.printStackTrace();
        }
 
        return builder.toString();
    }
}

6.启动类:

package org.ssgroup.spider;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;

import org.ssgroup.spider.constant.MaYiAllCityConstant;
import org.ssgroup.spider.domain.City;
import org.ssgroup.spider.domain.CityLocation;
import org.ssgroup.spider.utils.JdbcUtils;
import org.ssgroup.spider.utils.PinYinUtils;

import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.request.HttpGetRequest;

public class Application {
	public static Map<String,City> CITY_CACHE = new HashMap<String,City>();
	public static Map<String,CityLocation> CITY_LOCATION_CACHE = new HashMap<String,CityLocation>();
	public static String URL = "http://www.mayi.com";
	
	public static void main(String[] args) throws Exception {
		//初始化城市数据
		//initCity();
		//加载城市
		loadCityToMap();
		
		//初始化城市位置类型
		//initCityLocation();
		//加载城市位置类型
		loadCityLocationToMap();
		
		//先获取分类列表
		init();
	}
	
	private static void init() {
		String[] cityNames = new String[] {"ABCD","EFGH","JKLM","NPQR","STW","XYZ"};
		Map<String,String[]> allCitys = MaYiAllCityConstant.ALL_CITY;
		int i = 0;
		while(true) {
			if(MaYiAllCityConstant.on) {
				MaYiAllCityConstant.on=false;
				
				String cityName = cityNames[0];
				String[] citys = allCitys.get(cityName);
				for(String city : citys) {
					String city_pin_yin = PinYinUtils.convertLower(city);
					String nextUrl = URL+"/"+city_pin_yin;
System.out.println("MaYiIndexPipeline-->"+nextUrl);
					startGecco(nextUrl);
				}
				
				i++;
			}
			
			if(i == cityNames.length-1) {
				break;
			}
		}
	}
	
	private static void startGecco(String url) {
		HttpGetRequest start = new HttpGetRequest(url);
		start.setCharset("UTF-8");
		GeccoEngine.create()
		.classpath("org.ssgroup.spider")
		//开始抓取的页面地址
		.start(start)
		//开启几个爬虫线程
		.thread(1)
		//.debug(true)
		//单个爬虫每次抓取完一个请求后的间隔时间
		.interval(5000)
		.run();
	}
	
	private static void initCityLocation() throws Exception {
		Connection conn = null;
		PreparedStatement pstmt = null;
		try {
			String[] locations = MaYiAllCityConstant.CITY_LOCATION;
			
			conn = JdbcUtils.getConnection();
			conn.setAutoCommit(false);
			String sql = "INSERT INTO city_location(name,pin_yin,parent_id) " + 
						 "VALUES(?,?,?)";
			pstmt = conn.prepareStatement(sql);
			for(String location : locations) {
				pstmt.setString(1, location);
				pstmt.setString(2, PinYinUtils.convertLower(location));
				pstmt.setInt(3, 0);
				pstmt.addBatch();
			}
			pstmt.executeBatch();
			conn.commit();
		}catch (Exception e) {
			e.printStackTrace();
			conn.rollback();
		}finally {
			JdbcUtils.close(conn, pstmt, null);
		}
	}

	private static void initCity() throws Exception {
		Connection conn = null;
		PreparedStatement pstmt = null;
		
		try {
			Map<String,String[]> allCitys = MaYiAllCityConstant.ALL_CITY;
			for(Entry<String,String[]> entry : allCitys.entrySet()) {
				if(!"holdCity".equals(entry.getKey())){
					String[] citys = entry.getValue();
					//保存城市数据
					conn = JdbcUtils.getConnection();
					conn.setAutoCommit(false);
					String sql = "INSERT INTO city(name,pin_yin,first_pin_yin,first_last_pin_yin) " + 
								 "VALUES(?,?,?,?)";
					pstmt = conn.prepareStatement(sql);
					for(String city : citys) {
						pstmt.setString(1, city);
						pstmt.setString(2, PinYinUtils.convertLower(city));
						pstmt.setString(3, PinYinUtils.firstLetter(city).toLowerCase());
						pstmt.setString(4, PinYinUtils.capitalizeLetter(city).toLowerCase());
						pstmt.addBatch();
					}
					pstmt.executeBatch();
					conn.commit();
				}
			}
		}catch (Exception e) {
			e.printStackTrace();
			conn.rollback();
		}finally {
			JdbcUtils.close(conn, pstmt, null);
		}
	}
	
	private static void loadCityToMap() throws Exception {
		Connection conn = null;
		PreparedStatement pstmt = null;
		ResultSet resultSet = null;
		try {
			conn = JdbcUtils.getConnection();
			String sql = "SELECT id,name,pin_yin,first_pin_yin,first_last_pin_yin FROM city";
			pstmt = conn.prepareStatement(sql);
			resultSet = pstmt.executeQuery();
			while(resultSet.next()) {
				Long id = resultSet.getLong("id");
				String name = resultSet.getString("name");
				String pinYin = resultSet.getString("pin_yin");
				String firstPinYin = resultSet.getString("first_pin_yin");
				String firstLastPinYin = resultSet.getString("first_last_pin_yin");
				
				City city = new City();
				city.setId(id);
				city.setName(name);
				city.setPinYin(pinYin);
				city.setFirstPinYin(firstPinYin);
				city.setFirstLastPinYin(firstLastPinYin);
				CITY_CACHE.put(pinYin, city);
			}
		}catch (Exception e) {
			e.printStackTrace();
		}finally {
			JdbcUtils.close(conn, pstmt, resultSet);
		}
	}
	
	private static void loadCityLocationToMap() throws Exception {
		Connection conn = null;
		PreparedStatement pstmt = null;
		ResultSet resultSet = null;
		try {
			conn = JdbcUtils.getConnection();
			String sql = "SELECT id,name,pin_yin FROM city_location where parent_id=0";
			pstmt = conn.prepareStatement(sql);
			resultSet = pstmt.executeQuery();
			while(resultSet.next()) {
				Long id = resultSet.getLong("id");
				String name = resultSet.getString("name");
				String pinYin = resultSet.getString("pin_yin");
				
				CityLocation cityLocation = new CityLocation();
				cityLocation.setId(id);
				cityLocation.setName(name);
				cityLocation.setPinYin(pinYin);
				CITY_LOCATION_CACHE.put(pinYin, cityLocation);
			}
		}catch (Exception e) {
			e.printStackTrace();
		}finally {
			JdbcUtils.close(conn, pstmt, resultSet);
		}
	}
}

7.SQL

CREATE TABLE `city` (
  `id` bigint(10) NOT NULL AUTO_INCREMENT,
  `name` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市名称',
  `pin_yin` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市拼音',
  `first_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '首字母简写',
  `first_last_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '首尾字母简写',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='城市表';

CREATE TABLE `city_location` (
  `id` bigint(10) NOT NULL AUTO_INCREMENT,
  `name` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市地段名称',
  `pin_yin` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '城市地段拼音',
  `city_id` bigint(10) DEFAULT NULL COMMENT '城市ID',
  `city_name` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '城市名称',
  `city_pin_yin` varchar(10) COLLATE utf8_bin DEFAULT NULL COMMENT '城市拼音',
  `parent_id` bigint(10) DEFAULT NULL COMMENT '父ID',
  `status` int(2) DEFAULT '0' COMMENT '城市拼音',
  `href` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '请求路径',
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='城市商业地段表';

CREATE TABLE `rooms` (
  `id` bigint(10) NOT NULL,
  `price` DOUBLE(10,2) DEFAULT NULL COMMENT '价格',
  `house_location` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '房源地理位置',
  `original_url` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '图片原始地址',
  `image_url` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '缩略图地址',
  `title` varchar(255) COLLATE utf8_bin DEFAULT NULL COMMENT '房源标题',
  `num_room` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '几居室',
  `num_house` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '可住几个人',
  `city` varchar(50) COLLATE utf8_bin DEFAULT NULL COMMENT '属于哪个城市',
  `city_id` bigint(10) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='房源表';

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Gecco是什么 Gecco是一款用java语言开发的轻量化的易用的网络爬虫。Gecco整合了jsoup、httpclient、fastjson、spring、htmlunit、redission等优秀框架,让您只需要配置一些jquery风格的选择器就能很快的写出一个爬虫。Gecco框架有优秀的可扩展性,框架基于开闭原则进行设计,对修改关闭、对扩展开放。同时Gecco基于十分开放的MIT开源协议,无论你是使用者还是希望共同完善Gecco的开发者,欢迎pull request。如果你喜欢这款爬虫框架请star 或者 fork!参考手册架构图: 主要特征  简单易用,使用jquery风格的选择器抽取元素  支持面中的异步ajax请求  支持面中的javascript变量抽取  利用Redis实现分布式抓取,参考gecco-redis  支持结合Spring开发业务逻辑,参考gecco-spring  支持htmlunit扩展,参考gecco-htmlunit  支持插件扩展机制  支持下载时UserAgent随机选取  支持下载代理服务器随机选取 使用手册:http://www.geccocrawler.com/tag/sysc/快速入门:@Gecco(matchUrl="https://github.com/{user}/{project}", pipelines="consolePipeline") public class MyGithub implements HtmlBean {     private static final long serialVersionUID = -7127412585200687225L;     @RequestParameter("user")     private String user;     @RequestParameter("project")     private String project;     @Text     @HtmlField(cssPath=".repository-meta-content")     private String title;     @Text     @HtmlField(cssPath=".pagehead-actions li:nth-child(2) .social-count")     private int star;     @Text     @HtmlField(cssPath=".pagehead-actions li:nth-child(3) .social-count")     private int fork;     @Html     @HtmlField(cssPath=".entry-content")     private String readme;     public String getReadme() {         return readme;     }     public void setReadme(String readme) {         this.readme = readme;     }     public String getUser() {         return user;     }     public void setUser(String user) {         this.user = user;     }     public String getProject() {         return project;     }     public void setProject(String project) {         this.project = project;     }     public String getTitle() {         return title;     }     public void setTitle(String title) {         this.title = title;     }     public int getStar() {         return star;     }     public void setStar(int star) {         this.star = star;     }     public int getFork() {         return fork;     }     public void setFork(int fork) {         this.fork = fork;     }     public static void main(String[] args) {         GeccoEngine.create()         .classpath("com.geccocrawler.gecco.demo")         .start("https://github.com/xtuhcy/gecco")         .thread(1)         .interval(2000)         .loop(true)         .mobile(false)         .start();     } }demo地址:教您使用java爬虫gecco抓取JD全部商品信息(一)教您使用java爬虫gecco抓取JD全部商品信息(二)教您使用java爬虫gecco抓取JD全部商品信息(三)集成Htmlunit下载面爬虫的监控一个完整的例子,分处理,结合spring,mysql入库 标签:网络爬虫  开源爬虫
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值