使用Java及jsoup爬取链家北京二手房房价数据

由于是初次使用Java写爬虫,所以代码有些繁琐,请大家见谅,并能给与指正


首先分析链家北京二手房页面,使用360浏览器的审查元素功能,查看源代码,获取查询标签

如图一级查询所示,此图标签所获取的是链家北京二手房页面下的一级地区地址


由于具体获取有些复杂,故列大致步骤如下


主页——》一级地区地址(东城,西城,朝阳等)——》二级地区地址(东城下的安定门,安贞等)——》获取房屋地址(中间须获取二级地区地址下的页面页数,并拼接于地址中,作为参数获取本级数据)——-》获取房屋数据

package PachongTest;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;


import pojo.House;

public class LianJiaData {

	
	public static int rows=0;
	public static int i=0;//显示是第几条房屋数据
	
	/**
	 *测试函数 
	 */
	@Test
	public void TestJsoup(){
// LianJiaData.getUrlList("https://bj.lianjia.com/ershoufang/");
// LianJiaData.getHomeUrlList("https://bj.lianjia.com//ershoufang/anzhen1/");
// LianJiaData.getPageNum("https://bj.lianjia.com/ershoufang/xisi1/");
// LianJiaData.getPageUrlList("https://bj.lianjia.com/ershoufang/youanmennei11/");
// LianJiaData.getPageHomeUrlList("https://bj.lianjia.com/ershoufang/xidan/pg2/");
// LianJiaData.getConstructionArea("https://bj.lianjia.com/ershoufang/101102529719.html");
// LianJiaData.getInnerArea("http://bj.lianjia.com/ershoufang/101102044367.html");
// LianJiaData.getTimeLeft("https://bj.lianjia.com/ershoufang/101102529719.html");
// LianJiaData.getUnitPrice("http://bj.lianjia.com/ershoufang/101102228890.html");
// LianJiaData.getIntroduction("https://lf.lianjia.com/ershoufang/101102603227.html");
// LianJiaData.getHouseMessage("https://lf.lianjia.com/ershoufang/101102603227.html");
	}

	/**
	 *通过homeurl获取house信息,并保存到数据库中
	 */
	public static void getHouseMessage(String pageHomeUrl){
		if (!pageHomeUrl.equals("javascript:;")) {
			// allPageHomeUrlList.add(pageHomeUrl);

			i++;
			System.out.println("******这是查询的第"+i+"条数据******");
			
//			System.out.println(pageHomeUrl + i);

		    House house=LianJiaData.getHouse(pageHomeUrl);
		    LianJiaData.SaveDate(house);
		}
	}
      
	 
	/**
	 *获取所有的房子url 
	 */
	public static List<String> getAllHomeUrlList() throws IOException {
	
	
		
		int j=0;
		// 获取二级链接List "https://bj.lianjia.com//ershoufang/anzhen1/"
		List<String> urlList = LianJiaData.getUrlList("https://bj.lianjia.com/ershoufang/");
		// 页面地址List
		List<String> allHomeUrlList=new ArrayList<String>();
		for (String url : urlList) {
			// System.out.println(url);//二级链接
			// 通过二级链接获取页面总数,然后拼接页面地址
			List<String> pageUrlList = LianJiaData.getPageUrlList(url);
			for (String pageUrl : pageUrlList) {
				// allPageUrlList.add(pageUrl);
				// System.out.println(pageUrl);

				List<String> pageHomeUrlList = LianJiaData.getPageHomeUrlList(pageUrl);
				for (String pageHomeUrl : pageHomeUrlList) {
					allHomeUrlList.add(pageHomeUrl);
					System.out.println(j++);

				}
			}

		}

		// //最终房子的地址List
		// List<String> allPageHomeUrlList=new ArrayList<String>();
		// for (String pageList : allPageUrlList) {
		// List<String>
		// pageHomeUrlList=LianJiaData.getPageHomeUrlList(pageList);
		// for (String pageHomeUrl : pageHomeUrlList) {
		// allPageHomeUrlList.add(pageHomeUrl);
		// System.out.println(pageHomeUrl);
		// }
		// }

		return allHomeUrlList;
	}

	/**
	 *存储房屋数据 
	 */
	public static void SaveDate(House house) {
		Connection conn = null;
		PreparedStatement ps = null;
		ResultSet rs = null;
		try {
			// 1.注册驱动
			Class.forName("com.mysql.jdbc.Driver");
			// 2.获取连接
			conn = DriverManager.getConnection("jdbc:mysql:///lianjiadata", "root", "root");
			String sql = "INSERT INTO house (introduction,address,price,house_property,house_type,construction_area,inner_area,time_left,unit_price,url)VALUES(?,?,?,?,?,?,?,?,?,?)";
			ps = conn.prepareStatement(sql);
			ps.setString(1, house.getIntroduction());
			ps.setString(2, house.getAddress());
			ps.setInt(3, house.getPrice());
			ps.setInt(4, house.getHouseProperty());
			ps.setString(5, house.getHouseType());
			ps.setDouble(6, house.getConstructionArea());
			ps.setDouble(7, house.getInnerArea());
			ps.setInt(8, house.getTimeLeft());
			ps.setDouble(9, house.getUnitPrice());
			ps.setString(10,house.getUrl());
			rows = ps.executeUpdate();
			

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			// 6.释放资源
			if (rs != null) {
				try {
					rs.close();
				} catch (Exception e) {
					e.printStackTrace();
				} finally {
					rs = null;
				}
			}
			if (ps != null) {
				try {
					ps.close();
				} catch (Exception e) {
					e.printStackTrace();
				} finally {
					ps = null;
				}
			}
			if (conn != null) {
				try {
					conn.close();
				} catch (Exception e) {
					e.printStackTrace();
				} finally {
					conn = null;
				}
			}
		}

	}

	/**
	 * 获取房屋数据
	 */
	public static House getHouse(String url) {
		String introduction = LianJiaData.getIntroduction(url);// 房屋介绍
		String address = LianJiaData.getAddress(url);// 房屋地址
		Integer price = LianJiaData.getPrice(url);// 房屋价格
		Integer houseProperty = LianJiaData.getHouseProperty(url);// 房屋产权
		String houseType = LianJiaData.getHouseType(url);// 房屋户型
		Double constructionArea = LianJiaData.getConstructionArea(url);// 建筑面积
		Double innerArea = LianJiaData.getInnerArea(url);// 套内面积
		Integer timeLeft = LianJiaData.getTimeLeft(url);// 已经建筑年限
		Double unitPrice = LianJiaData.getUnitPrice(url);// 每平米价格

		House house = new House();
		house.setIntroduction(introduction);
		house.setAddress(address);
		house.setPrice(price);
		house.setHouseProperty(houseProperty);
		house.setHouseType(houseType);
		house.setConstructionArea(constructionArea);
		house.setInnerArea(innerArea);
		house.setTimeLeft(timeLeft);
		house.setUnitPrice(unitPrice);
		house.setUrl(url);

		return house;
	}

	/**
	 * Param url return urlList1 二级链接
	 */
	public static List<String> getUrlList(String url) throws IOException {

		// 当前页面所有元素独享doctument
		Document doc = Jsoup.connect(url).get();
		// 从document中获取三级分类的a标签
		Elements elements = doc.select(".sub_nav").select(".section_sub_nav a");
		List<String> UrlList = new ArrayList<String>();
		List<String> UrlList1 = new ArrayList<String>();
		for (Element element : elements) {
			String url1 = element.attr("href");
			UrlList.add(url1);
			// System.out.println(url1);
			if (url1.startsWith("/ershoufang/")) {
				url1 = "https://bj.lianjia.com" + url1;
				// 获取二级地址
				// 获取除燕郊,香河外以下的二级地址
				Document doc1 = Jsoup.connect(url1).get();
				Elements elements1 = doc1.select(".sub_sub_nav").select(".section_sub_sub_nav a");

				for (Element element2 : elements1) {
					String url2 = element2.attr("href");
					url2 = "https://bj.lianjia.com" + url2;
					if (!("/ershoufang/".equals(url2))) {
						UrlList1.add(url2);
						// System.out.println(url2);//此处获取二级所有地址,下面要获取二级地址下的各地址所含页数
					}

				}
			} else {
				// 获取除燕郊,香河二级地址
				Document doc1 = Jsoup.connect(url1).get();
				Elements elements1 = doc1.select(".position a");

				for (Element element2 : elements1) {
					String url2 = element2.attr("href");

					if (!("/ershoufang/".equals(url2))) {
						url2 = "https://lf.lianjia.com" + url2;
						UrlList1.add(url2);
						// System.out.println(url2);//此处获取二级所有地址,下面要获取二级地址下的各地址所含页数
					}
				}
			}

		}
		return UrlList1;// 返回最终所有地址的List

	}

	/**
	 * 获取最终每页房子地址List
	 */
	public static List<String> getHomeUrlList(String url) {
		List<String> homeUrlList = new ArrayList<String>();
		try {

			List<String> pageUrlList = LianJiaData.getPageUrlList(url);
			for (String pageUrl : pageUrlList) {
				Elements elements = Jsoup.connect(pageUrl).get().select("sellListContent");
				for (Element element : elements) {
					String homeUrl = element.attr("href");
					homeUrlList.add(homeUrl);
					// System.out.println(homeUrl);
				}
			}

		} catch (Exception e) {
			// TODO: 断点续传
			e.printStackTrace();
		}
		return homeUrlList;

	}

	/**
	 * return pageNum 最终地址所含页面总数
	 */
	public static Integer getPageNum(String url) {
		try {

			Elements elements = Jsoup.connect(url).get().select(".page-box").select(".house-lst-page-box");
			for (Element element : elements) {

				String data = element.attr("page-data");
				String data1 = element.attr("page-data").substring(13, 14);
				Integer pageNum = Integer.parseInt(data1);
				// System.out.println(pageNum);

				return pageNum;
			}
		} catch (IOException e) {
			// TODO 断点续爬
			e.printStackTrace();
		}
		return 0;

	}

	/**
	 * 获取最终地址每页具体地址,eg :https://bj.lianjia.com/ershoufang/tianningsi1/pg1/
	 */
	public static List<String> getPageUrlList(String url) {
		Integer pageNum = LianJiaData.getPageNum(url);
		List<String> pageUrlList = new ArrayList<String>();
		for (int i = 1; i <= pageNum; i++) {
			String pageUtl = url + "pg" + i;
			pageUrlList.add(pageUtl);
			// System.out.println(pageUtl);
		}
		return pageUrlList;

	}

	/**
	 * 获取每个分页下的房子链接
	 */

	public static List<String> getPageHomeUrlList(String url) {

		List<String> pageHomeUrlList = new ArrayList<String>();
		try {
			Elements elements = Jsoup.connect(url).get().select(".title a");
			for (Element element : elements) {
				String homeUrl = element.attr("href");
				pageHomeUrlList.add(homeUrl);
//				 System.out.println(homeUrl);
			}
		} catch (Exception e) {
			e.printStackTrace();
			// TODO 断点续爬
		}
		return pageHomeUrlList;

	}

	/**
	 * 获取每个url下的房子介绍 
	 */
	public static String getIntroduction(String url) {
		String introduction = null;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".baseattribute").select(".clear .content");// baseattribute
																						// clear
			introduction = elements.get(0).text();
		
//		System.out.println(introduction);
		} catch (Exception e) {
			// TODO: 断点续爬
		}
		return introduction;

	}

	/**
	 * 获取每个url下房屋地址
	 */

	public static String getAddress(String url) {
		String address = null;
		String address1 = null;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".areaName .info");
			address = elements.get(0).text();
			Elements elements1 = doc.select(".areaName .supplement");
			address1 = elements1.get(0).text();
			address = address + address1;
//			System.out.println(address);
		} catch (Exception e) {
			// TODO:断点续爬
		}
		return address;
	}

	/**
	 * 获取价格
	 */
	public static Integer getPrice(String url) {
		Integer price = 0;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".price .total");
			price = Integer.parseInt(elements.get(0).text());
			price = price * 10000;
//			System.out.println(price);
		} catch (Exception e) {
			// TODO:断点续传
		}

		return price;

	}

	/**
	 * 获取房屋产权时间
	 */
	public static Integer getHouseProperty(String url) {
		Integer houseProperty = 0;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".base li");
			String houseProperty1 = elements.get(12).text().substring(4, 6);
			houseProperty = Integer.parseInt(houseProperty1);
//			System.out.println(houseProperty);
		} catch (Exception e) {
			// TODO: 断点续传
		}
		return houseProperty;

	}

	/**
	 * 获得房屋户型
	 */
	public static String getHouseType(String url) {
		String houseType = null;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".base .content li");
			houseType = elements.get(0).text();
//			System.out.println(houseType);
		} catch (Exception e) {
			// TODO: 断点续爬
		}
		return houseType;

	}

	/**
	 * 获取房屋建筑面积
	 */
	public static Double getConstructionArea(String url) {
		Double constructionArea = 0.0;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".base li");
			String constructionArea1 = elements.get(2).text().substring(4, 9);
			constructionArea = Double.parseDouble(constructionArea1);
//			 System.out.println(constructionArea);
		} catch (Exception e) {
			// TODO: 断点续爬
		}
		return constructionArea;
	}

	/**
	 * 获取房屋套内面积
	 */
	public static Double getInnerArea(String url) {
		Double innerArea = 0.0;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".base li");
			String innerArea1 = elements.get(4).text().substring(4,9);
			innerArea = Double.parseDouble(innerArea1);
//			System.out.println(innerArea);
		} catch (Exception e) {
			// TODO: 断点续爬
		}
		return innerArea;
	}

	/**
	 * 获取建筑时间
	 */
	public static int getTimeLeft(String url) {
		int buildTime = 0;
		int timeLeft = 0;
		try {
			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".area .subInfo");
			String buildTime1 = elements.get(0).text().substring(0, 4);
			buildTime = Integer.parseInt(buildTime1);
			timeLeft = 2018 - buildTime;
//			System.out.println(timeLeft);
		} catch (Exception e) {
			// TODO: handle exception
		}
		return timeLeft;
	}

	public static Double getUnitPrice(String url) {
		Double unitPrice = 0.0;
		try {

			Document doc = Jsoup.connect(url).get();
			Elements elements = doc.select(".unitPriceValue");
			String unitPrice1 = elements.get(0).text().substring(0, 6);
			String temp=unitPrice1.substring(5, 6);
			if(temp.equals("元")){
				unitPrice1 = elements.get(0).text().substring(0, 5);
			}
			unitPrice = Double.parseDouble(unitPrice1);
			
//			System.out.println(unitPrice);
		} catch (Exception e) {
			// TODO: handle exception
		}
		return unitPrice;
	}
}
其中出现问题有由于初学爬虫,断点续爬不会写。此段代码获取到所有房屋地址List,引用到线程类中,再调用本类方法处理,由于每个线程处理的是所有房屋地址List其中的一部分,执行不同逻辑,故run方法中执行逻辑不同,代码如下

package thread;

import java.io.IOException;
import java.util.List;

import PachongTest.LianJiaData;

public class ThreadDemo {

	public static void main(String[] args){
		Thread01 t1=new Thread01();
		Thread02 t2=new Thread02();
		Thread03 t3=new Thread03();
		Thread04 t4=new Thread04();
		Thread05 t5=new Thread05();
		Thread06 t6=new Thread06();
		Thread07 t7=new Thread07();
		Thread08 t8=new Thread08();
		Thread09 t9=new Thread09();
		Thread10 t10=new Thread10();
		Thread11 t11=new Thread11();
		Thread12 t12=new Thread12();
		Thread13 t13=new Thread13();
		Thread14 t14=new Thread14();
		Thread15 t15=new Thread15();
		
		t1.start();
		t2.start();
		t3.start();
		t4.start();
		t5.start();
		t6.start();
		t7.start();
		t8.start();
		t9.start();
		t10.start();
		t11.start();
		t12.start();
		t13.start();
		t14.start();
		t15.start();
		
	}
}

class Thread01 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=0;i<2000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread02 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=2000;i<4000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread03 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=4000;i<6000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread04 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=6000;i<8000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}


class Thread05 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=8000;i<10000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread06 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=12000;i<14000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread07 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=14000;i<16000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}
class Thread08 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=16000;i<18000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}


class Thread09 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=18000;i<20000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread10 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=20000;i<22000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread11 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=22000;i<24000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread12 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=24000;i<26000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}


class Thread13 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=26000;i<28000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}



class Thread14 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=28000;i<lian.size();i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

class Thread15 extends Thread{

	public void run() {
		try {
			List<String> lian=LianJiaData.getAllHomeUrlList();
			for(int i=10000;i<12000;i++){
				String url=lian.get(i);
//				System.out.println(url);
				LianJiaData.getHouseMessage(url);
			}
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

其中遇到问题是不同线程执行时,LianJiaData类中的计数器j被执行多次,导致地址计数次数重复,希望有大神指出错误



待解决问题:链接超时  connect timeout

在设置时间Document doc = Jsoup.connect(url).timeout(3000).get()后解决一部分,报错有所减少,但仍有

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值