由于是初次使用Java写爬虫,所以代码有些繁琐,请大家见谅,并能给与指正
首先分析链家北京二手房页面,使用360浏览器的审查元素功能,查看源代码,获取查询标签
如图一级查询所示,此图标签所获取的是链家北京二手房页面下的一级地区地址
由于具体获取有些复杂,故列大致步骤如下
主页——》一级地区地址(东城,西城,朝阳等)——》二级地区地址(东城下的安定门,安贞等)——》获取房屋地址(中间须获取二级地区地址下的页面页数,并拼接于地址中,作为参数获取本级数据)——-》获取房屋数据
package PachongTest;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;
import pojo.House;
public class LianJiaData {
public static int rows=0;
public static int i=0;//显示是第几条房屋数据
/**
*测试函数
*/
@Test
public void TestJsoup(){
// LianJiaData.getUrlList("https://bj.lianjia.com/ershoufang/");
// LianJiaData.getHomeUrlList("https://bj.lianjia.com//ershoufang/anzhen1/");
// LianJiaData.getPageNum("https://bj.lianjia.com/ershoufang/xisi1/");
// LianJiaData.getPageUrlList("https://bj.lianjia.com/ershoufang/youanmennei11/");
// LianJiaData.getPageHomeUrlList("https://bj.lianjia.com/ershoufang/xidan/pg2/");
// LianJiaData.getConstructionArea("https://bj.lianjia.com/ershoufang/101102529719.html");
// LianJiaData.getInnerArea("http://bj.lianjia.com/ershoufang/101102044367.html");
// LianJiaData.getTimeLeft("https://bj.lianjia.com/ershoufang/101102529719.html");
// LianJiaData.getUnitPrice("http://bj.lianjia.com/ershoufang/101102228890.html");
// LianJiaData.getIntroduction("https://lf.lianjia.com/ershoufang/101102603227.html");
// LianJiaData.getHouseMessage("https://lf.lianjia.com/ershoufang/101102603227.html");
}
/**
*通过homeurl获取house信息,并保存到数据库中
*/
public static void getHouseMessage(String pageHomeUrl){
if (!pageHomeUrl.equals("javascript:;")) {
// allPageHomeUrlList.add(pageHomeUrl);
i++;
System.out.println("******这是查询的第"+i+"条数据******");
// System.out.println(pageHomeUrl + i);
House house=LianJiaData.getHouse(pageHomeUrl);
LianJiaData.SaveDate(house);
}
}
/**
*获取所有的房子url
*/
public static List<String> getAllHomeUrlList() throws IOException {
int j=0;
// 获取二级链接List "https://bj.lianjia.com//ershoufang/anzhen1/"
List<String> urlList = LianJiaData.getUrlList("https://bj.lianjia.com/ershoufang/");
// 页面地址List
List<String> allHomeUrlList=new ArrayList<String>();
for (String url : urlList) {
// System.out.println(url);//二级链接
// 通过二级链接获取页面总数,然后拼接页面地址
List<String> pageUrlList = LianJiaData.getPageUrlList(url);
for (String pageUrl : pageUrlList) {
// allPageUrlList.add(pageUrl);
// System.out.println(pageUrl);
List<String> pageHomeUrlList = LianJiaData.getPageHomeUrlList(pageUrl);
for (String pageHomeUrl : pageHomeUrlList) {
allHomeUrlList.add(pageHomeUrl);
System.out.println(j++);
}
}
}
// //最终房子的地址List
// List<String> allPageHomeUrlList=new ArrayList<String>();
// for (String pageList : allPageUrlList) {
// List<String>
// pageHomeUrlList=LianJiaData.getPageHomeUrlList(pageList);
// for (String pageHomeUrl : pageHomeUrlList) {
// allPageHomeUrlList.add(pageHomeUrl);
// System.out.println(pageHomeUrl);
// }
// }
return allHomeUrlList;
}
/**
*存储房屋数据
*/
public static void SaveDate(House house) {
Connection conn = null;
PreparedStatement ps = null;
ResultSet rs = null;
try {
// 1.注册驱动
Class.forName("com.mysql.jdbc.Driver");
// 2.获取连接
conn = DriverManager.getConnection("jdbc:mysql:///lianjiadata", "root", "root");
String sql = "INSERT INTO house (introduction,address,price,house_property,house_type,construction_area,inner_area,time_left,unit_price,url)VALUES(?,?,?,?,?,?,?,?,?,?)";
ps = conn.prepareStatement(sql);
ps.setString(1, house.getIntroduction());
ps.setString(2, house.getAddress());
ps.setInt(3, house.getPrice());
ps.setInt(4, house.getHouseProperty());
ps.setString(5, house.getHouseType());
ps.setDouble(6, house.getConstructionArea());
ps.setDouble(7, house.getInnerArea());
ps.setInt(8, house.getTimeLeft());
ps.setDouble(9, house.getUnitPrice());
ps.setString(10,house.getUrl());
rows = ps.executeUpdate();
} catch (Exception e) {
e.printStackTrace();
} finally {
// 6.释放资源
if (rs != null) {
try {
rs.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
rs = null;
}
}
if (ps != null) {
try {
ps.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
ps = null;
}
}
if (conn != null) {
try {
conn.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
conn = null;
}
}
}
}
/**
* 获取房屋数据
*/
public static House getHouse(String url) {
String introduction = LianJiaData.getIntroduction(url);// 房屋介绍
String address = LianJiaData.getAddress(url);// 房屋地址
Integer price = LianJiaData.getPrice(url);// 房屋价格
Integer houseProperty = LianJiaData.getHouseProperty(url);// 房屋产权
String houseType = LianJiaData.getHouseType(url);// 房屋户型
Double constructionArea = LianJiaData.getConstructionArea(url);// 建筑面积
Double innerArea = LianJiaData.getInnerArea(url);// 套内面积
Integer timeLeft = LianJiaData.getTimeLeft(url);// 已经建筑年限
Double unitPrice = LianJiaData.getUnitPrice(url);// 每平米价格
House house = new House();
house.setIntroduction(introduction);
house.setAddress(address);
house.setPrice(price);
house.setHouseProperty(houseProperty);
house.setHouseType(houseType);
house.setConstructionArea(constructionArea);
house.setInnerArea(innerArea);
house.setTimeLeft(timeLeft);
house.setUnitPrice(unitPrice);
house.setUrl(url);
return house;
}
/**
* Param url return urlList1 二级链接
*/
public static List<String> getUrlList(String url) throws IOException {
// 当前页面所有元素独享doctument
Document doc = Jsoup.connect(url).get();
// 从document中获取三级分类的a标签
Elements elements = doc.select(".sub_nav").select(".section_sub_nav a");
List<String> UrlList = new ArrayList<String>();
List<String> UrlList1 = new ArrayList<String>();
for (Element element : elements) {
String url1 = element.attr("href");
UrlList.add(url1);
// System.out.println(url1);
if (url1.startsWith("/ershoufang/")) {
url1 = "https://bj.lianjia.com" + url1;
// 获取二级地址
// 获取除燕郊,香河外以下的二级地址
Document doc1 = Jsoup.connect(url1).get();
Elements elements1 = doc1.select(".sub_sub_nav").select(".section_sub_sub_nav a");
for (Element element2 : elements1) {
String url2 = element2.attr("href");
url2 = "https://bj.lianjia.com" + url2;
if (!("/ershoufang/".equals(url2))) {
UrlList1.add(url2);
// System.out.println(url2);//此处获取二级所有地址,下面要获取二级地址下的各地址所含页数
}
}
} else {
// 获取除燕郊,香河二级地址
Document doc1 = Jsoup.connect(url1).get();
Elements elements1 = doc1.select(".position a");
for (Element element2 : elements1) {
String url2 = element2.attr("href");
if (!("/ershoufang/".equals(url2))) {
url2 = "https://lf.lianjia.com" + url2;
UrlList1.add(url2);
// System.out.println(url2);//此处获取二级所有地址,下面要获取二级地址下的各地址所含页数
}
}
}
}
return UrlList1;// 返回最终所有地址的List
}
/**
* 获取最终每页房子地址List
*/
public static List<String> getHomeUrlList(String url) {
List<String> homeUrlList = new ArrayList<String>();
try {
List<String> pageUrlList = LianJiaData.getPageUrlList(url);
for (String pageUrl : pageUrlList) {
Elements elements = Jsoup.connect(pageUrl).get().select("sellListContent");
for (Element element : elements) {
String homeUrl = element.attr("href");
homeUrlList.add(homeUrl);
// System.out.println(homeUrl);
}
}
} catch (Exception e) {
// TODO: 断点续传
e.printStackTrace();
}
return homeUrlList;
}
/**
* return pageNum 最终地址所含页面总数
*/
public static Integer getPageNum(String url) {
try {
Elements elements = Jsoup.connect(url).get().select(".page-box").select(".house-lst-page-box");
for (Element element : elements) {
String data = element.attr("page-data");
String data1 = element.attr("page-data").substring(13, 14);
Integer pageNum = Integer.parseInt(data1);
// System.out.println(pageNum);
return pageNum;
}
} catch (IOException e) {
// TODO 断点续爬
e.printStackTrace();
}
return 0;
}
/**
* 获取最终地址每页具体地址,eg :https://bj.lianjia.com/ershoufang/tianningsi1/pg1/
*/
public static List<String> getPageUrlList(String url) {
Integer pageNum = LianJiaData.getPageNum(url);
List<String> pageUrlList = new ArrayList<String>();
for (int i = 1; i <= pageNum; i++) {
String pageUtl = url + "pg" + i;
pageUrlList.add(pageUtl);
// System.out.println(pageUtl);
}
return pageUrlList;
}
/**
* 获取每个分页下的房子链接
*/
public static List<String> getPageHomeUrlList(String url) {
List<String> pageHomeUrlList = new ArrayList<String>();
try {
Elements elements = Jsoup.connect(url).get().select(".title a");
for (Element element : elements) {
String homeUrl = element.attr("href");
pageHomeUrlList.add(homeUrl);
// System.out.println(homeUrl);
}
} catch (Exception e) {
e.printStackTrace();
// TODO 断点续爬
}
return pageHomeUrlList;
}
/**
* 获取每个url下的房子介绍
*/
public static String getIntroduction(String url) {
String introduction = null;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".baseattribute").select(".clear .content");// baseattribute
// clear
introduction = elements.get(0).text();
// System.out.println(introduction);
} catch (Exception e) {
// TODO: 断点续爬
}
return introduction;
}
/**
* 获取每个url下房屋地址
*/
public static String getAddress(String url) {
String address = null;
String address1 = null;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".areaName .info");
address = elements.get(0).text();
Elements elements1 = doc.select(".areaName .supplement");
address1 = elements1.get(0).text();
address = address + address1;
// System.out.println(address);
} catch (Exception e) {
// TODO:断点续爬
}
return address;
}
/**
* 获取价格
*/
public static Integer getPrice(String url) {
Integer price = 0;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".price .total");
price = Integer.parseInt(elements.get(0).text());
price = price * 10000;
// System.out.println(price);
} catch (Exception e) {
// TODO:断点续传
}
return price;
}
/**
* 获取房屋产权时间
*/
public static Integer getHouseProperty(String url) {
Integer houseProperty = 0;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".base li");
String houseProperty1 = elements.get(12).text().substring(4, 6);
houseProperty = Integer.parseInt(houseProperty1);
// System.out.println(houseProperty);
} catch (Exception e) {
// TODO: 断点续传
}
return houseProperty;
}
/**
* 获得房屋户型
*/
public static String getHouseType(String url) {
String houseType = null;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".base .content li");
houseType = elements.get(0).text();
// System.out.println(houseType);
} catch (Exception e) {
// TODO: 断点续爬
}
return houseType;
}
/**
* 获取房屋建筑面积
*/
public static Double getConstructionArea(String url) {
Double constructionArea = 0.0;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".base li");
String constructionArea1 = elements.get(2).text().substring(4, 9);
constructionArea = Double.parseDouble(constructionArea1);
// System.out.println(constructionArea);
} catch (Exception e) {
// TODO: 断点续爬
}
return constructionArea;
}
/**
* 获取房屋套内面积
*/
public static Double getInnerArea(String url) {
Double innerArea = 0.0;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".base li");
String innerArea1 = elements.get(4).text().substring(4,9);
innerArea = Double.parseDouble(innerArea1);
// System.out.println(innerArea);
} catch (Exception e) {
// TODO: 断点续爬
}
return innerArea;
}
/**
* 获取建筑时间
*/
public static int getTimeLeft(String url) {
int buildTime = 0;
int timeLeft = 0;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".area .subInfo");
String buildTime1 = elements.get(0).text().substring(0, 4);
buildTime = Integer.parseInt(buildTime1);
timeLeft = 2018 - buildTime;
// System.out.println(timeLeft);
} catch (Exception e) {
// TODO: handle exception
}
return timeLeft;
}
public static Double getUnitPrice(String url) {
Double unitPrice = 0.0;
try {
Document doc = Jsoup.connect(url).get();
Elements elements = doc.select(".unitPriceValue");
String unitPrice1 = elements.get(0).text().substring(0, 6);
String temp=unitPrice1.substring(5, 6);
if(temp.equals("元")){
unitPrice1 = elements.get(0).text().substring(0, 5);
}
unitPrice = Double.parseDouble(unitPrice1);
// System.out.println(unitPrice);
} catch (Exception e) {
// TODO: handle exception
}
return unitPrice;
}
}
其中出现问题有由于初学爬虫,断点续爬不会写。此段代码获取到所有房屋地址List,引用到线程类中,再调用本类方法处理,由于每个线程处理的是所有房屋地址List其中的一部分,执行不同逻辑,故run方法中执行逻辑不同,代码如下
package thread;
import java.io.IOException;
import java.util.List;
import PachongTest.LianJiaData;
public class ThreadDemo {
public static void main(String[] args){
Thread01 t1=new Thread01();
Thread02 t2=new Thread02();
Thread03 t3=new Thread03();
Thread04 t4=new Thread04();
Thread05 t5=new Thread05();
Thread06 t6=new Thread06();
Thread07 t7=new Thread07();
Thread08 t8=new Thread08();
Thread09 t9=new Thread09();
Thread10 t10=new Thread10();
Thread11 t11=new Thread11();
Thread12 t12=new Thread12();
Thread13 t13=new Thread13();
Thread14 t14=new Thread14();
Thread15 t15=new Thread15();
t1.start();
t2.start();
t3.start();
t4.start();
t5.start();
t6.start();
t7.start();
t8.start();
t9.start();
t10.start();
t11.start();
t12.start();
t13.start();
t14.start();
t15.start();
}
}
class Thread01 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=0;i<2000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread02 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=2000;i<4000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread03 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=4000;i<6000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread04 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=6000;i<8000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread05 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=8000;i<10000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread06 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=12000;i<14000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread07 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=14000;i<16000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread08 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=16000;i<18000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread09 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=18000;i<20000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread10 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=20000;i<22000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread11 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=22000;i<24000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread12 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=24000;i<26000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread13 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=26000;i<28000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread14 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=28000;i<lian.size();i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
class Thread15 extends Thread{
public void run() {
try {
List<String> lian=LianJiaData.getAllHomeUrlList();
for(int i=10000;i<12000;i++){
String url=lian.get(i);
// System.out.println(url);
LianJiaData.getHouseMessage(url);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
其中遇到问题是不同线程执行时,LianJiaData类中的计数器j被执行多次,导致地址计数次数重复,希望有大神指出错误
待解决问题:链接超时 connect timeout
在设置时间Document doc = Jsoup.connect(url).timeout(3000).get()后解决一部分,报错有所减少,但仍有