本文为开发流程记录,排版和言语存在的问题请大家谅解,如有错误或者建议,请指出,谢谢。
之前写过一个使用Python+Beautiful Soup爬故事会,然后存文件的小爬虫,当时没有记录,最近公司不忙,写点东西玩玩,利用JAVA+jsoup+Oracle开发了一个爬取某房产网上的二手房信息并写入数据库小东西。废话不多说,开撸。
1、先分析网址,发现每一页的url都一样,只是页码是pn,那好办,写个循环即可。
2、F12分析源码,发现每一个房源都是一个class为list-item的li标签,房源信息在class为details-item的div标签里,价格信息在class为pro-price的div标签里。
3、到这里,思路已经很清晰了,用循环一个个撸页面,然后再遍历里面class为list-item的li标签并解析class为details-item和pro-price的div标签,最后在将数据塞进数据库就大功告成。
好了,废话不多说,上代码。先导入jsoup和ojdbc的包(文章末链接两个包都有)。
/**
* 爬取房源信息并解析写入数据库
* @param anjukePageNum
*/
static String city, address, community_name, house_type, storey, construction_time, create_time;
static double area, price, unit_price;
static int total = 0;
public static void crawl_AJK(int anjukePageNum) {
String Url = "https://nanjing.anjuke.com/sale/p" + anjukePageNum + "/#filtersort";
Document document;
try {
document = Jsoup.connect(Url).timeout(4000).ignoreContentType(true)
.userAgent("Mozilla\" to \"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0)").get();
Elements elements = document.select("li.list-item");
for (Element element : elements) {
// System.out.println(element);
Element e1 = element.select("div.details-item").first();
Element e2 = element.select("div.details-item").last();
Element e3 = element.select("div.pro-price").first();
Elements infos1 = e1.select("span");
Elements infos2 = e2.select("span");
Elements infos3 = e3.select("span");
String[] info2 = infos2.select("span.comm-address").first().childNode(0).toString().trim()
.split(" ");
System.out.println("crawl:" + total);
total++;
city = "南京";
// System.out.println(info2[1]);
address = info2[1];
// System.out.println(info2[0]);
community_name = info2[0];
for (int i = 0; i < infos1.size() - 1; i++) {
// System.out.println(infos1.get(i).childNode(0).toString().trim());// 平米
switch (i) {
case 0:
house_type = infos1.get(i).childNode(0).toString().trim();
break;
case 1:
area = Double.parseDouble(infos1.get(i).childNode(0).toString().trim().replace("m²", ""));
break;
case 2:
storey = infos1.get(i).childNode(0).toString().trim();
break;
case 3:
construction_time = infos1.get(i).childNode(0).toString().trim();
break;
}
}
for (int i = 0; i < infos3.size(); i++) {
// System.out.println(infos3.get(i).childNode(0).toString().trim().replace("<strong>",
// "")
// .replaceAll("</strong>", "万"));// 价格
switch (i) {
case 0:
price = Double.parseDouble(infos3.get(i).childNode(0).toString().trim().replace("<strong>", "")
.replaceAll("</strong>", ""));
break;
case 1:
unit_price = Double
.parseDouble(infos3.get(i).childNode(0).toString().trim().replace("元/m²", ""));
break;
}
}
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");// 设置日期格式
// 创建OperateOracle对象
DBHelper db = new DBHelper();
// 插入数据
db.AddData(city, address, community_name, house_type, storey, construction_time, area, price,
unit_price, df.format(new Date()));
System.out.println("小区:" + community_name + " 面积:" + area + " 价格:" + price);
System.out.println("------------------");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
数据库操作DBHelper的相关源码
// 定义连接所需的字符串
private static String USERNAMR = "username";
private static String PASSWORD = "pwd";
private static String DRVIER = "oracle.jdbc.OracleDriver";
private static String URL = "jdbc:oracle:thin:@172.21.204.5:1521:orcl";
// 创建一个数据库连接
Connection connection = null;
// 创建预编译语句对象,一般都是用这个而不用Statement
PreparedStatement pstm = null;
// 创建一个结果集对象
ResultSet rs = null;
/**
* 插入数据
* @param city
* @param address
* @param community_name
* @param house_type
* @param storey
* @param construction_time
* @param area
* @param price
* @param unit_price
* @param create_time
*/
public void AddData(String city, String address, String community_name, String house_type, String storey,
String construction_time, double area, double price, double unit_price, String create_time) {
connection = getConnection();
String sql = "select count(*) from ajk_data where 1 = 1";
int count = 0;
try {
// 计算数据库student表中数据总数
pstm = connection.prepareStatement(sql);
rs = pstm.executeQuery();
while (rs.next()) {
count = rs.getInt(1) + 1;
System.out.println(count + "写入数据库成功!");
}
String sqlStr = "insert into ajk_data values(?,?,?,?,?,?,?,?,?,?,?)";
// 执行插入数据操作
pstm = connection.prepareStatement(sqlStr);
pstm.setInt(1, count);
pstm.setString(2, city);
pstm.setString(3, address);
pstm.setString(4, community_name);
pstm.setString(5, house_type);
pstm.setString(6, storey);
pstm.setString(7, construction_time);
pstm.setDouble(8, area);
pstm.setDouble(9, price);
pstm.setDouble(10, unit_price);
pstm.setDate(11, strToDate(create_time));
pstm.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
} finally {
ReleaseResource();
}
}
/**
* 获取Connection对象
*
* @return
*/
public Connection getConnection() {
try {
Class.forName(DRVIER);
connection = DriverManager.getConnection(URL, USERNAMR, PASSWORD);
System.out.println("成功连接数据库");
} catch (ClassNotFoundException e) {
throw new RuntimeException("class not find !", e);
} catch (SQLException e) {
throw new RuntimeException("get connection error!", e);
}
return connection;
}
/**
* 释放资源
*/
public void ReleaseResource() {
if (rs != null) {
try {
rs.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (pstm != null) {
try {
pstm.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
/**
* @param 返回java.sql.Date格式的
*/
public static java.sql.Date strToDate(String strDate) {
String str = strDate;
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
java.util.Date d = null;
try {
d = format.parse(str);
} catch (Exception e) {
e.printStackTrace();
}
java.sql.Date date = new java.sql.Date(d.getTime());
return date;
}
程序的入口
public static void main(String args[]) {
for (int i = 1; i <= 50; i++) {
try {
crawl_AJK(i);
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
当然还有表结构
create table AJK_DATA
(
id VARCHAR2(100),
city VARCHAR2(100),
address VARCHAR2(100),
community_name VARCHAR2(500),
house_type VARCHAR2(100),
storey VARCHAR2(100),
construction_time VARCHAR2(100),
area NUMBER,
price NUMBER,
unit_price NUMBER,
create_time DATE
)