抓取搜房新房


package soufun;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Calendar;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class NewHouseIDList {
private static Connection mysql_connection;
private static Statement statement;
private static ResultSet resultSet;

public static void main(String[] args) throws ParserException, IOException,
SQLException {
String mysql_url = "jdbc:mysql://localhost:3306/abcdefg?characterEncoding=UTF-8";
String username = "root";
String password = "soufunroot";
String query = "";
NodeList n_list = null;
NodeList n_list_1 = null;
Node node_a = null;
String new_house_url = null;
String house_link = null;
String wuye_type = null;
String house_price = null;
String house_face_img_url = null;
String str_temp = null;

URL server = null;
HttpURLConnection connection = null;
Parser parser = null;
HasAttributeFilter attrfilter = null;
try {
Class.forName("com.mysql.jdbc.Driver");
mysql_connection = DriverManager.getConnection(mysql_url, username,
password);
}
// 捕获加载驱动程序异常
catch (ClassNotFoundException cnfex) {
System.err.println("装载 JDBC/ODBC 驱动程序失败。");
cnfex.printStackTrace();
System.exit(1); // terminate program
}
// 捕获连接数据库异常
catch (SQLException sqlex) {
System.err.println("无法连接数据库");
sqlex.printStackTrace();
System.exit(1); // terminate program
}
System.out.println("连接数据库成功");
statement = mysql_connection.createStatement();

for (int i = 1; i <= 685; i++) {
new_house_url = "http://newhouse.sz.soufun.com/house/%C9%EE%DB%DA_________________"
+ i + "_.htm";
server = new URL(new_house_url);
connection = (HttpURLConnection) server.openConnection();
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
parser.setEncoding("GB2312");
attrfilter = new HasAttributeFilter("class", "searchListNoraml");
n_list = parser.extractAllNodesThatMatch(attrfilter);
for (int j = 0; j < n_list.size(); j++) {
wuye_type = "";
house_price = "";
parser = new Parser();
parser.setInputHTML(n_list.elementAt(j).toHtml());
parser.setEncoding("GB2312");
attrfilter = new HasAttributeFilter("class", "photo");
n_list_1 = parser.extractAllNodesThatMatch(attrfilter);
if (n_list_1 == null || n_list_1.size() == 0) {
parser.reset();
attrfilter = new HasAttributeFilter("class", "phototuig");
n_list_1 = parser.extractAllNodesThatMatch(attrfilter);
}
node_a = n_list_1.elementAt(0).getFirstChild().getNextSibling();
if (node_a != null) {
house_link = ((LinkTag) node_a).getLink();
house_face_img_url = ((org.htmlparser.tags.ImageTag) node_a
.getFirstChild().getNextSibling())
.getAttribute("src");
System.out.println(new_house_url);
System.out.println("[" + i + "]:" + house_link + ":"
+ house_face_img_url);
System.out
.println("------------------------------------------------------------");

parser.reset();
attrfilter = new HasAttributeFilter("class", "purpose");
n_list_1 = parser.extractAllNodesThatMatch(attrfilter);
if (n_list_1 != null && n_list_1.size() > 0) {
wuye_type = n_list_1.elementAt(0).toPlainTextString();
wuye_type = wuye_type.replaceAll("物业类型:", "");

parser.reset();
attrfilter = new HasAttributeFilter("class",
"price_type");
n_list_1 = parser.extractAllNodesThatMatch(attrfilter);
if (n_list_1 != null && n_list_1.size() > 0) {
house_price = n_list_1.elementAt(0)
.toPlainTextString();
if (n_list_1.elementAt(0).getParent()
.toPlainTextString().indexOf("万元") >= 0)
house_price = house_price + "0000";
try {
Integer.parseInt(house_price);
} catch (Exception e) {
house_price = "0";
}
}
} else {
parser.reset();
attrfilter = new HasAttributeFilter("id", "houselist");
n_list_1 = parser.extractAllNodesThatMatch(attrfilter);
if (n_list_1 != null && n_list_1.size() > 0) {
str_temp = n_list_1.elementAt(0).toHtml();
if (str_temp.indexOf(".htm") >= 0) {
str_temp = str_temp.substring(str_temp
.indexOf("housing/") + 8, str_temp
.indexOf(".htm"));
if (str_temp.matches("[0-9]*")) {
house_link = "http://newhouse.sz.soufun.com/house/"
+ str_temp + ".htm";
} else {
System.out.println("获取楼盘链接失败:[" + j + "]"
+ new_house_url);
continue;
}
}
}

parser.reset();
attrfilter = new HasAttributeFilter("class", "font16");
n_list_1 = parser.extractAllNodesThatMatch(attrfilter);
if (n_list_1 != null && n_list_1.size() > 0) {
house_price = n_list_1.elementAt(0)
.toPlainTextString();
if (n_list_1.elementAt(0).getParent()
.toPlainTextString().indexOf("万元") >= 0)
house_price = house_price + "0000";
}
}

query = "select count(id) from uchome_z_soufun_newhouse_link where link='"
+ house_link + "'";
resultSet = statement.executeQuery(query);
if (resultSet.next()) {
if (resultSet.getString(1).trim().equals("0")) {
query = "insert into uchome_z_soufun_newhouse_link(link,`物业类型`,`单价`,house_face_img_url,date) values('"
+ house_link
+ "','"
+ wuye_type
+ "','"
+ house_price
+ "','"
+ house_face_img_url
+ "',now())";
statement.execute(query);
}
}
}
}
}
}

public static String replaceString(String str) {
str = str.replaceAll("\n", "");
str = str.replaceAll("\r", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(":", "");
str = str.replaceAll(" ", "");
return str;
}
}


package soufun;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Calendar;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.LinkStringFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.DefinitionList;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.JSONValue;
import org.json.simple.parser.ContainerFactory;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

public class NewHouseDetail {
private static String mysql_url = "jdbc:mysql://localhost:3306/abcdefg?characterEncoding=UTF-8";
private static Connection mysql_connection;
private static Statement link_statement;
private static Statement statement;
private static ResultSet link_resultSet;
private static ResultSet resultSet;
private static String query;
private static String soufun_agents_img_path = "d:/soufun/soufun_agents_img/";

public static void main(String[] args) throws ParserException, IOException,
SQLException {
String username = "root";
String password = "soufunroot";
String str_link = null, house_id = null;
String str_01 = null, str_02 = null;
HasAttributeFilter attrfilter = null;
LinkStringFilter lsf = null;
NodeList n_list = null;
Node node_a = null, node_b = null, node_c = null, node_d = null, node_e = null, node_f = null, node_g = null, node_h = null, node_i = null, node_j = null, node_k = null;
String house_link = null, project_img_link = null, soufun_id = null;
String source_agent_img_url = null;
String str_temp = null;
String house_face_img_url = null, floor = null, wuye_type = null, area = null, sub_area = null, house_style = null;
String[] fileInfo = null, mapabc_xy = null;
try {
Class.forName("com.mysql.jdbc.Driver");
mysql_connection = DriverManager.getConnection(mysql_url, username,
password);
}
// 捕获加载驱动程序异常
catch (ClassNotFoundException cnfex) {
System.err.println("装载 JDBC/ODBC 驱动程序失败。");
cnfex.printStackTrace();
System.exit(1); // terminate program
}
// 捕获连接数据库异常
catch (SQLException sqlex) {
System.err.println("无法连接数据库");
sqlex.printStackTrace();
System.exit(1); // terminate program
}
System.out.println("连接数据库成功");

link_statement = mysql_connection.createStatement();
statement = mysql_connection.createStatement();
query = "select link,id,house_face_img_url,`物业类型` from uchome_z_soufun_newhouse_link where type=0";
link_resultSet = link_statement.executeQuery(query);
if (link_resultSet.next()) {
do {
str_link = link_resultSet.getString(1).trim();
house_face_img_url = link_resultSet.getString(3).trim();
wuye_type = link_resultSet.getString(4).trim();
query = "update uchome_z_soufun_newhouse_link set type=1,update_time=now()";
URL server = new URL(str_link);
HttpURLConnection connection = (HttpURLConnection) server
.openConnection();
Parser parser = null;
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
parser.setEncoding("GB2312");
if (str_link.indexOf("office") >= 0)
attrfilter = new HasAttributeFilter("id", "officexq_8");
else if (str_link.indexOf("shop") >= 0)
attrfilter = new HasAttributeFilter("id", "shopxq_8");
else
attrfilter = new HasAttributeFilter("id", "newszxq_19");
n_list = parser.extractAllNodesThatMatch(attrfilter);
if (n_list == null || n_list.size() == 0) {
System.out.println("抓取" + str_link + "失败");
query = "update uchome_z_soufun_newhouse_link set type=2,update_time=now()";
query = query + " where link='" + str_link + "'";
statement.execute(query);
continue;
}
System.out
.println(link_resultSet.getString(2).trim()
+ ":["
+ str_link
+ "]:开始---------------------------------------------------------");
str_01 = n_list.elementAt(0).getFirstChild()
.toPlainTextString();
str_01 = replaceString(str_01);
System.out.println("楼盘名称:" + str_01);
query = query + ",楼盘名称='" + str_01 + "'";
str_temp = n_list.elementAt(0).getParent().getChildren()
.toHtml();

Parser parser_temp = new Parser();
parser_temp.setInputHTML(str_temp);
parser_temp.setEncoding("GB2312");
attrfilter = new HasAttributeFilter("class", "mingzil012_0511");
n_list = parser_temp.extractAllNodesThatMatch(attrfilter);
str_01 = n_list.elementAt(0).toPlainTextString();
str_01 = replaceString(str_01);
System.out.println("状态:" + str_01);
query = query + ",状态='" + str_01 + "'";

if (str_temp.indexOf("<b class=\"ffleft\">") >= 0) {
str_01 = str_temp.substring(str_temp
.indexOf("<b class=\"ffleft\">") + 18, str_temp
.indexOf("</b>"));
str_01 = replaceString(str_01);
System.out.println("楼盘别名:" + str_01);
query = query + ",楼盘别名='" + str_01 + "'";
}

parser.reset();
attrfilter = new HasAttributeFilter("id", "newszxq_26");
n_list = parser.extractAllNodesThatMatch(attrfilter);
project_img_link = ((LinkTag) n_list.elementAt(0)).getLink();
System.out.println("楼盘相册:" + project_img_link);
soufun_id = project_img_link
.substring(project_img_link.lastIndexOf("/") + 1,
project_img_link.indexOf(".htm"));
query = query + ",soufun_id='" + soufun_id + "'";
System.out.println("楼盘id:" + soufun_id);

parser.reset();
attrfilter = new HasAttributeFilter("id", "newszxq_41");
TableTag tag = (TableTag) parser.extractAllNodesThatMatch(
attrfilter).elementAt(0);
TableRow[] rows = tag.getRows();
for (int j = 0; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
str_01 = replaceString(td[k].toPlainTextString());
if (str_01.trim().indexOf("物业类别") >= 0) {
str_01 = str_01.substring(4);
System.out.println("物业类别:" + str_01);
query = query + ",物业类别='" + str_01 + "'";
} else if (str_01.trim().indexOf("项目特色") >= 0) {
str_01 = str_01.substring(4);
System.out.println("项目特色:" + str_01);
query = query + ",项目特色='" + str_01 + "'";
} else if (str_01.trim().indexOf("建筑类别") >= 0) {
str_01 = str_01.substring(4);
System.out.println("建筑类别:" + str_01);
query = query + ",建筑类别='" + str_01 + "'";
} else if (str_01.trim().indexOf("装修状况") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[装修相册\\]", "");
str_01 = str_01.replaceAll("\\[建材卖场\\]", "");
System.out.println("装修状况:" + str_01);
query = query + ",装修状况='" + str_01 + "'";
} else if (str_01.trim().indexOf("环线位置") >= 0) {
str_01 = str_01.substring(4);
System.out.println("环线位置:" + str_01);
query = query + ",环线位置='" + str_01 + "'";
} else if (str_01.trim().indexOf("所属商圈") >= 0) {
str_01 = str_01.substring(4);
sub_area = str_01;
System.out.println("所属商圈:" + str_01);
query = query + ",所属商圈='" + str_01 + "'";
}
}
}

parser.reset();
attrfilter = new HasAttributeFilter("id", "newszxq_42");
tag = (TableTag) parser.extractAllNodesThatMatch(attrfilter)
.elementAt(0);
rows = tag.getRows();
for (int j = 0; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
str_01 = replaceString(td[k].toPlainTextString());
if (str_01.trim().indexOf("物业地址") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[交通图\\]", "");
System.out.println("物业地址:" + str_01);
query = query + ",物业地址='" + str_01 + "'";
} else if (str_01.trim().indexOf("交通状况") >= 0) {
str_01 = str_01.substring(4);
System.out.println("交通状况:" + str_01);
query = query + ",交通状况='" + str_01 + "'";
}
}
}

parser.reset();
attrfilter = new HasAttributeFilter("id", "newszxq_43");
tag = (TableTag) parser.extractAllNodesThatMatch(attrfilter)
.elementAt(0);
rows = tag.getRows();
for (int j = 0; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
str_01 = replaceString(td[k].toPlainTextString());
if (str_01.trim().indexOf("开盘时间") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[开盘详情\\]", "");
System.out.println("开盘时间:" + str_01);
query = query + ",开盘时间='" + str_01 + "'";
} else if (str_01.trim().indexOf("入住时间") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[入住详情\\]", "");
System.out.println("入住时间:" + str_01);
query = query + ",入住时间='" + str_01 + "'";
} else if (str_01.trim().indexOf("容积率") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[关于容积率\\]", "");
System.out.println("容积率:" + str_01);
query = query + ",容积率='" + str_01 + "'";
} else if (str_01.trim().indexOf("绿化率") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[关于绿化率\\]", "");
System.out.println("绿化率:" + str_01);
query = query + ",绿化率='" + str_01 + "'";
} else if (str_01.trim().indexOf("户数") >= 0) {
str_01 = str_01.substring(2);
str_01 = str_01.replaceAll("\\[房源详情\\]", "");
System.out.println("户数:" + str_01);
query = query + ",户数='" + str_01 + "'";
} else if (str_01.trim().indexOf("预售许可证") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[更多\\]", "");
System.out.println("预售许可证:" + str_01);
query = query + ",预售许可证='" + str_01 + "'";
} else if (str_01.trim().indexOf("物业费") >= 0) {
str_01 = str_01.substring(4);
str_01 = str_01.replaceAll("\\[点评物业\\]", "");
System.out.println("物业费:" + str_01);
query = query + ",物业费='" + str_01 + "'";
} else if (str_01.trim().indexOf("物业公司") >= 0) {
str_01 = str_01.substring(4);
System.out.println("物业公司:" + str_01);
query = query + ",物业公司='" + str_01 + "'";
}
}
}

tag = (TableTag) tag.getNextSibling().getNextSibling();
rows = tag.getRows();
for (int j = 0; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
str_01 = replaceString(td[k].toPlainTextString());
if (str_01.trim().indexOf("开发商") >= 0) {
str_01 = str_01.substring(3);
str_01 = str_01.replaceAll("\\[相关网站\\]", "");
System.out.println("开发商:" + str_01);
query = query + ",开发商='" + str_01 + "'";
} else if (str_01.trim().indexOf("售楼地址") >= 0) {
str_01 = str_01.substring(4);
System.out.println("售楼地址:" + str_01);
query = query + ",售楼地址='" + str_01 + "'";
} else if (str_01.trim().indexOf("房价") >= 0
|| str_01.trim().indexOf("二手房销售电话") >= 0) {
if (str_01.trim().indexOf("二手房销售电话") >= 0)
str_01 = str_01.substring(str_01.trim()
.indexOf("二手房销售电话") + 8);
else
str_01 = str_01.substring(str_01.trim()
.indexOf("房价") + 2);
str_01 = str_01.replaceAll("\\[房价走势\\]", "");
str_01 = str_01.replaceAll("\\[我要纠错\\]", "");
str_01 = str_01.replaceAll("\\[房贷计算器\\]", "");
System.out.println("房价:" + str_01);
query = query + ",房价='" + str_01 + "'";
}
}
}

parser.reset();
attrfilter = new HasAttributeFilter("id", "dli011");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).toPlainTextString());
System.out.println("项目介绍:" + str_01);
query = query + ",项目介绍='" + str_01 + "'";

parser.reset();
attrfilter = new HasAttributeFilter("id", "dli012");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).toPlainTextString());
System.out.println("项目配套:" + str_01);
query = query + ",项目配套='" + str_01 + "'";

parser.reset();
attrfilter = new HasAttributeFilter("id", "dli013");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).toPlainTextString());
System.out.println("交通状况:" + str_01);
query = query + ",交通状况='" + str_01 + "'";

parser.reset();
attrfilter = new HasAttributeFilter("id", "dli014");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).toPlainTextString());
System.out.println("建材装修:" + str_01);
query = query + ",建材装修='" + str_01 + "'";

parser.reset();
attrfilter = new HasAttributeFilter("id", "dli015");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).toPlainTextString());
System.out.println("楼层状况:" + str_01);
query = query + ",楼层状况='" + str_01 + "'";

parser.reset();
attrfilter = new HasAttributeFilter("id", "dli016");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).toPlainTextString());
System.out.println("车位信息:" + str_01);
query = query + ",车位信息='" + str_01 + "'";

parser.reset();
attrfilter = new HasAttributeFilter("id", "dli017");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).toPlainTextString());
System.out.println("相关信息:" + str_01);
query = query + ",相关信息='" + str_01 + "'";

parser.reset();
attrfilter = new HasAttributeFilter("class", "mbx_span1");
n_list = parser.extractAllNodesThatMatch(attrfilter);
str_01 = replaceString(n_list.elementAt(0).getFirstChild()
.getNextSibling().getNextSibling().getNextSibling()
.getNextSibling().getNextSibling().toPlainTextString());
area = str_01;
System.out.println("所属城区:" + str_01);
query = query + ",所属城区='" + str_01 + "'";

ThreadSleep.setThreadSleep();
mapabc_xy = MapABC.getMapABCXY(soufun_id, "深圳");
query = query + ",mapabc_x='" + mapabc_xy[0] + "'";
System.out.println(mapabc_xy[0]);
query = query + ",mapabc_y='" + mapabc_xy[1] + "'";
System.out.println(mapabc_xy[1]);

query = query + " where link='" + str_link + "'";
query = query.replace("\\", "\\\\").replaceAll("\r", "")
.replaceAll("\n", "");
try {
statement.execute(query);
} catch (java.sql.SQLException e) {
System.out.println(query);
System.out.println("数据保存出错:" + e.getMessage());
System.exit(0);
}
if (house_face_img_url != null)
savePicture(soufun_id, "0", "封面图", "楼盘相册封面图",
house_face_img_url, house_face_img_url
.substring(house_face_img_url
.indexOf(".com/") + 4), "1", "",
"", "", "", "");
dataConversion(str_link, wuye_type, area, sub_area);
ThreadSleep.setThreadSleep();
NewHousePriceTrend.getNewHousePriceTrend(str_link, soufun_id);
ThreadSleep.setThreadSleep();
NewHouseImg.getNewHouseImg(str_link, soufun_id);
System.out
.println(link_resultSet.getString(2).trim()
+ ":["
+ str_link
+ "]:结束---------------------------------------------------------");
} while (link_resultSet.next());
}
}

public static void dataConversion(String str_link, String wuye_type,
String area, String sub_area) throws SQLException {
boolean execute = false;
System.out.println("数据转换:开始");
query = "update uchome_z_soufun_newhouse_link set update_time=now()";
if (wuye_type != null && wuye_type.trim().equals("") == false) {
execute = true;
if (wuye_type.trim().equals("住宅"))
wuye_type = "8";
else if (wuye_type.trim().equals("别墅"))
wuye_type = "14";
else
wuye_type = "0";
query += ",`物业类型ID`='" + wuye_type + "'";
System.out.println("wuye_type:" + wuye_type);
}
if (area != null && area.trim().equals("") == false) {
execute = true;
resultSet = statement
.executeQuery("select zoneid from uchome_z_zone where zonename='"
+ area + "区'");
if (resultSet.next())
area = resultSet.getString(1).trim();
resultSet = statement
.executeQuery("select zoneid from uchome_z_zone where zonename='"
+ sub_area + "'");
if (resultSet.next())
sub_area = resultSet.getString(1).trim();
else {
resultSet = statement
.executeQuery("select zoneid from uchome_z_zone where zonename='其它' and parent_zoneid='"
+ area + "'");
if (resultSet.next())
sub_area = resultSet.getString(1).trim();
}
if (area.trim().equals("其他") || area.trim().equals("其它")) {
area = "0";
sub_area = "0";
}
query += ",`城区ID`='" + area + "'";
query += ",`商圈ID`='" + sub_area + "'";
System.out.println("area:" + area);
System.out.println("sub_area:" + sub_area);
}
if (execute) {
query += " where `link`='" + str_link + "'";
statement.execute(query);
}
System.out.println("数据转换:结束");
}

public static void savePicture(String soufun_id, String xiangce_id,
String xiangce_name, String img_title, String img_small_url,
String target_img_small_path, String is_face_img,
String img_big_url, String target_img_big_path,
String house_style_type, String house_style_name,
String house_sytle_area) throws SQLException {
query = "select count(id) from uchome_z_soufun_newhouse_img where soufun_id='"
+ soufun_id + "' and img_small_url='" + img_small_url + "'";
resultSet = statement.executeQuery(query);
if (resultSet.next()) {
if (resultSet.getString(1).trim().equals("0")) {
query = "insert into uchome_z_soufun_newhouse_img(soufun_id,xiangce_id,xiangce_name,img_title,img_small_url,target_img_small_path,is_face_img,img_big_url,target_img_big_path,house_style_type,house_style_name,house_sytle_area,create_time) values('"
+ soufun_id
+ "','"
+ xiangce_id
+ "','"
+ xiangce_name
+ "','"
+ img_title
+ "','"
+ img_small_url
+ "','"
+ target_img_small_path
+ "','"
+ is_face_img
+ "','"
+ img_big_url
+ "','"
+ target_img_big_path
+ "','"
+ house_style_type
+ "','"
+ house_style_name
+ "','"
+ house_sytle_area + "',now())";
statement.execute(query);
}
}
}

public static String replaceString(String str) {
str = str.replaceAll("\n", "");
str = str.replaceAll("\r", "");
str = str.replaceAll("\t", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(":", "");
str = str.replaceAll("'", "");
str = str.replaceAll(" ", "");
return str;
}
}


package soufun;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Calendar;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class NewHouseImg {
private static Connection mysql_connection;
private static Statement area_statement;
private static Statement statement;
private static ResultSet area_resultSet;
private static ResultSet resultSet;

public static void main(String[] args) throws ParserException, IOException,
SQLException {

}

public static void getNewHouseImg(String new_house_link, String soufun_id)
throws ParserException, IOException, SQLException {
String mysql_url = "jdbc:mysql://localhost:3306/abcdefg?characterEncoding=UTF-8";
String username = "root";
String password = "soufunroot";
String query = "";
try {
Class.forName("com.mysql.jdbc.Driver");
mysql_connection = DriverManager.getConnection(mysql_url, username,
password);
}
// 捕获加载驱动程序异常
catch (ClassNotFoundException cnfex) {
System.err.println("装载 JDBC/ODBC 驱动程序失败。");
cnfex.printStackTrace();
System.exit(1); // terminate program
}
// 捕获连接数据库异常
catch (SQLException sqlex) {
System.err.println("无法连接数据库");
sqlex.printStackTrace();
System.exit(1); // terminate program
}
System.out.println("连接数据库成功");
area_statement = mysql_connection.createStatement();
statement = mysql_connection.createStatement();
NodeList n_list = null, n_list_style = null, n_list_temp = null;
Node node_a = null;
String img_small_url = null, img_big_url = null, img_link = null, img_title = null, img_face_url = null, house_style_url = null, house_style_name = null, house_style_type = null, house_area = null, xiangce_name = null;
int img_page = 0, temp_i = 0;
Boolean find_face_img = false;
URL server = null;
HttpURLConnection connection = null;
Parser parser = null;
HasAttributeFilter attrfilter = null;

for (int k = 0; k <= 7; k++) {
if (k == 0) {
server = new URL(new_house_link + "/photo/list_90" + k + "_"
+ soufun_id + ".htm");
connection = (HttpURLConnection) server.openConnection();
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
attrfilter = new HasAttributeFilter("class", "hxtlist0531");
n_list_style = parser.extractAllNodesThatMatch(attrfilter)
.elementAt(0).getChildren();
temp_i = 0;
for (int l = 0; l < n_list_style.size(); l++) {
if (n_list_style.elementAt(l).getClass().getName().equals(
"org.htmlparser.tags.LinkTag")) {
temp_i = temp_i + 1;
house_style_url = ((LinkTag) n_list_style.elementAt(l))
.getLink();
if (house_style_url.indexOf("_1_1") >= 0)
house_style_type = "一居室";
else if (house_style_url.indexOf("_2_1") >= 0)
house_style_type = "二居室";
else if (house_style_url.indexOf("_3_1") >= 0)
house_style_type = "三居室";
else if (house_style_url.indexOf("_4_1") >= 0)
house_style_type = "四居室";
else if (house_style_url.indexOf("_5_1") >= 0)
house_style_type = "五居室";
else if (house_style_url.indexOf("_6_1") >= 0)
house_style_type = "六居室";
else if (house_style_url.indexOf("_7_1") >= 0)
house_style_type = "六居以上";
System.out.println(house_style_url);
server = new URL(house_style_url);
connection = (HttpURLConnection) server
.openConnection();
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
attrfilter = new HasAttributeFilter("class", "link_00");
n_list = parser.extractAllNodesThatMatch(attrfilter);
if (n_list.size() > 0) {
img_page = (int) Math.ceil(Double
.parseDouble(n_list.elementAt(0)
.toPlainTextString().trim()) / 12);
}
for (int m = 1; m <= img_page; m++) {
find_face_img = false;
house_style_url = house_style_url.replaceAll(
"_1.htm", "_" + m + ".htm");
server = new URL(house_style_url);
connection = (HttpURLConnection) server
.openConnection();
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
attrfilter = new HasAttributeFilter("class",
"container");
n_list = parser
.extractAllNodesThatMatch(attrfilter);
for (int n = 0; n < n_list.size(); n++) {
if (n_list
.elementAt(n)
.getFirstChild()
.getClass()
.getName()
.equals("org.htmlparser.nodes.TextNode")) {
node_a = n_list.elementAt(n)
.getFirstChild().getNextSibling()
.getFirstChild().getNextSibling();
img_small_url = ((org.htmlparser.tags.ImageTag) node_a)
.getImageURL();
img_link = ((org.htmlparser.tags.LinkTag) node_a
.getParent()).getLink();

// 图片详情页:开始
server = new URL(img_link);
connection = (HttpURLConnection) server
.openConnection();
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
attrfilter = new HasAttributeFilter(
"class", "photosearch_bt mingzi18");
parser.setEncoding("gb2312");
n_list_temp = parser
.extractAllNodesThatMatch(attrfilter);
img_title = n_list_temp.elementAt(0)
.toPlainTextString();
img_title = replaceString(img_title).trim();
if (img_title.trim().equals("") == false) {
img_title = img_title.substring(0,
img_title.length() - 1);
img_title = img_title + "平米";
parser.reset();
attrfilter = new HasAttributeFilter(
"class", "zuozhetopnr");
n_list_temp = parser
.extractAllNodesThatMatch(attrfilter);
for (int o = 0; o < n_list_temp.size(); o++) {
house_style_name = n_list_temp
.elementAt(o)
.toPlainTextString();
if (house_style_name
.indexOf("户型图:") >= 0) {
house_area = house_style_name
.substring(house_style_name
.lastIndexOf(" ") + 6);
if (house_area != null
&& house_area.trim()
.equals("") == false)
house_area = house_area
.substring(
0,
house_area
.length() - 1);
house_style_name = house_style_name
.substring(
house_style_name
.indexOf("户型图:") + 4,
house_style_name
.indexOf(" "));
System.out
.println(house_style_name);
break;
}
}
parser.reset();
attrfilter = new HasAttributeFilter(
"id", "imgDetail");
n_list_temp = parser
.extractAllNodesThatMatch(attrfilter);
img_big_url = ((org.htmlparser.tags.ImageTag) n_list_temp
.elementAt(0)).getImageURL();
System.out.println(img_big_url);
} else {
System.out.println("抓取" + img_link
+ "失败");
continue;
}
// 图片详情页:结束

if (img_face_url != null)
if (img_face_url.trim().equals(
img_small_url.trim())
&& find_face_img)
System.out.print("此图像是:相册封面图");
System.out.println("[" + k + "]:"
+ img_title + ":" + img_small_url
+ ":" + house_area);
query = "select count(id) from uchome_z_soufun_newhouse_img where soufun_id='"
+ soufun_id
+ "' and img_small_url='"
+ img_small_url + "'";
resultSet = statement.executeQuery(query);
if (resultSet.next()) {
if (resultSet.getString(1).trim()
.equals("0")) {
query = "insert into uchome_z_soufun_newhouse_img(soufun_id,xiangce_id,xiangce_name,img_title,img_small_url,target_img_small_path,is_face_img,img_big_url,target_img_big_path,house_style_type,house_style_name,house_sytle_area,create_time) values('"
+ soufun_id
+ "','"
+ "900"
+ "','"
+ "户型图"
+ "','"
+ img_title
+ "','"
+ img_small_url
+ "','"
+ img_small_url
.substring(img_small_url
.indexOf(".com/") + 4)
+ "','"
+ (img_face_url
.trim()
.equals(
img_small_url
.trim())
&& find_face_img ? 1
: 0)
+ "','"
+ img_big_url
+ "','"
+ img_big_url
.substring(img_big_url
.indexOf(".com/") + 4)
+ "','"
+ house_style_type
+ "','"
+ house_style_name
+ "','"
+ house_area
+ "',now())";
statement.execute(query);
}
}
} else {
if (temp_i == 1 && m == 1) {
node_a = n_list.elementAt(n)
.getFirstChild();
img_small_url = ((org.htmlparser.tags.ImageTag) node_a)
.getImageURL();
img_title = ((org.htmlparser.tags.ImageTag) node_a)
.getAttribute("title");
img_face_url = img_small_url;
find_face_img = true;
}
}
}
}
}
}
} else {
server = new URL(new_house_link + "/photo/list_90" + k + "_"
+ soufun_id + ".htm");
connection = (HttpURLConnection) server.openConnection();
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
attrfilter = new HasAttributeFilter("class", "link_00");
n_list = parser.extractAllNodesThatMatch(attrfilter);
if (n_list.size() > 0) {
img_page = (int) Math.ceil(Double.parseDouble(n_list
.elementAt(0).toPlainTextString().trim()) / 12);
}
for (int j = 1; j <= img_page; j++) {
find_face_img = false;
server = new URL(new_house_link + "/photo/list_90" + k
+ "_" + soufun_id + "_" + j + ".htm");
connection = (HttpURLConnection) server.openConnection();
try {
parser = new Parser(connection);
} catch (Exception e) {
System.out.println(e.getMessage());
continue;
}
attrfilter = new HasAttributeFilter("class", "container");
n_list = parser.extractAllNodesThatMatch(attrfilter);
for (int i = 0; i < n_list.size(); i++) {
if (n_list.elementAt(i).getFirstChild().getClass()
.getName().equals(
"org.htmlparser.nodes.TextNode")) {
node_a = n_list.elementAt(i).getFirstChild()
.getNextSibling().getFirstChild()
.getNextSibling();
img_small_url = ((org.htmlparser.tags.ImageTag) node_a)
.getImageURL();
img_big_url = img_small_url.replaceAll("_s", "");
img_title = ((org.htmlparser.tags.ImageTag) node_a)
.getAttribute("title");
if (img_face_url.trim()
.equals(img_small_url.trim())
&& find_face_img)
System.out.print("此图像是:相册封面图");
if (k == 1)
xiangce_name = "交通图";
else if (k == 2)
xiangce_name = "外景图";
else if (k == 3)
xiangce_name = "实景图";
else if (k == 4)
xiangce_name = "效果图";
else if (k == 5)
xiangce_name = "样板间";
else if (k == 6)
xiangce_name = "规划图";
else if (k == 7)
xiangce_name = "周边配套图";
System.out.println("[" + k + "]:" + img_title + ":"
+ img_small_url);
query = "select count(id) from uchome_z_soufun_newhouse_img where soufun_id='"
+ soufun_id
+ "' and img_small_url='"
+ img_small_url + "'";
resultSet = statement.executeQuery(query);
if (resultSet.next()) {
if (resultSet.getString(1).trim().equals("0")) {
query = "insert into uchome_z_soufun_newhouse_img(soufun_id,xiangce_id,xiangce_name,img_title,img_small_url,target_img_small_path,is_face_img,img_big_url,target_img_big_path,house_style_type,house_style_name,house_sytle_area,create_time) values('"
+ soufun_id
+ "','"
+ "90"
+ k
+ "','"
+ xiangce_name
+ "','"
+ img_title
+ "','"
+ img_small_url
+ "','"
+ img_small_url
.substring(img_small_url
.indexOf(".com/") + 4)
+ "','"
+ (img_face_url.trim().equals(
img_small_url.trim())
&& find_face_img ? 1 : 0)
+ "','"
+ img_big_url
+ "','"
+ img_big_url.substring(img_big_url
.indexOf(".com/") + 4)
+ "','"
+ ""
+ "','"
+ ""
+ "','"
+ "" + "',now())";
statement.execute(query);
}
}
} else {
if (j == 1) {
node_a = n_list.elementAt(i).getFirstChild();
img_small_url = ((org.htmlparser.tags.ImageTag) node_a)
.getImageURL();
img_title = ((org.htmlparser.tags.ImageTag) node_a)
.getAttribute("title");
img_face_url = img_small_url;
find_face_img = true;
}
}
}
}
}
}
}

public static String replaceString(String str) {
str = str.replaceAll("\n", "");
str = str.replaceAll("\r", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(":", "");
str = str.replaceAll(" ", "");
return str;
}
}


package soufun;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Calendar;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.NotFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class NewHousePriceTrend {
private static Connection mysql_connection;
private static Statement area_statement;
private static Statement statement;
private static ResultSet area_resultSet;
private static ResultSet resultSet;

public static void main(String[] args) throws ParserException, IOException,
SQLException {

}

public static void getNewHousePriceTrend(String new_house_link,
String soufun_id) throws ParserException, IOException, SQLException {
String mysql_url = "jdbc:mysql://localhost:3306/abcdefg?characterEncoding=UTF-8";
String username = "root";
String password = "soufunroot";
String query = "";
try {
Class.forName("com.mysql.jdbc.Driver");
mysql_connection = DriverManager.getConnection(mysql_url, username,
password);
}
// 捕获加载驱动程序异常
catch (ClassNotFoundException cnfex) {
System.err.println("装载 JDBC/ODBC 驱动程序失败。");
cnfex.printStackTrace();
System.exit(1); // terminate program
}
// 捕获连接数据库异常
catch (SQLException sqlex) {
System.err.println("无法连接数据库");
sqlex.printStackTrace();
System.exit(1); // terminate program
}
System.out.println("连接数据库成功");
area_statement = mysql_connection.createStatement();
statement = mysql_connection.createStatement();
NodeList n_list = null, n_list_style = null, n_list_temp = null;
Node node_a = null;
String img_small_url = null, img_big_url = null, img_link = null, img_title = null, img_face_url = null, house_style_url = null, house_style_name = null, house_style_type = null, house_area = null, xiangce_name = null;
String record_time = null, highest_price = null, average_price = null, lowest_price = null, price_describe = null;
int img_page = 0, temp_i = 0;
Boolean find_face_img = false;
URL server = null;
HttpURLConnection connection = null;
Parser parser = null;
HasAttributeFilter attrfilter = null;

server = new URL(new_house_link + "/house/" + soufun_id
+ "/fangjia.htm");
connection = (HttpURLConnection) server.openConnection();
parser = new Parser(connection);
parser.setEncoding("GB2312");
attrfilter = new HasAttributeFilter("id", "priceListOpen");
n_list = parser.extractAllNodesThatMatch(attrfilter);
if (n_list != null && n_list.size() > 0) {
TableTag tag = (TableTag) n_list.elementAt(0).getFirstChild()
.getNextSibling();
TableRow[] rows = tag.getRows();
for (int j = 1; j < rows.length; j++) {
TableRow tr = (TableRow) rows[j];
TableColumn[] td = tr.getColumns();
for (int k = 0; k < td.length; k++) {
if (k == 0) {
record_time = replaceString(td[k].toPlainTextString());
System.out.print("记录时间" + record_time);
} else if (k == 1) {
highest_price = replaceString(td[k].toPlainTextString());
System.out.print("最高价" + highest_price);
} else if (k == 2) {
average_price = replaceString(td[k].toPlainTextString());
System.out.print("均价" + average_price);
} else if (k == 3) {
lowest_price = replaceString(td[k].toPlainTextString());
System.out.print("最低价" + lowest_price);
} else if (k == 4) {
price_describe = replaceString(td[k]
.toPlainTextString());
System.out.println("价格描述" + price_describe);
}
}

query = "select count(id) from uchome_z_soufun_newhouse_pricetrend where soufun_id='"
+ soufun_id + "' and record_time='" + record_time + "'";
resultSet = statement.executeQuery(query);
if (resultSet.next()) {
if (resultSet.getString(1).trim().equals("0")) {
query = "insert into uchome_z_soufun_newhouse_pricetrend(soufun_id,record_time,highest_price,average_price,lowest_price,price_describe,update_time) values('"
+ soufun_id
+ "','"
+ record_time
+ "','"
+ highest_price
+ "','"
+ average_price
+ "','"
+ lowest_price
+ "','"
+ price_describe
+ "',now())";
statement.execute(query);
} else {
query = "update uchome_z_soufun_newhouse_pricetrend set highest_price='"
+ highest_price
+ "',average_price='"
+ average_price
+ "',lowest_price='"
+ lowest_price
+ "',price_describe='"
+ price_describe
+ "',update_time=now() where soufun_id='"
+ soufun_id
+ "' and record_time='"
+ record_time + "'";
statement.execute(query);
}
}
}
}
}

public static String replaceString(String str) {
str = str.replaceAll("\n", "");
str = str.replaceAll("\r", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(" ", "");
str = str.replaceAll(":", "");
str = str.replaceAll(" ", "");
return str;
}
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值