Java使用xpath获取58同城数据
package common;
/**
* 读取当当网下机械表的数据,并进行分析
* sunwengang 2017-08-13 20:00
*/
import cn.wanghaomiao.xpath.model.JXDocument;
import com.jimi.house.common.utils.CheckString;
import com.jimi.house.modules.apartment.entity.ShareHouse;
import org.apache.log4j.Logger;
import java.io.*;
import java.net.*;
import java.util.ArrayList;
import java.util.List;
public class URLDemo {
private static Logger logger = Logger.getLogger(URLDemo.class);
public static void main(String args[]){
//确定爬取的网页地址,此处为当当网搜机械表显示的网页
//网址为 http://search.dangdang.com/?key=%BB%FA%D0%B5%B1%ED&act=input
String strurl="https://3g.ganji.com/sz_zufang/36880262146970x.shtml?gjcity=sz&cookie=|||4160337230276824358417&apptype=12&fzbref=0&key=&pubid=58168182¶ms=rankjxzfbestm2099^desc&trackkey=36880262146970_86156089-e0f5-4d83-b2f1-56ff15f0f641_20190126092658_1548466018254&fcinfotype=gz&jingxuan=1";
//建立url爬取核心对象
try {
URL url2 = new URL(strurl);
HttpURLConnection conn = (HttpURLConnection) url2.openConnection();
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");
conn.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 5.6; Windows NT)");
conn.setDoInput(true);
conn.setInstanceFollowRedirects(true);
InputStream is = conn.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
//按行读取并打印
String line = null;
StringBuilder builder = new StringBuilder();
while ((line = br.readLine()) != null) {
builder.append(line);
}
logger.error("【解析源url】:" + strurl);
logger.error("【日志信息】" + builder);
//创建对象
//div[@id='post_list']/div[./div/div/span[@class='article_view']/a/num()>1000]/div/h3/allText()
JXDocument jxDocument = new JXDocument(builder.toString());
if(!strurl.contains("m.58.com")) {
//获取标题
String title = "";
String xpath = "//div[@class='house-header cont-padding']/div[@class='house-header-left']/allText()";
List<Object> rs = jxDocument.sel(xpath);
logger.error("【原始标题】:" + rs);
if (CheckString.isNotEmpty(rs)) {
title = (String) rs.get(0);
title = title.replaceAll("小区:", "");
logger.error("【处理之后的title】:" + title);
}
//获取描述
String remark = "";
String xpath3 = "//div[@class='configure']/p/allText()";//div[@class='configure']/p/allText()
List<Object> titleList = jxDocument.sel(xpath3);
logger.error("【原始描述】:" + titleList);
if (CheckString.isNotEmpty(titleList)) {
remark = ((String) titleList.get(0)).replaceAll("联系我时,请说是在58同城上看到的,谢谢","");
}
//获取图片
String xpath4 = "//div[@class='swiper-slide']/img/@src";
List<Object> rs4 = jxDocument.sel(xpath4);
logger.error("【原始图片】:" + rs4.size() + ":具体值:" + rs4);
List<String> imgUrls = new ArrayList<>();
if (CheckString.isNotEmpty(rs4)) {
for (int i = 0, size = rs4.size(); i < size; i++) {
if (i > 5) {
break;
}
imgUrls.add((String)rs4.get(i));
}
logger.error("【imgUrls的长度是】:" + imgUrls.size());
}
String leaseMode = "";
logger.error("【房子模式】:" + CheckString.getLaseMode("村"));
if (CheckString.isNotEmpty(title)) {
leaseMode = CheckString.getLaseMode(title);
logger.error("【处理之后的leaseMode】:" + leaseMode);
}
}
br.close();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("hellow world!!!");
}
}
}
☛注意:使用allText()可以获取值所有的纯文本
☛注意://表示获取所有div /表示获取单个div //可以随便用 /只能一层一层薄消
String xpath = "//div[@class='house-header cont-padding']/div[@class='house-header-left']/allText()";