java做网页数据爬取## 标题
首先数据爬取,会有出现很多种可能,比如需要爬取的网页设置了防爬取,或者做数据权限验证,需要用户登录
这里我简单讲下,基本的数据爬虫怎么分析,需要提前准备做哪些工作
1:需要爬取的网页地址 :http://219.233.250.38/aqi/Home/Index
2:需要爬取网页的数据块
3:分析网页数据结构(结构存在变化,需要做兼容适配)
4:获取相应节点下的数据
首先我以上海空气质量实时发布系统为题
进入页面我们看页面结构,还有所需要的数据部分
在本次数据爬取的内容是需要获取1:实时空气质量 2:污染物实时浓度变化:3:空气质量预报
爬取中遇到的问题
1:污染物实时浓度变化,页面直接获取不到数据,需要单独调用接口:http://219.233.250.38/aqi/Home/PollutantRealtimeChanging
2:调用时对方设置了拦截规则
3:页面结构变化,导致有时数据无法爬取
下面看下代码情况
public void shangHaiAirQualityForecast () {
try {
//需要爬取数据的地址
String data = "http://219.233.250.38/aqi/Home/Index";
String data1 = "http://219.233.250.38/aqi/Home/PollutantRealtimeChanging";
URL url = new URL (data);
HttpURLConnection resumeConnection = (HttpURLConnection) url.openConnection ();
resumeConnection.connect ();
InputStream urlStream = resumeConnection.getInputStream ();
BufferedReader bufferedReader = new BufferedReader (
new InputStreamReader (urlStream, "UTF-8"));
String ss = null;
String total = "";
//循环读取bufferedReader,组织成字符串
while ((ss = bufferedReader.readLine ()) != null) {
total += ss;
}
//使用document操作,获取指定节点下的数据接结构
Document doc = Jsoup.parse(total);
Elements elements = doc.getElementsByClass("row clearfix ");
Document doc1 = null;
//这边是由于网页数据可能存在变化,做出的适应判断
if (elements.size () ==3){
doc1 = Jsoup.parse(String.valueOf(elements.get (1)));
}else {
doc1 = Jsoup.parse(String.valueOf(elements.get (2)));
}
//使用document操作,获取指定节点下的数据接结构
Elements elements1 = doc1.getElementsByClass("col-md-12 column");
String element = String.valueOf(elements1.get (0));
//将得到的节点数据通过字符串截取,得到需要的数据
String timePoint = stringReplace(spanDivSplit(element));
Element element1 = doc1.getElementById("cr-aqicolor01");
String aqi = pSplit(String.valueOf (element1));
Element element2 = doc1.getElementById("cr-ContaminantsData");
String Primary_pollutant = subSplit(String.valueOf (element2));
Element element3 = doc1.getElementById("qulaitylabel01");
String qulaitylabel = pSplit2 (String.valueOf (element3));
Elements elements2 = doc1.getElementsByClass("col-md-8 col-xs-9 column");
String Aqi_24hours = pSplit(String.valueOf(elements2.get (1)));
//这边是由于页面结构中直接爬取不到,需要执行单独的接口,获取数据
String resultString = doPostJson(data1,"{}");
JsonParser jsobj = new JsonParser ();
JsonArray jarray = (JsonArray) jsobj.parse (resultString);
for (int i = 0; i <jarray.size () ; i++) {
JsonObject subObject = jarray.get (i).getAsJsonObject ();
String itemid = subObject.get ("itemid").getAsString ();
Float value = subObject.get ("value").getAsFloat();
}
Elements elements3 = doc1.getElementsByClass("col-md-12 column");
String timePoint1 = stringReplace(pSplit1(String.valueOf (elements3.get(6))));
Elements elements4 = doc1.getElementsByClass("table");
String[] yb = trSplit(String.valueOf(elements4.get (1)));
bufferedReader.close ();
} catch (Exception e) {
e.getMessage ();
}
}
/**
* 传送json类型的post请求
* @param url
* @param json
* @return String
*/
public static String doPostJson(String url, String json) {
// 创建Httpclient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
String resultString = "";
try {
// 创建Http Post请求
HttpPost httpPost = new HttpPost(url);
//httpPost.setHeader("", "");
// 创建请求内容
StringEntity entity = new StringEntity(json, ContentType.APPLICATION_JSON);
httpPost.setEntity(entity);
// 执行http请求
response = httpClient.execute(httpPost);
resultString = EntityUtils.toString(response.getEntity(), "utf-8");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return resultString;
}
//时间替换的处理
public static String stringReplace (String string) {
String[] strNow = new SimpleDateFormat ("yyyy-MM-dd").format (new Date ()).toString ().split ("-");
Integer year = Integer.parseInt (strNow[0]);
string = string.replace ("月", "-");
string = string.replace ("日", " ");
if (string.contains ("时发布")){
string = string.replace ("时发布", ":00:00");
}else {
string = string.replace ("时", ":00:00");
}
string = year + "-" + string.trim ();
return string;
}
//span标签时截取
public static String spanDivSplit (String string) {
String[] strings = string.split ("</span>");
strings = strings[0].split ("<span style=\"font-size: 14px\">");
return strings[1];
}
//p标签时截取
public static String pSplit (String string) {
String[] strings = string.split ("</p>");
strings = strings[0].split (">");
return strings[2];
}
//tr标签时截取
public static String[] trSplit (String string) {
String[] strings = string.split("</tbody>");
strings = strings[0].split ("</tr>");
return strings;
}
//p标签和sub标签时截取
public static String subSplit (String string) {
String[] strings = string.split ("</p>");
if (strings[0].contains ("</sub>")){
strings = string.split ("</sub>");
strings = strings[0].split ("<sub>");
String string1 = strings[1];
strings = strings[0].split (">");
if (strings.length>1){
return strings[1] + string1;
}else {
return strings[0] + string1;
}
}else {
strings = strings[0].split (">");
return strings[1];
}
}
//p标签时截取,可以和下面的整合,一起使用
public static String pSplit1 (String string) {
String[] strings = string.split ("</p>");
strings = strings[1].split (">");
return strings[1];
}
//p标签时截取
public static String pSplit2 (String string) {
String[] strings = string.split ("</p>");
strings = strings[0].split (">");
return strings[1];
}
上面获取数据的代码,然后将需要的数据入库即可
下次分享需要登录验证的网页数据爬取