前言
项目需要获取7日天气预报,免费好用的接口寻觅不到,搜索一番后发现用简单的爬虫可以实现,在完成python版本后,想着能否用java来实现,一番学习后完成了需求,结果如下
工具准备
在Maven项目中添加相应依赖,这里使用httpclient+jsoup的组合来完成
httpclient用来发送请求,而jsoup用来解析请求结果
两者的详细介绍参考文末引文
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
爬取数据
首先看到中国天气网的界面如下(随着时间段的不同,界面可能显示不同)
打开控制台看到目标url,url最后的数字标识地区代码,这是我们爬虫的入口,请求头中的User-Agent属性标识自己使用的浏览器。
新建HtmlUtil类用来发送请求,爬取数据需要先从此类获取
package com.ljp.springandpython.utils;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HtmlUtil {
public static String getResult(String url) {
try (CloseableHttpClient httpClient = HttpClientBuilder.create().build();
CloseableHttpResponse response = httpClient.execute(new HttpGetConfig(url))) {
String result = EntityUtils.toString(response.getEntity(),"utf-8"); //设置编码,防止乱码
return result;
} catch (IOException e) {
e.printStackTrace();
return "";
}
}
}
class HttpGetConfig extends HttpGet {
public HttpGetConfig(String url) {
super(url);
setDefaultConfig();
}
private void setDefaultConfig() {
this.setConfig(RequestConfig.custom()
.setConnectionRequestTimeout(1000 * 10)
.setConnectTimeout(1000 * 10)
.setSocketTimeout(1000 * 10)
.build());
this.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0");
}
}
获取日期和星期
在界面中审查元素,找到“4日(今天)”元素,发现为< h1 >标签,另外可以发现在此页面中,h1标签在这里首次出现,那么我们可以获取全部的h1标签,最后只取前7个,提取出日期
jsoup中的select方法可以方便的提取页面中的元素,更多用法参照 jsoup中select的用法
String result=HtmlUtil.getResult("http://www.weather.com.cn/weather/101110200.shtml");
Document document= Jsoup.parse(result);
Elements elements;
// 获取日期和星期
elements=document.select("h1");
List<String> dateList=new ArrayList<>();
List<String> dayList=new ArrayList<>();
for (int i = 0; i < 7; i++) {
String text=elements.get(i).text();
int length=text.length();
dateList.add(text.substring(0,length-4));
dayList.add(text.substring(length-3,length-1));
}
System.out.println(dateList);
System.out.println(dayList);
获取天气描述
获取天气较为简单,发现天气位于p标签内,class属性为“wea”,接上文代码
// 获取天气
elements=document.select("p[class=wea]");
List<String> weatherList=new ArrayList<>();
for (Element item : elements) {
weatherList.add(item.text());
}
System.out.println(weatherList);
获取温度范围
温度范围类似,不再赘述
// 获取温度,最高温和最低温
elements=document.select("p[class=tem]");
int i=0;
List<String> highTempList=new ArrayList<>();
List<String> lowTempList=new ArrayList<>();
for (Element item : elements) {
highTempList.add(item.select("span").text()+"℃");
lowTempList.add(item.select("i").text());
}
System.out.println(highTempList);
System.out.println(lowTempList);
获取风向及风力
风向和风力有多个子标签嵌套,仔细分析结构
// 获取风向及风力
elements=document.select("p[class=win]");
List<String> windDirectionList1=new ArrayList<>();
List<String> windDirectionList2=new ArrayList<>();
List<String> windSpeedList=new ArrayList<>();
for (Element item : elements) {
Element em=item.child(0); //获取em标签,em标签中包含了两个span标签,是需要的风向
windDirectionList1.add(em.select("span").get(0).attr("title")); //attr函数用来获取标签内的属性值
windDirectionList2.add(em.select("span").get(1).attr("title"));
windSpeedList.add(item.select("i").text());
}
System.out.println(windDirectionList1);
System.out.println(windDirectionList2);
System.out.println(windSpeedList);
完整代码
package com.ljp.springandpython.utils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.boot.test.context.SpringBootTest;
import java.util.*;
@SpringBootTest
class HtmlUtilTest {
@Test
void test1(){
String result=HtmlUtil.getResult("http://www.weather.com.cn/weather/101110200.shtml");
Document document= Jsoup.parse(result);
Elements elements;
// 获取日期和星期
elements=document.select("h1");
List<String> dateList=new ArrayList<>();
List<String> dayList=new ArrayList<>();
for (int i = 0; i < 7; i++) {
String text=elements.get(i).text();
int length=text.length();
dateList.add(text.substring(0,length-4));
dayList.add(text.substring(length-3,length-1));
}
System.out.println(dateList);
System.out.println(dayList);
// 获取天气
elements=document.select("p[class=wea]");
List<String> weatherList=new ArrayList<>();
for (Element item : elements) {
weatherList.add(item.text());
}
System.out.println(weatherList);
// 获取温度,最高温和最低温
elements=document.select("p[class=tem]");
int i=0;
List<String> highTempList=new ArrayList<>();
List<String> lowTempList=new ArrayList<>();
for (Element item : elements) {
highTempList.add(item.select("span").text()+"℃");
lowTempList.add(item.select("i").text());
}
System.out.println(highTempList);
System.out.println(lowTempList);
// 获取风向及风力
elements=document.select("p[class=win]");
List<String> windDirectionList1=new ArrayList<>();
List<String> windDirectionList2=new ArrayList<>();
List<String> windSpeedList=new ArrayList<>();
for (Element item : elements) {
Element em=item.child(0); //获取em标签,em标签中包含了两个span标签,是需要的风向
windDirectionList1.add(em.select("span").get(0).attr("title")); //attr函数用来获取标签内的属性值
windDirectionList2.add(em.select("span").get(1).attr("title"));
windSpeedList.add(item.select("i").text());
}
System.out.println(windDirectionList1);
System.out.println(windDirectionList2);
System.out.println(windSpeedList);
// 封装结果,每天一行
List<Map<String,String>> list=new ArrayList<>();
for (int j = 0; j < 7; j++) {
Map<String,String> map=new LinkedHashMap<>();
map.put("date",dateList.get(j));
map.put("day",dayList.get(j));
map.put("weather",weatherList.get(j));
map.put("highTemp",highTempList.get(j));
map.put("lowTemp",lowTempList.get(j));
map.put("windDirection1",windDirectionList1.get(j));
map.put("windDirection2",windDirectionList2.get(j));
map.put("windSpeed",windSpeedList.get(j));
list.add(map);
}
list.forEach(System.out::println);
}
}
运行结果: