废话先不多说,看下代码:
使用的jar(json的工具jar还有httpClient的jar):
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import com.alibaba.fastjson.JSONArray;
/**
* 爬取中国的疫情数据使用HttpClient
* @author Yuer
*
*/
public class HttpClientChina {
public static void main(String[] args) throws Exception {
parseHtml1();
}
public static JSONArray parseHtml1() throws IOException {
// 先引入httpclient的包
HttpClient client = HttpClientBuilder.create().build();
HttpGet get = new HttpGet("https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0");
get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0");
HttpResponse response = client.execute(get);
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity resEntity = response.getEntity();
String message = EntityUtils.toString(resEntity, "utf-8");
// System.out.println(message);
// 正则获取数据
// 因为html的数据格式看着就像json格式,所以我们正则获取json
String reg = "window.getAreaStat = (.*?)\\}(?=catch)";
Pattern totalPattern = Pattern.compile(reg);
Matcher totalMatcher = totalPattern.matcher(message);
String result = "";
if (totalMatcher.find()) {
result = totalMatcher.group(1);
// System.out.println(result);
// 各个省市的是一个列表List,如果想保存到数据库中,要遍历结果,下面是demo
JSONArray array = JSONArray.parseArray(result);
System.out.println(array);
return array;
}
} else {
System.out.println("请求失败");
}
return null;
}
}
最开始使用jsoup和socket去获取,效果不太理想,后面决定使用HttpClient,然后这里待解决的问题是将该json数组的数据转为excel来便于观看,但是弄了半天没成功,以后改进吧。使用的jar:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.10-FINAL</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>