用到httpclient包跟jsoup包
要处理的URL:https://news.ecnu.edu.cn/cf/4c/c1833a118604/page.psp
爬取:c1833a118604——c1833a118704
首先对url做处理,获取URL:
public static int subUrl() {
int page;
String url = "https://news.ecnu.edu.cn/cf/4c/c1833a118604/page.psp";
String[] strs = url.split("/");
String str = strs[5];
String str1 = str.substring(0, str.indexOf("a"));
String str2 = str.substring(str1.length() + 1, str.length());
page = Integer.parseInt(str2);
return page;
}
然后抓取页面信息到本地:
public class HttpRequest {
public static void main(String[] args) throws Exception {
int page = SubUrl.subUrl();
for (int i = 0; i < 99; i++) {
String url = "https://news.ecnu.edu.cn/cf/4c/c1833a" + page + "/page.psp";
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpClient.execute(httpGet);
HttpEntity entity = response.getEntity();
String concent = EntityUtils.toString(entity, "utf-8");
response.close();
Document document = Jsoup.parse(concent);
Elements elements = document.getElementsByTag("html");
String string = elements.html();
// 新建文件保存
String fileName = "sunbeam//result" + page + ".html";
File file = new File(fileName);
File fileParent = file.getParentFile();
if (!fileParent.exists()) {
// 创建父目录文件
fileParent.mkdirs();
}
file.createNewFile();
// System.out.println(string);
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
// 写入内容
osw.write(string);
// 关闭写入流
osw.close();
page--;
}
}
}