选择163新闻网站的某一篇新闻https://news.163.com/18/0920/13/DS5ARO3R0001899O.html进行爬取。
如果新闻已不存在则读者参考文章另选新闻。
import com.xucj.jsoup.Httpclientutil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class New163Spider {
public static void main(String[] args) throws IOException {
String html = Httpclientutil.getHtmlByUrl("https://news.163.com/18/0920/13/DS5ARO3R0001899O.html");
Document document = Jsoup.parse(html);
//1.获取新闻标题
Element title = document.select("#epContentLeft h1").get(0);
if(title != null){
System.out.println(title.text());
}
//2.获取新闻时间
Element newDate = document.select(".post_time_source").get(0);
System.out.println(newDate.text().split(" ")[0]);
//3.获取新闻来源
System.out.println(newDate.text().split(" ")[1]);
//4.获取新闻内容
Elements context = document.select(".end-text>p");
if(context != null && !context.isEmpty()){
for (Element c : context) {
String replace = c.text().replace("<p>", "");
replace = replace.replace("</p>", "");
System.out.println(replace);
}
}
//5.获取新闻责任编辑
Element element = document.select("[class=ep-source cDGray] .ep-editor").get(0);
System.out.println(element.text());
}
}
Httpclientutil工具类:
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class Httpclientutil {
public static String getHtmlByUrl(String url) throws IOException {
//1.创建HttpClient对象
CloseableHttpClient aDefault = HttpClients.createDefault();
//2.指定请求方式
HttpGet httpGet = new HttpGet(url);
//3.可选的,封装请求参数
//3.1封装请求体
// httpGet.setHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
// httpGet.setHeader("Accept-Encoding","gzip, deflate");
// httpGet.setHeader("Accept-Language","zh-CN,zh;q=0.9");
// httpGet.setHeader("Cache-Control","max-age=0");
// httpGet.setHeader("Connection","keep-alive");
// httpGet.setHeader("Host","www.itcast.cn");
// httpGet.setHeader("Upgrade-Insecure-Requests","1");
// httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36");
//4.发送请求
CloseableHttpResponse execute = aDefault.execute(httpGet);
//5.获取数据
//5.1获取状态码
if(execute.getStatusLine().getStatusCode()==200){
String html = EntityUtils.toString(execute.getEntity(), "UTF-8");
return html;
}
return null;
}
}