利用Jsoup、正则抽取网页的标题、作者、发布时间以及正文
源网站
利用Jsoup进行抽取
public static void main(String[] args) {
String url = "https://auto.gasgoo.com/a/70125507.html";
String sourceStr = getHTML(url, "UTF-8");
System.out.println(getTitle(sourceStr));
System.out.println(getTime(sourceStr));
System.out.println(getAuthor(sourceStr));
System.out.println(getContent(sourceStr).get(0));
}
/**
* 获取时间
* @param sourceStr
* @return
*/
public static String getTime(String sourceStr) {
Document doc = Jsoup.parse(sourceStr);
//根据元素的Class来获取其使用该种class的元素Elements
//若根据id来选,因为id是唯一标志,故只返回一个Element
if (doc.getElementsByClass("userInfo").size()!=0) {
return doc.getElementsByClass("userInfo").get(0).getElementsByTag("span").get(0).text();
}else {
return doc.getElementsByClass("pageInfo").get(0).getElementsByTag("span").get(1).text();
}
}
/**
* 获取作者信息,因为网站具有多种包含作者的标签样式,故需分别处理
* @param sourceStr
* @return
*/
public static String getAuthor(String sourceStr) {
Document doc = Jsoup.parse(sourceStr);
//根据元素的Class来获取其使用该种class的元素Elements
//若根据id来选,因为id是唯一标志,故只返回一个Element
Elements author2 = doc.getElementsByClass("scribe").get(0).getElementsByTag("a");
if (doc.getElementsByClass("userInfo").size()!=0) {
Elements author1 = doc.getElementsByClass("userInfo").get(0).getElementsByTag("a");
if (author1.size()==2) {
return author1.get(1).text();
}else {
return author1.get(0).text();
}
}else {
return doc.getElementsByClass("scribe").get(0).getElementsByTag("a").get(0).text();
}
}
/**
* 获取标题
* @param sourceStr
* @return
*/
public static String getTitle(String sourceStr) {
Document doc = Jsoup.parse(sourceStr);
//根据元素的Class来获取其使用该种class的元素Elements
//若根据id来选,因为id是唯一标志,故只返回一个Element
if (doc.getElementsByClass("detailed").size()!=0) {
return doc.getElementsByClass("detailed").get(0).getElementsByTag("h1").get(0).text();
}else {
return doc.getElementsByClass("articleNR fl").get(0).getElementsByTag("h1").get(0).text();
}
}
/**
* 返回正文内容
* @param sourceStr
* @return
*/
public static List<String> getContent(String sourceStr) {
List<String> list = new ArrayList<>();
Document document = Jsoup.parse(sourceStr);
//正文块由id唯一标识
Element content = document.getElementById("ArticleContent");
Elements links = content.getElementsByTag("p");
for (Element link : links) {
String linkText = link.text();
if (!linkText.equals("")) {
list.add(linkText);
// System.out.println(linkText);
}
}
return list;
}
/**
* 根据网址获取网页源代码
* @param pageURL
* @param charset
* @return
*/
public static String getHTML(String pageURL, String charset) {
StringBuilder pageHTML = new StringBuilder();
try {
URL url = new URL(pageURL);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36");
// connection.setRequestProperty("cookie",cookie);
BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset));
String line = null;
while ((line = br.readLine()) != null) {
pageHTML.append(line);
pageHTML.append("\r\n");
}
connection.disconnect();
} catch (Exception e) {
e.printStackTrace();
}
return pageHTML.toString();
}
利用正则表达式进行抽取
/**
* 从网址中获取id
* @return
*/
public String getId(String url) {
String id1 = "";
String id2 = "";
id1 = RegexTools.getRegexResult(url, ".+\\/(.*?).shtml");
id2 = RegexTools.getRegexResult(url, ".+\\/(.*?).html");
if (!id1.equals("")) {
return id1;
} else {
return id2;
}
}
public String getTitle(String sourceStr) {
String title1 = "";
String title2 = "";
title1 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"detailed\">.*<h1>(.*?)</h1>");
title2 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"articleNR\\sfl\">\\s*<h1>(.*?)</h1>");
if (!title1.equals("")) {
return title1;
} else {
return title2;
}
}
public String getTime(String sourceStr) {
String time1 = "";
String time2 = "";
time1 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">.*?<span>(.*?)</span>.*?</div>");
time2 = RegexTools.getRegexResult(sourceStr, "</a>\\s*</span>\\s*<span>(.*?)</span>\\s*<span\\sclass=\"scribe\">");
if (!time1.equals("")) {
return time1;
} else {
return time2;
}
}
/**
* 获取作者信息,因为网站具有多种包含作者的标签样式,故需分别处理
* @param sourceStr
* @return
*/
public String getAuthor(String sourceStr) {
String author1 = "";
String author2 = "";
String author3 = "";
String author4 = "";
author1 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">.*?<a.*?class=\"editor\">(.*?)</a>.*?</div>");
author2 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">.*?<a.*?>(.*?)</a>.*?</div>");
author3 = RegexTools.getRegexResult(sourceStr, "<div\\sclass=\"userInfo\">\\s+(.*?)\\s+<span>");
author4 = RegexTools.getRegexResult(sourceStr, "<span\\sclass=\"scribe\">\\s*<a\\shref=.*?>\\s*<img\\s.*?>(.*?)\\s*</a>");
if (!author1.equals("")) {
return author1;
} else if (!author2.equals("")) {
return author2;
} else if (!author3.equals("")) {
return author3;
} else{
return author4;
}
}
/**
* 返回正文内容
* @param sourceStr
* @return
*/
public List<String> getContent(String sourceStr) {
List<String> list = new ArrayList<>();
Document document = Jsoup.parse(sourceStr);
//正文块由id唯一标识
Element content = document.getElementById("ArticleContent");
Elements links = content.getElementsByTag("p");
for (Element link : links) {
String linkText = link.text();
if (!linkText.equals("")) {
list.add(linkText);
// System.out.println(linkText);
}
}
return list;
}
完!