Java利用正则表达式获取网页端的标签内容
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetHtmlTitle {
public GetHtmlTitle(String htmlUrl){
System.out.println("------------开始读取网页(" + htmlUrl + ")-----------");
String htmlSource = "";
htmlSource = getHtmlSource(htmlUrl);
System.out.println("------------读取网页(" + htmlUrl + ")结束-----------/n");
System.out.println("------------分析(" + htmlUrl + ")结果如下-----------/n");
String title = getTitle(htmlSource);
System.out.println("网站标题: " + title);
}
public String getHtmlSource(String htmlUrl){
URL url;
StringBuffer sb = new StringBuffer();
try{
url = new URL(htmlUrl);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "UTF-8"));
String temp;
while ((temp = in.readLine()) != null)
{
sb.append(temp);
}
in.close();
}catch (MalformedURLException e) {
System.out.println("你输入的URL格式有问题!请仔细输入");
}catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
public String getTitle(String htmlSource){
List<String> list = new ArrayList<String>();
String title = "";
Pattern pa = Pattern.compile("<title>.*?</title>");
Matcher ma = pa.matcher(htmlSource);
while (ma.find())
{
list.add(ma.group());
}
for (int i = 0; i < list.size(); i++)
{
title = title + list.get(i);
}
return outTag(title);
}
public String outTag(String s)
{
return s.replaceAll("<.*?>", "");
}
public static void main(String[] args) {
String htmlUrl = "https://blog.csdn.net/zty1317313805/article/details/80097511";
new GetHtmlTitle(htmlUrl);
}
}