简单的从网上获取小说内容:
package com.company.web;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpRequest {
//设置字符集
static String charsetName = "gbk";
HttpRequest(String htmlUrl) {
URL url = null;
URLConnection urlNet = null;
InputStream in = null;
BufferedReader bReader = null;
HashMap map = new HashMap();
try {
url = new URL(htmlUrl);
urlNet = url.openConnection();
in = urlNet.getInputStream();
bReader = new BufferedReader(new InputStreamReader(in, charsetName));
String s = null;
String html = "";
while ((s = bReader.readLine()) != null) {
html += s;
}
String title = getTitle(html);
String pLabel = getPLabel(html);
System.out.println(title);
System.out.println(pLabel);
in.close();
} catch (IOException e) {
System.out.println("无法打开网页!!");
e.printStackTrace();
}
}
/**
* 获取title标签里面的内容
*
* @param html
* @return
*/
public String getTitle(String html) {
String title = "";
String regex = "<title>.*?</title>";
List<String> list = new ArrayList<String>();
Pattern pa = Pattern.compile(regex);
Matcher ma = pa.matcher(html);
while (ma.find())//寻找符合el的字串
{
list.add(ma.group());//将符合el的字串加入到list中
}
for (int i = 0; i < list.size(); i++) {
title += list.get(i);
}
return outTag(title);
}
/**
* 获取P标签里面的内容
*
* @param html
* @return
*/
public String getPLabel(String html) {
String title = "";
String regex = "<p.*?>(.*?)</p>";
List<String> list = new ArrayList<String>();
Pattern pa = Pattern.compile(regex);
Matcher ma = pa.matcher(html);
while (ma.find())//寻找符合el的字串
{
list.add(ma.group());//将符合el的字串加入到list中
}
for (int i = 0; i < list.size(); i++) {
title += "\n" + list.get(i);
}
return outTag(title);
}
/**
* 过滤前后标签
*
* @param str
* @return
*/
public String outTag(String str) {
return str.replaceAll("<.*?>", "");
}
public static void main(String[] args) {
//网站地址
String htmlUrl = "https://www.x23qb.com/book/1081/228497.html";
new HttpRequest(htmlUrl);
}
}
效果图: