此种方法只适用于网页源代码含有所要提取的信息
此例中,我们索要获取信息的网页是:“https://blog.csdn.net/”,其中又包含不同的文章,再进一步获得不同文章的信息。
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.net.ssl.*;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.List;
public class test {
static
{
try
{
trustAllHttpsCertificates();
HttpsURLConnection.setDefaultHostnameVerifier
(
new HostnameVerifier()
{
public boolean verify(String urlHostName, SSLSession session)
{
return true;
}
}
);
} catch (Exception e) {}
}
private static void trustAllHttpsCertificates()
throws NoSuchAlgorithmException, KeyManagementException
{
TrustManager[] trustAllCerts = new TrustManager[1];
trustAllCerts[0] = new TrustAllManager();
SSLContext sc = SSLContext.getInstance("SSL");
sc.init(null, trustAllCerts, null);
HttpsURLConnection.setDefaultSSLSocketFactory(
sc.getSocketFactory());
}
private static class TrustAllManager
implements X509TrustManager
{
public X509Certificate[] getAcceptedIssuers()
{
return null;
}
public void checkServerTrusted(X509Certificate[] certs,
String authType)
throws CertificateException
{
}
public void checkClientTrusted(X509Certificate[] certs,
String authType)
throws CertificateException
{
}
}
public static List<String> getText(String Url) throws IOException {
List<String> urlList = new ArrayList<>();
String rule = "abs:href";//加abs:获得绝对地址
//获得网页源代码
Document document = Jsoup.connect(Url).timeout(4000).ignoreContentType(true).userAgent("Mozilla\" to \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36").get();
Elements urlNode = document.getElementsByClass("title").select("h2").select("a");
for(Element element : urlNode){
String title = element.text();//获取文章名
urlList.add(element.attr(rule));//获取该网页链接
}
return urlList;
}
public static void main(String[] args) throws IOException {
List<String> urlist = getText("https://blog.csdn.net/");
testThread testThread = new testThread(urlist);
testThread.run();
}
}
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class testThread extends Thread {
List<String> urlist;
public testThread(List<String> urlist){
this.urlist = urlist;
}
public static String Path = "D:\\CSDN\\";//读入地址
public static void createFile(File fileName) throws Exception {
try {
if (!fileName.exists()) {
fileName.createNewFile();
}
} catch (Exception e) {
e.printStackTrace();
}
}
//写入文本内容
public static void writeTxtFile(String content, File fileName) throws Exception {
RandomAccessFile mm = null;
FileOutputStream o = null;
try {
o = new FileOutputStream(fileName);
o.write(content.getBytes("UTF-8"));
o.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (mm != null) {
mm.close();
}
}
}
public void run(){
String title;
String content;
for (String url : urlist) {
try {
Document document = Jsoup.connect(url).timeout(6000).get();
title = document.select("title").toString();
Elements co = document.select("#article_content").select("p");
content = co.html();
Elements img = co.select("img");
//创建线程池
ExecutorService pool = Executors.newFixedThreadPool(9);
for(Element element : img){
String str = element.attr("src");//获取链接
pool.execute(new DownloadTask(str));//下载图片
}
pool.shutdown();//释放线程池
File file = new File(Path+title.replaceAll("<title>", "").replaceAll("</title>","")+".txt");
createFile(file);
System.out.println("创建文件:"+file.getPath());
writeTxtFile(FileterHtml(content), file);
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}
}
//删除不必要的符号
public String FileterHtml(String content){
content = content.replaceAll("<br>","\n").replaceAll("<strong>","").replaceAll("</br>","").
replaceAll("</strong>","");
Pattern p_scirpt;
Matcher m_special;
String regEx_special=" ";
p_scirpt = Pattern.compile(regEx_special,Pattern.CASE_INSENSITIVE);
m_special = p_scirpt.matcher(content);
content = m_special.replaceAll("");
return content;
}
}
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
public class DownloadTask implements Runnable{
String str;
public DownloadTask(String str){
this.str = str;
}
public void run(){
HttpURLConnection conn = null;
InputStream in = null;
FileOutputStream out = null;
try {
conn = (HttpURLConnection) new URL(str).openConnection();
//读取数据
in = conn.getInputStream();
String uu = "D:\\CSDN\\";
//获得图片的名字
int index = str.lastIndexOf('/');
String file = str.substring(index+1);
file = uu +file + ".jpg";
//创建输出流,写入
out = new FileOutputStream(file);
byte[] buf = new byte[1024 + 16];
int size;
while(-1 != (size = in.read(buf))) {
out.write(buf, 0, size);
}
//下载完成
String name = Thread.currentThread().getName();
System.out.println(name + "下载" + file);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//不论是否发生异常都会执行的
if(out != null) {
try {
out.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if(conn != null) {
conn.disconnect();
}
}
}
}