网文爬虫小程序2.0
前两天写了一个网页小说爬虫,只能一章一章地下载,慢死个人,实在不堪大用,有兴趣的可以点这里。现在改进了一下,多线程下载。废话不多说直接上代码。
用到的包如下:
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
说明一下:
有时候会Jsoup.parse有时候会报exception,导致单个章节下载不到;
刚开始可以注释掉ftmp.deleteOnExit();这样可以保留每个章节的独立文件。
线程可以调多一点,但会增加Jsoup.parse报exception的几率。
file_merge()是从网上扒过来的,非原创。 有兴趣的可以看这里(java 多个线程同时写同一个文件)
public static void main(String[] args) throws IOException, InterruptedException, ExecutionException {
mdown1();
}
public static void mdown1() throws IOException, InterruptedException {
final String chapter="https://www.xxxxxx.org/xxxxx/";//小说章节目录
Document document = Jsoup.parse(new URL(chapter),30000);
String bookname=document.getElementById("info").child(0).text();//得到书名
final Elements e2=document.getElementById("list").getElementsByAttribute("href");//得到章节列表
final int tnum = 20 ; // 开20个线程
final CountDownLatch downLatch = new CountDownLatch(e2.size());//线程计数器
ExecutorService service = Executors.newFixedThreadPool(tnum);//线程池
ArrayList<File> files = new ArrayList<File>();//所有文件列表,用于文件合并
int chapter_num=0;
for(;chapter_num<e2.size();chapter_num++) {
Element e=e2.get(chapter_num);
final String title=e.text();
final String url=chapter + e.attr("href");
final FileWriter fwtmp = new FileWriter("d:/tmp/tt_"+ chapter_num +".txt");//保存路径
final BufferedWriter bwtmp=new BufferedWriter(fwtmp);
//
File ftmp=new File("d:/tmp/tt_" + chapter_num +".txt");
ftmp.deleteOnExit();//注意:程序结束之后会删除被合并的文件,适用于临时文件
files.add(ftmp);
service.execute(new Runnable(){
@Override
public void run(){
try {
System.out.println("开始下载" + title);
Document doc = Jsoup.parse(new URL(url),30000);
Element content=doc.getElementById("content");//根据ID查找content
//写title
bwtmp.write("\r\n================" + title + "================\r\n"); //write title
//写 content
for(TextNode tn : content.textNodes() ) {
String tmp=tn.text().replace(" ", "");
if(tmp.contains("www")) continue;//去除广告
bwtmp.write(" "+tmp+"\r\n");
}
bwtmp.flush();
System.out.println(title+" 下载完成 ");
// downLatch.countDown();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
downLatch.countDown();
try {
bwtmp.close();
fwtmp.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
});
}
downLatch.await();//等待所有章节下载完毕
service.shutdown();
System.out.println("总章节数 :"+chapter_num);
//章节合并
String bookpath="d:/tmp/"+bookname+".txt";//全路径
file_merge(files,bookpath);
System.out.println("合并文件完毕!");
}
public static void file_merge(ArrayList<File> files , String dstFilePath) {
File fdst = new File(dstFilePath);
FileInputStream in = null;
FileChannel inChannel = null;
FileOutputStream out = null ;
FileChannel outChannel = null ;
try {
out = new FileOutputStream(fdst, true);
outChannel = out.getChannel();
// 记录新文件最后一个数据的位置
long start = 0;
for (File file : files) {
in = new FileInputStream(file);
inChannel = in.getChannel();
// 从inChannel中读取file.length()长度的数据,写入outChannel的start处
outChannel.transferFrom(inChannel, start, file.length());
start += file.length();
in.close();
inChannel.close();
}
}catch (Exception e) {
e.printStackTrace();
} finally {
try {
out.close();
outChannel.close();
} catch (Exception e2) {
}
}
}