无聊的星期一,老姐要我找一本网上的小说,我看没有下载的,写了一个爬虫写入txt 文件,很简单
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Calendar;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
*
* @author memode
*
*/
public class Test_1 {
public static void atiricle(){
Document newsdoc;
long t1 = System.currentTimeMillis(); // 排序前取得当前时间
String link = "http://www.tywx.com/ty109892/";
int num = 5846664; //初始化页码
int chapter = 0; //章节计数
String tmpLink = "http://www.tywx.com/ty109892/"+num+".html";
File file = new File("c:\\test.txt"); //存入路径
try {
PrintStream ps = new PrintStream(new FileOutputStream(file));
while (true) {
//设置15s的超时时间
if("404".equals(Jsoup.connect(tmpLink).timeout(15000).execute().statusCode())){
break;
}
newsdoc = Jsoup.connect(tmpLink)
.userAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.21 (KHTML, like Gecko) Chrome/19.0.1042.0 Safari/535.21")
.timeout(5000).get(); //设置超时时间5s
//判断
if(null==newsdoc.getElementsByAttributeValue("class","kfyd").first()){
break;
}
//标题
String title = newsdoc.getElementsByAttributeValue("class","kfyd").first().select("h1").text();
System.out.println(title);
//获取下一章的链接
tmpLink = link+ newsdoc.getElementsByAttributeValue("id","thumb").first().select("#pager_next").attr("href");
System.out.println(tmpLink);
//解析小说的正文
String news_tmp = newsdoc.getElementsByAttributeValue("id", "content").select("div").remove()
.html()
.replaceAll("<.*?script[^>]*?>[\\s\\S]*?<\\/.*?script.*?>*", " ") //过滤script脚本
.replaceAll("(?i)<br[^>]*>\n<br>", "\n").replaceAll(" ", " "); //过滤换行 和空格
chapter++;
//写入章节和正文
ps.append(title+"\n\n");
ps.append(news_tmp+"\n");
}
} catch (IOException e) {
System.out.println("网络异常 net error!");
}
System.out.println("已下载"+chapter+"章");
long t2 = System.currentTimeMillis(); // 排序后取得当前时间
Calendar c = Calendar.getInstance();
c.setTimeInMillis(t2 - t1);
System.out.println("耗时: " + c.get(Calendar.MINUTE) + "分 "
+ c.get(Calendar.SECOND) + "秒 " + c.get(Calendar.MILLISECOND)
+ " 毫秒");
}
public static void main(String[] args) {
new Test_1().atiricle();
}
}