packagexyz.yangchaojie.JSOUP.service;importjava.io.File;importjava.io.FileOutputStream;importjava.io.IOException;importjava.io.RandomAccessFile;importjava.util.List;importorg.jsoup.Jsoup;importorg.jsoup.nodes.Document;public class CrawTextThread extendsThread {
ListUrlList;public CrawTextThread(ListurlList) {this.UrlList =urlList;
}
String rule= "";
String rule_title= "h1";
String rule_content= "content";public static String PATH = "D:\\JSOUP\\";/*** 创建文件
*
*@paramfileName
*@return
*/
public static void createFile(File fileName) throwsException {try{if (!fileName.exists()) {
fileName.createNewFile();
}
}catch(Exception e) {
e.printStackTrace();
}
}public static void writeTxtFile(String content, File fileName) throwsException {
RandomAccessFile mm= null;
FileOutputStream o= null;try{
o= newFileOutputStream(fileName);
o.write(content.getBytes("UTF-8"));
o.close();
}catch(Exception e) {
e.printStackTrace();
}finally{if (mm != null) {
mm.close();
}
}
}
@Overridepublic voidrun() {
currentThread().setName("一个都别跑:");
String title;
String content;for(String url : UrlList) {try{
Document document= Jsoup.connect(url).timeout(6000).get();
title= document.select("h1").toString();
content= document.select("#content").html();
System.out.println("线程:"+currentThread().getName()+"爬取URL—>"+url);
File file= new File(PATH+title.replaceAll("
", "").replaceAll("
", "")+".txt");createFile(file);
System.out.println("创建文件:"+file.getPath());
writeTxtFile(FileterHtml(content), file);
}catch(IOException e) {
e.printStackTrace();
}catch(Exception e) {
e.printStackTrace();
}
}
}public staticString FileterHtml(String str) {return str.replaceAll(" ", "").replaceAll("
", "\r\n");
}
}