package lib;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class Main {
public static List<String> listAll(String url) {
List<String> list = new ArrayList<>();
Document doc = null;
try {
doc = Jsoup.connect(url).timeout(600000).get();
} catch (Exception e) {
e.printStackTrace();
} finally {
}
Elements trs = doc.select("table[class]");
Elements tds = trs.get(0).select("tr");
for (int j = 1; j < tds.size(); j++) {
String text = tds.get(j).text();
list.add(text);
String ural = tds.get(j).select("td").select("a").attr("href");
if (ural.trim().length() > 0) {
ural = url.substring(0, url.lastIndexOf("/") + 1) + ural;
System.out.println("continue:" + ural);
list.addAll(listAll(ural));
list.add("*********************************************");
}
}
return list;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/13.html"; //获取的url
long start = System.currentTimeMillis();
List<String> list = listAll(url);
ObjectOutputStream objectOutputStream;
try {
objectOutputStream = new ObjectOutputStream(new FileOutputStream(new File("/序列化存放目录/tibet.bin")));
objectOutputStream.writeObject(list);
objectOutputStream.flush();
objectOutputStream.close();
} catch (IOException e2) {
// TODO Auto-generated catch block
e2.printStackTrace();
}
System.out.println("耗时:" + (System.currentTimeMillis() - start));
System.out.println("开始写入文档");
PrintWriter out = null;
try {
out = new PrintWriter(new File("/文件存放目录/data.txt"));
} catch (FileNotFoundException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
for (String e : list) {
if (!e.contains("***")) {
String res[] = e.split("\\s+");
out.println(res[res.length - 1]);
} else {
out.println(e);
}
}
out.flush();
out.close();
}
}