现在获取数据大多都是python,但是java页可以
package test;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @ProjectName: spider01
* @Package: test
* @ClassName: LYGGongGongPiingTai
* @Author:
* @Description:
* @Date: 2021/9/18 9:52 上午
* @Version: 1.0
*/
public class LYGGongGongPiingTai {
public static String getHtml(String url){
String html = null;
try {
html = Jsoup.connect(url)
.ignoreContentType(true).timeout(5000).execute().body();
} catch (IOException e) {
}
return html;
}
public static List<String> getList(String html){
List<String> list = new ArrayList<String>();
Document document =Jsoup.parse(html);
Elements elements=document.select("a");
for (Element element:elements){
if ("_blank".equals(element.attr("target"))) {
// System.out.println(element.attr("href"));
String url = "http://spzx.lyg.gov.cn/" + element.attr("href");
list.add(url);
}
}
return list;
}
private static void geturl() {
for (int i = 1;i<5;i++) {
System.out.println("当前运行页数" + i );
String url = "http://spzx.lyg.gov.cn/lygweb/jyxx/001007/001007002/" + i + ".html";
String html = getHtml(url);
// System.out.println(html);
System.out.println(getList(html));
}
}
private static List<String> getlygwebList(String html) {
List<String> list = new ArrayList<String>();
Document document =Jsoup.parse(html);
Elements elements=document.select("p");
for (Element element:elements){
// System.out.println(element.text());
list.add(element.text());
}
return list;
}
private static void getlygweb() {
String url = "http://spzx.lyg.gov.cn/lygweb/jyxx/001007/001007002/20210908/100e7667-6711-4f2a-9b57-383de53915e9.html";
url = "http://spzx.lyg.gov.cn/lygweb/jyxx/001007/001007002/20210819/7570f01b-b069-437c-b343-c10988cfe082.html";
String html = getHtml(url);
// System.out.println(html);
System.out.println(getlygwebList(html));
}
public static void main(String[] args) {
//1 获取链接
geturl();
//2简单的抽取html
getlygweb();
}
}