一个简单的java爬网页
主要使用了jsoup工具
这是百度到的jsoup中文使用手册,很方便的一款java爬虫工具,maven引入依赖即可
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
爬取页面
Jsoup.connect(“要爬取的页面的网址”).get(),返回Document 对象就是爬取的网页,再通过select(“截取规则”)对页面再进行信息过滤。
页面分析很重要,若多条记录信息,分析页面的共有最小一级规则过滤
我爬取的这个页面需要有用户信息的cookie需要携带,从分页的第一页开始爬取(page=页码数,这是我观察到的规则),所以使用
Document doc = Jsoup.connect(“http://******?page=”+i+"***").cookie(“登录保存的sessionid”, “session值”).get(); 其中的i为页码数
同样也通过select过滤页面
第一次写博客,微末之见。
// An highlighted block
package com.jr.success;
import com.jr.demo.pojo.TestDemo;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author: jr
* @date: created in 2019/5/9 10:38
*/
public class PageTest {
public static void main(String[] args) {
try {
//获取需要爬虫的页面总页数
int i = jsoupPageNumTest();
// 带入页数爬虫
List<TestDemo> testDemos = jsoupPageTest(i);//
// List<TestDemo> testDemos = jsoupPageTest(i);
//遍历list 获取其中的id 通过id遍历子页面
System.out.println("testDemos = " + testDemos);
for (TestDemo testDemo : testDemos) {
String id = testDemo.getId();
//当前子页面数据尚未处理
jsoupSonPageTest(id);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 获取总页数 确定遍历次数
*
* @throws IOException
*/
public static int jsoupPageNumTest() throws IOException {
List<TestDemo> testList = new ArrayList<TestDemo>();
//通过第一页获取总页数
Document docOne = Jsoup.connect("http://******?page=1***").cookie("登录保存的sessionid", "session值").get();
//这是第一个页面下获取页面总页数
Elements elements2 = docOne.select("div[id=\"datatable-tabletools_info\"]");
// System.out.println("elements2 = " + elements2);
String text = elements2.text();
// 共 *** 页 有一个空格分割
String[] s = text.split(" ");
System.out.println("text = " + text);
//总页数
int pageNum = Integer.parseInt(s[1]);
System.out.println("pageNum = " + pageNum);
return pageNum;
}
/**
* 爬取第一层頁面
*
* @throws IOException
*/
public static List<TestDemo> jsoupPageTest(int pageNum) throws IOException {
List<TestDemo> testList = new ArrayList<TestDemo>();
for (int i = 1; i <= pageNum; i++) {
// System.out.println(i);
// 分页
Document doc = Jsoup.connect("http://********?page=" + i + "*****").cookie("登录保存的sessionid", "session值").get();
Elements elements1 = doc.select("tbody").select("tr");
// System.out.println("elements1 = " + elements1);
for (int j = 0; j < elements1.size(); j++) {
//获取每一行的列
Elements tds = elements1.get(j).select("td");
// System.out.println("tds = " + tds);
TestDemo testDemo = new TestDemo();
for (int k = 0; k < tds.size(); k++) {
//对每一行中的某些你需要的列进行处理
//首页表格中没有带字段名称的 所以我就自己随便命名了
switch (k) {
case 0:
Elements ele = tds.select("input[data-entityId]");
String id = ele.attr("data-entityId");
System.out.println("id = " + id);
testDemo.setId(id);
break;
case 1:
String aid = tds.get(k).select("td div").text();
testDemo.setAid(aid);
System.out.println("aid = " + aid);
break;
case 2:
String aname = tds.get(k).select("td div").text();
testDemo.setAname(aname);
System.out.println("aname = " + aname);
break;
case 3:
String atype = tds.get(k).select("td div").text();
testDemo.setAtype(atype);
System.out.println("atype = " + atype);
break;
case 4:
String aresource = tds.get(k).select("td div").text();
testDemo.setAresource(aresource);
System.out.println("aresource = " + aresource);
break;
case 5:
String recordDate = tds.get(k).select("td div").text();
testDemo.setRecordDate(recordDate);
System.out.println("recordDate = " + recordDate);
break;
case 6:
String applicationDate = tds.get(k).select("td div").text();
testDemo.setApplicationDate(applicationDate);
System.out.println("applicationDate = " + applicationDate);
break;
case 7:
String mediator = tds.get(k).select("td div").text();
testDemo.setMediator(mediator);
System.out.println("mediator = " + mediator);
break;
case 8:
String status = tds.get(k).select("td div").text();
testDemo.setStatus(status);
System.out.println("status = " + status);
break;
case 9:
String writ = tds.get(k).select("td div").text();
testDemo.setWrit(writ);
System.out.println("writ = " + writ);
break;
default:
String oldClose = tds.get(k).select("td div").text();
System.out.println("oldClose = " + oldClose);
break;
}
}
testList.add(testDemo);
//接下来,进行下一步操作 比如可以存到文件中 也可以存到数据库中 随意啦 反正拉倒数据了
}
}
return testList;
}
/**
* 爬子頁
* entityId id
*
* @throws IOException
*/
public static void jsoupSonPageTest(String entityId) throws IOException {
Document sonDoc = Jsoup.connect("******" + entityId + "***").cookie("登录保存的sessionid", "session值").get();
// System.out.println("sonDoc = " + sonDoc);
Elements sonElements1 = sonDoc.select("div form");
// System.out.println("sonElements1 = " + sonElements1);
System.out.println("==================================================================");
//存放list1 list2 list3 的 entityid 咳咳重要网站 不宜透露信息
List<String> list1= new ArrayList<String>();
List<String> list2= new ArrayList<String>();
List<String> list3= new ArrayList<String>();
//存放该页面子页面entityId结果
Map<String, List<String>> map = new HashMap<String, List<String>>();
/*map.put("list1", list1);
map.put("list2", list2);
map.put("list3", list3);*/
//模块分割
Elements eles = sonElements1.select("div[class=\"content-box\"]");
for (Element ele : eles) {
//单个模块按div[class="form-group"]分割
Elements ele_info = ele.select("div[class=\"form-group\"]");
for (Element element : ele_info) {
Elements names = element.select("div[name]");
for (Element na : names) {
String name = na.attr("name");
System.out.println(name + "= " + na.text());
}
}
System.out.println("==================================================================");
}
System.out.println("==================================================================");
//最下一层内容
//获取再下一层链接
Elements eleSon = sonElements1.select("div[class=\"tab-content\"]").select("div[name]");
Elements select = eleSon.select("div[ng-include]");
for (Element element : select) {
String attr = element.attr("ng-include");
// System.out.println(attr);
System.out.println(attr.substring(1, attr.length() - 1));
//再下一层链接地址
System.out.println(" ==================================================================================== ");
String[] split1 = attr.substring(1, attr.length() - 1).split("\\.");
String split2 = split1[2].split("@")[0];
System.out.println("split2 = " + split2);
if ("list1".equals(split2)) {
String link = "http://******" + attr.substring(1, attr.length() - 1);
// List<String> inputEntityIds = getInputEntityId(link);
Document docTest = Jsoup.connect(link).cookie("登录保存的sessionid", "session值").get();
Elements inputs = docTest.select("input[data-entityId]");
for (Element input : inputs) {
String inputEntityId = input.attr("data-entityId");
System.out.println("inputEntityId = " + inputEntityId);
list1.add(inputEntityId);
try {
String link2 = "******" + inputEntityId + "****";
list1Page(link2);
//当前条件下有三种可能的地址 若地址条件错误会出现HttpStatusException 异常 所以我就无脑的反复抓取异常处理了
} catch (HttpStatusException e) {
try {
String link2 = "******" + inputEntityId + "****";
list1sPage(link2);
} catch (HttpStatusException e1) {
String link2 = "*******" + inputEntityId + "***";
list1Page(link2);
}
}
}
} else if ("list2".equals(split2)) {
String link = "*****" + attr.substring(1, attr.length() - 1);
List<String> inputEntityIds = getInputEntityId(link);
if (inputEntityIds != null) {
for (String inputEntityId : inputEntityIds) {
list2.add(inputEntityId);
String link2 = "******" + inputEntityId + "***";
otherPage(link2);
}
}
} else if ("list3".equals(split2)) {
//这是获取页面中的转跳地址
String link = "******" + attr.substring(1, attr.length() - 1);
List<String> inputEntityIds = getInputEntityId(link);
if (inputEntityIds != null) {
for (String inputEntityId : inputEntityIds) {
System.out.println("inputEntityId = " + inputEntityId);
/*地址不泄露了*/
list3.add(inputEntityId);
String link2 = "*********&entityId=" + inputEntityId + "";
otherPage(link2);
}
}
}
}
System.out.println("==============================================");
System.out.println();
// System.out.println("eleSon = " + eleSon);
}
/**
* 最后一级页面
* 这个页面我分析为就一个模块 用form-group分割里面所有内容
* @throws IOException
*/
public static void list1Page(String link) throws IOException, HttpStatusException {
Document docTest = Jsoup.connect(link).cookie("登录保存的sessionid", "session值").get();
Elements ele_info = docTest.select("div[class=\"form-group\"]");
for (Element element : ele_info) {
Elements names = element.select("div[name]");
for (Element na : names) {
String name = na.attr("name");
System.out.println(name + "= " + na.text());
}
}
System.out.println("==================================================================");
}
/**
* 其他最后一级页面
* 宁外两个最后一级页面按class=\"content-box\"可以分为俩模块 对模块遍历在切割(我用的繁琐了 ,懒得精简)
* @throws IOException
*/
public static void otherPage(String link) throws IOException {
Document docTest = Jsoup.connect(link).cookie("登录保存的sessionid", "session值").get();
//模块分割
Elements eles = docTest.select("div[class=\"content-box\"]");
for (Element ele : eles) {
Elements ele_info = ele.select("div[class=\"form-group\"]");
for (Element element : ele_info) {
Elements names = element.select("div[name]");
for (Element na : names) {
String name = na.attr("name");
System.out.println(name + "= " + na.text());
} //ok
}
}
}
/**
* 获取最后一级子页面的EntityId
*
* @param link
* @throws IOException
*/
private static List<String> getInputEntityId(String link) throws IOException {
Document docTest = Jsoup.connect(link).cookie("登录保存的sessionid", "session值").get();
List<String> inputEntityIds = new ArrayList<String>();
Elements inputs = docTest.select("input[data-entityId]");
for (Element input : inputs) {
String inputEntityId = input.attr("data-entityId");
System.out.println("inputEntityId = " + inputEntityId);
inputEntityIds.add(inputEntityId);
}
return inputEntityIds;
}
}