第一步:导入jar
jsoup-1.8.1.jar
第一种:不需要登录,不需要传参,爬取数据:
public static void main(String[] args) {
//请求地址
String url="http://news.baidu.com/";
//访问此URL
Connection conn= Jsoup.connect(url);
Document dt=null;
try {
//获得内容
dt = conn.get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//获得class为hotnews的元素
Elements et1=dt.getElementsByClass("hotnews");
//Elements et=dt.getElementsByAttributeValue("class", "hotnews");
//遍历第一个hotnews元素的第一个子元素的子元素集合
for(Element temp:et1.get(0).child(0).children()){
//输出集合的内容
System.out.println(temp.text());
}
}
第二种:不需要登录,但需要传参,爬取数据:
//访问分词网页,给定参数,处理返回值
public class Test2 {
public static void main(String[] args) {
//访问地址
String url="http://www.78901.net/participle/?ac=done";
//访问
Connection conn=Jsoup.connect(url);
//发送参数
conn.data("source", "我们是不是世界上最可爱的人");
conn.data("do_fork","1");
conn.data("do_unit","1");
conn.data("Submit","分词");
try {
//设置连接方式post,获得返回值
Response res= conn.method(Connection.Method.POST).execute();
//转换成JSON数据或者HTML格式
String resBody= res.body();
//解析字符串
Document dt=Jsoup.parse(resBody);
//获得result元素
Element et=dt.getElementById("result");
System.out.println(et.text());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
第三种:需要登录,需要传参,爬取数据:
需要抓包程序Fiddler辅助
FiddlerSetup-5.0.20173.50948.exe
public static void main(String[] args) {
//登录提交的URL
String url = "https://passport.liepin.com/c/login.json?__mn__=user_login";
//访问URL
Connection conn = Jsoup.connect(url);
//Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("218.90.165.250", 46268));
//把提交的表头放在集合
HashMap<String,String> headMap = new HashMap<String,String>();
headMap.put("Host","passport.liepin.com");
headMap.put("Connection","keep-alive");
headMap.put("Content-Length","93");
headMap.put("Accept","application/json, text/javascript, */*; q=0.01");
headMap.put("Origin","https://passport.liepin.com");
headMap.put("X-Requested-With","XMLHttpRequest");
headMap.put("X-Alt-Referer","https://www.liepin.com/");
headMap.put("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36");
headMap.put("Content-Type","application/x-www-form-urlencoded");
headMap.put("Referer","https://passport.liepin.com/ajaxproxy.html");
headMap.put("Accept-Encoding","gzip, deflate, br");
headMap.put("Accept-Language","zh-CN,zh;q=0.9");
//提交表头集合
conn.headers(headMap);
//把cookies值放入集合
Map<String,String> ckMap = new HashMap<String,String>();
ckMap.put("abtest", "0");
ckMap.put("__uuid", "1538983647573.34");
ckMap.put("_uuid","5292D6DCFFA84819242C7BB1ABB83B4D");
ckMap.put("2f566a3", "cdc5746f42806e63ab6864e0d09b475f");
ckMap.put("user_kind","0");
ckMap.put("is_lp_user", "true");
ckMap.put("c_flag","a6aeea41dd6386851883e2ef2cb9a75c");
ckMap.put("need_bind_tel", "false");
ckMap.put("gr_user_id", "8c86904b-9d0d-4ae8-909f-659ec2d7dc7a");
ckMap.put("imClientId", "688f4029e0b428c0f3037b9a20b21a71");
ckMap.put("imId", "688f4029e0b428c09918c151b38ebcd5");
ckMap.put("abtest", "0");
ckMap.put("_fecdn_", "1");
ckMap.put("__tlog", "1539073468015.35%7C00000000%7C00000000%7C00000000%7C00000000");
ckMap.put("Hm_lvt_a2647413544f5a04f00da7eee0d5e200", "1538983648,1539073468");
ckMap.put("_mscid", "00000000");
ckMap.put("2f566a36", "cdc5746f42806e63ab6864e0d09b475f");
ckMap.put("user_vip", "0");
ckMap.put("new_user", "false");
ckMap.put("gr_session_id_bad1b2d9162fab1f80dde1897f7a2972", "6706e5be-5b3a-4efa-ad68-f63143da711b");
ckMap.put("gr_cs1_6706e5be-5b3a-4efa-ad68-f63143da711b", "UniqueKey%3A92796b64014b83861a667e00ad0d9493");
ckMap.put("user_photo", "55557f3b28ee44a8919620ce01a.gif");
ckMap.put("openChatWin", "");
ckMap.put("verifycode", "7021bb9b5e7b4170a5cc51e651bc9ab0");
ckMap.put("__session_seq", "5");
ckMap.put("Hm_lpvt_a2647413544f5a04f00da7eee0d5e200", "1539074615");
//发送cookies值
conn.cookies(ckMap);
//发送密码,加密了的,不需要解密,数据库就保存此值
conn.data("user_pwd","1dfcfc1ee80bda7ae9ee7cf0d4a1d704");
conn.data("version","");
//发送用户名
conn.data("user_login","13551868112");
conn.data("chk_remember_pwd","on");
Connection.Response res=null;
try {
//设置连接方式post,获得返回值
res = conn.ignoreContentType(true).method(Connection.Method.POST).execute();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//转换成JSON数据或者HTML格式
String resBody = res.body();
//解析字符串
Document dt = Jsoup.parse(resBody);
System.out.println(dt.html());
//爬的URL
String newUrl = "https://c.liepin.com/resume/getdefaultresume/";
//访问URL
Connection conn2 = Jsoup.connect(newUrl);
//发送表头集合
conn2.headers(headMap);
//conn2.proxy(proxy);
//发送cookies值
conn2.cookies(res.cookies());
Connection.Response res2=null;
try {
//设置连接方式post,获得返回值
res2 = conn2.ignoreContentType(true).method(Connection.Method.GET).execute();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//String username=res2.body();
//Document dd=Jsoup.parse(username);
Document dd=null;
try {
dd = conn2.get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Elements el=dd.getElementsByClass("icons32 icons32-clipboard");
System.out.println(dd);
for(Element ss:el){
System.out.println(ss.className());
}
}