爬虫

第一步:导入jar
jsoup-1.8.1.jar

第一种:不需要登录,不需要传参,爬取数据:

public static void main(String[] args) {
		//请求地址
		String url="http://news.baidu.com/";
		//访问此URL
		Connection conn= Jsoup.connect(url);
		Document dt=null;
		try {
			//获得内容
			dt = conn.get();
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		//获得class为hotnews的元素
		Elements et1=dt.getElementsByClass("hotnews");
		//Elements et=dt.getElementsByAttributeValue("class", "hotnews");
		//遍历第一个hotnews元素的第一个子元素的子元素集合
		for(Element temp:et1.get(0).child(0).children()){
			//输出集合的内容
			System.out.println(temp.text());
		}
	}

第二种:不需要登录,但需要传参,爬取数据:

//访问分词网页,给定参数,处理返回值
public class Test2 {

	public static void main(String[] args) {
		//访问地址
		String url="http://www.78901.net/participle/?ac=done";
		//访问
		Connection conn=Jsoup.connect(url);
		//发送参数
		conn.data("source", "我们是不是世界上最可爱的人");
		conn.data("do_fork","1");
		conn.data("do_unit","1");
		conn.data("Submit","分词");
		try {
			//设置连接方式post,获得返回值
			Response res= conn.method(Connection.Method.POST).execute();
			//转换成JSON数据或者HTML格式
			String resBody= res.body();
			//解析字符串
			Document dt=Jsoup.parse(resBody);
			//获得result元素
			Element et=dt.getElementById("result");
			System.out.println(et.text());
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}

	}
}

第三种:需要登录,需要传参,爬取数据:
需要抓包程序Fiddler辅助
FiddlerSetup-5.0.20173.50948.exe

public static void main(String[] args) {
		//登录提交的URL
		String url = "https://passport.liepin.com/c/login.json?__mn__=user_login";
		//访问URL
        Connection conn = Jsoup.connect(url);
        //Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress("218.90.165.250", 46268));
        //把提交的表头放在集合
        HashMap<String,String> headMap = new HashMap<String,String>();
        headMap.put("Host","passport.liepin.com");
        headMap.put("Connection","keep-alive");
        headMap.put("Content-Length","93");
        headMap.put("Accept","application/json, text/javascript, */*; q=0.01");
        headMap.put("Origin","https://passport.liepin.com");
        headMap.put("X-Requested-With","XMLHttpRequest");
        headMap.put("X-Alt-Referer","https://www.liepin.com/");
        headMap.put("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36");
        headMap.put("Content-Type","application/x-www-form-urlencoded");
        headMap.put("Referer","https://passport.liepin.com/ajaxproxy.html");
        headMap.put("Accept-Encoding","gzip, deflate, br");
        headMap.put("Accept-Language","zh-CN,zh;q=0.9");
        //提交表头集合
        conn.headers(headMap);
        //把cookies值放入集合
        Map<String,String> ckMap = new HashMap<String,String>();
        ckMap.put("abtest", "0");
        ckMap.put("__uuid", "1538983647573.34");
        ckMap.put("_uuid","5292D6DCFFA84819242C7BB1ABB83B4D");
        ckMap.put("2f566a3", "cdc5746f42806e63ab6864e0d09b475f");
        ckMap.put("user_kind","0");
        ckMap.put("is_lp_user", "true");
        ckMap.put("c_flag","a6aeea41dd6386851883e2ef2cb9a75c");
        ckMap.put("need_bind_tel", "false");
        ckMap.put("gr_user_id", "8c86904b-9d0d-4ae8-909f-659ec2d7dc7a");
        ckMap.put("imClientId", "688f4029e0b428c0f3037b9a20b21a71");
        ckMap.put("imId", "688f4029e0b428c09918c151b38ebcd5");
        ckMap.put("abtest", "0");
        ckMap.put("_fecdn_", "1");
        ckMap.put("__tlog", "1539073468015.35%7C00000000%7C00000000%7C00000000%7C00000000");
        ckMap.put("Hm_lvt_a2647413544f5a04f00da7eee0d5e200", "1538983648,1539073468");
        ckMap.put("_mscid", "00000000");
        ckMap.put("2f566a36", "cdc5746f42806e63ab6864e0d09b475f");
        ckMap.put("user_vip", "0");
        ckMap.put("new_user", "false");
        ckMap.put("gr_session_id_bad1b2d9162fab1f80dde1897f7a2972", "6706e5be-5b3a-4efa-ad68-f63143da711b");       
        ckMap.put("gr_cs1_6706e5be-5b3a-4efa-ad68-f63143da711b", "UniqueKey%3A92796b64014b83861a667e00ad0d9493");
        ckMap.put("user_photo", "55557f3b28ee44a8919620ce01a.gif");
        ckMap.put("openChatWin", "");
        ckMap.put("verifycode", "7021bb9b5e7b4170a5cc51e651bc9ab0");
        ckMap.put("__session_seq", "5");
        ckMap.put("Hm_lpvt_a2647413544f5a04f00da7eee0d5e200", "1539074615");
 
        //发送cookies值
        conn.cookies(ckMap);
        //发送密码,加密了的,不需要解密,数据库就保存此值
        conn.data("user_pwd","1dfcfc1ee80bda7ae9ee7cf0d4a1d704");
        conn.data("version","");
        //发送用户名
        conn.data("user_login","13551868112");
        conn.data("chk_remember_pwd","on");
        Connection.Response res=null;
		try {
			//设置连接方式post,获得返回值
			res = conn.ignoreContentType(true).method(Connection.Method.POST).execute();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		//转换成JSON数据或者HTML格式
		String resBody = res.body();
		//解析字符串
		Document dt = Jsoup.parse(resBody);
		System.out.println(dt.html());
		//爬的URL
        String newUrl = "https://c.liepin.com/resume/getdefaultresume/";
        //访问URL
        Connection conn2 = Jsoup.connect(newUrl);
     	//发送表头集合
        conn2.headers(headMap);
        //conn2.proxy(proxy);
        //发送cookies值
        conn2.cookies(res.cookies());
        Connection.Response res2=null;
		try {
			//设置连接方式post,获得返回值
			res2 = conn2.ignoreContentType(true).method(Connection.Method.GET).execute();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		//String username=res2.body();
		//Document dd=Jsoup.parse(username);
		Document dd=null;
		try {
			dd = conn2.get();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		Elements el=dd.getElementsByClass("icons32 icons32-clipboard");
		System.out.println(dd);
		for(Element ss:el){
			System.out.println(ss.className());
		}
        
	}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值