java扒取微博网络数据

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

	String strURL="https://weibo.com/u/3502967407";//https://weibo.com/u/3081728031   5723344072
	URL url=null;
	HttpURLConnection httpConn=null;
		url = new URL(strURL);
		httpConn = (HttpURLConnection) url.openConnection();
		//String c="SUB=_2AkMqj-zif8NxqwJRmfkcyG7la4R0ygjEieKc0x05JRMxHRl-yT9jqhUitRB6AQ_CDRrmGwjoWaf2alXg9Yfxki-R4Nwe; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5gfVwXwLLzATj6ArcV1q7i; SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; UOR=,,localhost:8080; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; wb_view_log=1920*10801; TC-Page-G0=1ae767ccb34a580ffdaaa3a58eb208b8|1584343362|1584343362";
		//String c="SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; WBtopGlobal_register_version=3d5b6de7399dfbdb; wb_view_log_6439293145=1920*10801; wb_view_log=1920*10801; UOR=,,www.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5K2hUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1615971499; SSOLoginState=1584435500; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJHcrUMAiEDq2Fby6kEoayWwopa6y9lMEbqh1h7NHOffM.; SUB=_2A25zdOF8DeRhGeBP6VcW8SjLzDWIHXVQAFW0rDV8PUNbmtANLUr-kW9NRWCbkhivb5UzMh1zGT7KgW6D-dSnnHFj; SUHB=0sqBpKsKWKrq8Z; un=18595757685; wvr=6; wb_view_log_6125716779=1920*10801; TC-Page-G0=1ae767ccb34a580ffdaaa3a58eb208b8|1584440113|1584440107; webim_unReadCount=%7B%22time%22%3A1584440351054%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A3%2C%22msgbox%22%3A0%7D";
		//String c="SINAGLOBAL=2525797642447.1143.1576751690811; _s_tentry=localhost:8080; Apache=9113724801556.377.1583116766626; ULV=1583116766636:2:1:1:9113724801556.377.1583116766626:1582854844672; TC-V5-G0=4de7df00d4dc12eb0897c97413797808; login_sid_t=96a715575970779900d6d744eadd4ef1; cross_origin_proto=SSL; Ugrow-G0=140ad66ad7317901fc818d7fd7743564; WBtopGlobal_register_version=3d5b6de7399dfbdb; wb_view_log_6439293145=1920*10801; wb_view_log=1920*10801; UOR=,,www.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5K2hUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1615971499; SSOLoginState=1584435500; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJHcrUMAiEDq2Fby6kEoayWwopa6y9lMEbqh1h7NHOffM.; SUB=_2A25zdOF8DeRhGeBP6VcW8SjLzDWIHXVQAFW0rDV8PUNbmtANLUr-kW9NRWCbkhivb5UzMh1zGT7KgW6D-dSnnHFj; SUHB=0sqBpKsKWKrq8Z; un=18595757685; wvr=6; wb_view_log_6125716779=1920*10801; webim_unReadCount=%7B%22time%22%3A1584440603192%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A3%2C%22msgbox%22%3A0%7D; TC-Page-G0=b993e9b6e353749ed3459e1837a0ae89|1584440608|1584440580";
		String c="SINAGLOBAL=2525797642447.1143.1576751690811; UOR=,,login.sina.com.cn; TC-V5-G0=595b7637c272b28fccec3e9d529f251a; SSOLoginState=1585210218; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wvr=6; _s_tentry=weibo.com; Apache=7211436044072.67.1585211180994; ULV=1585211181930:3:2:1:7211436044072.67.1585211180994:1583116766636; SUB=_2AkMp3ULYf8PxqwJRmfkcyG7la4R0ygjEieKfgbMDJRMxHRl-yT9jqk8GtRB6Al1sKDCUM-bsv44hS2JWofGDBG0WLLhQ; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5gfVwXwLLzATj6ArcV1q7i; TC-Page-G0=62b98c0fc3e291bc0c7511933c1b13ad|1585565168|1585565167";
		//String c="SINAGLOBAL=2525797642447.1143.1576751690811; UOR=,,login.sina.com.cn; TC-V5-G0=595b7637c272b28fccec3e9d529f251a; SSOLoginState=1585210218; Ugrow-G0=7e0e6b57abe2c2f76f677abd9a9ed65d; wvr=6; _s_tentry=weibo.com; Apache=7211436044072.67.1585211180994; ULV=1585211181930:3:2:1:7211436044072.67.1585211180994:1583116766636; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWlIp9eUCCs0AXkbTy9zp7x5JpX5KMhUgL.Foqpeo-NeKqNS0.2dJLoIEXLxKqLBonL1h-LxKMLB.2LB-qLxKML1-2L1hBLxKnLBKqL1h2LxKqLB-BLB.zt; ALF=1616895226; SCF=AjcCfB6DUrrZ2fMhnntI_TyQc2JsccpWc3X4bHbuPEpJIndU4aQ389BrJPQyB4i6Qj847pZmhvQfZIHMPwC8ARc.; SUB=_2A25zetktDeRhGeBP6VcW8SjLzDWIHXVQDk3lrDV8PUNbmtAKLWXFkW9NRWCbkkZQxvxIVxXfNU1QVQBfoeucUtmz; SUHB=0LcSPzPnzcU0HI; wb_view_log_6125716779=1920*10801; TC-Page-G0=841d8e04c4761f733a87c822f72195f3|1585363186|1585363180; webim_unReadCount=%7B%22time%22%3A1585363187658%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A36%2C%22msgbox%22%3A0%7D";
		//第一重点   微博扒取数据,因为微博用了Cookie,所以我们扒取时爬到的是个接近空的网页,里面什么内容也没有,跟电脑上F12看到的完全不一样
		httpConn.setRequestProperty("Cookie", c);
		httpConn.setRequestProperty("charset", "utf-8");
		InputStreamReader input = new InputStreamReader(httpConn.getInputStream(), "utf-8");
		BufferedReader bufReader = new BufferedReader(input);
		String line = "";
		StringBuilder contentBuf = new StringBuilder();
		while ((line = bufReader.readLine()) != null) {
		contentBuf.append(line);
		}
		
		String buf = contentBuf.toString();
		System.out.println(buf);
		Document document = Jsoup.parse(buf);
		//第二重点   微博上面的数据都是用PHP写的,页面的内容也都是用script渲染上面的,所以我们要获取script中渲染的数据,script渲染数据也是个正常的json数据,所一般能拿到指定的script很重要,微博里面所有的内容并不是由一个script渲染的,
		//他由很多的script渲染,所以要想拿到指定的内容就要拿到指定的script,下面看怎么找script。所以找到指定的script很重要。
		Elements elements=document.select("script");
		for (Element element : elements) {
			//解析script
			String s1=element.data().split("<script>FM.view")[0];
			//一层一层   抽丝剥茧拿到自己的内容
			if (s1.contains("\"html\":\"")) { 
				if(s1.split("\"html\":\"")[0].contains("Pl_Official_Headerv6__1")) {
					String content = s1.split("\"html\":\"")[1].replaceAll("(\\\\t|\\\\n|\\\\r)", "").replaceAll("\\\\\"", "\"").replaceAll("\\\\/", "/");
		        	 content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13); 
			    	 Document header = Jsoup.parse(content);
			         Elements headerphoto= header.getElementsByClass("photo");
			         Elements username= header.getElementsByClass("username");
			         String nickName=username.text();
			         String img_url=headerphoto.attr("src");
			        // w.setNickname(nickName);
			        // w.setImg_url(img_url);
			         System.out.println(nickName);
			         System.out.println(img_url);
				}
				if(s1.split("\"html\":\"")[0].contains("Pl_Core_T8CustomTriColumn__3")) {
					String content = s1.split("\"html\":\"")[1].replaceAll("(\\\\t|\\\\n|\\\\r)", "").replaceAll("\\\\\"", "\"").replaceAll("\\\\/", "/");
			       	 content = content.substring(0,content.length() <= 13 ? content.length(): content.length() - 13); 
				    	 Document header = Jsoup.parse(content);
				         Elements data= header.getElementsByClass("W_f14");
				         if(data.size()==0) {
				        	 data= header.getElementsByClass("W_f16");
				         }
				         if(data.size()==0) {
				        	 data= header.getElementsByClass("W_f18");
				         }
				         String fun=data.get(1).text();
				       //  w.setFan_num(fun);
				         System.out.println(fun);
				}
			}
		}

借鉴使用 不要完全照搬 可能封闭性大括号粘贴不全

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值