爬今日头条文章

之前做了一个要求爬今日头条的需求,觉得挺有用的,就留下来。

	@Override
	public Map<String, Object> getAuthorInformation(String url) {
		/*
		 * @author XiaoMingHui
		 * @date 2017-7-31 下午3:37:30
		 */
		Map<String, Object> map = new HashMap<>();
		try {
			map = getMapByUrl(url);
		} catch (UnknownHostException e) {
			map.put("MyError", "所输入的URL有误,导致无法正确的得到数据, 请输入正确的URL");
			logger.debug(
					"Class: ArticleSourceServiceImpl, Method: getAuthorInformation, 抓取数据的URL输入有误", e);
		} catch (IOException e) {
			map.put("MyError", "未知的错误发生了,请联系后台工作人员。");
			logger.error(
					"Class: ArticleSourceServiceImpl, Method: getAuthorInformation, io流抛出异常", e);
		} catch (Exception e) {
			map.put("MyError", "未知的错误发生了,请联系后台工作人员。");
			logger.error(
					"Class: ArticleSourceServiceImpl, Method: getAuthorInformation, 抛出异常", e);
		}
			return map;
	}


	
	/**
	 * * 根据今日头条的作者主页URL地址获取到作者的信息,封装成一个map集合 * * @param url * @return * @author
	 * XiaoMingHui * @throws Exception * @date 2017-8-1 下午2:19:58
	 */
	private Map<String, Object> getMapByUrl(String url) throws IOException, UnknownHostException {
		Connection connection = Jsoup.connect(url);
		String tKey = "User-Agent";
		String tValue = "Mozilla/5.0 (Windows NT 6.1; WOW64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrom"
				+ "e/60.0.3100.0 Safari/537.36";
		// 获取头文件信息 connection.header(tKey, tValue);
		// 获取cookies信息 connection.cookies(getCookies());
		Document content = connection.get();
		String html = content.html();
		// 截取字符串的方法获取到作者信息
		String json = StringUtils.substringBetween(html, "var userInfo = ", ";");
		return JsonUtil.toBean(json, Map.class);

	}

/**获得今日头条所需要的cookies信息
	 * @return
	 * @author XiaoMingHui
	 * @date 2017-8-1 下午2:21:44
	 */
	private Map<String, String> getCookies() {
		Map<String, String> cookies = new HashMap<String, String>();
		cookies.put("UM_distinctid",
				"15d604b19ca3a2-0e6085b9900ead-1c197450-1fa400-15d604b19cb7b8");
		cookies.put("uuid", "w:a881896fbbd446bf9fc7c0c97434e78f");
		cookies.put("OUTFOX_SEARCH_USER_ID_NCOO", "1631109872.7671177");
		cookies.put("csrftoken", "59bd6e9979d9a0e447159c09c69ec182");
		// cookies.put("WEATHER_CITY", "%E5%8C%97%E4%BA%AC");
		cookies.put("_ga", "GA1.2.1128248025.1501125011");
		cookies.put("_gid", "GA1.2.1377010915.1501125011");
		cookies.put("__utmt", "1");
		cookies.put("__utma",
				"24953151.1128248025.1501125011.1501125474.1501125474.1");
		cookies.put("__utmb", "24953151.8.10.1501125474");
		cookies.put("__utmc", "24953151");
		cookies.put("__utmz",
				"24953151.15011254741.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)");
		cookies.put("tt_webid", "6444852342776890893");
		cookies.put("CNZZDATA1259612802", "478646998-1500557369-%7C1501121800");
		return cookies;
	}
/**获得今日头条所需要的cookies信息
	 * @return
	 * @author XiaoMingHui
	 * @date 2017-8-1 下午2:21:44
	 */
	private Map<String, String> getCookies() {
		Map<String, String> cookies = new HashMap<String, String>();
		cookies.put("UM_distinctid",
				"15d604b19ca3a2-0e6085b9900ead-1c197450-1fa400-15d604b19cb7b8");
		cookies.put("uuid", "w:a881896fbbd446bf9fc7c0c97434e78f");
		cookies.put("OUTFOX_SEARCH_USER_ID_NCOO", "1631109872.7671177");
		cookies.put("csrftoken", "59bd6e9979d9a0e447159c09c69ec182");
		// cookies.put("WEATHER_CITY", "%E5%8C%97%E4%BA%AC");
		cookies.put("_ga", "GA1.2.1128248025.1501125011");
		cookies.put("_gid", "GA1.2.1377010915.1501125011");
		cookies.put("__utmt", "1");
		cookies.put("__utma",
				"24953151.1128248025.1501125011.1501125474.1501125474.1");
		cookies.put("__utmb", "24953151.8.10.1501125474");
		cookies.put("__utmc", "24953151");
		cookies.put("__utmz",
				"24953151.15011254741.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)");
		cookies.put("tt_webid", "6444852342776890893");
		cookies.put("CNZZDATA1259612802", "478646998-1500557369-%7C1501121800");
		return cookies;
	}

 

	

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 5
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值