之前做了一个要求爬今日头条的需求,觉得挺有用的,就留下来。
@Override
public Map<String, Object> getAuthorInformation(String url) {
/*
* @author XiaoMingHui
* @date 2017-7-31 下午3:37:30
*/
Map<String, Object> map = new HashMap<>();
try {
map = getMapByUrl(url);
} catch (UnknownHostException e) {
map.put("MyError", "所输入的URL有误,导致无法正确的得到数据, 请输入正确的URL");
logger.debug(
"Class: ArticleSourceServiceImpl, Method: getAuthorInformation, 抓取数据的URL输入有误", e);
} catch (IOException e) {
map.put("MyError", "未知的错误发生了,请联系后台工作人员。");
logger.error(
"Class: ArticleSourceServiceImpl, Method: getAuthorInformation, io流抛出异常", e);
} catch (Exception e) {
map.put("MyError", "未知的错误发生了,请联系后台工作人员。");
logger.error(
"Class: ArticleSourceServiceImpl, Method: getAuthorInformation, 抛出异常", e);
}
return map;
}
/**
* * 根据今日头条的作者主页URL地址获取到作者的信息,封装成一个map集合 * * @param url * @return * @author
* XiaoMingHui * @throws Exception * @date 2017-8-1 下午2:19:58
*/
private Map<String, Object> getMapByUrl(String url) throws IOException, UnknownHostException {
Connection connection = Jsoup.connect(url);
String tKey = "User-Agent";
String tValue = "Mozilla/5.0 (Windows NT 6.1; WOW64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrom"
+ "e/60.0.3100.0 Safari/537.36";
// 获取头文件信息 connection.header(tKey, tValue);
// 获取cookies信息 connection.cookies(getCookies());
Document content = connection.get();
String html = content.html();
// 截取字符串的方法获取到作者信息
String json = StringUtils.substringBetween(html, "var userInfo = ", ";");
return JsonUtil.toBean(json, Map.class);
}
/**获得今日头条所需要的cookies信息
* @return
* @author XiaoMingHui
* @date 2017-8-1 下午2:21:44
*/
private Map<String, String> getCookies() {
Map<String, String> cookies = new HashMap<String, String>();
cookies.put("UM_distinctid",
"15d604b19ca3a2-0e6085b9900ead-1c197450-1fa400-15d604b19cb7b8");
cookies.put("uuid", "w:a881896fbbd446bf9fc7c0c97434e78f");
cookies.put("OUTFOX_SEARCH_USER_ID_NCOO", "1631109872.7671177");
cookies.put("csrftoken", "59bd6e9979d9a0e447159c09c69ec182");
// cookies.put("WEATHER_CITY", "%E5%8C%97%E4%BA%AC");
cookies.put("_ga", "GA1.2.1128248025.1501125011");
cookies.put("_gid", "GA1.2.1377010915.1501125011");
cookies.put("__utmt", "1");
cookies.put("__utma",
"24953151.1128248025.1501125011.1501125474.1501125474.1");
cookies.put("__utmb", "24953151.8.10.1501125474");
cookies.put("__utmc", "24953151");
cookies.put("__utmz",
"24953151.15011254741.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)");
cookies.put("tt_webid", "6444852342776890893");
cookies.put("CNZZDATA1259612802", "478646998-1500557369-%7C1501121800");
return cookies;
}
/**获得今日头条所需要的cookies信息
* @return
* @author XiaoMingHui
* @date 2017-8-1 下午2:21:44
*/
private Map<String, String> getCookies() {
Map<String, String> cookies = new HashMap<String, String>();
cookies.put("UM_distinctid",
"15d604b19ca3a2-0e6085b9900ead-1c197450-1fa400-15d604b19cb7b8");
cookies.put("uuid", "w:a881896fbbd446bf9fc7c0c97434e78f");
cookies.put("OUTFOX_SEARCH_USER_ID_NCOO", "1631109872.7671177");
cookies.put("csrftoken", "59bd6e9979d9a0e447159c09c69ec182");
// cookies.put("WEATHER_CITY", "%E5%8C%97%E4%BA%AC");
cookies.put("_ga", "GA1.2.1128248025.1501125011");
cookies.put("_gid", "GA1.2.1377010915.1501125011");
cookies.put("__utmt", "1");
cookies.put("__utma",
"24953151.1128248025.1501125011.1501125474.1501125474.1");
cookies.put("__utmb", "24953151.8.10.1501125474");
cookies.put("__utmc", "24953151");
cookies.put("__utmz",
"24953151.15011254741.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)");
cookies.put("tt_webid", "6444852342776890893");
cookies.put("CNZZDATA1259612802", "478646998-1500557369-%7C1501121800");
return cookies;
}