1、获取列表
需要分析正确的接口url
请求参数:
start=180
&pageSize=90
&cityId=763
&salary=0,0
&workExperience=-1
&education=-1
&companyType=-1
&employmentType=-1
&jobWelfareTag=-1
&kw=Java%E5%BC%80%E5%8F%91 这里汉字需要转码
&kt=3 没有这个参数,请求失败,不知道有什么用,望告知!!
public static void main(String[] args) {
String url = "https://fe-api.zhaopin.com/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3";
Connection connect = Jsoup.connect(url).timeout(30000);
connect.header("authority", "fe-api.zhaopin.com");
connect.header("accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
connect.header("path",
"/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
connect.header("accept-encoding", "gzip, deflate, br");
connect.header("accept-language", "zh-CN,zh;q=0.9");
connect.header("cache-control", "no-cache");
connect.header("upgrade-insecure-requests", "1");
connect.header("user-agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
.ignoreContentType(true);
connect.method(Method.GET);
try {
Response response = connect.execute();
System.out.println(response.body());
} catch (IOException e1) {
e1.printStackTrace();
}
}
2、获得单个职位详情,在爬取中遇到html乱码问题
第一版:HTML乱码
public static void main(String[] args) {
String url = "https://jobs.zhaopin.com/CC322742114J00246383604.htm";
Connection connect = Jsoup.connect(url).timeout(30000);
connect.header("Host", "jobs.zhaopin.com");
connect.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
connect.header("path",
"/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
connect.header("Accept-Encoding", "gzip, deflate, br");
connect.header("Accept-Language", "zh-CN,zh;q=0.9");
connect.header("Cache-Control", "no-cache");
connect.header("Connection", "keep-alive");
connect.header("upgrade-insecure-requests", "1");
connect.header("user-agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
.ignoreContentType(true);
connect.method(Method.GET);
try {
Response response = connect.execute();
Document parse = response.parse();
System.out.println(parse.toString());
String html = parse.html();
int index = html.indexOf("__INITIAL_STATE__=");不会使用正则
String msglist = html.substring(index, html.indexOf("</script>", index)).replace("__INITIAL_STATE__=", "");
System.out.println(msglist);
} catch (IOException e1) {
e1.printStackTrace();
}
}
第二版:网上找的解决乱码方法 ,但是无效
public static void main(String[] args) throws IOException {
String urlstr = "http://jobs.zhaopin.com/CC322742114J00246383604.htm";
URL url = new URL(urlstr);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.addRequestProperty("Host", "jobs.zhaopin.com");
connection.addRequestProperty("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
connection.addRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");
connection.addRequestProperty("Cache-Control", "no-cache");
connection.addRequestProperty("Connection", "keep-alive");
connection.addRequestProperty("upgrade-insecure-requests", "1");
connection.addRequestProperty("user-agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36");
try {
/*第一种 字符乱码 无效*/
Document parse = Jsoup.parse(connection.getInputStream(), "UTF-8", urlstr);
String docStr = parse.toString();
/*第二种 字符乱码 无效*/
String str = new String(docStr.getBytes("ISO8859-1"), "UTF-8");
parse = Jsoup.parse(str);
} catch (IOException e1) {
e1.printStackTrace();
}
}
最终解决方法 :在把第一个版本中的connect.header("Accept-Encoding", "gzip, deflate, br"); 注释了,就不会出现乱码。
为什么会出现第二版:Connection connect = Jsoup.connect(url) 不能请求设置编码
爬虫是有时效性的,以上有错误,望指出!!!