第一步、分析网页
要找到URL的规律,找出页面跟URL的关系。我们需要的response的请求头有什么内容,有什么作用,也需要搞清楚。
分页一般是与url参数有关
51job分页:
//大数据,1177页,共58848条职位
for (int i=1;i<=1177;i++){
String url="https://search.51job.com/list/000000,000000,0000,01,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,"+i+".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
getHtml(url);
}
百度图片分页:
url='https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord='+word+'&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&word='+word+'&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=girl&rn=30&gsm=&1575687587077=&pn='+page
反反爬虫
一般是设置请求头解决
伪装成浏览器,这个少不了
con.setRequestProperty("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
其他请求头:
京东反反爬虫,设置"Referer","https://list.jd.com/list.html"
URL urlObj = new URL(url);
// 1. 建立http连接
HttpURLConnection con = (HttpURLConnection) urlObj.openConnection();
// 2. 配置request header
con.setRequestProperty("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
con.setRequestProperty("Sec-Fetch-Mode","no-cors");
con.setRequestProperty("Referer","https://list.jd.com/list.html");
// 3. 打开连接
con.connect();
百度图片反反爬虫设置请求头
Referer: http://image.baidu.com
第二步、有了网页,那就要取到有用信息了
jsoup工具的使用
package com.jsoup.gethtml;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
public class Get51Job {
public static void getHtml(String url){
try {
Document doc = Jsoup.connect(url).get();
Elements spans = doc.select("p.t1 a");
int i=1;
for(Element span:spans){
String joburl=span.attr("href");
getDetails(joburl);
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void getDetails(String joburl){
//https://jobs.51job.com/shenzhen-nsq/117347710.html?s=01&t=0
//https://jobs.51job.com/hangzhou/112946708.html?s=01&t=0
//https://jobs.51job.com/guangzhou/114204332.html?s=01&t=0
try {
String txt="";
Document doc = Jsoup.connect(joburl).get();
//维保销售主任 10-15万/年
Elements h1s = doc.select("div.cn h1");
Elements money = doc.select("div.cn strong");
for (Element e:h1s){
txt+=e.text()+"`";
}
for (Element e:money){
txt+=money.text()+"`";
}
//通力电梯有限公司
Elements comp = doc.select("a.catn");
for (Element e:comp){
txt+=e.text()+"`";
}
//合肥 | 无工作经验 | 招1人 | 09-24发布
Elements addrs = doc.select("p.msg");
int y=0;
for (Element e:addrs){
String[] split = e.text().split("\\|");
for (String s:split){
y++;
txt+=s+"`";
}
}
if(y<7){
for (int i=0;i<7-y;i++){
txt+= "`";
}
}
//五险一金补充医疗保险免费班车员工旅游餐饮补贴绩效奖金定期体检
Elements welfares= doc.select("span.sp4");
for (int i=0;i<welfares.size();i++){
if(i==welfares.size()-1){
txt+=welfares.get(i).text();
continue;
}
txt+=welfares.get(i).text()+"|";
}
txt+=",";
//公司
Elements comp1 = doc.select("p.at");
for (int i=0;i<comp1.size();i++){
if(i==comp1.size()-1){
txt+=comp1.get(i).text();
continue;
}
txt+=comp1.get(i).text()+"`";
}
if(y<=4){
}else {
txt+="\n";
writeTxt(txt,"D://reptile/51jobs_new2.txt");
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void writeTxt(String str,String path) throws IOException {
File txt=new File(path);
if(!txt.exists()){
txt.createNewFile();
}
byte bytes[];
bytes=str.getBytes();
int b=bytes.length; //是字节的长度,不是字符串的长度
FileOutputStream fos=new FileOutputStream(path,true);
fos.write(bytes,0,b);
fos.close();
}
public static void main(String[] args) {
for (int i=1;i<=1177;i++){
String url="https://search.51job.com/list/000000,000000,0000,01,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,"+i+".html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=";
getHtml(url);
}
}
}
json格式
请求工具CrawlerUtil
package com.hopu.bigdata.home;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
public class CrawlerUtil {
public static String getSource(String url) {
return getSource(url, "utf8");
}
public static String getSource(String url, String charset) {
String src = "";
try {
URL urlObj = new URL(url);
// 1. 建立http连接
HttpURLConnection con = (HttpURLConnection) urlObj.openConnection();
// 2. 配置request header
con.setRequestProperty("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
// 3. 打开连接
con.connect();
// 判断是否请求成功
if (con.getResponseCode() == 200) {
//4. 请求成功则获取网页源代码
InputStream is = con.getInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte [] bytes = new byte[1<<20];
int len;
while ((len=is.read(bytes)) != -1) {
baos.write(bytes, 0, len);
}
// 字符 = 字节 + 编码格式
src = new String(baos.toByteArray(), charset);
baos.close();
is.close();
return src;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
}
return src;
}
public static String getSource(String url,String charset,String header) {
String src = "";
try {
URL urlObj = new URL(url);
// 1. 建立http连接
HttpURLConnection con = (HttpURLConnection) urlObj.openConnection();
// 2. 配置request header
con.setRequestProperty("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36");
con.setRequestProperty("Sec-Fetch-Mode","no-cors");
con.setRequestProperty("Referer","https://list.jd.com/list.html");
// 3. 打开连接
con.connect();
// 判断是否请求成功
if (con.getResponseCode() == 200) {
//4. 请求成功则获取网页源代码
InputStream is = con.getInputStream();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
byte [] bytes = new byte[1<<20];
int len;
while ((len=is.read(bytes)) != -1) {
baos.write(bytes, 0, len);
}
// 字符 = 字节 + 编码格式
src = new String(baos.toByteArray(), charset);
baos.close();
is.close();
return src;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
}
return src;
}
}
使用工具得到json的response,并取值
public static void getJobs(String url){
// http://s.cjol.com/service/joblistjson.aspx?&ListType=2&page=2
String jsonStr = CrawlerUtil.getSource(url);
// 可以把jsonObject理解为Map,使用get方法,获取属性值。
JSONObject jsonObject = new JSONObject(jsonStr);
Object jobListHtml = jsonObject.get("JobListHtml");
String html=jobListHtml.toString();
Document doc = Jsoup.parse(html);
Elements li = doc.select("li");
int i=1;
String txt="";
for (Element l:li){
txt+=l.text();
if(i%8==0){
txt+="\n";
}else {
txt+="\t";
}
i++;
}
try {
writeTxt(txt,"D://jobs.txt");
} catch (IOException e) {
e.printStackTrace();
}
}
第三部、存储
TXT文件
public static void writeTxt(String str,String path) throws IOException {
File txt=new File(path);
if(!txt.exists()){
txt.createNewFile();
}
byte bytes[];
bytes=str.getBytes();
int b=bytes.length; //是字节的长度,不是字符串的长度
FileOutputStream fos=new FileOutputStream(path,true);
fos.write(bytes,0,b);
fos.close();
}
写数据库
mysql
HBASE
。。。
总结
爬虫其实比较简单,就是比较烦人,爬取一个网站要踩许多坑,需要解决各种小问题,当然,熟能生巧。