这是对上个程序的补充,同时登录多个账号采集所有信息,分别存在不同的表中,修改的部分代码下面会粘贴出来,其他代码部分参考上篇博客~~~
每个账号是有区别的,所以方法的调用时使用个标记添加为参数,全局变量及方法调用的变化如下:
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Calendar;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;
public static String responseCookie[] = {"","",""};
public static int i[]={0,0,0},x[]={0,0,0};//i是用来记录发布的职位数,x是用来记录报名人员的人数
public static void main(String[] args) throws IOException, RowsExceededException, WriteException, BiffException {
// TODO Auto-generated method stub
Calendar c = Calendar.getInstance();//可以对每个时间域单独修改
int year = c.get(Calendar.YEAR);
int month = c.get(Calendar.MONTH)+1;
int date = c.get(Calendar.DATE);
//GetCookie()方法得到Cookie值作为全局变量,便于之后跳转网页是随时使用,最后返回网页内容
String ss[]={"phone_login=123&passwd_login=123",
"phone_login=456&passwd_login=456",
"phone_login=789&passwd_login=789"};
for(int j=0;j<ss.length;j++){
String path="C:\\resume\\DoumiCareer("+j+")"+year+"."+month+"."+date+".xlsx";//使用当天的时间来命名文件名
WritableWorkbook wb = Workbook.createWorkbook(new File(path));
WritableSheet ws = wb.createSheet("Sheet1", 0);//创建Excel表并打开
Label label0 = new Label(0,0,"职位名");
ws.addCell(label0);
Label label1 = new Label(1,0,"职位ID");
ws.addCell(label1);
Label label2 = new Label(2,0,"发布城市");
ws.addCell(label2);
Label label3 = new Label(3,0,"报名链接");
ws.addCell(label3);
i[j]++;
StringBuffer str=GetCookie(ss[j],j);
//解析网页内容,得到当前页面所有职位的部分信息
String hrefs=LookAllCareer(str,ws,wb,j);
//判断是否有下一页的链接,若不为null,则继续获取网页内容,并解析
while(hrefs!=null){
String href=hrefs.replaceAll("amp;","");
System.out.println(href);
StringBuffer content=new StringBuffer();
//获取链接下网页的内容
content=GetHTML(href,j);
hrefs=LookAllCareer(content,ws,wb,j);
}
//将信息写入表中并关闭
wb.write();
wb.close();
//根据上面建立的表,得到所有报名人员的信息的方法
GetAllEnrollInfo(j);
}
}
//获取该链接下的网页内容,结合Cookie进行跳转
//根据Cookie及URL得到网页内容
public static StringBuffer GetHTML(String href,int r) throws UnsupportedEncodingException, IOException{
URL url = new URL(href);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
//System.out.println(responseCookie);
connection.setRequestProperty("Cookie", responseCookie[r]);//给服务器送登录后的cookie
BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
StringBuffer content=new StringBuffer();
int j=0;
String line1= br.readLine();
while (line1 != null) {
content.append(line1+System.getProperty("line.separator"));
//System.out.println((j++)+"===="+new String(line1.getBytes()));
line1 = br.readLine();
}
return content;
}
//获取全局Cookie值,并返回企业版首页的网页内容
//得到全局变量cookie值
public static StringBuffer GetCookie(String str,int r) throws IOException{
URL url = new URL("https://vip.doumi.com/employer/user/ajaxlogin");
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setDoOutput(true);//允许连接提交信息
connection.setRequestMethod("POST");//网页提交方式“GET”、“POST”
connection.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
connection.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
connection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
connection.setRequestProperty("Cache-Control", "max-age=0");
connection.setRequestProperty("Connection", "keep-alive");
connection.setRequestProperty("Host", "vip.doumi.com");
connection.setRequestProperty("Referer", "http://www.doumi.com/wh/");
connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36");
OutputStream os = connection.getOutputStream();
//os.write(jsonParam.toString().getBytes());
os.write(str.toString().getBytes("UTF-8"));
os.flush();
//os.close();
String cookieVal = "";
String key = null;
//取cookie
for(int i = 1; (key = connection.getHeaderFieldKey(i)) != null; i++){
if(key.equalsIgnoreCase("set-cookie")){
cookieVal = connection.getHeaderField(i);
cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));
responseCookie[r] = responseCookie[r] + cookieVal + ";";
}
}
//System.out.println("cookie:" + responseCookie);
int responsecode = connection.getResponseCode();
//System.out.println("responsecode:"+responsecode);
//acces
URL url1 = new URL("https://vip.doumi.com/managecenter");
HttpURLConnection connection1 = (HttpURLConnection) url1.openConnection();
connection1.setRequestProperty("Cookie", responseCookie[r]);//给服务器送登录后的cookie
BufferedReader br1 = new BufferedReader(new InputStreamReader(connection1.getInputStream(),"UTF-8"));
StringBuffer content1=new StringBuffer();
String line1= br1.readLine();
//System.out.println("登陆后:");
int j=0;
while (line1 != null) {
content1.append(line1+System.getProperty("line.separator"));
//System.out.println((j++)+"===="+new String(line1.getBytes()));
line1 = br1.readLine();
}
//System.out.println(content1.toString());
return content1;
}
//获取该网页下的所有职位的相关信息,创建单元格写入DoumiCareer表中,返回下一页链接的字符串
//匹配网页的内容,返回下一页的链接地址,重复使用该函数收集发布职位的信息
public static String LookAllCareer(StringBuffer strb,WritableSheet sheet,WritableWorkbook wb,int ll) throws RowsExceededException, WriteException, IOException{
//得到下一页职位信息页面链接的正则,即用最后一个li判断是否有a标签,若有,则记下链接属性,若没有则停止
Pattern P=Pattern.compile("<div class=\"pageBox\">[\\s]*?<ul class=\"pagination\">[\\s\\S]*?</ul>[\\s]*?</div>");//[\\s]*?<li></li>[\\s]*?
//得到当前页面下所有职位的相关信息的正则
Pattern p1=Pattern.compile("<div class=\"bList-item-t\">[\\s]*?<a.*?>([^\n]*)</a>");
Pattern p2=Pattern.compile("<div class=\"bList-item-mid\">[\\s]*?<ul.*?>[\\s]*?<li>[\\s\\S]*?</li>[\\s]*?<li>[\\s\\S]*?</li>[\\s\\S]*?<li class=\"pubTime\">[\\s\\S]*?</li>[\\s]*?<li>[\\s\\S]*?</li>[\\s]*?<li>[\\s\\S]*?</li>[\\s]*?</ul>[\\s]*?</div>");
Pattern p3=Pattern.compile("<div class=\"bList-item-opBtn\">[\\s]*?<a.*?>([^\n]*)</a>[\\s]*?<a.*?>([^\n]*)</a>[\\s]*?</div>");
Matcher nextpage = P.matcher(strb);
Matcher m = p1.matcher(strb);
Matcher r = p2.matcher(strb);
Matcher s = p3.matcher(strb);
//得到单个职位报名管理信息的跳转链接
Pattern reg2=Pattern.compile("<li>[\\s]*?<span>([^\n]*)</span>([\\s\\S]*?)[\\s]*?</li>");
Pattern reg22=Pattern.compile("<li>[\\s]*?<span>([^\n]*)</span>[\\s]*?<em.*?>([^\n]*)</em>[\\s]*?</li>");
Pattern reg3=Pattern.compile("<a" + "[^<>]*?\\s" + "href" + "=['\"]?(.*?)['\"]?(\\s.*?)?>");
while(m.find() && r.find() && s.find()){
Matcher rr=reg2.matcher(r.group(0));
Matcher rrr=reg22.matcher(r.group(0));
Matcher ss=reg3.matcher(s.group(0));
if(rr.find() && rrr.find() && ss.find()){
Label label0 = new Label(0,i[ll],m.group(1));
sheet.addCell(label0);
Label label1 = new Label(1,i[ll],rr.group(2));
sheet.addCell(label1);
Label label2 = new Label(2,i[ll],rrr.group(2));
sheet.addCell(label2);
Label label3 = new Label(3,i[ll],ss.group(1));
sheet.addCell(label3);
i[ll]++;
//System.out.println(i+"==="+m.group(1)+","+rr.group(2)+","+rrr.group(2)+","+ss.group(1));
}
//System.out.println("职位名称:"+m.group(0)+","+r.group(0)+","+s.group(0));
}
//wb.write();
//得到发布职位的名称
Pattern Reg=Pattern.compile("<li class=\"active\">[\\s\\S]*?</li><li><a href"+"=['\"]?(.*?)['\"]?(\\s.*?)?>([\\s\\S])*?</a></li>");
if(nextpage.find()){
//System.out.println(nextpage.group(0));
Matcher NextPage=Reg.matcher(nextpage.group(0));
if(NextPage.find()){
//System.out.println(NextPage.group(1));
return NextPage.group(1);
}
}
return null;
}
//读取DoumiCareer表中的信息,获取所有职位的报名人员的信息,并写入DoumiResume表中
//读取Excel表中每个职位的的报名管理链接,跳转到报名管理的页面,得到所有页面的报名人员信息
public static void GetAllEnrollInfo(int y) throws BiffException, IOException, RowsExceededException, WriteException{
//int j=1;
Calendar c = Calendar.getInstance();//可以对每个时间域单独修改
int year = c.get(Calendar.YEAR);
int month = c.get(Calendar.MONTH)+1;
int date = c.get(Calendar.DATE);
String path="C:\\resume\\DoumiCareer("+y+")"+year+"."+month+"."+date+".xlsx";
InputStream readfile = new FileInputStream(path);
Workbook rexcel = Workbook.getWorkbook(readfile);
//这里有两种方法获取sheet表:名字和下标(从0开始)
//Sheet st = rwb.getSheet("original");
Sheet st = rexcel.getSheet(0);
//创建DoumiResume.xlsx,将报名人员信息写入Excel表中
String setpath="C:\\resume\\DoumiResume("+y+")"+year+"."+month+"."+date+".xlsx";
WritableWorkbook wb = Workbook.createWorkbook(new File(setpath));
WritableSheet ws = wb.createSheet("Sheet1", 0);
Label label0 = new Label(0,0,"姓名");
ws.addCell(label0);
Label label1 = new Label(1,0,"性别");
ws.addCell(label1);
Label label2 = new Label(2,0,"年龄");
ws.addCell(label2);
Label label3 = new Label(3,0,"联系电话");
ws.addCell(label3);
//Label label4 = new Label(4,0,"报名时间");
//ws.addCell(label4);
/*Label label4 = new Label(4,0,"报名时间");
ws.addCell(label4);
Label label5 = new Label(5,0,"是否在校生");
ws.addCell(label5);
Label label6 = new Label(6,0,"所在学校");
ws.addCell(label6);
Label label7 = new Label(7,0,"入学年份");
ws.addCell(label7);*/
x[y]++;
for(int j=1;j<i[y];j++){
Cell cell1=st.getCell(1, j);//当前网页的职位ID(pid)
Cell cell3=st.getCell(3, j);//获取当前职位的网页链接
System.out.println(cell3.getContents());
StringBuffer content=GetHTML(cell3.getContents(),y);
//System.out.println(content);
GetSingleEnrollInfo(cell1.getContents(),content,wb,ws,y);
}
//关闭输入流及读取信息的表
readfile.close();
rexcel.close();
//写入报名信息到表中并关闭
wb.write();
wb.close();
}
//根据报名管理的网页内容,匹配到所有报名人员的信息
public static void GetSingleEnrollInfo(String pid,StringBuffer content,WritableWorkbook wb,WritableSheet sheet,int y) throws IOException, RowsExceededException, WriteException{
//System.out.println(content);
//匹配到下一页指向的网址,并返回
Pattern P=Pattern.compile("<div class=\"pageBox\"></div>");
Matcher M=P.matcher(content);
//匹配到所有报名人员的相关信息:姓名,性别,年龄,使用aid和pid请求Ajax得到联系电话// class=\"fc-4b mr5 hover-after\" [\\s\\S]
Pattern q1=Pattern.compile("<em.*?>[\\s]*?<span.*?>[\\s]*?([\\S]*?)[\\s]*?</span>[\\s]*?</em>[\\s]*?<span.*?></span>[\\s]*?<span class=\"mr5\">([^\n]*?)</span>[\\s]*?<span class=\"mr5\">([^\n]*?)</span>");
Pattern q2=Pattern.compile("<td class=\"read-phone-"+"([\\S]*?)\">[\\s]*([\\s\\S]*?)[\\s]*</td>[\\s]*?<td>[\\s]*([\\s\\S]*?)[\\s]*</td>");//([\\S]*?)[\\s]*<div.*?>([\\s\\S]*?)</div>
//匹配到姓名,性别,年龄
Matcher m1=q1.matcher(content);
//匹配到read-phone标签的aid和pid
Matcher m2=q2.matcher(content);
while(m1.find() && m2.find()){
//获取报名时间
String bmtime=null;
if(m2.group(3).indexOf("div")>-1){
Pattern q22=Pattern.compile("([^\n]*?)[\\s]*<div class=\"b-ico-time\">([\\s\\S]*?)</div>");
Matcher m22=q22.matcher(m2.group(3));
//label4=new Label(4,x,m.group(1));
if(m22.find()){
//System.out.println(m22.group(1));
bmtime=m22.group(1);
}
else{
bmtime="";
}
}
else{
bmtime=m2.group(3);
}
//System.out.println(bmtime);
//判断是否为当天时间
if(bmtime.indexOf("1天前")>-1){
Label label0 = new Label(0,x[y],m1.group(1));
Label label1 = new Label(1,x[y],m1.group(2));
Label label2 = new Label(2,x[y],m1.group(3));
sheet.addCell(label0);
sheet.addCell(label1);
sheet.addCell(label2);
//System.out.println(m1.group(1)+","+m1.group(2)+","+m1.group(3));
Label label3;
if(m2.group(2).indexOf("查看电话")>-1){
String aid=m2.group(1);
//System.out.println();//获取aid并进行处理
String tel=CheckTelphone(pid,aid,y);
label3=new Label(3,x[y],tel);
//sheet.addCell(label3);
//System.out.println(tel);
}
else{
label3=new Label(3,x[y],m2.group(2));
//sheet.addCell(label3);
}
sheet.addCell(label3);
//Label label4 = new Label(4,x[y],bmtime);
//sheet.addCell(label4);
x[y]++;
}
else continue;
}
}
//对pid和aid进行处理获得手机号
public static String CheckTelphone(String pid,String aid,int y) throws IOException{
String tel=null;
URL url=new URL("https://vip.doumi.com/employer/manage/readphone");
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestProperty("Cookie", responseCookie[y]);//给服务器送登录后的cookie
connection.setDoOutput(true); //通过把URLConnection设为输出,你可以把数据向你个Web页传送。
String str="aid="+aid+"&pid="+pid;
OutputStreamWriter out = new OutputStreamWriter(connection.getOutputStream(), "UTF-8");
out.write(str.toString()); //向页面传递数据。post的关键所在!
// remember to clean up
out.flush();
out.close();
BufferedReader br = new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
StringBuffer content=new StringBuffer();
String line= br.readLine();
Pattern p=Pattern.compile("\"data\":\\{\""+aid+"\":\"([\\S]*?)\"}}");//{符号前面加了两杠,不加的话会出错
Matcher m=p.matcher(line);
if(m.find()){
tel=m.group(1);
//System.out.println(m.group(1));
}
return tel;
}