iSelenium

专注自动化测试

使用htmlunit爬去某招聘网站部分招聘要求内容

主代码部分:

package selenium.crawler;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;

import org.testng.annotations.Test;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

import common.ExcelMethod;

public class zhilianzhaopin {
	@Test  
    public void test() throws IOException {  
        WebClient webClient = new  WebClient(BrowserVersion.CHROME);
        webClient.getOptions().setJavaScriptEnabled(false);  
        webClient.getOptions().setCssEnabled(false);  
        webClient.getOptions().setUseInsecureSSL(false);  
   
        List<String> urlList =  getZlzpUrl(webClient);
        
        try {
        	ExcelMethod ds = new ExcelMethod();
        	ds.readExcel();
        	int row = 0;
        	   for (String url : urlList) {
        	        List<String> baseInfoList = new ArrayList<>();
        	        	getResumeInfo(webClient, url, baseInfoList);       	        	
        	        	for (int i = 0; i < baseInfoList.size(); i++) {       	        		
        	        		ds.setValueIntoCell("Sheet1", i, row, baseInfoList.get(i));
						}       	        	
        	        	row++;
        	        	System.out.println(baseInfoList.toString());
        	        	
        			}
        	   ds.closeFile();   
        	   System.out.println("--------------END---------------");
        	   
		} catch (Exception e) {
			e.printStackTrace();
		}
    
          
	}

	private void getResumeInfo(WebClient webClient, String url, List<String> baseInfoList)
			throws IOException, MalformedURLException {
		HtmlPage page = webClient.getPage(url);  
		List<DomElement> bodyElementList = page.getElementsByTagName("body");
		List<HtmlElement> divElementList = bodyElementList.get(0).getElementsByTagName("div");
		for (HtmlElement htmlElement : divElementList) {
			String classAttr = htmlElement.getAttribute("class");
			if (classAttr != null || classAttr != "") {
				if (classAttr.equals("top-fixed-box")) {
					List<HtmlElement> terminadivElementList = htmlElement.getElementsByTagName("div");
					List<HtmlElement> h1vElementList =terminadivElementList.get(0).getElementsByTagName("h1");
					String h1Text = h1vElementList.get(0).getTextContent();
					List<HtmlElement> h2vElementList =terminadivElementList.get(0).getElementsByTagName("h2");
					String h2Text = h2vElementList.get(0).getTextContent();
					baseInfoList.add(h1Text);
					baseInfoList.add(h2Text);
				}
				else if(classAttr.equals("terminalpage clearfix")){
					List<HtmlElement> terminadivElementList = htmlElement.getElementsByTagName("div");
					List<HtmlElement> firstUlElementList = terminadivElementList.get(0).getElementsByTagName("ul");
					List<HtmlElement> firstLiElementList = firstUlElementList.get(0).getElementsByTagName("li");
					for (HtmlElement liElement : firstLiElementList) {
						String temp =  liElement.getTextContent();
						baseInfoList.add(temp);
					}
					
					List<HtmlElement> firstdivElementList = terminadivElementList.get(0).getElementsByTagName("div");
					List<HtmlElement> detaildivElementList = firstdivElementList.get(0).getElementsByTagName("div");
					List<HtmlElement> pElementList = detaildivElementList.get(0).getElementsByTagName("p");
					int i = 0;
					while (i < 3) {
						String temp =  pElementList.get(i).getTextContent();
						i = i+2;
						baseInfoList.add(temp);
					}							
				}
			}
			
		}
	}

	private List<String> getZlzpUrl(WebClient webClient) throws IOException, MalformedURLException {
		List<String> urlList = new ArrayList<>();
		for (int pageNum = 1; pageNum <= 12; pageNum++) {
        	String url ="http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E9%80%89%E6%8B%A9%E5%9C%B0%E5%8C%BA&kw=%E8%87%AA%E5%8A%A8%E5%8C%96%E6%B5%8B%E8%AF%95%E5%B7%A5%E7%A8%8B%E5%B8%88&p="
        			+ pageNum
        			+ "&isadv=0";  
            HtmlPage page = webClient.getPage(url);  
//          System.out.println("页面文本:"+page.getTitleText());             
            DomElement  talbleElement = page.getElementById("newlist_list_content_table");
            List<HtmlElement> talbleElementList =   talbleElement.getElementsByTagName("table");
            System.out.println("size: " + talbleElementList.size());
            for (int i = 1; i < talbleElementList.size(); i++) {
            	 List<HtmlElement> trElementList = talbleElementList.get(i).getElementsByTagName("tr");	
            	 List<HtmlElement> divElementList = trElementList.get(0).getElementsByTagName("div");
            	 List<HtmlElement> aElementList = divElementList.get(0).getElementsByTagName("a");
            	 String htef = aElementList.get(0).getAttribute("href");
            	 System.out.println(pageNum + "---" + i + "---" + htef);
            	 urlList.add(htef);
            }	
		}
		
		return urlList;
	}
}


写excel代码

package common;

import java.io.File;
import jxl.Workbook;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.Label;
import jxl.write.WriteException;

public class ExcelMethod {
	
	static Workbook wbook;
    static WritableWorkbook wwbCopy;
    static String ExecutedTestCasesSheet;
    static WritableSheet shSheet;
    
    public void readExcel()
    {
    try{
    wbook = Workbook.getWorkbook(new File("D:\\testSampleData.xls"));
    wwbCopy = Workbook.createWorkbook(new File("D:\\testSampleDataCopy.xls"), wbook);
    shSheet = wwbCopy.getSheet(0);
    }
    catch(Exception e)
    {
        e.printStackTrace();
    }
    }
    
    public void setValueIntoCell(String strSheetName,int iColumnNumber, int iRowNumber,String strData) throws WriteException
    {
        WritableSheet wshTemp = wwbCopy.getSheet(strSheetName);
        Label labTemp = new Label(iColumnNumber, iRowNumber, strData);
                
        try {
            wshTemp.addCell(labTemp);
             } 
            catch (Exception e) 
            {
                e.printStackTrace();
            }
    }
    
    public void closeFile()
    {
        try {
            // Closing the writable work book
            wwbCopy.write();
            wwbCopy.close();

            // Closing the original work book
            wbook.close();
        } catch (Exception e)

        {
            e.printStackTrace();
        }
    }
    
    public static void main(String[] args) throws WriteException
    {
    	ExcelMethod ds = new ExcelMethod();
        ds.readExcel();
        ds.setValueIntoCell("Sheet1", 5, 1, "PASS");
        ds.setValueIntoCell("Sheet1", 5, 2, "FAIL");
        ds.setValueIntoCell("Sheet1", 5, 3, "PASS");
        ds.closeFile();
    }
}


结果截图:




阅读更多
个人分类: htmlunit
想对作者说点什么? 我来说一句

Python获取招聘网站数据

2017年07月10日 2KB 下载

没有更多推荐了,返回首页

不良信息举报

使用htmlunit爬去某招聘网站部分招聘要求内容

最多只允许输入30个字

加入CSDN,享受更精准的内容推荐,与500万程序员共同成长!
关闭
关闭