关闭

java 抓取拍拍贷网站数据

标签: javahtmlunit 拍拍贷爬虫
3222人阅读 评论(35) 收藏 举报
分类:
 
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

/**
 * 抓取拍拍贷网站(http://invest.ppdai.com/loan/list_riskhigh?monthgroup=&rate=0&didibid=)数据
 * @author yesf
 *
 */
public class Main {
	private final static String[] headerTitles = new String[]{"用户", "借入信用", "借出信用", "性别年龄", "目前身份", "次数", "借款金额", "年利率", "期限", "还款方式", "投标人数", "状态"};
	private final static String FILE_PATH = "./正常数据_高风险收益区.csv";
	private final static WebClient CLIENT = new WebClient(BrowserVersion.CHROME);
	
    /**    
       
     * main(这里用一句话描述这个方法的作用)    
     * @param   name    
     * @param  @return    设定文件    
     * @return String    DOM对象    
     * @Exception 异常对象     
     */
    public static void main(String[] args) {
        CLIENT.getOptions().setJavaScriptEnabled(false);
        CLIENT.getOptions().setCssEnabled(false);
        HtmlPage page = null; 
        try {
        	// http://invest.ppdai.com/loan/list_safe_s0_p1?Rate=0
        	// http://invest.ppdai.com/loan/list_riskmiddle?monthgroup=&rate=0&didibid=
            page = CLIENT.getPage("http://invest.ppdai.com/loan/list_riskhigh?monthgroup=&rate=0&didibid=");
            for(;;) {
                parseContent(page);
                HtmlAnchor anchor = (HtmlAnchor)page.getByXPath("//a[@class='nextpage']").get(0);
                if(anchor.getAttribute("href").equals("javascript:void(0)")) {
                    break; 
                }
//                page = anchor.click(); 
                page = CLIENT.getPage("http://invest.ppdai.com" + anchor.getHrefAttribute());
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        CLIENT.close();
    }

    private static void parseContent(HtmlPage page) throws IOException {
        List<HtmlElement> elements = (List<HtmlElement>)page.getByXPath("//p[@class='userInfo clearfix']");
        for(HtmlElement element : elements) {
        	HtmlAnchor anchor = (HtmlAnchor) element.getFirstElementChild();
        	String userName = anchor.asText();
//        	HtmlPage userPage = anchor.click();
        	HtmlPage userPage = CLIENT.getPage(anchor.getHrefAttribute());
        	
        	String honorLi = ((HtmlElement)(userPage.getByXPath("//li[@class='honor_li']").get(0))).asText().replaceAll("\r", "").replaceAll("\n", "");
            String userLi = ((HtmlElement)(userPage.getByXPath("//li[@class='user_li']").get(0))).asText().replaceAll("\r", "").replaceAll("\n", "");
           
            List<HtmlDivision> divList = (List<HtmlDivision>) userPage.getByXPath("//div[@class='borrowlist_tit']");
            List<HtmlDivision> divList1 = (List<HtmlDivision>) userPage.getByXPath("//div[@class='borrow_list']");
            for(int i=0; i<divList.size(); i++) {
            	HtmlDivision div = divList.get(i);
            	HtmlDivision div1 = divList1.get(i);
            	String text = div1.asText().replace(",", "").replaceAll("\r", "").replaceAll("\n", "");
            	int start = text.indexOf("状态:");
            	int end = text.indexOf("借入信用:");
            	String status = "";
            	if (start != -1 && end != -1) {
            		status = text.substring(start, end);
            	}
//            	System.out.println(text);
            	
//            	 HtmlPage userDetailPage = ((HtmlAnchor)(div.getFirstElementChild())).click();
            	 HtmlPage userDetailPage = CLIENT.getPage("http://www.ppdai.com" + ((HtmlAnchor)div.getFirstElementChild()).getHrefAttribute());
                 String successCnt = ((HtmlElement)(userDetailPage.getByXPath("//span[@class='bidinfo']").get(0))).asText();
                 String detail = ((HtmlDivision)userDetailPage.getByXPath("//div[@class='newLendDetailMoneyLeft']").get(0)).asText().replaceAll(",", "");
                 String payWay = ((HtmlDivision)userDetailPage.getByXPath("//div[@class='item w260']").get(0)).asText();
                 String biaoCnt = ((HtmlDivision)userDetailPage.getByXPath("//div[@class='item w164']").get(0)).asText();
                 
//                 String moneyAll = userDetailPage.getElementById("listRestMoney").asText().replace(",", "");
                 
                 String sourceData = "用户: " + userName + " " + honorLi + " 性别年龄: " + userLi + " 次数: " + successCnt + " " + detail + " " + payWay + biaoCnt + status;//+ " 金额: " + moneyAll;
//                 System.out.println(sourceData);
                 String data = formatData(sourceData);
//                 System.out.println(data);
                 writeToCsv(FILE_PATH, data);
            }
        }
    }
    
    private static String formatData(String data) {
    	data = data.replaceAll(":", ",").replaceAll(":", ",");
    	for(String headerTitle : headerTitles) {
    		data = data.replace(headerTitle, "");
    	}
    	return data;
    }
    
    private static void writeToCsv(String filePath, String data) {
    	File file = new File(filePath);
    	BufferedWriter writer = null;
    	boolean first = false;
    	try {
	    	if(!file.exists()) {
	    		file.getParentFile().mkdirs();
	    		file.createNewFile();
	    		first = true;
	    	}
	    	writer = new BufferedWriter(new FileWriter(file, true));
	    	if(first) {
	    		StringBuilder sb = new StringBuilder("");
	    		for(String header : headerTitles) {
	    			sb.append(",").append(header);
	    		}
	    		writer.write(sb + "\r\n");
	    	}
	    	writer.write(data + "\r\n");
    	} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if(writer != null) {
				try {
					writer.close();
				} catch (IOException e) {
					e.printStackTrace();
				}				
			}
		}
    }

}

生成的目标文件如下:


功能实现核心技术采用htmlunit开源框架,纯java实现浏览器内核引擎,模拟用户操作,自动完成鼠标点击,表单提交,下载文件等。。。朋友们自由发挥哈,发挥你们的想象

5
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:5219次
    • 积分:97
    • 等级:
    • 排名:千里之外
    • 原创:3篇
    • 转载:1篇
    • 译文:1篇
    • 评论:36条
    文章分类
    最新评论