import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
/**
* 抓取拍拍贷网站(http://invest.ppdai.com/loan/list_riskhigh?monthgroup=&rate=0&didibid=)数据
* @author yesf
*
*/
public class Main {
private final static String[] headerTitles = new String[]{"用户", "借入信用", "借出信用", "性别年龄", "目前身份", "次数", "借款金额", "年利率", "期限", "还款方式", "投标人数", "状态"};
private final static String FILE_PATH = "./正常数据_高风险收益区.csv";
private final static WebClient CLIENT = new WebClient(BrowserVersion.CHROME);
/**
* main(这里用一句话描述这个方法的作用)
* @param name
* @param @return 设定文件
* @return String DOM对象
* @Exception 异常对象
*/
public static void main(String[] args) {
CLIENT.getOptions().setJavaScriptEnabled(false);
CLIENT.getOptions().setCssEnabled(false);
HtmlPage page = null;
try {
// http://invest.ppdai.com/loan/list_safe_s0_p1?Rate=0
// http://invest.ppdai.com/loan/list_riskmiddle?monthgroup=&rate=0&didibid=
page = CLIENT.getPage("http://invest.ppdai.com/loan/list_riskhigh?monthgroup=&rate=0&didibid=");
for(;;) {
parseContent(page);
HtmlAnchor anchor = (HtmlAnchor)page.getByXPath("//a[@class='nextpage']").get(0);
if(anchor.getAttribute("href").equals("javascript:void(0)")) {
break;
}
// page = anchor.click();
page = CLIENT.getPage("http://invest.ppdai.com" + anchor.getHrefAttribute());
}
} catch (Exception e) {
e.printStackTrace();
}
CLIENT.close();
}
private static void parseContent(HtmlPage page) throws IOException {
List<HtmlElement> elements = (List<HtmlElement>)page.getByXPath("//p[@class='userInfo clearfix']");
for(HtmlElement element : elements) {
HtmlAnchor anchor = (HtmlAnchor) element.getFirstElementChild();
String userName = anchor.asText();
// HtmlPage userPage = anchor.click();
HtmlPage userPage = CLIENT.getPage(anchor.getHrefAttribute());
String honorLi = ((HtmlElement)(userPage.getByXPath("//li[@class='honor_li']").get(0))).asText().replaceAll("\r", "").replaceAll("\n", "");
String userLi = ((HtmlElement)(userPage.getByXPath("//li[@class='user_li']").get(0))).asText().replaceAll("\r", "").replaceAll("\n", "");
List<HtmlDivision> divList = (List<HtmlDivision>) userPage.getByXPath("//div[@class='borrowlist_tit']");
List<HtmlDivision> divList1 = (List<HtmlDivision>) userPage.getByXPath("//div[@class='borrow_list']");
for(int i=0; i<divList.size(); i++) {
HtmlDivision div = divList.get(i);
HtmlDivision div1 = divList1.get(i);
String text = div1.asText().replace(",", "").replaceAll("\r", "").replaceAll("\n", "");
int start = text.indexOf("状态:");
int end = text.indexOf("借入信用:");
String status = "";
if (start != -1 && end != -1) {
status = text.substring(start, end);
}
// System.out.println(text);
// HtmlPage userDetailPage = ((HtmlAnchor)(div.getFirstElementChild())).click();
HtmlPage userDetailPage = CLIENT.getPage("http://www.ppdai.com" + ((HtmlAnchor)div.getFirstElementChild()).getHrefAttribute());
String successCnt = ((HtmlElement)(userDetailPage.getByXPath("//span[@class='bidinfo']").get(0))).asText();
String detail = ((HtmlDivision)userDetailPage.getByXPath("//div[@class='newLendDetailMoneyLeft']").get(0)).asText().replaceAll(",", "");
String payWay = ((HtmlDivision)userDetailPage.getByXPath("//div[@class='item w260']").get(0)).asText();
String biaoCnt = ((HtmlDivision)userDetailPage.getByXPath("//div[@class='item w164']").get(0)).asText();
// String moneyAll = userDetailPage.getElementById("listRestMoney").asText().replace(",", "");
String sourceData = "用户: " + userName + " " + honorLi + " 性别年龄: " + userLi + " 次数: " + successCnt + " " + detail + " " + payWay + biaoCnt + status;//+ " 金额: " + moneyAll;
// System.out.println(sourceData);
String data = formatData(sourceData);
// System.out.println(data);
writeToCsv(FILE_PATH, data);
}
}
}
private static String formatData(String data) {
data = data.replaceAll(":", ",").replaceAll(":", ",");
for(String headerTitle : headerTitles) {
data = data.replace(headerTitle, "");
}
return data;
}
private static void writeToCsv(String filePath, String data) {
File file = new File(filePath);
BufferedWriter writer = null;
boolean first = false;
try {
if(!file.exists()) {
file.getParentFile().mkdirs();
file.createNewFile();
first = true;
}
writer = new BufferedWriter(new FileWriter(file, true));
if(first) {
StringBuilder sb = new StringBuilder("");
for(String header : headerTitles) {
sb.append(",").append(header);
}
writer.write(sb + "\r\n");
}
writer.write(data + "\r\n");
} catch (IOException e) {
e.printStackTrace();
} finally {
if(writer != null) {
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
生成的目标文件如下:
功能实现核心技术采用htmlunit开源框架,纯java实现浏览器内核引擎,模拟用户操作,自动完成鼠标点击,表单提交,下载文件等。。。朋友们自由发挥哈,发挥你们的想象