一、需求
最近要为公司做百度健康排位抓取工具,就是通过一批搜索词得到百度健康展现出来的医院列表及其位置
比如说我在百度健康搜索“北京人流”,得到如下图所示的列表,取得搜索出来的医院的位置,比如得到“北京奥北医院”,“左1”。“中国人民解放军第二炮兵总医院”,“右1”
以此类推,只需要第一页就行
二、代码实现
我使用了Jsoup+htmlunit相结合方式获取页面和解析页面,采用Apache POI解析和能导出excel
package com.huaxia.yanfa.export;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Exports {
// 读取搜索词文件
public List<String> readSearchKeys() {
List<String> list = new ArrayList<String>();
try {
File inFile = new File("searchkeys.xlsx");
if (inFile.isFile() && inFile.exists()) {
FileInputStream is = new FileInputStream(inFile);
System.out.println("开始读取搜索词");
XSSFWorkbook wb = new XSSFWorkbook(is);
XSSFSheet sheet = wb.getSheetAt(0);
XSSFRow row;
// 获取总行数
int totalRowNum = sheet.getLastRowNum();
for (int i = 1; i <= totalRowNum; i++) {
row = sheet.getRow(i);
if(row!=null){
XSSFCell xccel = row.getCell(0);
if(xccel!=null&&xccel.toString().trim().length()>0){
list.add(xccel.toString());
}
}
}
System.out.println("读取搜索词完毕");
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
public void writeExcel(){
WebClient webClient = new WebClient();
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
// 获取html对象
Document doc;
try {
List<Map<String, String>> list = new ArrayList<Map<String, String>>();
Exports exp=new Exports();
List<String> keys=exp.readSearchKeys();
for(String key:keys){
//这个是百度健康搜索的URL
String urls = "http://jiankang.baidu.com/juhe?aType=2&source=0&sessionID=1380510305320866&provID=1&provName=%E5%8C%97%E4%BA%AC&avoid_enter_submit=&wd="
+ URLEncoder.encode(key, "UTF-8");
HtmlPage htmlPage = webClient.getPage(urls);
doc = Jsoup.parse(htmlPage.asXml());
//获取左边的位置
Elements links = doc.select("div.card-detail");
int i = 1;
for (Element es : links) {
Elements title = es.select("a.card-hospital-name");
String domin = title.get(0).text().replace(" ", "");
if (title.get(0).hasAttr("em")) {
domin = title.get(0).getElementsByTag("em").get(0).text().toString().replace(" ", "")+ domin;
}
Map<String, String> map = new HashMap<String, String>();
map.put("searchKey", key);
map.put("weizhi", "左" + i);
map.put("hospitalName", domin);
list.add(map);
i++;
}
//获取右边的位置
Element elright=doc.getElementById("health-right");
Elements rightEl=elright.select("div.card-info");
i = 1;
for (Element es : rightEl) {
Elements title = es.select("a.card-hospital-name");
String domin = title.get(0).text().replace(" ", "");
if (title.get(0).hasAttr("em")) {
domin = title.get(0).getElementsByTag("em").get(0).text().toString().replace(" ", "")+ domin;
}
Map<String, String> map = new HashMap<String, String>();
map.put("searchKey", key);
map.put("weizhi", "右" + i);
map.put("hospitalName", domin);
list.add(map);
i++;
}
}
// 导出excel
HSSFWorkbook hwb = new HSSFWorkbook();
HSSFSheet sheet = hwb.createSheet("IP");
HSSFRow row = sheet.createRow((int) 0);
HSSFCellStyle style = hwb.createCellStyle();
style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
String head[] = { "搜索词", "位置", "医院名称" };
for (int j = 0; j < head.length; j++) {
HSSFCell cell = row.createCell(j);
cell.setCellValue(head[j]);
cell.setCellStyle(style);
}
int i=0;
for (Map<String, String> map : list) {
row = sheet.createRow(i + 1);
for (int j = 0; j < head.length; j++) {
HSSFCell cell = row.createCell(j);
switch (j) {
case 0:
cell.setCellValue(map.get("searchKey") == null ? ""
: map.get("searchKey").toString());
break;
case 1:
cell.setCellValue(map.get("weizhi") == null ? ""
: map.get("weizhi").toString());
break;
case 2:
cell.setCellValue(map.get("hospitalName") == null ? ""
: map.get("hospitalName").toString());
break;
default:
break;
}
}
i++;
}
hwb.write(new FileOutputStream(new File("百度健康排位.xls")));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void main(String[] args) {
new Exports().writeExcel();
}
}