通过搜索词获取百度健康排位

一、需求

       最近要为公司做百度健康排位抓取工具,就是通过一批搜索词得到百度健康展现出来的医院列表及其位置

  比如说我在百度健康搜索“北京人流”,得到如下图所示的列表,取得搜索出来的医院的位置,比如得到“北京奥北医院”,“左1”。“中国人民解放军第二炮兵总医院”,“右1”

以此类推,只需要第一页就行

二、代码实现

      我使用了Jsoup+htmlunit相结合方式获取页面和解析页面,采用Apache POI解析和能导出excel

     

package com.huaxia.yanfa.export;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class Exports {

	
	// 读取搜索词文件
	
		public List<String> readSearchKeys() {
			List<String> list = new ArrayList<String>();
			try {
           			
	File inFile = new File("searchkeys.xlsx");
				if (inFile.isFile() && inFile.exists()) {
					FileInputStream is = new FileInputStream(inFile);
					System.out.println("开始读取搜索词");
					XSSFWorkbook wb = new XSSFWorkbook(is);
					XSSFSheet sheet = wb.getSheetAt(0);
					XSSFRow row;
					// 获取总行数
					int totalRowNum = sheet.getLastRowNum();
					for (int i = 1; i <= totalRowNum; i++) {
						row = sheet.getRow(i);
						if(row!=null){
							XSSFCell xccel = row.getCell(0);
							if(xccel!=null&&xccel.toString().trim().length()>0){
								list.add(xccel.toString());
							}
						}
					}
					System.out.println("读取搜索词完毕");
				}
				
			} catch (Exception e) {
				e.printStackTrace();
			}
			return list;

		}
	public void writeExcel(){
		
		WebClient webClient = new WebClient();
		webClient.setAjaxController(new NicelyResynchronizingAjaxController());
		// 获取html对象
		Document doc;
		try {
			List<Map<String, String>> list = new ArrayList<Map<String, String>>();
			Exports exp=new Exports();
			List<String> keys=exp.readSearchKeys();
			for(String key:keys){
                          //这个是百度健康搜索的URL   
			String urls = "http://jiankang.baidu.com/juhe?aType=2&source=0&sessionID=1380510305320866&provID=1&provName=%E5%8C%97%E4%BA%AC&avoid_enter_submit=&wd="
					+ URLEncoder.encode(key, "UTF-8");
			HtmlPage htmlPage = webClient.getPage(urls);
			doc = Jsoup.parse(htmlPage.asXml());
			//获取左边的位置
			Elements links = doc.select("div.card-detail");
		
			int i = 1;
			for (Element es : links) {
				Elements title = es.select("a.card-hospital-name");
				String domin = title.get(0).text().replace(" ", "");
				if (title.get(0).hasAttr("em")) {
					domin = title.get(0).getElementsByTag("em").get(0).text().toString().replace(" ", "")+ domin;
				}
				Map<String, String> map = new HashMap<String, String>();
				map.put("searchKey", key);
				map.put("weizhi", "左" + i);
				map.put("hospitalName", domin);
				list.add(map);
				i++;
			}
			
			//获取右边的位置
			Element elright=doc.getElementById("health-right");
			Elements rightEl=elright.select("div.card-info");
			 i = 1;
			for (Element es : rightEl) {
				Elements title = es.select("a.card-hospital-name");
				String domin = title.get(0).text().replace(" ", "");
				if (title.get(0).hasAttr("em")) {
					domin = title.get(0).getElementsByTag("em").get(0).text().toString().replace(" ", "")+ domin;
				}
				 Map<String, String> map = new HashMap<String, String>();
					map.put("searchKey", key);
					map.put("weizhi", "右" + i);
					map.put("hospitalName", domin);
					list.add(map);
				 i++;
			}
			
		}
			// 导出excel
			HSSFWorkbook hwb = new HSSFWorkbook();
			HSSFSheet sheet = hwb.createSheet("IP");
			HSSFRow row = sheet.createRow((int) 0);
			HSSFCellStyle style = hwb.createCellStyle();
			style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
			String head[] = { "搜索词", "位置", "医院名称" };
			for (int j = 0; j < head.length; j++) {
				HSSFCell cell = row.createCell(j);
				cell.setCellValue(head[j]);
				cell.setCellStyle(style);
			}
			
           int  i=0;
			for (Map<String, String> map : list) {
				row = sheet.createRow(i + 1);
				for (int j = 0; j < head.length; j++) {
					HSSFCell cell = row.createCell(j);
					switch (j) {
					case 0:
						cell.setCellValue(map.get("searchKey") == null ? ""
								: map.get("searchKey").toString());
						break;
					case 1:
						cell.setCellValue(map.get("weizhi") == null ? ""
								: map.get("weizhi").toString());
						break;
					case 2:
						cell.setCellValue(map.get("hospitalName") == null ? ""
								: map.get("hospitalName").toString());
						break;
					default:
						break;
					}
				}
			    i++;
			}
			hwb.write(new FileOutputStream(new File("百度健康排位.xls")));
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	public static void main(String[] args) {
		new Exports().writeExcel();
	}
	
	


}


 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值