java抓取某网站上的医院信息

最新推荐文章于 2023-04-01 15:47:48 发布

weiythi

最新推荐文章于 2023-04-01 15:47:48 发布

阅读量742

点赞数

分类专栏： JavaWeb

本文链接：https://blog.csdn.net/weiythi/article/details/8242485

版权

JavaWeb 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

抓取某网站上的医院信息，帮一位同学写的，完全是现学现卖,使用jsoup解析返回的HTML代码

HttpRequestProxy.java

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import org.htmlparser.util.ParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class HttpRequestProxy {
	private static List<MZinfo> mzinfos = new ArrayList<MZinfo>();
	private static List<MZinfo> levelinfo = new ArrayList<MZinfo>();
	private static List<MZinfo> cityinfo = new ArrayList<MZinfo>();
	public static String getWebContent(String urlString, final String charset,
			int timeout) throws IOException {
		if (urlString == null || urlString.length() == 0) {
			return null;
		}
		urlString = (urlString.startsWith("http://") || urlString
				.startsWith("https://")) ? urlString : ("http://" + urlString)
				.intern();
		URL url = new URL(urlString);
		HttpURLConnection conn = (HttpURLConnection) url.openConnection();
		conn.setRequestProperty(
				"User-Agent",
				"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
		conn.setRequestProperty("Accept", "text/html");
		conn.setConnectTimeout(timeout);
		try {
			if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
				return null;
			}
		} catch (IOException e) {
			e.printStackTrace();
			return null;
		}
		InputStream input = conn.getInputStream();
		BufferedReader reader = new BufferedReader(new InputStreamReader(input,
				charset));
		String line = null;
		StringBuffer sb = new StringBuffer();
		while ((line = reader.readLine()) != null) {
			sb.append(line).append("\r\n");
		}
		if (reader != null) {
			reader.close();
		}
		if (conn != null) {
			conn.disconnect();
		}
		return sb.toString();

	}

	public static String getWebContent(String urlString) throws IOException {
		return getWebContent(urlString, "iso-8859-1", 5000);
	}

	public static void getHospitolInfo(String url) {
		int pagesSum =1;
		String path = "./result_"+System.currentTimeMillis()+".csv";
		String s = null;
		try {
			s = getWebContent(url);
			s = new String(s.getBytes("iso-8859-1"), "utf8");
		} catch (IOException e1) {
			// TODO Auto-generated catch block
			e1.printStackTrace();
		}
		Document doc = Jsoup.parse(s);
		Elements pageEle = doc.select("div[class=page]>a");
		if(pageEle!=null&&!"".equals(pageEle.text().trim())){
			for(Element pages:pageEle){
				if("最后一页".equals(pages.text())){
					String lastUrl=pages.attr("href");
					int a=lastUrl.lastIndexOf(".");
					String str=lastUrl.substring(a-3, a);
					String regEx="[^0-9]";   
					Pattern p = Pattern.compile(regEx);   
					Matcher m = p.matcher(str);   
					pagesSum= Integer.parseInt(m.replaceAll("").trim());
					System.out.println("數據頁數:"+pagesSum);
				}
			}
		}
		FileOutputStream fos=null;
		OutputStreamWriter osw=null;
		BufferedWriter fw=null;
			try {
				fos = new FileOutputStream(path);
				osw = new OutputStreamWriter(fos, "GBK");
				fw = new BufferedWriter(osw);
				for (int i = 1; i <= pagesSum; i++) {
					System.out.println("當前正在處理第"+i+"頁的數據");
					String fir = url.substring(0, url.lastIndexOf(".") - 1);
					s= getWebContent(fir + i + ".html");
					s = new String(s.getBytes("iso-8859-1"), "utf8");
					doc = Jsoup.parse(s);
					Elements ele = doc.select("div[class=part-list]");
					for (Element element : ele) {
						String title = element.select("h4 > a").text();
						String keshi = element.select("p > a[target=_self]").text();
						String dengji= element.select("h4").text();
						int index=dengji.lastIndexOf("(");
						int last=dengji.lastIndexOf(")");
						
						//System.out.println(keshi + "\t\t" + title);
						fw.write(title+","+keshi+","+dengji.substring(index+1, last)+"\n");
					}
				}
			} catch (Exception e) {
				e.printStackTrace();
			} finally {
				try {
					fw.close();
					osw.close();
					fos.close();
				} catch (IOException e) {
					// TODO Auto-generated catch block
					System.out.println("IOException");
				}
			}
		}

	public static void main(String[] args) throws IOException, ParserException {
		while(true){
		// 初始化链接数据
		 System.out.println("获取医院分级信息");
		 getHostLevel();
		 System.out.println("分级信息获取完毕。请输入分级编号：");
		 Scanner inputLevel = new Scanner(System.in);
		 int levelBianhao=inputLevel.nextInt();
		 System.out.println("开始初始化地区数据....");
		 MZinfo mzinfoLevel=levelinfo.get(levelBianhao);
		//getHospitolInfo(mzinfoLevel.getUrl());
		 String s=getWebContent(mzinfoLevel.getUrl());
		 getNative(s);
		 System.out.println("请输入地区编号：");
		 Scanner input = new Scanner(System.in);
		 int bianhao=input.nextInt();
		 System.out.println("是否按照城市顯示醫院信息[Y/N]");
		 Scanner inputYn = new Scanner(System.in);
		 String flag=inputYn.next();
		 MZinfo mzinfo=mzinfos.get(bianhao);
		 if("Y".endsWith(flag)){
			 System.out.println("开始获取城市信息");
			 getCityHospital(mzinfo);
			 System.out.println("城市信息获取成功，请输入城市编号");
			 Scanner inputcity = new Scanner(System.in);
			 int cityNum=inputcity.nextInt();
			 mzinfo=cityinfo.get(cityNum);
		 }
		 System.out.println("開始抓取信息：");
		 getHospitolInfo(mzinfo.getUrl());
		 System.out.println("抓取信息成功,是否继续[Y/N]");
		 Scanner inputYN = new Scanner(System.in);
		 String flag2=inputYN.next();
		 if("N".equals(flag2)){
			 break;
		 }
		 Runtime.getRuntime().exec( "cmd   cls ");
		}
	}

	private static void getCityHospital(MZinfo mzinfo) throws IOException {
		String s = getWebContent(mzinfo.getUrl());
		s = new String(s.getBytes("iso-8859-1"), "utf8");
		Document doc = Jsoup.parse(s);
		Elements sf= doc.select("div[class=find-hospital]>h4");
		for(Element el:sf){
			if(mzinfo.getDiqu().equals(el.select("h4>a").text()))
			{
				Elements ele = el.select(" h4 > div > ul >li");
				//System.out.println(mzinfo.getDiqu());
				for(int i=0;i<ele.size();i++){
					Element element=ele.get(i);
					String url = element.select("a").attr("href");
					String name = element.select("a").text();
					MZinfo mZinfo2=new MZinfo();
					mZinfo2.setDiqu(name);
					mZinfo2.setUrl(url);
					cityinfo.add(mZinfo2);
					System.out.println("       "+i+":"+name);
				}
			}
		}
		
	}

	private static  void getNative(String s) throws IOException {
		s = new String(s.getBytes("iso-8859-1"), "utf8");
		Document doc = Jsoup.parse(s);
		Elements ele = doc.select("div[class=find-hospital]>h4");
		// 循环省
		for (int i = 0; i < ele.size(); i++) {
			Element element = ele.get(i);
			Elements ele1 = element.select("h4>a");
			MZinfo mZinfo = new MZinfo();
			String diqu = ele1.text();
			mZinfo.setDiqu(diqu);
			mZinfo.setUrl(ele1.attr("href").toString());
			mzinfos.add(mZinfo);
			System.out.println(i + ":" + diqu);
		}
	}
	private static void getHostLevel() throws IOException {
		// TODO Auto-generated method stub
		String s = getWebContent("http://hospital.qqyy.com/list-p110000c0a110108k0v1r0d0n0.html");
		s = new String(s.getBytes("iso-8859-1"), "utf8");
		Document doc = Jsoup.parse(s);
		// <h4 class='show' id='proset' >
		Elements ele = doc.select("div[class=find-departments-tab tab2]>span>a");
		// 循环分级
		for (int i = 0; i < ele.size(); i++) {
			Element element = ele.get(i);
			System.out.println(i+":"+element.text());
			MZinfo mZinfo = new MZinfo();
			mZinfo.setDiqu(element.text());
			mZinfo.setUrl(element.attr("href").toString());
			levelinfo.add(mZinfo);
		}
	}
	
}

MZinfo.java

public class MZinfo {
	private String diqu;
	private String url;
	public String getDiqu() {
		return diqu;
	}
	public void setDiqu(String diqu) {
		this.diqu = diqu;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	
}