抓取某网站上的医院信息,帮一位同学写的,完全是现学现卖,使用jsoup解析返回的HTML代码
HttpRequestProxy.java
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import org.htmlparser.util.ParserException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HttpRequestProxy {
private static List<MZinfo> mzinfos = new ArrayList<MZinfo>();
private static List<MZinfo> levelinfo = new ArrayList<MZinfo>();
private static List<MZinfo> cityinfo = new ArrayList<MZinfo>();
public static String getWebContent(String urlString, final String charset,
int timeout) throws IOException {
if (urlString == null || urlString.length() == 0) {
return null;
}
urlString = (urlString.startsWith("http://") || urlString
.startsWith("https://")) ? urlString : ("http://" + urlString)
.intern();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestProperty(
"User-Agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
conn.setRequestProperty("Accept", "text/html");
conn.setConnectTimeout(timeout);
try {
if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) {
return null;
}
} catch (IOException e) {
e.printStackTrace();
return null;
}
InputStream input = conn.getInputStream();
BufferedReader reader = new BufferedReader(new InputStreamReader(input,
charset));
String line = null;
StringBuffer sb = new StringBuffer();
while ((line = reader.readLine()) != null) {
sb.append(line).append("\r\n");
}
if (reader != null) {
reader.close();
}
if (conn != null) {
conn.disconnect();
}
return sb.toString();
}
public static String getWebContent(String urlString) throws IOException {
return getWebContent(urlString, "iso-8859-1", 5000);
}
public static void getHospitolInfo(String url) {
int pagesSum =1;
String path = "./result_"+System.currentTimeMillis()+".csv";
String s = null;
try {
s = getWebContent(url);
s = new String(s.getBytes("iso-8859-1"), "utf8");
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Document doc = Jsoup.parse(s);
Elements pageEle = doc.select("div[class=page]>a");
if(pageEle!=null&&!"".equals(pageEle.text().trim())){
for(Element pages:pageEle){
if("最后一页".equals(pages.text())){
String lastUrl=pages.attr("href");
int a=lastUrl.lastIndexOf(".");
String str=lastUrl.substring(a-3, a);
String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(str);
pagesSum= Integer.parseInt(m.replaceAll("").trim());
System.out.println("數據頁數:"+pagesSum);
}
}
}
FileOutputStream fos=null;
OutputStreamWriter osw=null;
BufferedWriter fw=null;
try {
fos = new FileOutputStream(path);
osw = new OutputStreamWriter(fos, "GBK");
fw = new BufferedWriter(osw);
for (int i = 1; i <= pagesSum; i++) {
System.out.println("當前正在處理第"+i+"頁的數據");
String fir = url.substring(0, url.lastIndexOf(".") - 1);
s= getWebContent(fir + i + ".html");
s = new String(s.getBytes("iso-8859-1"), "utf8");
doc = Jsoup.parse(s);
Elements ele = doc.select("div[class=part-list]");
for (Element element : ele) {
String title = element.select("h4 > a").text();
String keshi = element.select("p > a[target=_self]").text();
String dengji= element.select("h4").text();
int index=dengji.lastIndexOf("(");
int last=dengji.lastIndexOf(")");
//System.out.println(keshi + "\t\t" + title);
fw.write(title+","+keshi+","+dengji.substring(index+1, last)+"\n");
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
fw.close();
osw.close();
fos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("IOException");
}
}
}
public static void main(String[] args) throws IOException, ParserException {
while(true){
// 初始化链接数据
System.out.println("获取医院分级信息");
getHostLevel();
System.out.println("分级信息获取完毕。请输入分级编号:");
Scanner inputLevel = new Scanner(System.in);
int levelBianhao=inputLevel.nextInt();
System.out.println("开始初始化地区数据....");
MZinfo mzinfoLevel=levelinfo.get(levelBianhao);
//getHospitolInfo(mzinfoLevel.getUrl());
String s=getWebContent(mzinfoLevel.getUrl());
getNative(s);
System.out.println("请输入地区编号:");
Scanner input = new Scanner(System.in);
int bianhao=input.nextInt();
System.out.println("是否按照城市顯示醫院信息[Y/N]");
Scanner inputYn = new Scanner(System.in);
String flag=inputYn.next();
MZinfo mzinfo=mzinfos.get(bianhao);
if("Y".endsWith(flag)){
System.out.println("开始获取城市信息");
getCityHospital(mzinfo);
System.out.println("城市信息获取成功,请输入城市编号");
Scanner inputcity = new Scanner(System.in);
int cityNum=inputcity.nextInt();
mzinfo=cityinfo.get(cityNum);
}
System.out.println("開始抓取信息:");
getHospitolInfo(mzinfo.getUrl());
System.out.println("抓取信息成功,是否继续[Y/N]");
Scanner inputYN = new Scanner(System.in);
String flag2=inputYN.next();
if("N".equals(flag2)){
break;
}
Runtime.getRuntime().exec( "cmd cls ");
}
}
private static void getCityHospital(MZinfo mzinfo) throws IOException {
String s = getWebContent(mzinfo.getUrl());
s = new String(s.getBytes("iso-8859-1"), "utf8");
Document doc = Jsoup.parse(s);
Elements sf= doc.select("div[class=find-hospital]>h4");
for(Element el:sf){
if(mzinfo.getDiqu().equals(el.select("h4>a").text()))
{
Elements ele = el.select(" h4 > div > ul >li");
//System.out.println(mzinfo.getDiqu());
for(int i=0;i<ele.size();i++){
Element element=ele.get(i);
String url = element.select("a").attr("href");
String name = element.select("a").text();
MZinfo mZinfo2=new MZinfo();
mZinfo2.setDiqu(name);
mZinfo2.setUrl(url);
cityinfo.add(mZinfo2);
System.out.println(" "+i+":"+name);
}
}
}
}
private static void getNative(String s) throws IOException {
s = new String(s.getBytes("iso-8859-1"), "utf8");
Document doc = Jsoup.parse(s);
Elements ele = doc.select("div[class=find-hospital]>h4");
// 循环省
for (int i = 0; i < ele.size(); i++) {
Element element = ele.get(i);
Elements ele1 = element.select("h4>a");
MZinfo mZinfo = new MZinfo();
String diqu = ele1.text();
mZinfo.setDiqu(diqu);
mZinfo.setUrl(ele1.attr("href").toString());
mzinfos.add(mZinfo);
System.out.println(i + ":" + diqu);
}
}
private static void getHostLevel() throws IOException {
// TODO Auto-generated method stub
String s = getWebContent("http://hospital.qqyy.com/list-p110000c0a110108k0v1r0d0n0.html");
s = new String(s.getBytes("iso-8859-1"), "utf8");
Document doc = Jsoup.parse(s);
// <h4 class='show' id='proset' >
Elements ele = doc.select("div[class=find-departments-tab tab2]>span>a");
// 循环分级
for (int i = 0; i < ele.size(); i++) {
Element element = ele.get(i);
System.out.println(i+":"+element.text());
MZinfo mZinfo = new MZinfo();
mZinfo.setDiqu(element.text());
mZinfo.setUrl(element.attr("href").toString());
levelinfo.add(mZinfo);
}
}
}
MZinfo.java
public class MZinfo {
private String diqu;
private String url;
public String getDiqu() {
return diqu;
}
public void setDiqu(String diqu) {
this.diqu = diqu;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}