简单的网页内容抓取实例(携程酒店)

网页抓取有很多种,这里介绍一个简单方法,暴力但快速得到有规律的网页内容

比如携程酒店的网页内容,希望得到一下基本信息:

酒店名称

英文名称

城市

省份

地址

纬度

经度

经纬度(String 类型)

电话

酒店星级


这里是代码(带测试样例)

package webTextGrabber;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

public class WebContent {
 // hotelId, hotelUrl, cityId can be obtained at CtripUtil class
 private String hotelName;
 private String hotelEname;
 private String cityName;
 private String provinceName;
 private String address;
 private double lat;
 private double lng;
 private String coordinates;
 private String tel;
 private int hotelStars;


 public String getUrlSource(String url) throws IOException {
  URL webpage = new URL(url);
  URLConnection yc = webpage.openConnection();
  BufferedReader in = new BufferedReader(new InputStreamReader(yc.getInputStream(), "UTF-8"));
  String inputLine;
  StringBuilder a = new StringBuilder();
  while ((inputLine = in.readLine()) != null)
   a.append(inputLine);
  in.close();

  return a.toString();
 }

 public void setAll(String str) throws Exception {
  try {
   setHotelName(str);
   setHotelEname(str);
   setCityName(str);
   setProvinceName(str);
   setAddress(str);
   setLat(str);
   setLng(str);
   setCoordinates();
   setTel(str);
   setHotelStars(str);
  } catch (Exception e) {
   throw new Exception(e);
  }

 }

 public String setHotelName(String str) throws Exception {
  try {
   int index = str.indexOf("cn_n");
   hotelName = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index));
  } catch (Exception e) {
   throw new Exception(e);
  }
  return hotelName;

 }

 public String setHotelEname(String str) throws Exception {
  try {
   int index = str.indexOf("en_n");
   if (index == -1) {
    return hotelEname;
   }
   hotelEname = str.substring(str.indexOf(">", index) + 1, str.indexOf("<", index));
  } catch (Exception e) {
   throw new Exception(e);
  }
  return hotelEname;
 }

 public String setCityName(String str) {
  int index = str.indexOf("city");
  cityName = str.substring(index + 5, str.indexOf(">", index) - 1);
  return cityName;
 }

 public String setProvinceName(String str) {
  int index = str.indexOf("province");
  provinceName = str.substring(index + 9, str.indexOf(";", index));
  return provinceName;
 }
 
 public String setAddress(String str) {
  int index = str.lastIndexOf("酒店地址");
  if (!provinceName.equals(cityName)) {
   address = provinceName + cityName;
  } else {
   address = provinceName;
  }

  address += str.substring(index + 5, str.indexOf(";", index));
  return address;
 }

 public double setLat(String str) {
  int index = str.indexOf("latitude");
  lat = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2));
  return lat;
 }

 public double setLng(String str) {
  int index = str.indexOf("longitude");
  lng = Double.parseDouble(str.substring(str.indexOf("content=", index) + 9, str.indexOf("/>", index) - 2));
  return lng;
 }

 public String setCoordinates() {
  coordinates = "" + lat + ", " + lng;
  return coordinates;
 }

 public String setTel(String str) throws Exception {
  try {
   int index = str.indexOf("电话0");
   if (index == -1) {
    return tel;
   }
   tel = str.substring(index + 2, index + 14);
  } catch (Exception e) {
   throw new Exception(e);
  }
  return tel;
 }

 public int setHotelStars(String str) throws Exception {
  try {
   int index = str.indexOf("hotel_stars");
   if (index == -1) {
    return hotelStars;
   }
   hotelStars = Integer.parseInt(str.substring(index + 11, index + 13));
  } catch (Exception e) {
   throw new Exception(e);
  }

  return hotelStars;
 }

 public String getHotelName() {
  return hotelName;
 }

 public String getHotelEname() {
  return hotelEname;
 }

 public String getCityName() {
  return cityName;
 }

 public String getProvinceName() {
  return provinceName;
 }

 public String getAddress() {
  return address;
 }

 public double getLat() {
  return lat;
 }

 public double getLng() {
  return lng;
 }

 public String getCoordinates() {
  return coordinates;
 }

 public String getTel() {
  return tel;
 }

 public int getHotelStars() {
  return hotelStars;
 }


 /**
  * 
  * @param args
  * 
  * @throws IOException
  */
 public static void main(final String args[]) throws IOException
   {

  final List<String> list = new ArrayList<String>();
  list.add("http://hotels.ctrip.com/hotel/427952.html");
  list.add("http://hotels.ctrip.com/hotel/671.html");
  list.add("http://hotels.ctrip.com/hotel/2005959.html");
  list.add("http://hotels.ctrip.com/hotel/481810.html");
  list.add("http://hotels.ctrip.com/hotel/2104633.html");
  list.add("http://hotels.ctrip.com/hotel/1481502.html");
  list.add("http://hotels.ctrip.com/hotel/1720124.html");
  list.add("http://hotels.ctrip.com/hotel/2165407.html");
  list.add("http://hotels.ctrip.com/hotel/1636803.html");
  list.add("http://hotels.ctrip.com/hotel/371188.html");


    final WebContent wc = new WebContent();

  for (int i = 0; i < list.size(); i++) {
   String webinfo = wc.getUrlSource(list.get(i));
   if (webinfo == null || webinfo.length() == 0 || webinfo.indexOf("验证") != -1) {
    continue;
   }
   try {
    wc.setAll(webinfo);
   } catch (Exception e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
   }
   System.out.println(wc.getHotelName());
   System.out.println(wc.getHotelEname());
   System.out.println(wc.getCityName());
   System.out.println(wc.getProvinceName());
   System.out.println(wc.getAddress());
   System.out.println(wc.getLat());
   System.out.println(wc.getLng());
   System.out.println(wc.getCoordinates());
   System.out.println(wc.getTel());
   System.out.println(wc.getHotelStars());
  }
   }
}


这里是输出结果:

北京金隅喜来登酒店
Sheraton Beijing Dongcheng Hotel
北京
北京
北京北三环东路36号
39.97346873
116.4163028
39.97346873, 116.4163028
010-57988888
5
上海大厦
Broadway Mansions Hotel
上海
上海
上海北苏州路20号
31.250007605066
121.49663745291
31.250007605066, 121.49663745291
021-63246260
5
宁国伯爵王朝大酒店
Bojue Dynasty Hotel
宁国
安徽
安徽宁国宁阳西路155号
30.606036618238
118.97454952709
30.606036618238, 118.97454952709
0563-4188888
5
合肥天鹅湖大酒店
Swan Lake Hotel
合肥
安徽
安徽合肥政务文化新区东流路888号
31.823922092928
117.23604371154
31.823922092928, 117.23604371154
0551-6353666
5
宁国都市阳光酒店

宁国
安徽
安徽宁国中溪北路8号
30.633710023238
118.98375012382
30.633710023238, 118.98375012382
0563-4101788
3
达拉特旗东达假日酒店
Dongda Holiday Hotel
达拉特旗
内蒙古
内蒙古达拉特旗树林召西街南侧
40.402906010224
110.00925942075
40.402906010224, 110.00925942075
0477-3963888
3
大理和舀田园度假酒店

大理市
云南
云南大理市城北村(0872-2475995)
25.861913102242
100.14410073281
25.861913102242, 100.14410073281
0872-2475995
3
喀纳斯贾登峪回家休闲酒店(酒店区)
Connectedhome
布尔津
新疆
新疆布尔津喀纳斯贾登峪生活区游客接待基地一区
48.501779520655
87.157369327333
48.501779520655, 87.157369327333
0906-6327598
3
欣得酒店(北京石佛营店)

北京
北京
北京石佛营东里99号
39.941142006731
116.51243647991
39.941142006731, 116.51243647991
010-85814122
3
青岛颐中皇冠假日酒店
Crowne Plaza Qingdao
青岛
山东
山东青岛香港中路76号
36.070022690161
120.40615095949
36.070022690161, 120.40615095949
0532-8571888
3

当然,在具体的工作学习使用中,可以将数据存成相应的数据格式来保存在数据库中。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值