Jsoup爬虫新手一

获取这个页面的信息http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html

第一次写的,省,市,县,镇,村,嵌套好几层循环,总是read timed out错误

 

import java.sql.Connection;
import java.sql.DriverManager;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Util_Spider_Gov_01 {
	//频繁报错,一次抓不完
	private static final int YEAR = 2016;

	private static String startUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html";
	
	private static int total = 1;
	
	public static void main(String[] args) throws Exception {

		Document document_province;

		try {
			document_province = Jsoup.connect(startUrl)
					.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko").timeout(5000)
					.get();
		} catch (Exception e) {
			document_province = Jsoup.connect(startUrl)
					.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko").timeout(5000)
					.get();
			e.printStackTrace();
		}

		Elements elements_provinces = document_province.body().select(".provincetr > td > a");

		// Connection connection = getConnection();

		// PreparedStatement ps_province_insert = connection.prepareStatement("insert
		// into province(name) value(?)");
		// PreparedStatement ps_province_select = connection.prepareStatement("select *
		// from province where name = ?");

		int i = 1;
		for (Element province : elements_provinces) {

			String url_province = province.absUrl("href");
			Document document_city;
			try {
				document_city = Jsoup.connect(url_province)
						.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
						.timeout(2000).get();
			} catch (Exception e) {
				try {
					document_city = Jsoup.connect(url_province)
							.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
							.timeout(2000).get();
					System.out.println("省份连接一级错误:");
				} catch (Exception e1) {
					document_city = Jsoup.connect(url_province)
							.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
							.timeout(2000).get();
					// TODO Auto-generated catch block
					e1.printStackTrace();
				}
				e.printStackTrace();
			}

			Elements elements_cities = document_city.body().select(".citytr");

			for (Element city : elements_cities) {

				Elements city_info = city.select("a[href]");

				Element city_markCode_element = city_info.get(0);
				String city_markCode = city_markCode_element.text();

				Element city_name_element = city_info.get(1);
				String city_name = city_name_element.text();

				String url_county = city_name_element.absUrl("href");

				Document document_county;
				try {
					document_county = Jsoup.connect(url_county)
							.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
							.timeout(2000).get();
				} catch (Exception e) {
					// TODO Auto-generated catch block
					try {
						document_county = Jsoup.connect(url_county)
								.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
								.timeout(2000).get();
					} catch (Exception e1) {
						document_county = Jsoup.connect(url_county)
								.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
								.timeout(2000).get();
						// TODO Auto-generated catch block
						e1.printStackTrace();
					}
					e.printStackTrace();
				}

				Elements elements_counties = document_county.body().select(".countytr");

				for (Element county : elements_counties) {

					Elements county_info = county.select("a[href]");
					if (county_info.size() > 0) {

						Element county_markCode_element = county_info.get(0);
						String county_markCode = county_markCode_element.text();

						Element county_name_element = county_info.get(1);
						String county_name = county_name_element.text();

						String url_town = county_name_element.absUrl("href");

						Document document_town;
						try {
							document_town = Jsoup.connect(url_town)
									.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
									.timeout(2000).get();
						} catch (Exception e) {
							try {
								document_town = Jsoup.connect(url_town)
										.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
										.timeout(2000).get();
							} catch (Exception e1) {
								document_town = Jsoup.connect(url_town)
										.userAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
										.timeout(2000).get();
								// TODO Auto-generated catch block
								e1.printStackTrace();
							}
							// TODO Auto-generated catch block
							e.printStackTrace();
						}
						Elements elements_town = document_town.body().select(".towntr");

						for (Element town : elements_town) {

							Elements town_info = town.select("a[href]");

							Element town_markCode_element = town_info.get(0);
							String town_markCode = town_markCode_element.text();

							Element town_name_element = town_info.get(1);
							String town_name = town_name_element.text();

							String url_village = town_name_element.absUrl("href");

							Document document_village;
							try {
								document_village = Jsoup.connect(url_village)
										.userAgent(
												"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
										.timeout(2000).get();
							} catch (Exception e) {
								try {
									document_village = Jsoup.connect(url_village)
											.userAgent(
													"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
											.timeout(2000).get();
								} catch (Exception e1) {
									document_village = Jsoup.connect(url_village)
											.userAgent(
													"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")
											.timeout(2000).get();
									// TODO Auto-generated catch block
									e1.printStackTrace();
								}
								// TODO Auto-generated catch block
								e.printStackTrace();
							}

							Elements elements_village = document_village.body().select(".villagetr");

							for (Element village : elements_village) {

								Elements village_info = village.select("td");
								Element village_markCode_element = village_info.get(0);
								Element village_classCode_element = village_info.get(1);
								Element village_name_element = village_info.get(2);
								System.out.println("第"+total+"个村庄名字:"+village_name_element.text());
								System.out.println("=============");
								total++;
								
							}
						}
					}
				}
			}
		}
		System.out.println("抓完了");
	}

	public static Connection getConnection() throws Exception {
		Class.forName("com.mysql.jdbc.Driver");
		Connection connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/villagecount", "root", "root");
		return connection;
	}
}

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值