项目中用到的包结构
项目使用Jsoup进行网络的链接与网页的解析,使用dbutils进行dao操作,使用c3p0进行链接的管理
源代码下载地址:http://download.csdn.net/detail/chen1chen2chen3/9598202点击打开链接
爬虫程序的入口:
- package com.crawlercity.main;
- import org.jsoup.nodes.Document;
- import com.crawlercity.util.HttpUtils;
- import com.crawlercity.util.JsoupUtils;
- public class Main {
- public static void main(String[] args) {
- String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";// 初始解析网页地址
- // 设置代理ip
- HttpUtils.setProxyIp();
- Document document = JsoupUtils.getDocument(url);// 得到的document一定是正常 的document
- JsoupUtils.analysisDocument(document);
- }
- }
用于动态ip代理的工具类HttpUtils
- package com.crawlercity.util;
- import java.io.BufferedReader;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.Random;
- public class HttpUtils {
- /**
- * 设置代理ip
- * @throws IOException
- */
- public static void setProxyIp() {
- try {
- List<String> ipList = new ArrayList<>();
- BufferedReader proxyIpReader = new BufferedReader(new InputStreamReader(HttpUtils.class.getResourceAsStream("/proxyip.txt")));
- String ip = "";
- while((ip = proxyIpReader.readLine()) != null) {
- ipList.add(ip);
- }
- Random random = new Random();
- int randomInt = random.nextInt(ipList.size());
- String ipport = ipList.get(randomInt);
- String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));
- String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());
- System.setProperty("http.maxRedirects", "50");
- System.getProperties().setProperty("proxySet", "true");
- System.getProperties().setProperty("http.proxyHost", proxyIp);
- System.getProperties().setProperty("http.proxyPort", proxyPort);
- System.out.println("设置代理ip为:" + proxyIp + "端口号为:" + proxyPort);
- } catch (Exception e) {
- System.out.println("重新设置代理ip");
- setProxyIp();
- }
- }
- }
用于获取document对象的工具类JsoupUtils
- public static Document getDocument(String url) {
- try {
- Document document = Jsoup.connect(url).timeout(70).get();
- if(document == null || document.toString().trim().equals("")) {// 表示ip被拦截或者其他情况
- System.out.println("出现ip被拦截或者其他情况");
- HttpUtils.setProxyIp();
- getDocument(url);
- }
- return document;
- } catch (Exception e) { // 链接超时等其他情况
- System.out.println("出现链接超时等其他情况");
- HttpUtils.setProxyIp();// 换代理ip
- getDocument(url);// 继续爬取网页
- }
- return getDocument(url);
- }
用于解析html文档的工具类 JsoupUtils
- public static void analysisDocument(Document document) {
- try {
- String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/";
- CityInfo cityInfo1 = new CityInfo();
- CityInfo cityInfo2 = new CityInfo();
- CityInfo cityInfo3 = new CityInfo();
- CityInfo cityInfo4 = new CityInfo();
- CityInfo cityInfo5 = new CityInfo();
- // 将类型为1(省)的数据分析并且插入
- Elements elements1 = document.getElementsByAttributeValue("class", "provincetr");
- for(Element element1 : elements1) {
- Elements elements2 = element1.getElementsByTag("a");
- for(Element element2 : elements2) {
- cityInfo1.setName(element2.text());
- cityInfo1.setParentId(0);
- cityInfo1.setType(1);
- cityInfo1.setUrl(baseUrl + element2.attr("href"));
- // System.out.println("cityInfo1" + cityInfo1.toString());
- int key1 = DBUtils.insertCityInfo(cityInfo1);
- Document document2 = getDocument(cityInfo1.getUrl());
- Elements elements3 = document2.getElementsByAttributeValue("class", "citytr");
- for(Element element3 : elements3) {
- Elements elements4 = element3.getElementsByTag("a");
- if(elements4.toString().trim().equals("")) {
- Elements diffElements = element3.getElementsByTag("td");
- cityInfo2.setCode(diffElements.get(0).text());
- cityInfo2.setName(diffElements.get(1).text());
- cityInfo2.setParentId(key1);
- cityInfo2.setType(2);
- continue;
- }
- cityInfo2.setCode(elements4.get(0).text());
- cityInfo2.setName(elements4.get(1).text());
- cityInfo2.setUrl(baseUrl + elements4.get(1).attr("href"));
- cityInfo2.setParentId(key1);
- cityInfo2.setType(2);
- /*System.out.println("cityInfo2" + cityInfo2.toString());*/
- int key2 = DBUtils.insertCityInfo(cityInfo2);
- Document document3 = getDocument(cityInfo2.getUrl());
- Elements elements5 = document3.getElementsByAttributeValue("class", "countytr");
- for(Element element5 : elements5) {
- Elements elements6 = element5.getElementsByTag("a");
- if(elements6.toString().trim().equals("")) {
- Elements diffElements = element5.getElementsByTag("td");
- cityInfo3.setCode(diffElements.get(0).text());
- cityInfo3.setName(diffElements.get(1).text());
- cityInfo3.setParentId(key2);
- cityInfo3.setType(3);
- continue;
- }
- cityInfo3.setCode(elements6.get(0).text());
- cityInfo3.setName(elements6.get(1).text());
- String cityInfo2Url = cityInfo2.getUrl();
- cityInfo3.setUrl(cityInfo2Url.substring(0, cityInfo2Url.lastIndexOf("/") + 1) + elements6.get(1).attr("href"));
- cityInfo3.setParentId(key2);
- cityInfo3.setType(3);
- /* System.out.println("cityInfo3" + cityInfo3.toString());*/
- int key3 = DBUtils.insertCityInfo(cityInfo3);
- Document document4 = getDocument(cityInfo3.getUrl());
- Elements elements7 = document4.getElementsByAttributeValue("class", "towntr");
- for(Element element7 : elements7) {
- Elements elements8 = element7.getElementsByTag("a");
- System.out.println(elements8.toString());
- if(elements8.toString().trim().equals("")) {// 表示没有a标签
- Elements diffElements = element7.getElementsByTag("td");
- cityInfo4.setCode(diffElements.get(0).text());
- cityInfo4.setName(diffElements.get(1).text());
- cityInfo4.setParentId(key3);
- cityInfo4.setType(4);
- continue;
- }
- cityInfo4.setCode(elements8.get(0).text());
- cityInfo4.setName(elements8.get(1).text());
- String cityInfo3Url = cityInfo3.getUrl();
- cityInfo4.setUrl(cityInfo3Url.substring(0, cityInfo3Url.lastIndexOf("/") + 1) + elements8.get(1).attr("href"));
- cityInfo4.setParentId(key3);
- cityInfo4.setType(4);
- // System.out.println("cityInfo4" + cityInfo4.toString());
- int key4 = DBUtils.insertCityInfo(cityInfo4);
- Document document5 = getDocument(cityInfo4.getUrl());
- Elements elements9 = document5.getElementsByAttributeValue("class", "villagetr");
- for(Element element8 : elements9) {
- Elements elements10 = element8.getElementsByTag("td");
- cityInfo5.setCode(elements10.get(0).text());
- cityInfo5.setName(elements10.get(2).text());
- cityInfo5.setParentId(key4);
- cityInfo5.setType(5);
- /*System.out.println("cityInfo5" + cityInfo5.toString());*/
- DBUtils.insertCityInfo(cityInfo5);
- }
- }
- }
- }
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
用于dao操作的工具类DbUtils
- package com.crawlercity.util;
- import java.sql.Connection;
- import java.sql.ResultSet;
- import java.sql.SQLException;
- import java.sql.Statement;
- import javax.sql.DataSource;
- import org.apache.commons.dbutils.QueryRunner;
- import org.apache.commons.dbutils.handlers.ScalarHandler;
- import com.crawlercity.model.CityInfo;
- import com.mchange.v2.c3p0.ComboPooledDataSource;
- public class DBUtils {
- private static DataSource ds = null;
- public static Connection getConnection() {
- if(ds == null) {
- ds = new ComboPooledDataSource();
- }
- try {
- return ds.getConnection();
- } catch (SQLException e) {
- e.printStackTrace();
- }
- return null;
- }
- public static DataSource getDataSource() {
- return ds == null ? new ComboPooledDataSource() : ds;
- }
- public static void releaseSource(Connection conn, Statement st, ResultSet rs) {
- try {
- if(rs != null && !rs.isClosed()) {
- rs.close();
- }
- if(st != null && !st.isClosed()) {
- st.close();
- }
- if(conn != null && !conn.isClosed()) {
- conn.close();
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- public static int insertCityInfo(CityInfo cityInfo) {
- Connection connection = DBUtils.getConnection();
- QueryRunner qr = new QueryRunner();
- String sql1 = "insert into cityinfo values (?,?,?,?,?,?)";
- // 返回主键
- String sql2 = "SELECT LAST_INSERT_ID()";
- try {
- int result = qr.update(connection, sql1, null, cityInfo.getParentId(), cityInfo.getType(), cityInfo.getName(), cityInfo.getCode(), cityInfo.getUrl());
- int key = Integer.parseInt(qr.query(connection, sql2, new ScalarHandler<>()).toString());
- releaseSource(connection, null, null);
- return key;
- } catch (SQLException e) {
- e.printStackTrace();
- }
- return 0;
- }
- }
写代码的过程中出现了一些问题如: Jsoup如何在设置编码的同时设置连接超时,如何在超时或者动态ip代理无效的时候重新获取动态ip代理,如何在解析html失败后继续解析等。
通过这次编程发现自己在java网络方面的只是还是有待提高,以后继续努力!