网络爬虫爬取全国省市区(动态ip代理的获取,实现对ip限制的突破)

项目中用到的包结构

项目使用Jsoup进行网络的链接与网页的解析,使用dbutils进行dao操作,使用c3p0进行链接的管理

源代码下载地址:http://download.csdn.net/detail/chen1chen2chen3/9598202点击打开链接


爬虫程序的入口:

[java]  view plain  copy
  1. package com.crawlercity.main;  
  2.   
  3.   
  4. import org.jsoup.nodes.Document;  
  5.   
  6. import com.crawlercity.util.HttpUtils;  
  7. import com.crawlercity.util.JsoupUtils;  
  8.   
  9. public class Main {  
  10.     public static void main(String[] args) {  
  11.         String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";// 初始解析网页地址  
  12.         // 设置代理ip  
  13.         HttpUtils.setProxyIp();  
  14.         Document document = JsoupUtils.getDocument(url);// 得到的document一定是正常 的document  
  15.         JsoupUtils.analysisDocument(document);  
  16.           
  17.     }  
  18. }  

用于动态ip代理的工具类HttpUtils

[java]  view plain  copy
  1. package com.crawlercity.util;  
  2.   
  3.   
  4. import java.io.BufferedReader;  
  5. import java.io.IOException;  
  6. import java.io.InputStreamReader;  
  7. import java.util.ArrayList;  
  8. import java.util.List;  
  9. import java.util.Random;  
  10.   
  11. public class HttpUtils {  
  12.     /** 
  13.      * 设置代理ip 
  14.      * @throws IOException 
  15.      */  
  16.     public static void setProxyIp() {  
  17.         try {  
  18.             List<String> ipList = new ArrayList<>();  
  19.             BufferedReader proxyIpReader = new BufferedReader(new InputStreamReader(HttpUtils.class.getResourceAsStream("/proxyip.txt")));  
  20.               
  21.             String ip = "";  
  22.             while((ip = proxyIpReader.readLine()) != null) {  
  23.                 ipList.add(ip);  
  24.             }  
  25.               
  26.             Random random = new Random();  
  27.             int randomInt = random.nextInt(ipList.size());  
  28.             String ipport = ipList.get(randomInt);  
  29.             String proxyIp = ipport.substring(0, ipport.lastIndexOf(":"));  
  30.             String proxyPort = ipport.substring(ipport.lastIndexOf(":") + 1, ipport.length());  
  31.               
  32.             System.setProperty("http.maxRedirects""50");    
  33.             System.getProperties().setProperty("proxySet""true");     
  34.             System.getProperties().setProperty("http.proxyHost", proxyIp);    
  35.             System.getProperties().setProperty("http.proxyPort", proxyPort);  
  36.               
  37.             System.out.println("设置代理ip为:" + proxyIp + "端口号为:" + proxyPort);  
  38.         } catch (Exception e) {  
  39.             System.out.println("重新设置代理ip");  
  40.             setProxyIp();  
  41.         }  
  42.             
  43.           
  44.     }  
  45. }  

用于获取document对象的工具类JsoupUtils

[html]  view plain  copy
  1. public static Document getDocument(String url) {  
  2.             try {  
  3.                 Document document = Jsoup.connect(url).timeout(70).get();  
  4.                   
  5.                 if(document == null || document.toString().trim().equals("")) {// 表示ip被拦截或者其他情况  
  6.                     System.out.println("出现ip被拦截或者其他情况");  
  7.                     HttpUtils.setProxyIp();  
  8.                     getDocument(url);  
  9.                 }  
  10.                   
  11.                 return document;  
  12.             } catch (Exception e) { // 链接超时等其他情况  
  13.                 System.out.println("出现链接超时等其他情况");  
  14.                 HttpUtils.setProxyIp();// 换代理ip  
  15.                 getDocument(url);// 继续爬取网页  
  16.             }  
  17.             return getDocument(url);  
  18.         }  

用于解析html文档的工具类 JsoupUtils

[java]  view plain  copy
  1. public static void analysisDocument(Document document) {  
  2.         try {  
  3.             String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/";  
  4.             CityInfo cityInfo1 = new CityInfo();  
  5.             CityInfo cityInfo2 = new CityInfo();  
  6.             CityInfo cityInfo3 = new CityInfo();  
  7.             CityInfo cityInfo4 = new CityInfo();  
  8.             CityInfo cityInfo5 = new CityInfo();  
  9.             // 将类型为1(省)的数据分析并且插入  
  10.             Elements elements1 = document.getElementsByAttributeValue("class""provincetr");  
  11.             for(Element element1 : elements1) {  
  12.                 Elements elements2 = element1.getElementsByTag("a");  
  13.                 for(Element element2 : elements2) {  
  14.                     cityInfo1.setName(element2.text());  
  15.                     cityInfo1.setParentId(0);  
  16.                     cityInfo1.setType(1);  
  17.                     cityInfo1.setUrl(baseUrl + element2.attr("href"));  
  18. //                  System.out.println("cityInfo1" + cityInfo1.toString());  
  19.                     int key1 = DBUtils.insertCityInfo(cityInfo1);  
  20.                     Document document2 = getDocument(cityInfo1.getUrl());  
  21.                     Elements elements3 = document2.getElementsByAttributeValue("class""citytr");  
  22.                     for(Element element3 : elements3) {  
  23.                         Elements elements4 = element3.getElementsByTag("a");  
  24.                         if(elements4.toString().trim().equals("")) {  
  25.                             Elements diffElements = element3.getElementsByTag("td");  
  26.                             cityInfo2.setCode(diffElements.get(0).text());  
  27.                             cityInfo2.setName(diffElements.get(1).text());  
  28.                             cityInfo2.setParentId(key1);  
  29.                             cityInfo2.setType(2);  
  30.                             continue;  
  31.                         }  
  32.                         cityInfo2.setCode(elements4.get(0).text());  
  33.                         cityInfo2.setName(elements4.get(1).text());  
  34.                         cityInfo2.setUrl(baseUrl + elements4.get(1).attr("href"));  
  35.                         cityInfo2.setParentId(key1);  
  36.                         cityInfo2.setType(2);  
  37.                         /*System.out.println("cityInfo2" + cityInfo2.toString());*/  
  38.                         int key2 = DBUtils.insertCityInfo(cityInfo2);  
  39.                         Document document3 = getDocument(cityInfo2.getUrl());  
  40.                         Elements elements5 = document3.getElementsByAttributeValue("class""countytr");  
  41.                         for(Element element5 : elements5) {  
  42.                             Elements elements6 = element5.getElementsByTag("a");  
  43.                             if(elements6.toString().trim().equals("")) {  
  44.                                 Elements diffElements = element5.getElementsByTag("td");  
  45.                                 cityInfo3.setCode(diffElements.get(0).text());  
  46.                                 cityInfo3.setName(diffElements.get(1).text());  
  47.                                 cityInfo3.setParentId(key2);  
  48.                                 cityInfo3.setType(3);  
  49.                                 continue;  
  50.                             }  
  51.                             cityInfo3.setCode(elements6.get(0).text());  
  52.                             cityInfo3.setName(elements6.get(1).text());  
  53.                             String cityInfo2Url = cityInfo2.getUrl();  
  54.                             cityInfo3.setUrl(cityInfo2Url.substring(0, cityInfo2Url.lastIndexOf("/") + 1) + elements6.get(1).attr("href"));  
  55.                             cityInfo3.setParentId(key2);  
  56.                             cityInfo3.setType(3);  
  57.                     /*      System.out.println("cityInfo3" + cityInfo3.toString());*/  
  58.                             int key3 = DBUtils.insertCityInfo(cityInfo3);  
  59.                             Document document4 = getDocument(cityInfo3.getUrl());  
  60.                             Elements elements7 = document4.getElementsByAttributeValue("class""towntr");  
  61.                             for(Element element7 : elements7) {  
  62.                                 Elements elements8 = element7.getElementsByTag("a");  
  63.                                 System.out.println(elements8.toString());  
  64.                                 if(elements8.toString().trim().equals("")) {// 表示没有a标签  
  65.                                     Elements diffElements = element7.getElementsByTag("td");  
  66.                                     cityInfo4.setCode(diffElements.get(0).text());  
  67.                                     cityInfo4.setName(diffElements.get(1).text());  
  68.                                     cityInfo4.setParentId(key3);  
  69.                                     cityInfo4.setType(4);  
  70.                                     continue;  
  71.                                 }  
  72.                                 cityInfo4.setCode(elements8.get(0).text());  
  73.                                 cityInfo4.setName(elements8.get(1).text());  
  74.                                 String cityInfo3Url = cityInfo3.getUrl();  
  75.                                 cityInfo4.setUrl(cityInfo3Url.substring(0, cityInfo3Url.lastIndexOf("/") + 1) + elements8.get(1).attr("href"));  
  76.                                 cityInfo4.setParentId(key3);  
  77.                                 cityInfo4.setType(4);  
  78. //                              System.out.println("cityInfo4" + cityInfo4.toString());  
  79.                                 int key4 = DBUtils.insertCityInfo(cityInfo4);  
  80.                                 Document document5 = getDocument(cityInfo4.getUrl());  
  81.                                 Elements elements9 = document5.getElementsByAttributeValue("class""villagetr");  
  82.                                 for(Element element8 : elements9) {  
  83.                                     Elements elements10 = element8.getElementsByTag("td");  
  84.                                     cityInfo5.setCode(elements10.get(0).text());  
  85.                                     cityInfo5.setName(elements10.get(2).text());  
  86.                                     cityInfo5.setParentId(key4);  
  87.                                     cityInfo5.setType(5);  
  88.                                     /*System.out.println("cityInfo5" + cityInfo5.toString());*/  
  89.                                     DBUtils.insertCityInfo(cityInfo5);  
  90.                                 }  
  91.                             }  
  92.                         }  
  93.                     }  
  94.                 }  
  95.             }  
  96.               
  97.         } catch (Exception e) {  
  98.             e.printStackTrace();  
  99.         }  

用于dao操作的工具类DbUtils

[html]  view plain  copy
  1. package com.crawlercity.util;  
  2.   
  3. import java.sql.Connection;  
  4. import java.sql.ResultSet;  
  5. import java.sql.SQLException;  
  6. import java.sql.Statement;  
  7.   
  8. import javax.sql.DataSource;  
  9.   
  10. import org.apache.commons.dbutils.QueryRunner;  
  11. import org.apache.commons.dbutils.handlers.ScalarHandler;  
  12.   
  13. import com.crawlercity.model.CityInfo;  
  14. import com.mchange.v2.c3p0.ComboPooledDataSource;  
  15.   
  16. public class DBUtils {  
  17.   
  18.     private static DataSource ds = null;  
  19.   
  20.     public static Connection getConnection() {  
  21.         if(ds == null) {  
  22.             ds = new ComboPooledDataSource();  
  23.         }  
  24.         try {  
  25.             return ds.getConnection();  
  26.         } catch (SQLException e) {  
  27.             e.printStackTrace();  
  28.         }  
  29.         return null;  
  30.     }  
  31.   
  32.     public static DataSource getDataSource() {  
  33.         return ds == null ? new ComboPooledDataSource() : ds;  
  34.     }  
  35.       
  36.   
  37.     public static void releaseSource(Connection conn, Statement st, ResultSet rs) {  
  38.         try {  
  39.             if(rs != null && !rs.isClosed()) {  
  40.                 rs.close();  
  41.             }  
  42.             if(st != null && !st.isClosed()) {  
  43.                 st.close();  
  44.             }  
  45.             if(conn != null && !conn.isClosed()) {  
  46.                 conn.close();  
  47.             }  
  48.         } catch (Exception e) {  
  49.             e.printStackTrace();  
  50.         }  
  51.     }  
  52.       
  53.     public static int insertCityInfo(CityInfo cityInfo) {  
  54.           
  55.         Connection connection = DBUtils.getConnection();  
  56.         QueryRunner qr = new QueryRunner();  
  57.         String sql1 = "insert into cityinfo values (?,?,?,?,?,?)";  
  58.         // 返回主键  
  59.         String sql2 = "SELECT LAST_INSERT_ID()";  
  60.           
  61.         try {  
  62.             int result = qr.update(connection, sql1, null, cityInfo.getParentId(), cityInfo.getType(), cityInfo.getName(), cityInfo.getCode(), cityInfo.getUrl());  
  63.             int key = Integer.parseInt(qr.query(connection, sql2, new ScalarHandler<>()).toString());  
  64.             releaseSource(connection, null, null);  
  65.             return key;  
  66.         } catch (SQLException e) {  
  67.             e.printStackTrace();  
  68.         }  
  69.         return 0;  
  70.     }  
  71. }  

写代码的过程中出现了一些问题如: Jsoup如何在设置编码的同时设置连接超时,如何在超时或者动态ip代理无效的时候重新获取动态ip代理,如何在解析html失败后继续解析等。

通过这次编程发现自己在java网络方面的只是还是有待提高,以后继续努力!

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值