2021-08-11 转载-java爬虫

在这个数据为王的时代,爬虫应用地越来越广泛,对于一个萌新程序员来说如果你要做爬虫,那么Python是你的不二之选。但是对于那些老腊肉的Java程序员(亦或者你是程序媛)想使用Java做爬虫也不是不行,只是没有Python那么方便。身为一块Java老腊肉的我在此记录一下自己在使用Java做网络爬虫使用的工具类。

在pom.xml文件中引入commons-lang3 依赖:


   
   
  1. <dependency>
  2. <groupId>org.apache.commons </groupId>
  3. <artifactId>commons-lang3 </artifactId>
  4. <version>3.6 </version>
  5. </dependency>

 SpiderHttpUtils 工具类完整代码如下: 


   
   
  1. import java.io.BufferedInputStream;
  2. import java.io.BufferedReader;
  3. import java.io.ByteArrayOutputStream;
  4. import java.io.InputStreamReader;
  5. import java.io.UnsupportedEncodingException;
  6. import java.net.HttpURLConnection;
  7. import java.net.URL;
  8. import java.net.URLConnection;
  9. import java.net.URLEncoder;
  10. import java.security.cert.CertificateException;
  11. import java.security.cert.X509Certificate;
  12. import java.util.Map;
  13. import javax.net.ssl.HttpsURLConnection;
  14. import javax.net.ssl.SSLContext;
  15. import javax.net.ssl.SSLSocketFactory;
  16. import javax.net.ssl.TrustManager;
  17. import javax.net.ssl.X509TrustManager;
  18. import org.apache.commons.lang3.StringUtils;
  19. public class SpiderHttpUtils {
  20. public static String sendGet(boolean isHttps, String requestUrl, Map<String, String> params,
  21. Map<String, String> headers, String charSet) {
  22. if (StringUtils.isBlank(requestUrl)) {
  23. return "";
  24. }
  25. if (StringUtils.isBlank(charSet)) {
  26. charSet = "UTF-8";
  27. }
  28. URL url = null;
  29. URLConnection conn = null;
  30. BufferedReader br = null;
  31. try {
  32. // 创建连接
  33. url = new URL(requestUrl + "?" + requestParamsBuild(params));
  34. if (isHttps) {
  35. conn = getHttpsUrlConnection(url);
  36. } else {
  37. conn = (HttpURLConnection) url.openConnection();
  38. }
  39. // 设置请求头通用属性
  40. // 指定客户端能够接收的内容类型
  41. conn.setRequestProperty( "Accept", "*/*");
  42. // 设置连接的状态为长连接
  43. conn.setRequestProperty( "Connection", "keep-alive");
  44. // 设置发送请求的客户机系统信息
  45. conn.setRequestProperty( "User-Agent",
  46. "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
  47. // 设置请求头自定义属性
  48. if ( null != headers && headers.size() > 0) {
  49. for (Map.Entry<String, String> entry : headers.entrySet()) {
  50. conn.setRequestProperty(entry.getKey(), entry.getValue());
  51. }
  52. }
  53. // 设置其他属性
  54. // conn.setUseCaches(false);//不使用缓存
  55. // conn.setReadTimeout(10000);// 设置读取超时时间
  56. // conn.setConnectTimeout(10000);// 设置连接超时时间
  57. // 建立实际连接
  58. conn.connect();
  59. // 读取请求结果
  60. br = new BufferedReader( new InputStreamReader(conn.getInputStream(), charSet));
  61. String line = null;
  62. StringBuilder sb = new StringBuilder();
  63. while ((line = br.readLine()) != null) {
  64. sb.append(line);
  65. }
  66. return sb.toString();
  67. } catch (Exception exception) {
  68. return "";
  69. } finally {
  70. try {
  71. if (br != null) {
  72. br.close();
  73. }
  74. } catch (Exception e) {
  75. e.printStackTrace();
  76. }
  77. }
  78. }
  79. public static String requestParamsBuild(Map<String, String> map) {
  80. String result = "";
  81. if ( null != map && map.size() > 0) {
  82. StringBuffer sb = new StringBuffer();
  83. for (Map.Entry<String, String> entry : map.entrySet()) {
  84. try {
  85. String value = URLEncoder.encode(entry.getValue(), "UTF-8");
  86. sb.append(entry.getKey() + "=" + value + "&");
  87. } catch (UnsupportedEncodingException e) {
  88. e.printStackTrace();
  89. }
  90. }
  91. result = sb.substring( 0, sb.length() - 1);
  92. }
  93. return result;
  94. }
  95. private static HttpsURLConnection getHttpsUrlConnection(URL url) throws Exception {
  96. HttpsURLConnection httpsConn = (HttpsURLConnection) url.openConnection();
  97. // 创建SSLContext对象,并使用我们指定的信任管理器初始化
  98. TrustManager[] tm = { new X509TrustManager() {
  99. public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
  100. // 检查客户端证书
  101. }
  102. public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
  103. // 检查服务器端证书
  104. }
  105. public X509Certificate[] getAcceptedIssuers() {
  106. // 返回受信任的X509证书数组
  107. return null;
  108. }
  109. } };
  110. SSLContext sslContext = SSLContext.getInstance( "SSL", "SunJSSE");
  111. sslContext.init( null, tm, new java.security.SecureRandom());
  112. // 从上述SSLContext对象中得到SSLSocketFactory对象
  113. SSLSocketFactory ssf = sslContext.getSocketFactory();
  114. httpsConn.setSSLSocketFactory(ssf);
  115. return httpsConn;
  116. }
  117. public static byte[] getFileAsByte( boolean isHttps, String requestUrl) {
  118. if (StringUtils.isBlank(requestUrl)) {
  119. return new byte[ 0];
  120. }
  121. URL url = null;
  122. URLConnection conn = null;
  123. BufferedInputStream bi = null;
  124. try {
  125. // 创建连接
  126. url = new URL(requestUrl);
  127. if (isHttps) {
  128. conn = getHttpsUrlConnection(url);
  129. } else {
  130. conn = (HttpURLConnection) url.openConnection();
  131. }
  132. // 设置请求头通用属性
  133. // 指定客户端能够接收的内容类型
  134. conn.setRequestProperty( "accept", "*/*");
  135. // 设置连接的状态为长连接
  136. conn.setRequestProperty( "Connection", "keep-alive");
  137. // 设置发送请求的客户机系统信息
  138. conn.setRequestProperty( "User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
  139. // 设置其他属性
  140. conn.setConnectTimeout( 3000); // 设置连接超时时间
  141. conn.setDoOutput( true);
  142. conn.setDoInput( true);
  143. // 建立实际连接
  144. conn.connect();
  145. // 读取请求结果
  146. bi = new BufferedInputStream(conn.getInputStream());
  147. ByteArrayOutputStream outStream = new ByteArrayOutputStream();
  148. byte[] buffer = new byte[ 2048];
  149. int len = 0;
  150. while ((len = bi.read(buffer)) != - 1) {
  151. outStream.write(buffer, 0, len);
  152. }
  153. bi.close();
  154. byte[] data = outStream.toByteArray();
  155. return data;
  156. } catch (Exception exception) {
  157. return new byte[ 0];
  158. } finally {
  159. try {
  160. if (bi != null) {
  161. bi.close();
  162. }
  163. } catch (Exception e) {
  164. e.printStackTrace();
  165. }
  166. }
  167. }
  168. }

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值