Android爬虫,获取网页源码的两种方式,Apache HttpClient,HttpURLConnection

在这里插入图片描述

Apache HttpClient

    implementation group: 'cz.msebera.android' , name: 'httpclient', version: '4.4.1.1'
package com.example.administrator.myapplication.Douyin;

import java.io.IOException;
import java.net.URI;
import java.util.Map;

import cz.msebera.android.httpclient.client.methods.CloseableHttpResponse;
import cz.msebera.android.httpclient.client.methods.HttpGet;
import cz.msebera.android.httpclient.client.utils.URIBuilder;
import cz.msebera.android.httpclient.impl.client.CloseableHttpClient;
import cz.msebera.android.httpclient.impl.client.HttpClients;
import cz.msebera.android.httpclient.util.EntityUtils;

public class HttpClientUtil {
    public static String doGet(String url, Map<String, String> param) {
        // 创建Httpclient对象
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String resultString = "";
        CloseableHttpResponse response = null;
        try {
            // 创建uri
            URIBuilder builder = new URIBuilder(url);
            if (param != null) {
                for (String key : param.keySet()) {
                    builder.addParameter(key, param.get(key));
                }
            }
            URI uri = builder.build();
            // 创建http GET请求
            HttpGet httpGet = new HttpGet(uri);
            // 設置httpGet的头部參數信息 此处必不可少,伪装成浏览器访问
            httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
            httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch, br");
            httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.8");
            httpGet.setHeader("User-Agent",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
            // 执行请求
            response = httpclient.execute(httpGet);
            // 判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == 200) {
                resultString = EntityUtils.toString(response.getEntity(), "UTF-8");
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return resultString;
    }

    public static String doGet(String url) {
        return doGet(url, null);
    }
}

HttpURLConnection

package com.example.administrator.myapplication.Douyin;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 * 获取HTML数据
 */
public class HtmlService {

    public static String getHtml(String path) throws Exception {
        // 通过网络地址创建URL对象
        URL url = new URL(path);
        // 根据URL
        // 打开连接,URL.openConnection函数会根据URL的类型,返回不同的URLConnection子类的对象,这里URL是一个http,因此实际返回的是HttpURLConnection
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        // 设定URL的请求类别,有POST、GET 两类
        conn.setRequestMethod("GET");
        //设置从主机读取数据超时(单位:毫秒)
        conn.setConnectTimeout(5000);
        //设置连接主机超时(单位:毫秒)
        conn.setReadTimeout(5000);
        // 通过打开的连接读取的输入流,获取html数据
        InputStream inStream = conn.getInputStream();
        // 得到html的二进制数据
        byte[] data = readInputStream(inStream);
        // 是用指定的字符集解码指定的字节数组构造一个新的字符串
        String html = new String(data, "utf-8");
        return html;
    }

    /**
     * 读取输入流,得到html的二进制数据
     *
     * @param inStream
     * @return
     * @throws Exception
     */
    public static byte[] readInputStream(InputStream inStream) throws Exception {
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024];
        int len = 0;
        while ((len = inStream.read(buffer)) != -1) {
            outStream.write(buffer, 0, len);
        }
        inStream.close();
        return outStream.toByteArray();
    }

}

主程序

 button.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View v) {
                new Thread() {
                    @Override
                    public void run() {
                        html = HttpClientUtil.doGet(url);
                        Log.e(TAG, "run: " + html);
                        handler.post(new Runnable() {
                            @Override
                            public void run() {
                                textView.setText(html);
                                textView.setTextColor(getResources().getColor(R.color.colorPrimary));
                            }
                        });
                    }
                }.start();
            }
        });

        button4.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View v) {
                new Thread() {
                    @Override
                    public void run() {
                        try {
                            final String html = HtmlService.getHtml(url);
                            handler.post(new Runnable() {
                                @Override
                                public void run() {
                                    textView.setText(html);
                                    textView.setTextColor(getResources().getColor(R.color.colorBrown));
                                }
                            });
                        } catch (Exception e) {
                            e.printStackTrace();
                        }

                    }
                }.start();
            }
        });
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
使用HttpURLConnection获取网页内容的示例代码如下: ```java import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; public class HttpUrlConnectionExample { public static void main(String[] args) { try { URL url = new URL("http://www.example.com"); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); conn.setRequestProperty("User-Agent", "Mozilla/5.0"); BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream())); String inputLine; StringBuffer response = new StringBuffer(); while ((inputLine = in.readLine()) != null) { response.append(inputLine); } in.close(); System.out.println(response.toString()); } catch (Exception e) { e.printStackTrace(); } } } ``` 使用HttpClient获取网页内容的示例代码如下: ```java import java.io.BufferedReader; import java.io.InputStreamReader; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClientBuilder; public class HttpClientExample { public static void main(String[] args) { try { HttpClient httpClient = HttpClientBuilder.create().build(); HttpGet request = new HttpGet("http://www.example.com"); HttpResponse response = httpClient.execute(request); BufferedReader rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent())); String line = ""; StringBuffer responseContent = new StringBuffer(); while ((line = rd.readLine()) != null) { responseContent.append(line); } rd.close(); System.out.println(responseContent.toString()); } catch (Exception e) { e.printStackTrace(); } } } ``` 这两种方式都可以获取网页的内容,只不过使用的类和方法略有不同。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值