Android爬虫，获取网页源码的两种方式，Apache HttpClient，HttpURLConnection

最新推荐文章于 2023-02-23 15:20:05 发布

张雨zy

最新推荐文章于 2023-02-23 15:20:05 发布

阅读量1.2k

点赞数 1

分类专栏：爬虫

本文链接：https://blog.csdn.net/yu540135101/article/details/86768690

版权

爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

在这里插入图片描述

Apache HttpClient

    implementation group: 'cz.msebera.android' , name: 'httpclient', version: '4.4.1.1'

package com.example.administrator.myapplication.Douyin;

import java.io.IOException;
import java.net.URI;
import java.util.Map;

import cz.msebera.android.httpclient.client.methods.CloseableHttpResponse;
import cz.msebera.android.httpclient.client.methods.HttpGet;
import cz.msebera.android.httpclient.client.utils.URIBuilder;
import cz.msebera.android.httpclient.impl.client.CloseableHttpClient;
import cz.msebera.android.httpclient.impl.client.HttpClients;
import cz.msebera.android.httpclient.util.EntityUtils;

public class HttpClientUtil {
    public static String doGet(String url, Map<String, String> param) {
        // 创建Httpclient对象
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String resultString = "";
        CloseableHttpResponse response = null;
        try {
            // 创建uri
            URIBuilder builder = new URIBuilder(url);
            if (param != null) {
                for (String key : param.keySet()) {
                    builder.addParameter(key, param.get(key));
                }
            }
            URI uri = builder.build();
            // 创建http GET请求
            HttpGet httpGet = new HttpGet(uri);
            // 設置httpGet的头部參數信息 此处必不可少，伪装成浏览器访问
            httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httpGet.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7");
            httpGet.setHeader("Accept-Encoding", "gzip, deflate, sdch, br");
            httpGet.setHeader("Accept-Language", "zh-cn,zh;q=0.8");
            httpGet.setHeader("User-Agent",
                    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
            // 执行请求
            response = httpclient.execute(httpGet);
            // 判断返回状态是否为200
            if (response.getStatusLine().getStatusCode() == 200) {
                resultString = EntityUtils.toString(response.getEntity(), "UTF-8");
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return resultString;
    }

    public static String doGet(String url) {
        return doGet(url, null);
    }
}

HttpURLConnection

package com.example.administrator.myapplication.Douyin;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;

/**
 * 获取HTML数据
 */
public class HtmlService {

    public static String getHtml(String path) throws Exception {
        // 通过网络地址创建URL对象
        URL url = new URL(path);
        // 根据URL
        // 打开连接，URL.openConnection函数会根据URL的类型，返回不同的URLConnection子类的对象，这里URL是一个http，因此实际返回的是HttpURLConnection
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        // 设定URL的请求类别，有POST、GET 两类
        conn.setRequestMethod("GET");
        //设置从主机读取数据超时（单位：毫秒）
        conn.setConnectTimeout(5000);
        //设置连接主机超时（单位：毫秒）
        conn.setReadTimeout(5000);
        // 通过打开的连接读取的输入流,获取html数据
        InputStream inStream = conn.getInputStream();
        // 得到html的二进制数据
        byte[] data = readInputStream(inStream);
        // 是用指定的字符集解码指定的字节数组构造一个新的字符串
        String html = new String(data, "utf-8");
        return html;
    }

    /**
     * 读取输入流，得到html的二进制数据
     *
     * @param inStream
     * @return
     * @throws Exception
     */
    public static byte[] readInputStream(InputStream inStream) throws Exception {
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024];
        int len = 0;
        while ((len = inStream.read(buffer)) != -1) {
            outStream.write(buffer, 0, len);
        }
        inStream.close();
        return outStream.toByteArray();
    }

}

主程序

 button.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View v) {
                new Thread() {
                    @Override
                    public void run() {
                        html = HttpClientUtil.doGet(url);
                        Log.e(TAG, "run: " + html);
                        handler.post(new Runnable() {
                            @Override
                            public void run() {
                                textView.setText(html);
                                textView.setTextColor(getResources().getColor(R.color.colorPrimary));
                            }
                        });
                    }
                }.start();
            }
        });

        button4.setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View v) {
                new Thread() {
                    @Override
                    public void run() {
                        try {
                            final String html = HtmlService.getHtml(url);
                            handler.post(new Runnable() {
                                @Override
                                public void run() {
                                    textView.setText(html);
                                    textView.setTextColor(getResources().getColor(R.color.colorBrown));
                                }
                            });
                        } catch (Exception e) {
                            e.printStackTrace();
                        }

                    }
                }.start();
            }
        });

张雨zy

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
Android爬虫，获取网页源码的两种方式，Apache HttpClient，HttpURLConnection

Apache HttpClient implementation group: 'cz.msebera.android' , name: 'httpclient', version: '4.4.1.1'package com.example.administrator.myapplication.Douyin;import java.io.IOException;import ...
复制链接

扫一扫

专栏目录