JAVA 网络爬虫.

package com.thinkyun.community.util.reptile;

import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;

public final class Reptile {

    static Logger logger = (Logger) LoggerFactory.getLogger("Reptile");

    public static final String GBK = "GBK";

    public static final String UTF8 = "UTF-8";

    public static final String POST = "POST";

    public static final String GET = "GET";

    /**
     * @param urlPath       网址路径
     * @param cookies       cookies
     * @param element       截取内容的特征
     * @param encode        字符编码
     * @param requestMethod 请求方式
     * @return Map
     */
    public static Map<String, Object> request(String urlPath, String cookies, String element, String encode, String requestMethod) {
        Map<String, Object> map = new HashMap();
        BufferedInputStream inputStream = null;
        try {
            logger.info("进入爬虫接口");
            URL url = new URL(urlPath);
            HttpURLConnection connection = (HttpURLConnection) url.openConnection();
            //请求方式
            connection.setRequestMethod(requestMethod);
            connection.setDoOutput(true);
            connection.setDoInput(true);
            if (cookies != null) {
                connection.setRequestProperty("Cookie", cookies);
            }
            String html = null;
            //获取访问状态码
            int code = connection.getResponseCode();
            map.put("resultCode", code);
            //判断访问码,200为访问成功.
            if (code == 200) {
                //转为BufferedInputStream
                inputStream = new BufferedInputStream(connection.getInputStream());
                //将流转成byte数组,再转换成html字符串
                String txt = new String(IOUtils.toByteArray(inputStream), encode);
                //将字符串转换成Document对象,和Jquery操作类似.
                Document document = Jsoup.parseBodyFragment(txt);
                Element masthead = document.selectFirst(element);
                html = masthead.html();//text()方法为去掉样式的纯文本.
                int i = 1;
                document = Jsoup.parseBodyFragment(html);
                //查询所有IMG标签
                Elements cover_images = document.select("img");
                //获取第一张图片的url地址.
                map.put("firstImage", cover_images.get(0).attr("src"));
                //文章内容
                map.put("content", html);
            }
        } catch (IOException e) {
            logger.info("接口异常");
            e.printStackTrace();
        } finally {
            try {
                if (inputStream != null) {
                    inputStream.close();
                }
            }catch (Exception e){
                e.printStackTrace();
            }
            logger.info("接口结束");
            return map;
        }
    }

    public static void main(String[] args) {
        Map map = request("https://mp.weixin.qq.com/s?__biz=MjM5NTY1MjY0MQ==&mid=2650743162&idx=2&sn=3016639b1ef0eed00cc567b1c6117d9a&chksm=befeb43489893d22eae2e6724e2b91304caa60c8d152c9e6bc511366908c2e1d5b1d165a2793&scene=0#rd",
                null,
                "div#js_content", UTF8, GET);

    }
}

 

        <!-- 网页解析 -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值