Java 爬虫微信公众号详情,并且破解微信图片跨域问题

第一步先爬取文章

第二步将原本的微信图片,下载到本地替换到七牛云上面

 

  <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.13</version>
        </dependency>
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpmime</artifactId>
            <version>4.5.12</version>
        </dependency>
        <dependency>
            <groupId>com.google.code.gson</groupId>
            <artifactId>gson</artifactId>
            <version>2.8.6</version>
        </dependency>
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.11.3</version>
        </dependency>
public class SpiderUtil {

    // 微信公众号文章域名
    private static final String WX_DOMAIN = "https://mp.weixin.qq.com";


    /**
     * 测试主方法
     */
    public static void main(String args[]) {
        String url = "https://mp.weixin.qq.com/s/Cw-QuhaqruojDQM-Ttvjzg";
        String resp = getActicle(url);
        System.out.println(resp);
    }

    public static String getActicle(String url) {
        // 检测链接是否合法
        String msg = checkUrl(url);
        if (msg != null) {
            return msg;
        }
        // 请求与响应
        String resp = HttpTool.get(url, getWxHeaderMap());
        String content = getWxActicleContent(resp);
        if (resp == null || resp.trim().length() == 0) {
            return "文章获取失败,请检查链接是否正确";
        }
        return content;
    }

    /**
     * 检测文章链接是否合法
     */
    public static String checkUrl(String url) {
        if (url == null) {
            return "请输入文章链接";
        }
        if (!url.startsWith(WX_DOMAIN)) {
            return "请输入微信公众号文章链接";
        }
        return null;
    }
    public static String getWxActicleContent(String resp) {
        try {
            Element document = Jsoup.parse(resp);
            Elements pngs = document.select("img[data-src]");
            for (Element element : pngs) {
                String imgUrl = element.attr("data-src");
                String newsrc = DownLoadImg.downloadPicture(imgUrl);
                element.attr("src", newsrc);
            }
            return document.toString();
        } catch (Exception e) {
            return "";
        }
    }

    /**
     * 微信公众号请求头设置
     */
    public static Map<String, String> getWxHeaderMap() {
        Map<String, String> map = new HashMap<>(new LinkedHashMap<>());
        map.put("Accept", "text/html, application/xhtml+xml, image/jxr, */*");
        map.put("Accept-Encoding", "gzip, deflate");
        map.put("Accept-Language", "zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3");
        map.put("Host", "mp.weixin.qq.com");
        map.put("If-Modified-Since", "Sat, 04 Jan 2020 12:23:43 GMT");
        map.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko");
        return map;
    }

}

 

public class DownLoadImg {
    public static String downloadPicture(String urlList) {
        String filename = UUID.fastUUID().toString();
        String path = "D:/m2/" + filename;
        URL url = null;
        String uploadUrl = null;
        try {
            url = new URL(urlList);
            DataInputStream dataInputStream = new DataInputStream(url.openStream());
            FileOutputStream fileOutputStream = new FileOutputStream(new File(path));
            ByteArrayOutputStream output = new ByteArrayOutputStream();

            byte[] buffer = new byte[1024];
            int length;

            while ((length = dataInputStream.read(buffer)) > 0) {
                output.write(buffer, 0, length);
            }
            BASE64Encoder encoder = new BASE64Encoder();
            String encode = encoder.encode(buffer);
            byte[] content = output.toByteArray();
            uploadUrl = new QiNiuUploadUtil().upload(content, filename, true);
            fileOutputStream.write(content);
            dataInputStream.close();
            fileOutputStream.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("Download返回的filname=" + uploadUrl);
        return uploadUrl;
    }

    public static void main(String[] args) {
        String picture = downloadPicture("https://mmbiz.qpic.cn/mmbiz_png/7WbP8ZjskNqv1Wyx18gicMDiciaibkbZic6q3HqhSAdvrFEmAsg65cmE51rrsumhS6DK0f1ibHKHKEPibO6TbibK0gZ4GQ/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1");
        System.out.println(picture);
    }

}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值