爬取公众号的文章，同时处理图片不显示问题（JAVA）

最新推荐文章于 2024-06-13 16:09:52 发布

黯然神伤888

最新推荐文章于 2024-06-13 16:09:52 发布

阅读量394

点赞数

本文链接：https://blog.csdn.net/dante1987/article/details/120178527

版权

JAVA 同时被 3 个专栏收录

28 篇文章 0 订阅

订阅专栏

spring boot

16 篇文章 0 订阅

订阅专栏

IntelliJ IDEA

9 篇文章 0 订阅

订阅专栏

Java代码下载（IDEA）

Maven引入

<groupId>org.apache.httpcomponents</groupId>

<artifactId>httpclient</artifactId>

</dependency>

<groupId>org.jsoup</groupId>

<artifactId>jsoup</artifactId>

</dependency>

写在application.properties配置文件中

upload.path=G:/imgs
getUrl=https://mp.weixin.qq.com/s/fdllA87IDpUZ34OFBVZdWw
wexinUrl=https://mp.weixin.qq.com


定义Controller

@RestController
@RequestMapping("/crawler")
public class CrawlerController {
    //存储图片的路径，写在配置文件里面
    @Value("${upload.path}")
    private String path;
    //公众号文章地址，写在配置文件里面
    @Value("${getUrl}")
    private String url;
    //公众号总地址
    @Value("${wexinUrl}")
    private String wexinUrl;

    @RequestMapping("/getContent")
    public String getContent() {

        String content = null;
        String imgDir = path;

//      输入网址，创建发起Get请求的对象
        HttpGet httpGet = new HttpGet(url);

//      创建httpClient对象，类似于打开浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();

//      类似于浏览器输入网址后，按回车
        CloseableHttpResponse execute = null;
        try {
            execute = httpClient.execute(httpGet);

            //      解析获取数据，判断状态码是不是200
            if (execute.getStatusLine().getStatusCode() == 200) {
                HttpEntity entity = execute.getEntity();
                content = EntityUtils.toString(entity, "utf8");

                Document doc = Jsoup.parse(content);

                //找到图片标签
                Elements img = doc.select("img");
                for (int i = 0; i < img.size(); i++) {
                    // 图片地址
                    String imgUrl = img.get(i).attr("data-src");

                    File sf = new File(imgDir);
                    if (!sf.exists()) {
                        sf.mkdirs();
                    }
//                    这里是一个公众号的二维码的图片，先不处理了
//                    String id = img.get(i).attr("id");
//                    if ("js_pc_qr_code_img".equalsIgnoreCase(id)) {
//                        imgUrl = wexinUrl + img.get(i).attr("src");
//                    }

                    if (imgUrl != null && !imgUrl.equals("")) {
                        String fileName = DateTimeUitls.getString("yyyyMMddHHmmssSS") + ".png";
                        String imgPath = imgDir + File.separator + fileName;
                        File imgFile = new File(imgPath);
                        if (!imgFile.exists()) {
                            // 下载图片
                            // 构造URL
                            URL url = new URL(imgUrl);
                            // 打开连接
                            URLConnection con = url.openConnection();
                            //设置请求超时为5s
                            con.setConnectTimeout(5 * 1000);
                            // 输入流
                            InputStream in = con.getInputStream();
                            // 1K的数据缓冲
                            byte[] bs = new byte[1024];
                            // 读取到的数据长度
                            int len;
                            // 输出的文件流
                            OutputStream os = new FileOutputStream(imgPath);
                            // 开始读取
                            while ((len = in.read(bs)) != -1) {
                                os.write(bs, 0, len);
                            }
                            os.close();
                            in.close();
                        }

                        //重新赋值为本地路径，
//                        img.get(i).attr("data-src", imgPath);
//                        img.get(i).attr("src", imgPath);

                        //上面访问图片可能访问不到，建议定义访问图片的请求方法，所以修改成下面的的路径访问方式
                        img.get(i).attr("data-src", "/crawler/readImg/" + fileName);
                        img.get(i).attr("src", "/crawler/readImg/" + fileName);

                        //导出 html
                        content = doc.outerHtml();
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                execute.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                httpClient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return content;
    }

    //建议定义访问图片的请求方法
    @RequestMapping("/readImg/{fileName}")
    public void readImg(@PathVariable("fileName") String fileName, HttpServletResponse response) {
        try {
            // fileImage 为服务器存储的实际路径 如 c:\aa\bb.jpg
            String fileImage = path + File.separator + fileName;
            FileInputStream hFile = new FileInputStream(fileImage); // 以byte流的方式打开文件
            int i = hFile.available(); // 得到文件大小
            byte data[] = new byte[i];
            hFile.read(data); // 读数据
            hFile.close();
            response.setContentType("image/*"); // 设置返回的文件类型
            OutputStream toClient = response.getOutputStream(); // 得到向客户端输出二进制数据的对象
            toClient.write(data); // 输出数据
            toClient.close();
        } catch (IOException e) {
            // 错误处理
            PrintWriter toClient;
            try {
                // 得到向客户端输出文本的对象
                toClient = response.getWriter();
                response.setContentType("text/html;charset=utf8");
                toClient.write("无法打开图片!");
                toClient.close();
            } catch (IOException e1) {
                e1.printStackTrace();
            }
        }
    }
}

时间格式工具类

public class DateTimeUitls {
    public static String getString(String pattern){
        SimpleDateFormat df = new SimpleDateFormat(pattern);
        return df.format(new Date());
    }
}