Java代码下载(IDEA)
Maven引入
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
写在application.properties配置文件中 upload.path=G:/imgs getUrl=https://mp.weixin.qq.com/s/fdllA87IDpUZ34OFBVZdWw wexinUrl=https://mp.weixin.qq.com
定义Controller @RestController @RequestMapping("/crawler") public class CrawlerController { //存储图片的路径,写在配置文件里面 @Value("${upload.path}") private String path; //公众号文章地址,写在配置文件里面 @Value("${getUrl}") private String url; //公众号总地址 @Value("${wexinUrl}") private String wexinUrl; @RequestMapping("/getContent") public String getContent() { String content = null; String imgDir = path; // 输入网址,创建发起Get请求的对象 HttpGet httpGet = new HttpGet(url); // 创建httpClient对象,类似于打开浏览器 CloseableHttpClient httpClient = HttpClients.createDefault(); // 类似于浏览器输入网址后,按回车 CloseableHttpResponse execute = null; try { execute = httpClient.execute(httpGet); // 解析获取数据,判断状态码是不是200 if (execute.getStatusLine().getStatusCode() == 200) { HttpEntity entity = execute.getEntity(); content = EntityUtils.toString(entity, "utf8"); Document doc = Jsoup.parse(content); //找到图片标签 Elements img = doc.select("img"); for (int i = 0; i < img.size(); i++) { // 图片地址 String imgUrl = img.get(i).attr("data-src"); File sf = new File(imgDir); if (!sf.exists()) { sf.mkdirs(); } // 这里是一个公众号的二维码的图片,先不处理了 // String id = img.get(i).attr("id"); // if ("js_pc_qr_code_img".equalsIgnoreCase(id)) { // imgUrl = wexinUrl + img.get(i).attr("src"); // } if (imgUrl != null && !imgUrl.equals("")) { String fileName = DateTimeUitls.getString("yyyyMMddHHmmssSS") + ".png"; String imgPath = imgDir + File.separator + fileName; File imgFile = new File(imgPath); if (!imgFile.exists()) { // 下载图片 // 构造URL URL url = new URL(imgUrl); // 打开连接 URLConnection con = url.openConnection(); //设置请求超时为5s con.setConnectTimeout(5 * 1000); // 输入流 InputStream in = con.getInputStream(); // 1K的数据缓冲 byte[] bs = new byte[1024]; // 读取到的数据长度 int len; // 输出的文件流 OutputStream os = new FileOutputStream(imgPath); // 开始读取 while ((len = in.read(bs)) != -1) { os.write(bs, 0, len); } os.close(); in.close(); } //重新赋值为本地路径, // img.get(i).attr("data-src", imgPath); // img.get(i).attr("src", imgPath); //上面访问图片可能访问不到,建议定义访问图片的请求方法,所以修改成下面的的路径访问方式 img.get(i).attr("data-src", "/crawler/readImg/" + fileName); img.get(i).attr("src", "/crawler/readImg/" + fileName); //导出 html content = doc.outerHtml(); } } } } catch (Exception e) { e.printStackTrace(); } finally { try { execute.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } return content; } //建议定义访问图片的请求方法 @RequestMapping("/readImg/{fileName}") public void readImg(@PathVariable("fileName") String fileName, HttpServletResponse response) { try { // fileImage 为服务器存储的实际路径 如 c:\aa\bb.jpg String fileImage = path + File.separator + fileName; FileInputStream hFile = new FileInputStream(fileImage); // 以byte流的方式打开文件 int i = hFile.available(); // 得到文件大小 byte data[] = new byte[i]; hFile.read(data); // 读数据 hFile.close(); response.setContentType("image/*"); // 设置返回的文件类型 OutputStream toClient = response.getOutputStream(); // 得到向客户端输出二进制数据的对象 toClient.write(data); // 输出数据 toClient.close(); } catch (IOException e) { // 错误处理 PrintWriter toClient; try { // 得到向客户端输出文本的对象 toClient = response.getWriter(); response.setContentType("text/html;charset=utf8"); toClient.write("无法打开图片!"); toClient.close(); } catch (IOException e1) { e1.printStackTrace(); } } } }
时间格式工具类
public class DateTimeUitls { public static String getString(String pattern){ SimpleDateFormat df = new SimpleDateFormat(pattern); return df.format(new Date()); } }