第一步先爬取文章
第二步将原本的微信图片,下载到本地替换到七牛云上面
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.12</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.6</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
public class SpiderUtil {
// 微信公众号文章域名
private static final String WX_DOMAIN = "https://mp.weixin.qq.com";
/**
* 测试主方法
*/
public static void main(String args[]) {
String url = "https://mp.weixin.qq.com/s/Cw-QuhaqruojDQM-Ttvjzg";
String resp = getActicle(url);
System.out.println(resp);
}
public static String getActicle(String url) {
// 检测链接是否合法
String msg = checkUrl(url);
if (msg != null) {
return msg;
}
// 请求与响应
String resp = HttpTool.get(url, getWxHeaderMap());
String content = getWxActicleContent(resp);
if (resp == null || resp.trim().length() == 0) {
return "文章获取失败,请检查链接是否正确";
}
return content;
}
/**
* 检测文章链接是否合法
*/
public static String checkUrl(String url) {
if (url == null) {
return "请输入文章链接";
}
if (!url.startsWith(WX_DOMAIN)) {
return "请输入微信公众号文章链接";
}
return null;
}
public static String getWxActicleContent(String resp) {
try {
Element document = Jsoup.parse(resp);
Elements pngs = document.select("img[data-src]");
for (Element element : pngs) {
String imgUrl = element.attr("data-src");
String newsrc = DownLoadImg.downloadPicture(imgUrl);
element.attr("src", newsrc);
}
return document.toString();
} catch (Exception e) {
return "";
}
}
/**
* 微信公众号请求头设置
*/
public static Map<String, String> getWxHeaderMap() {
Map<String, String> map = new HashMap<>(new LinkedHashMap<>());
map.put("Accept", "text/html, application/xhtml+xml, image/jxr, */*");
map.put("Accept-Encoding", "gzip, deflate");
map.put("Accept-Language", "zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3");
map.put("Host", "mp.weixin.qq.com");
map.put("If-Modified-Since", "Sat, 04 Jan 2020 12:23:43 GMT");
map.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko");
return map;
}
}
public class DownLoadImg {
public static String downloadPicture(String urlList) {
String filename = UUID.fastUUID().toString();
String path = "D:/m2/" + filename;
URL url = null;
String uploadUrl = null;
try {
url = new URL(urlList);
DataInputStream dataInputStream = new DataInputStream(url.openStream());
FileOutputStream fileOutputStream = new FileOutputStream(new File(path));
ByteArrayOutputStream output = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int length;
while ((length = dataInputStream.read(buffer)) > 0) {
output.write(buffer, 0, length);
}
BASE64Encoder encoder = new BASE64Encoder();
String encode = encoder.encode(buffer);
byte[] content = output.toByteArray();
uploadUrl = new QiNiuUploadUtil().upload(content, filename, true);
fileOutputStream.write(content);
dataInputStream.close();
fileOutputStream.close();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Download返回的filname=" + uploadUrl);
return uploadUrl;
}
public static void main(String[] args) {
String picture = downloadPicture("https://mmbiz.qpic.cn/mmbiz_png/7WbP8ZjskNqv1Wyx18gicMDiciaibkbZic6q3HqhSAdvrFEmAsg65cmE51rrsumhS6DK0f1ibHKHKEPibO6TbibK0gZ4GQ/640?wx_fmt=png&tp=webp&wxfrom=5&wx_lazy=1&wx_co=1");
System.out.println(picture);
}
}