1.Jsoup简介:
Jsoup是一款HTML解析框架,它可以连接URL地址并获取HTML文本内容,Jsoup提供了强大的API,可以通过类似Jquery的形式对DOM、标签、类名等获取页面内的数据元素.
2.上案例:
package com.lnbdy.sms.controller;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestHeader;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import com.alibaba.fastjson.JSON;
import io.swagger.annotations.ApiOperation;
import io.swagger.annotations.ApiParam;
@RestController
@RequestMapping("/api/wechat")
public class WechatContentController {
@ApiOperation(value = "获取微信内容页中的数据")
@PostMapping("/content")
public String getContent(@RequestBody WechatContent wechatContentparam) {
//连接微信页面(微信url有多种,我用的是可以直接连接分享的页面url)
//获取页面内容,并将数据写入自定义的试题类中
Document doc;
try {
doc = Jsoup.connect(wechatContentparam.getUrl()).get();
Elements title = doc.getElementsByTag("h2");
Elements source = doc.getElementsByClass("profile_nickname");
Elements content = doc.getElementsByClass("rich_media_content");
Elements imageTag = doc.getElementsByTag("img");
WechatContent wechatContent = new WechatContent();
wechatContent.setTitle(title.text());
wechatContent.setContent(content.html());
wechatContent.setSource(source.text());
// 获取img中的src的图片地址
List<String> imgurls = new ArrayList<String>();
for (Element imgurl : imageTag) {
if(imgurl.attr("data-src").toString() != null && imgurl.attr("data-src").toString() != "") {
imgurls.add(imgurl.attr("data-src").toString());
}
}
wechatContent.setImgurls(imgurls);
return JSON.toJSONString(wechatContent);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}