首先添加Markdown 转 Html依赖:
<!--Markdown 转 Html,markdown解析框架-->
<dependency>
<groupId>com.vladsch.flexmark</groupId>
<artifactId>flexmark-all</artifactId>
<version>0.62.2</version>
</dependency>
html中添加url的输入框(我使用的是layui2.7.6.js):
<div class="layui-form-pane">
<div class="layui-form-item layui-form-text">
<label class="layui-form-label">
<a href="javascript:void(0)" onclick="piliangReptile()"><i class="fa fa-github" style="color:red;font-size: x-large;"></i>批量抓取文章</a>
<a href="javascript:void(0)" style="margin-left: 80%;" onclick="reptile()"><i class="fa fa-github" style="color:red;font-size: x-large;"></i>抓取单篇文章</a>
</label>
<div class="layui-input-block">
<textarea id="zhuaqu" class="layui-textarea"
placeholder=" 抓取单篇文章URL示例:https://blog.csdn.net/qq3892997/article/details/128954768(文章ID)。 列表抓取URL示例:https://blog.csdn.net/qq3892997/article/list/2(页码非必填)"></textarea>
</div>
</div>
</div>
JS:
//抓取文章
function reptile(){
var loadIndex = layer.load(2, {
shade: [0.1,'#fff'] //0.1透明度的白色背景
});
$.post('/homeReptile', {url: $('#zhuaqu').val()}, function(data){
layer.close(loadIndex);//关闭加载层
layer.msg(data.message);
});
}
//抓取批量文章
function piliangReptile(){
var loadIndex = layer.load(2, {
shade: [0.1,'#fff'] //0.1透明度的白色背景
});
$.get('/piliangReptile', {url: $('#zhuaqu').val()}, function(data){
layer.close(loadIndex);//关闭加载层
layer.msg(data.message);
});
}
controller:
//首页文章、ip用户、留言统计 ---首页各种统计信息 服务器监控.redis监控 ---请访问HomeController
@PostMapping(value = "/homeReptile")
public ResponseResult reptile(String url) {
if(StringUtils.isBlank(url)){
return ResponseResult.error("无效的URL");
}
return articleService.reptile(url);
}
@GetMapping(value = "/piliangReptile")
public ResponseResult piliangReptile(String url) {
String content = getWebpageContent(url); // 抓取页面的内容
ArrayList<String> urls = new ArrayList<>();
int succint = 0 ;
int errint = 0 ;
// 根据观察,找到需要抓取的文章列表的范围
int start = content.indexOf("<div class=\"article-list\">");
int end = content.indexOf("</main>");
// 如果没找到,返回
if (start < 0 || end < 0)
return ResponseResult.error(urls.toString());
// 对数据进行筛选,移除空行。
ArrayList<String> list = new ArrayList<>();
for (String line : content.substring(start, end).split("\n"))
if (line.trim().length() > 0)
list.add(line.trim());
// 提取内容
for (int i = 0; i < list.size(); i++) {
if (list.get(i).contains("article/details/") && list.get(i + 1).contains("<span class")) {
String aurl = list.get(i).split("\"")[1];
if(aurl.contains("https://blog.csdn.net/")){
ResponseResult a = articleService.reptile(aurl);
if (a.getCode() == 200){
succint++;
}else{
errint++;
}
}
}
}
log.info("成功抓取"+succint+"篇文章,失败"+errint+"篇!");
return new ResponseResult(200,"成功抓取"+succint+"篇文章,失败"+errint+"篇!", url);
}
// 从指定网址获取网页的内容,返回为网站的HTML字符串。
public static String getWebpageContent(String url) {
try {
URL u = new URL(url);
HttpURLConnection conn = (HttpURLConnection) u.openConnection();
conn.connect();
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8));
StringBuilder sb = new StringBuilder();
String str;
while ((str = br.readLine()) != null) {
sb.append(str).append("\n");
}
br.close();
conn.disconnect();
return sb.toString();
} catch (Exception e) {
e.printStackTrace();
}
return "";
}
使用到的统一返回类:
package com.shiyi.common;
import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.Data;
import java.util.HashMap;
import java.util.Map;
import static com.shiyi.common.ResultCode.*;
/**
* <p> 统一返回结果类 </p>
*
* @description :
* @author : blue
*/
@ApiModel(value = "统一返回结果类")
@Data
public class ResponseResult {
/**
* 消息内容
*/
@ApiModelProperty(value = "响应消息", required = false)
private String message;
/**
* 响应码:参考`ResultCode`
*/
@ApiModelProperty(value = "响应码", required = true)
private Integer code;
/**
* 响应中的数据
*/
@ApiModelProperty(value = "响应数据", required = false)
private Object data;
@ApiModelProperty(value = "响应数据", required = false)
private Map<String,Object> extra = new HashMap<>();
public ResponseResult putExtra(String key, Object value) {
this.extra.put(key, value);
return this;
}
public static ResponseResult error(String message) {
return new ResponseResult(FAILURE.getCode(), message, null);
}
public static ResponseResult error() {
return new ResponseResult(FAILURE.getCode(), ERROR.getDesc(), null);
}
public static ResponseResult error(Integer code, String message) {
return new ResponseResult(code, message, null);
}
public static ResponseResult success() {
return new ResponseResult(SUCCESS.getCode(), SUCCESS.getDesc(), null);
}
public static ResponseResult success(Object data) {
return new ResponseResult(SUCCESS.getCode(),SUCCESS.getDesc(), data);
}
public static ResponseResult success(String message, Object data) {
return new ResponseResult(SUCCESS.getCode(), message, data);
}
public static ResponseResult success(Integer code, String message, Object data) {
return new ResponseResult(code, message, data);
}
public static ResponseResult success(Integer code, String message) {
return new ResponseResult(code, message,null);
}
public ResponseResult(Integer code, String msg, Object data) {
this.code = code;
this.message = msg;
this.data = data;
}
}
service:
/**
* 抓取文章
* @return
*/
@Override
@Transactional(rollbackFor = Exception.class)
public ResponseResult reptile(String url) {
try {
Document document = Jsoup.connect(url).get();
Elements title = document.getElementsByClass("title-article");
Elements tags = document.getElementsByClass("tag-link");
Elements content = document.getElementsByClass("article_content");
if (StringUtils.isBlank(content.toString())) {
return ResponseResult.error("文章抓取失败");
}
//爬取的是HTML内容,需要转成MD格式的内容
String newContent = content.get(0).toString().replaceAll("<code>", "<code class=\"lang-java\">");
MutableDataSet options = new MutableDataSet();
String markdown = FlexmarkHtmlConverter.builder(options).build().convert(newContent)
.replace("lang-java","java");
//把抓取下来的文章赋值到实体
BlogArticle entity = BlogArticle.builder().userId(7L).contentMd(markdown)
.categoryId(OTHER_CATEGORY_ID).isOriginal(YesOrNoEnum.NO.getCode()).originalUrl(url)
.title(title.get(0).text()).avatar(IMG_URL_API).content(newContent).build();//IMG_URL_API是我文章图片封面的url,需要自己修改
baseMapper.insert(entity);//保存文章
//为该文章添加标签
List<Long> tagsId = new ArrayList<>();
tags.forEach(item ->{
String tag = item.text();
Tags result = tagsMapper.selectOne(new QueryWrapper<Tags>().eq(SqlConf.NAME,tag ));
if (result == null ){
result = Tags.builder().name(tag).build();//以页面已有的标签在本地创建一个新的标签
tagsMapper.insert(result);
}
tagsId.add(result.getId());
});
if(tagsId.size()==0 ){//如果csdn写文章时就没有写标签 ---那么就定义一个后期改
log.info("文章抓取成功,但是无标签,已使用标题作为标签:{}", JSON.toJSONString(entity));
Tags t = Tags.builder().name(entity.getTitle()).build();//以页面已有的标签在本地创建一个新的标签
tagsId.add(t.getId());
}
tagsMapper.saveArticleTags(entity.getId(),tagsId);
log.info("文章抓取成功,内容为:{}", JSON.toJSONString(entity));
} catch (IOException e) {
throw new BusinessException(e);
}
return ResponseResult.success();
}
文章实体类:
package com.shiyi.entity;
import com.baomidou.mybatisplus.annotation.*;
import com.fasterxml.jackson.annotation.JsonFormat;
import com.shiyi.util.DateUtils;
import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.*;
import java.io.Serializable;
import java.util.Date;
/**
* <p>
* 博客文章表
* </p>
*
* @author blue
* @since 2021-08-18
*/
@Data
@EqualsAndHashCode(callSuper = false)
@TableName("b_article")
@ApiModel(value="BlogArticle对象", description="博客文章表")
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class BlogArticle implements Serializable {
private static final long serialVersionUID=1L;
@ApiModelProperty(value = "主键id")
@TableId(value = "id", type = IdType.AUTO)
private Long id;
@ApiModelProperty(value = "用户id")
private Long userId;
@ApiModelProperty(value = "分类id")
private Long categoryId;
@ApiModelProperty(value = "文章标题")
private String title;
@ApiModelProperty(value = "文章封面地址")
private String avatar;
@ApiModelProperty(value = "文章简介")
private String summary;
@ApiModelProperty(value = "文章内容")
private String content;
@ApiModelProperty(value = "文章内容MD版")
private String contentMd;
@ApiModelProperty(value = "发布状态 0:下架;1:上架")
private Integer isPublish;
@ApiModelProperty(value = "是否是私密文章 0 否 1是")
private Integer isSecret;
@ApiModelProperty(value = "是否置顶 0否 1是")
private Integer isStick;
@ApiModelProperty(value = "是否原创 0:转载 1:原创")
private Integer isOriginal;
@ApiModelProperty(value = "转发地址")
private String originalUrl;
@ApiModelProperty(value = "文章阅读量")
private Integer quantity;
@ApiModelProperty(value = "说明")
private String remark;
@ApiModelProperty(value = "SEO关键词")
private String keywords;
@ApiModelProperty(value = "创建时间")
@TableField(fill = FieldFill.INSERT)
@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")
private Date createTime;
@ApiModelProperty(value = "最后更新时间")
@TableField(fill = FieldFill.UPDATE)
@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")
private Date updateTime;
}
标签实体:
package com.shiyi.entity;
import com.baomidou.mybatisplus.annotation.*;
import java.util.Date;
import java.io.Serializable;
import com.fasterxml.jackson.annotation.JsonFormat;
import com.shiyi.util.DateUtils;
import io.swagger.annotations.ApiModel;
import io.swagger.annotations.ApiModelProperty;
import lombok.*;
/**
* <p>
* 博客标签表
* </p>
*
* @author blue
* @since 2021-09-09
*/
@Data
@EqualsAndHashCode(callSuper = false)
@TableName("b_tags")
@ApiModel(value="Tags对象", description="博客标签表")
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class Tags implements Serializable {
private static final long serialVersionUID=1L;
@ApiModelProperty(value = "主键id")
@TableId(value = "id", type = IdType.AUTO)
private Long id;
@ApiModelProperty(value = "标签名称")
private String name;
@ApiModelProperty(value = "排序")
private int sort;
@ApiModelProperty(value = "点击量")
private int clickVolume;
@ApiModelProperty(value = "创建时间")
@TableField(fill = FieldFill.INSERT)
@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")
private Date createTime;
@ApiModelProperty(value = "最后更新时间")
@TableField(fill = FieldFill.UPDATE)
@JsonFormat(pattern = DateUtils.FORMAT_STRING,timezone="GMT+8")
private Date updateTime;
@TableField(exist = false)
private int articleCount;
public Tags(Long id, int clickVolume) {
this.id = id;
this.clickVolume = clickVolume;
}
}
文章实体对应的mysql8.0的脚本:
-- blog.b_article definition
CREATE TABLE `b_article` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键id',
`user_id` bigint DEFAULT NULL COMMENT '用户id',
`category_id` bigint DEFAULT NULL COMMENT '分类id',
`title` varchar(150) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '' COMMENT '文章标题',
`avatar` varchar(200) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '文章封面地址',
`summary` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '' COMMENT '文章简介',
`content` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '文章内容 (最多两百字)',
`content_md` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci COMMENT '文章内容md版',
`is_secret` int DEFAULT '0' COMMENT '是否是私密文章 0 否 1是',
`is_stick` int DEFAULT '0' COMMENT '是否置顶 0否 1是',
`is_publish` int DEFAULT '0' COMMENT '是否发布 0:下架 1:发布',
`is_original` int DEFAULT NULL COMMENT '是否原创 0:转载 1:原创',
`original_url` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT '转载地址',
`quantity` bigint DEFAULT '0' COMMENT '文章阅读量',
`remark` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT '' COMMENT '说明',
`create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`keywords` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL COMMENT 'seo关键词',
`update_time` datetime DEFAULT NULL COMMENT '修改时间',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=429 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC COMMENT='博客文章表';
标签实体mysql8.0脚本:
-- blog.b_tags definition
CREATE TABLE `b_tags` (
`id` bigint NOT NULL AUTO_INCREMENT COMMENT '主键id',
`name` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL DEFAULT '' COMMENT '标签名称',
`click_volume` int DEFAULT '0',
`sort` int NOT NULL COMMENT '排序',
`create_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`update_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '最后更新时间',
PRIMARY KEY (`id`) USING BTREE,
KEY `tag_name` (`name`) USING BTREE COMMENT '博客标签名称'
) ENGINE=InnoDB AUTO_INCREMENT=289 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC COMMENT='博客标签表';
文章与标签关联表:
-- blog.b_article_tag definition
CREATE TABLE `b_article_tag` (
`id` int NOT NULL AUTO_INCREMENT,
`article_id` int NOT NULL COMMENT '文章id',
`tag_id` int NOT NULL COMMENT '标签id',
PRIMARY KEY (`id`) USING BTREE,
UNIQUE KEY `fk_article_tag_1` (`article_id`,`tag_id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=1308 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci ROW_FORMAT=DYNAMIC;