CREATE TABLE extract_task_temp
(
id integer NOT NULL DEFAULT nextval('extract_task_temp_731_id_seq'::regclass),
task_init_time timestamp with time zone, -- 初始化抽取任务时间
task_current_time timestamp with time zone, -- 当前任务抽取时间
task_next_time timestamp with time zone, -- 下一次任务抽取时间
create_time timestamp with time zone DEFAULT now(),
update_time timestamp with time zone, -- 修改时间
task_type integer, -- 任务类型1:文章,2回复
website_id integer, -- 站点类型id
start_size integer, -- 分页起始大小
limit_size integer, -- 分次取多少条数据
cid integer, -- 客户id
authors text, -- 作者昵称
interval_time integer -- 间隔时间单位(分钟)
)
package com.cyyun.mobile.tools;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Resource;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.log4j.Logger;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import com.cyyun.mobile.dao.ICommentAccountTempDao;
import com.cyyun.mobile.dao.IExtractTaskTempDao;
import com.cyyun.mobile.pojo.CommentAccountTemp;
import com.cyyun.mobile.pojo.ExtractTaskTemp;
import com.cyyun.mobile.service.ExtractTaskTempService;
import com.cyyun.mobile.tools.httpconnection.HttpUrlConnection;
import com.cyyun.mobile.tools.json.JsonEntity;
import com.cyyun.mobile.tools.json.JsonEntityArray;
import com.twmacinta.util.MD5;
/**
* 抽取数据任务
*
* @author zhangzm
*
*/
@Component
public class ExtractTask {
static Logger log = Logger.getLogger(ExtractTask.class);
@Resource
IExtractTaskTempDao iExtractTaskTempDao;
@Resource
ICommentAccountTempDao iCommentAccountTempDao;
@Resource
ExtractTaskTempService extractTaskTempService;
/**
* 获取任务对象
*
* @return
*/
public List<ExtractTaskTemp> getExtractTaskTemp() {
List<ExtractTaskTemp> extractTaskTemps = null;
try {
extractTaskTemps = extractTaskTempService
.queryExtractTaskTemp(null);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return extractTaskTemps;
}
@Scheduled(cron = "0 0/1 * * * ?")
public void execute() {
List<ExtractTaskTemp> extractTaskTemps = getExtractTaskTemp();
if (CollectionUtils.isEmpty(extractTaskTemps)) {
log.warn("extractTaskTemps isEmpty");
return;
}
for (ExtractTaskTemp e : extractTaskTemps) {
if (null != e) {
if ("1".equals(String.valueOf(e.getTaskType()))) {
createTask(e);
} else {
createReplyTask(e);
}
}
}
}
/**
* 获取回复数
*
* @param bean
*/
public void createReplyTask(ExtractTaskTemp bean) {
if (bean == null) {
log.warn("SpidTaskSynBean is null ");
return;
}
if (null == bean.getTaskType()) {
log.warn("ExtractTaskTemp getTaskType is null ");
return;
}
initTask(bean);
HttpUrlConnection connection = new HttpUrlConnection();
Map<String, String> dataMap = new HashMap<String, String>();
String url = Constant.GET_ARTICLE_REPLY_URL;
dataMap.put("order", "rid");
dataMap.put("desc", "asc");
dataMap.put("cid", String.valueOf(bean.getCid()));
dataMap.put("limit", String.valueOf(bean.getLimitSize()));
dataMap.put("fid", String.valueOf(bean.getWebsiteId()));
dataMap.put("authors", bean.getAuthors());
dataMap.put("from", String.valueOf(bean.getTaskCurrentTime().getTime()));
dataMap.put("to", String.valueOf(bean.getTaskNextTime().getTime()));
StringBuilder logBuilder = new StringBuilder();
logBuilder.append("开始时间为:").append(bean.getCreateTime())
.append("结束时间为:").append(bean.getTaskNextTime())
.append(" url :" + url).append(" dataMap :" + dataMap);
log.info(logBuilder.toString());
String response = null;
try {
response = connection.readData(dataMap, url);
} catch (Exception e) {
log.error(e.getMessage(), e);
return;
}
if ("[]".equals(response)) {
bean.setAuthors(null);
bean.setStartSize(0);
extractTaskTempService.updateExtractTaskTemp(bean);
log.warn("get response is null " + url + " " + dataMap);
return;
}
JsonEntityArray array = new JsonEntityArray(response);
List<Map<String, Object>> addArticleBeans = new ArrayList<Map<String, Object>>();
if (array != null && array.size() > 0) {
for (int i = 0; i < array.size(); i++) {
JsonEntity jsonE = array.getJsonEntity(i);
String rid = jsonE.getString("rid");
Map<String, Object> map = new HashMap<String, Object>();
map.put("rid", Integer.valueOf(rid));
addArticleBeans.add(map);
}
try {
bean.setStartSize(bean.getStartSize() + addArticleBeans.size());
extractTaskTempService
.updateExtractTaskTempAndArticleReplyTemp(bean,
addArticleBeans);
} catch (Exception e) {
log.error(e);
}
}
}
public String getCommentAccountTempName(
List<CommentAccountTemp> accountTemps) {
StringBuilder builder = new StringBuilder();
try {
if (CollectionUtils.isNotEmpty(accountTemps)) {
for (CommentAccountTemp c : accountTemps) {
if (c != null) {
String name = c.getCommentNickname();
builder.append(name).append(",");
}
}
}
} catch (Exception e) {
log.error(e);
}
String s = builder.toString();
if (s.endsWith(",")) {
s = s.substring(0, s.length() - 1);
}
return s;
}
/**
* 初始化任务,设置 开始时间,结束时间,以及账号表中的任务起始时间。(每一次任务表中的结束时间=账号表中的开始时间) 在账号表中的时间会
* 出现的时间范围是 初始化时间+时间间隔*次数
*
* @param bean
*/
public void initTask(ExtractTaskTemp bean) {
if (StringUtils.isNotBlank(bean.getAuthors())) {
return;
}
Map<String, Object> map = new HashMap<String, Object>();
map.put("websiteId", bean.getWebsiteId());
map.put("deleteFlag", 1);
map.put("taskTime", bean.getTaskInitTime());
List<CommentAccountTemp> accountTemps = null;
try {
accountTemps = extractTaskTempService.queryCommentAccountTemp(map);
} catch (Exception e) {
log.error(e);
}
if (CollectionUtils.isEmpty(accountTemps)) {
map.clear();
map.put("websiteId", bean.getWebsiteId());
map.put("deleteFlag", 1);
map.put("taskTime", bean.getTaskNextTime());
try {
accountTemps = extractTaskTempService
.queryCommentAccountTemp(map);
bean.setStartSize(0);// 起始页
bean.setTaskCurrentTime(bean.getTaskNextTime());// 当前处理时间
Date taskNextTime = DateUtils.addMinutes(
bean.getTaskNextTime(), bean.getIntervalTime());
bean.setTaskNextTime(taskNextTime);// 下次处理时间
} catch (Exception e) {
log.error(e);
}
} else {
bean.setStartSize(0);// 起始页
bean.setTaskCurrentTime(bean.getTaskInitTime());// 当前处理时间
Date taskNextTime = DateUtils.addMinutes(bean.getTaskInitTime(),
bean.getIntervalTime());
bean.setTaskNextTime(taskNextTime);// 下次处理时间
}
try {
bean.setAuthors(getCommentAccountTempName(accountTemps));// 设置作者
extractTaskTempService.updateExtractTaskTempAndCommentAccountTemp(
bean, accountTemps);
} catch (Exception e) {
log.error(e);
}
}
/**
* 抽取文章数据
*
* @param bean
*/
public void createTask(ExtractTaskTemp bean) {
if (bean == null) {
log.warn("SpidTaskSynBean is null ");
return;
}
if (null == bean.getTaskType()) {
log.warn("ExtractTaskTemp getSysTypeId is null ");
return;
}
initTask(bean);
while (true) {
HttpUrlConnection connection = new HttpUrlConnection();
Map<String, String> dataMap = new HashMap<String, String>();
String url = Constant.GET_ARTICLE_URL;
dataMap.put("action", "full");
dataMap.put("sort", "6");
dataMap.put("order", "1");
dataMap.put("start", String.valueOf(bean.getStartSize()));
dataMap.put("cid", String.valueOf(bean.getCid()));
dataMap.put("limit", String.valueOf(bean.getLimitSize()));
dataMap.put("fid", String.valueOf(bean.getWebsiteId()));
dataMap.put("authors", bean.getAuthors());
dataMap.put("from",
String.valueOf(bean.getTaskCurrentTime().getTime()));
dataMap.put("to", String.valueOf(bean.getTaskNextTime().getTime()));
StringBuilder logBuilder = new StringBuilder();
logBuilder.append("开始时间为:").append(bean.getCreateTime())
.append("结束时间为:").append(bean.getTaskNextTime())
.append(" url :" + url).append(" dataMap :" + dataMap);
log.info(logBuilder.toString());
String response = null;
try {
response = connection.readData(dataMap, url);
} catch (Exception e) {
log.error(e.getMessage(), e);
return;
}
JsonEntity jsonEntity = new JsonEntity(response);
int result = Integer.valueOf(jsonEntity.getString("count"));
if (result == 0) {
log.error("抽取文件数据为0条 ");
bean.setAuthors(null);
bean.setStartSize(0);
extractTaskTempService.updateExtractTaskTemp(bean);
return;
} else {
JsonEntityArray array = jsonEntity.getJsonEntityArray("items");
List<Map<String, Object>> addArticleBeans = new ArrayList<Map<String, Object>>();
if (array != null && array.size() > 0) {
for (int i = 0; i < array.size(); i++) {
JsonEntity jsonE = array.getJsonEntity(i);
String aid = jsonE.getString("aid");
Map<String, Object> map = new HashMap<String, Object>();
map.put("aid", Integer.valueOf(aid));
addArticleBeans.add(map);
}
}
try {
bean.setStartSize(bean.getStartSize()
+ addArticleBeans.size());
extractTaskTempService.updateExtractTaskTempAndArticleTemp(
bean, addArticleBeans);
} catch (Exception e) {
log.error(e);
}
}
}
}
public void getArticleByGuid(Set<String> guids) {
if (CollectionUtils.isNotEmpty(guids)) {
for (String guid : guids) {
getArticleByGuid(Constant.CID, guid);
}
}
}
/**
* 抽取文章数据
*
* @param bean
*/
public void getArticleByGuid(String cid, String guid) {
if (StringUtils.isBlank(cid)) {
log.warn("cid is null ");
return;
}
if (StringUtils.isBlank(guid)) {
log.warn("cid is null ");
return;
}
HttpUrlConnection connection = new HttpUrlConnection();
Map<String, String> dataMap = new HashMap<String, String>();
String url = Constant.GET_ARTICLE_URL;
dataMap.put("cid", cid);
dataMap.put("guid", guid);
String response = null;
try {
response = connection.readData(dataMap, url);
} catch (Exception e) {
log.error(e.getMessage(), e);
return;
}
// 解析json
JsonEntity jsonEntity = new JsonEntity(response);
int result = Integer.valueOf(jsonEntity.getString("count"));
if (result == 0) {
log.error("获取0条数据");
} else {
}
}
/**
* 抽取文章数据
*
* @param bean
*/
public static void testGetArticleByGuid(String cid, String guid) {
if (StringUtils.isBlank(cid)) {
log.warn("cid is null ");
return;
}
if (StringUtils.isBlank(guid)) {
log.warn("cid is null ");
return;
}
HttpUrlConnection connection = new HttpUrlConnection();
Map<String, String> dataMap = new HashMap<String, String>();
dataMap.put("cid", cid);
dataMap.put("guid", guid);
String response = null;
try {
response = connection.readData(dataMap, url);
} catch (Exception e) {
log.error(e.getMessage(), e);
return;
}
// 解析json
JsonEntity jsonEntity = new JsonEntity(response);
int result = Integer.valueOf(jsonEntity.getString("count"));
if (result == 0) {
log.error("获取0条数据");
} else {
}
}
/**
* 获取文章内容
*
* @param aid
* @return
*/
public String getArticleContent(Long aid) {
HttpUrlConnection connection = new HttpUrlConnection();
Map<String, String> dataMap = new HashMap<String, String>();
String url = Constant.GET_ARTICLE_CONTENT_URL;
dataMap.put("aid", String.valueOf(aid));
String response = null;
try {
response = connection.readData(dataMap, url);
} catch (Exception e) {
log.error(e.getMessage(), e);
return null;
}
if (StringUtils.isNotBlank(response)) {
// 解析json
JsonEntity jsonEntity = new JsonEntity(response);
return jsonEntity.getString("content");
}
return null;
}
public static void testSpidArticle() {
HttpUrlConnection connection = new HttpUrlConnection();
Map<String, String> dataMap = new HashMap<String, String>();
dataMap.put("start", "0");
dataMap.put("cid", "731");
dataMap.put("limit", "5");
dataMap.put("action", "full");
dataMap.put("sort", "6");
dataMap.put("order", "1");
dataMap.put("author", "品味咖啡");
dataMap.clear();
String response = connection.readData(dataMap, url);
JsonEntity jsonEntity = new JsonEntity(response);
JsonEntityArray array = jsonEntity.getJsonEntityArray("items");
}
public static void testArticleReply() {
HttpUrlConnection connection = new HttpUrlConnection();
Map<String, String> dataMap = new HashMap<String, String>();
dataMap.put("start", "0");
dataMap.put("cid", Constant.CID);
dataMap.put("limit", "10");
dataMap.put("author", "最爱看九爷");
String response = connection.readData(dataMap, url);
// JsonEntity jsonEntity = new JsonEntity(response);
JsonEntityArray array = new JsonEntityArray(response);
// 采集完成
// JsonEntityArray array = jsonEntity.getJsonEntityArray("items");
}
public static String getMD5Url(String url) {
if (StringUtils.isBlank(url)) {
return url;
}
try {
MD5 md5 = new MD5();
md5.Update(url);
return md5.asHex();
} catch (Exception e) {
log.error("md5 加密异常", e);
}
return null;
}
public static void main(String[] args) {
}
}
最好把传输数据接口的参数都配置到数据库中,
每次任务记录好当前任务的参数参数,当宕机或者重启的时候,有利于保存当前的查询参数,有利于下一次 的查询