接口:
public interface UrlDiscover {
//主要是这个 给一个文本内容解析出map 【网页连接,(标题、描述、网站logo)】
@Nullable
Map<String, UrlInfo> getUrlContentMap(String content);
/**
*解析出标题、描述、网站logo
**/
@Nullable
UrlInfo getContent(String url);
//给它一个网页就能解析出标题
@Nullable
String getTitle(Document document);
//描述
@Nullable
String getDescription(Document document);
//图片
@Nullable
String getImage(String url, Document document);
}
AbstractUrlDiscover
@Slf4j
public abstract class AbstractUrlDiscover implements UrlDiscover {
//链接识别的正则
private static final Pattern PATTERN = Pattern.compile("((http|https)://)?(www.)?([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:/~+#-]*[\\w@?^=%&/~+#-])?");
@Nullable
@Override
public Map<String, UrlInfo> getUrlContentMap(String content) {
if (StrUtil.isBlank(content)) {
return new HashMap<>();
}
List<String> matchList = ReUtil.findAll(PATTERN, content, 0);
//并行请求
List<CompletableFuture<Pair<String, UrlInfo>>> futures = matchList.stream().map(match -> CompletableFuture.supplyAsync(() -> {
UrlInfo urlInfo = getContent(match);
return Objects.isNull(urlInfo) ? null : Pair.of(match, urlInfo);
})).collect(Collectors.toList());
CompletableFuture<List<Pair<String, UrlInfo>>> future = FutureUtils.sequenceNonNull(futures);
//结果组装
return future.join().stream().collect(Collectors.toMap(Pair::getFirst, Pair::getSecond, (a, b) -> a));
}
@Nullable
@Override
public UrlInfo getContent(String url) {
Document document = getUrlDocument(assemble(url));
if (Objects.isNull(document)) {
return null;
}
return UrlInfo.builder()
.title(getTitle(document))
.description(getDescription(document))
.image(getImage(assemble(url), document)).build();
}
/***
* 没有协议头的增加一个协议头
* @param url
* @return
*/
private String assemble(String url) {
if (!StrUtil.startWith(url, "http")) {
return "http://" + url;
}
return url;
}
/**
*Jsoup 解析网站 拿到页面
* @param matchUrl
* @return
*/
protected Document getUrlDocument(String matchUrl) {
try {
Connection connect = Jsoup.connect(matchUrl);
connect.timeout(2000);
return connect.get();
} catch (Exception e) {
log.error("find error:url:{}", matchUrl, e);
}
return null;
}
/**
* 判断链接是否有效
* 输入链接
* 返回true或者false
*/
public static boolean isConnect(String href) {
//请求地址
URL url;
//请求状态码
int state;
//下载链接类型
String fileType;
try {
url = new URL(href);
HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
state = httpURLConnection.getResponseCode();
fileType = httpURLConnection.getHeaderField("Content-Disposition");
//如果成功200,缓存304,移动302都算有效链接,并且不是下载链接
if ((state == 200 || state == 302 || state == 304) && fileType == null) {
return true;
}
httpURLConnection.disconnect();
} catch (Exception e) {
return false;
}
return false;
}
}
AbstractUrlDiscover子类一:通用网站解析
public class CommonUrlDiscover extends AbstractUrlDiscover {
@Nullable
@Override
public String getTitle(Document document) {
return document.title();
}
@Nullable
@Override
public String getDescription(Document document) {
String description = document.head().select("meta[name=description]").attr("content");
String keywords = document.head().select("meta[name=keywords]").attr("content");
String content = StrUtil.isNotBlank(description) ? description : keywords;
//只保留一句话的描述
return StrUtil.isNotBlank(content) ? content.substring(0, content.indexOf("。")) : content;
}
@Nullable
@Override
public String getImage(String url, Document document) {
String image = document.select("link[type=image/x-icon]").attr("href");
//如果没有去匹配含有icon属性的logo
String href = StrUtil.isEmpty(image) ? document.select("link[rel$=icon]").attr("href") : image;
//如果url已经包含了logo
if (StrUtil.containsAny(url, "favicon")) {
return url;
}
//如果icon可以直接访问或者包含了http
if (isConnect(!StrUtil.startWith(href, "http") ? "http:" + href : href)) {
return href;
}
return StrUtil.format("{}/{}", url, StrUtil.removePrefix(href, "/"));
}
}
AbstractUrlDiscover子类二优先级调用处理器:似乎也没干什么事情全是调用其他子类
public class PrioritizedUrlDiscover extends AbstractUrlDiscover {
private final List<UrlDiscover> urlDiscovers = new ArrayList<>(2);
//两个子类进行组装起来 进行一个责任链s 以供下面进行遍历查找 有对应的就直接返回
public PrioritizedUrlDiscover() {
urlDiscovers.add(new WxUrlDiscover());
urlDiscovers.add(new CommonUrlDiscover());
}
@Nullable
@Override
public String getTitle(Document document) {
for (UrlDiscover urlDiscover : urlDiscovers) {
String urlTitle = urlDiscover.getTitle(document);
if (StrUtil.isNotBlank(urlTitle)) {
return urlTitle;
}
}
return null;
}
@Nullable
@Override
public String getDescription(Document document) {
for (UrlDiscover urlDiscover : urlDiscovers) {
String urlDescription = urlDiscover.getDescription(document);
if (StrUtil.isNotBlank(urlDescription)) {
return urlDescription;
}
}
return null;
}
@Nullable
@Override
public String getImage(String url, Document document) {
for (UrlDiscover urlDiscover : urlDiscovers) {
String urlImage = urlDiscover.getImage(url, document);
if (StrUtil.isNotBlank(urlImage)) {
return urlImage;
}
}
return null;
}
}
AbstractUrlDiscover子类三:微信url解析
public class WxUrlDiscover extends AbstractUrlDiscover {
@Nullable
@Override
public String getTitle(Document document) {
return document.getElementsByAttributeValue("property", "og:title").attr("content");
}
@Nullable
@Override
public String getDescription(Document document) {
return document.getElementsByAttributeValue("property", "og:description").attr("content");
}
@Nullable
@Override
public String getImage(String url, Document document) {
String href = document.getElementsByAttributeValue("property", "og:image").attr("content");
return isConnect(href) ? href : null;
}
}
美团CompletableFuture工具类
/**
* 美团的CompletableFuture封装工具类,参考文章https://mp.weixin.qq.com/s/GQGidprakfticYnbVYVYGQ
*/
@Slf4j
public class FutureUtils {
/**
* 设置CF状态为失败
*/
public static <T> CompletableFuture<T> failed(Throwable ex) {
CompletableFuture<T> completableFuture = new CompletableFuture<>();
completableFuture.completeExceptionally(ex);
return completableFuture;
}
/**
* 设置CF状态为成功
*/
public static <T> CompletableFuture<T> success(T result) {
CompletableFuture<T> completableFuture = new CompletableFuture<>();
completableFuture.complete(result);
return completableFuture;
}
/**
* 将List<CompletableFuture<T>> 转为 CompletableFuture<List<T>>
*/
public static <T> CompletableFuture<List<T>> sequence(Collection<CompletableFuture<T>> completableFutures) {
return CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream()
.map(CompletableFuture::join)
.collect(Collectors.toList())
);
}
/**
* 将List<CompletableFuture<List<T>>> 转为 CompletableFuture<List<T>>
* 多用于分页查询的场景
*/
public static <T> CompletableFuture<List<T>> sequenceList(Collection<CompletableFuture<List<T>>> completableFutures) {
return CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream()
.flatMap(listFuture -> listFuture.join().stream())
.collect(Collectors.toList())
);
}
/*
* 将List<CompletableFuture<Map<K, V>>> 转为 CompletableFuture<Map<K, V>>
* @Param mergeFunction 自定义key冲突时的merge策略
*/
public static <K, V> CompletableFuture<Map<K, V>> sequenceMap(
Collection<CompletableFuture<Map<K, V>>> completableFutures, BinaryOperator<V> mergeFunction) {
return CompletableFuture
.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream().map(CompletableFuture::join)
.flatMap(map -> map.entrySet().stream())
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, mergeFunction)));
}
/**
* 将List<CompletableFuture<T>> 转为 CompletableFuture<List<T>>,并过滤调null值
*/
public static <T> CompletableFuture<List<T>> sequenceNonNull(Collection<CompletableFuture<T>> completableFutures) {
return CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream()
.map(CompletableFuture::join)
.filter(Objects::nonNull)
.collect(Collectors.toList())
);
}
/**
* 将List<CompletableFuture<List<T>>> 转为 CompletableFuture<List<T>>,并过滤调null值
* 多用于分页查询的场景
*/
public static <T> CompletableFuture<List<T>> sequenceListNonNull(Collection<CompletableFuture<List<T>>> completableFutures) {
return CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream()
.flatMap(listFuture -> listFuture.join().stream().filter(Objects::nonNull))
.collect(Collectors.toList())
);
}
/**
* 将List<CompletableFuture<Map<K, V>>> 转为 CompletableFuture<Map<K, V>>
*
* @Param filterFunction 自定义过滤策略
*/
public static <T> CompletableFuture<List<T>> sequence(Collection<CompletableFuture<T>> completableFutures,
Predicate<? super T> filterFunction) {
return CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream()
.map(CompletableFuture::join)
.filter(filterFunction)
.collect(Collectors.toList())
);
}
/**
* 将List<CompletableFuture<List<T>>> 转为 CompletableFuture<List<T>>
*
* @Param filterFunction 自定义过滤策略
*/
public static <T> CompletableFuture<List<T>> sequenceList(Collection<CompletableFuture<List<T>>> completableFutures,
Predicate<? super T> filterFunction) {
return CompletableFuture.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream()
.flatMap(listFuture -> listFuture.join().stream().filter(filterFunction))
.collect(Collectors.toList())
);
}
/**
* 将CompletableFuture<Map<K,V>>的list转为 CompletableFuture<Map<K,V>>。 多个map合并为一个map。 如果key冲突,采用新的value覆盖。
*/
public static <K, V> CompletableFuture<Map<K, V>> sequenceMap(
Collection<CompletableFuture<Map<K, V>>> completableFutures) {
return CompletableFuture
.allOf(completableFutures.toArray(new CompletableFuture<?>[0]))
.thenApply(v -> completableFutures.stream().map(CompletableFuture::join)
.flatMap(map -> map.entrySet().stream())
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (a, b) -> b)));
}
}