Url解析组件模式重构(不影响旧版本使用)

This commit is contained in:
zhaoqichao 2023-07-06 10:23:44 +08:00
parent b516e51bc9
commit 21ea09cd4c
6 changed files with 246 additions and 0 deletions

View File

@ -0,0 +1,22 @@
package com.abin.mallchat.common.common.utils.chain;
import com.abin.mallchat.common.common.utils.discover.domain.UrlInfo;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
import java.util.Map;
/**
* Description: 测试
* Author: achao
* Date: 2023/7/6 9:29
*/
public class Application {
public static void main(String[] args) {
PrioritizedUrlHandler handler = new PrioritizedUrlHandler();
String longStr = "其中包含一个URL www.baidu.com,一个带有端口号的URL http://www.jd.com:80, 一个带有路径的URL http://mallchat.cn, 还有美团技术文章https://mp.weixin.qq.com/s/hwTf4bDck9_tlFpgVDeIKg ";
Map<String, UrlInfo> urlContentMap = handler.getUrlContentMap(longStr);
System.out.println(urlContentMap);
}
}

View File

@ -0,0 +1,96 @@
package com.abin.mallchat.common.common.utils.chain;
import cn.hutool.core.util.ReUtil;
import cn.hutool.core.util.StrUtil;
import com.abin.mallchat.common.common.utils.FutureUtils;
import com.abin.mallchat.common.common.utils.discover.domain.UrlInfo;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.data.util.Pair;
import javax.annotation.Nullable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.CompletableFuture;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Description: 链接处理工厂
* Author: achao
* Date: 2023/7/6 9:12
*/
@Slf4j
public abstract class FactoryUrlHandler extends UrlHandler{
//链接识别的正则
private static final Pattern PATTERN = Pattern.compile("((http|https)://)?(www.)?([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:/~+#-]*[\\w@?^=%&/~+#-])?");
@Override
@Nullable
public Map<String, UrlInfo> getUrlContentMap(String content) {
if (StrUtil.isBlank(content)) {
return new HashMap<>();
}
List<String> matchList = ReUtil.findAll(PATTERN, content, 0);
//并行请求
List<CompletableFuture<Pair<String, UrlInfo>>> futures = matchList.stream().map(match -> CompletableFuture.supplyAsync(() -> {
UrlInfo urlInfo = getContent(match);
return Objects.isNull(urlInfo) ? null : Pair.of(match, urlInfo);
})).collect(Collectors.toList());
CompletableFuture<List<Pair<String, UrlInfo>>> future = FutureUtils.sequenceNonNull(futures);
//结果组装
return future.join().stream().collect(Collectors.toMap(Pair::getFirst, Pair::getSecond, (a, b) -> a));
}
private UrlInfo getContent(String url){
url = !StrUtil.startWith(url, "http") ? "http://" + url : url;
Document document = getUrlDocument(url);
return UrlInfo.builder()
.title(getTitle(document))
.description(getDescription(document))
.image(getImage(url,document)).build();
}
protected Document getUrlDocument(String matchUrl) {
try {
Connection connect = Jsoup.connect(matchUrl);
connect.timeout(2000);
return connect.get();
} catch (Exception e) {
log.error("find error:url:{}", matchUrl, e);
}
return null;
}
/**
* 获取链接的标题
* @param document
* @return
*/
@Nullable
abstract String getTitle(Document document);
/**
* 获取链接的描述
* @param document
* @return
*/
@Nullable
abstract String getDescription(Document document);
/**
* 获取链接的LOGO
* @param document
* @return
*/
@Nullable
abstract String getImage(String url, Document document);
}

View File

@ -0,0 +1,39 @@
package com.abin.mallchat.common.common.utils.chain;
import cn.hutool.core.util.StrUtil;
import com.abin.mallchat.common.common.utils.discover.UrlDiscover;
import com.abin.mallchat.common.common.utils.discover.domain.UrlInfo;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
import java.util.List;
import java.util.Map;
/**
* Description: 优先级链接统一处理扩展类
* Author: achao
* Date: 2023/7/6 9:36
*/
public class PrioritizedUrlHandler extends FactoryUrlHandler {
private final FactoryUrlHandler commonUrlHandler = new CommonUrlHandler();
private final FactoryUrlHandler wxUrlHandler = new WxUrlHandler();
@Nullable
@Override
String getTitle(Document document) {
return StrUtil.isBlank(wxUrlHandler.getTitle(document)) ? commonUrlHandler.getTitle(document) : wxUrlHandler.getTitle(document);
}
@Nullable
@Override
String getDescription(Document document) {
return StrUtil.isBlank(wxUrlHandler.getDescription(document)) ? commonUrlHandler.getDescription(document) : wxUrlHandler.getDescription(document);
}
@Nullable
@Override
String getImage(String url, Document document) {
return StrUtil.isBlank(wxUrlHandler.getImage(url, document)) ? commonUrlHandler.getImage(url, document) : wxUrlHandler.getImage(url, document);
}
}

View File

@ -0,0 +1,25 @@
package com.abin.mallchat.common.common.utils.chain;
import com.abin.mallchat.common.common.utils.discover.domain.UrlInfo;
import org.jsoup.nodes.Document;
import javax.annotation.Nullable;
import java.util.Map;
import java.util.regex.Pattern;
/**
* Description: url集合处理抽象接口定义类
* Author: achao
* Date: 2023/7/6 8:58
*/
public abstract class UrlHandler {
/**
* 提取消息中的所有链接并组装Map
* @param content
* @return
*/
@Nullable
abstract Map<String,UrlInfo> getUrlContentMap(String content);
}

View File

@ -0,0 +1,32 @@
package com.abin.mallchat.common.common.utils.chain;
import cn.hutool.core.util.StrUtil;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document;
/**
* Description:
* Author: achao
* Date: 2023/7/6 9:34
*/
public class WxUrlHandler extends FactoryUrlHandler {
@Nullable
@Override
public String getTitle(Document document) {
return document.getElementsByAttributeValue("property", "og:title").attr("content");
}
@Nullable
@Override
public String getDescription(Document document) {
String description = document.getElementsByAttributeValue("property", "og:description").attr("content");
return StrUtil.isNotBlank(description) ? description.substring(0, description.indexOf("")) : description;
}
@Nullable
@Override
public String getImage(String url, Document document) {
return document.getElementsByAttributeValue("property", "og:image").attr("content");
}
}

View File

@ -0,0 +1,32 @@
package com.abin.mallchat.common.common.utils.chain.dto;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
/**
* Description: 链接信息提取类
* Author: achao
* Date: 2023/7/6 8:54
*/
@Data
@Builder
@AllArgsConstructor
@NoArgsConstructor
public class UrlInfo {
/**
* 标题
**/
String title;
/**
* 描述
**/
String description;
/**
* 网站LOGO
**/
String image;
}