mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 16:37:17 +08:00
[Feature] add for new
This commit is contained in:
@@ -156,3 +156,8 @@
|
||||
| 1 | O | 敏感词添加 | 2023-06-06 23:51:58 | 幸运/幸运儿/17年前/1条/1梯两户/1比1/年检/幸存/幸运/幸运儿/恶搞/游戏机/日/草 |
|
||||
| 2 | A | 敏感词添加 | 2023-06-06 23:51:58 | SB |
|
||||
|
||||
# release_0.3.2
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:-----|:--------|:--------------------|:-------|
|
||||
| 1 | O | 中文繁简体样式 | 2023-06-07 23:51:58 | 调整实现策略 |
|
||||
|
||||
4
pom.xml
4
pom.xml
@@ -25,8 +25,8 @@
|
||||
<project.compiler.level>1.7</project.compiler.level>
|
||||
|
||||
<!--============================== INTER ==============================-->
|
||||
<heaven.version>0.1.154</heaven.version>
|
||||
<opencc4j.version>1.7.2</opencc4j.version>
|
||||
<heaven.version>0.2.7</heaven.version>
|
||||
<opencc4j.version>1.8.1</opencc4j.version>
|
||||
|
||||
<!--============================== OTHER ==============================-->
|
||||
<junit.version>4.13.1</junit.version>
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 核心方法
|
||||
* @since 0.3.2
|
||||
*/
|
||||
public interface ISensitiveWord {
|
||||
|
||||
/**
|
||||
* 返回所有对应的敏感词
|
||||
* @param string 原始字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.0.1
|
||||
* @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式
|
||||
*/
|
||||
List<IWordResult> findAll(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 返回第一个对应的敏感词
|
||||
* @param string 原始字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.3.2
|
||||
*/
|
||||
IWordResult findFirst(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 替换所有敏感词内容
|
||||
*
|
||||
* ps: 这里可以添加优化。
|
||||
*
|
||||
* @param target 目标字符串
|
||||
* @param context 上下文
|
||||
* @return 替换后结果
|
||||
* @since 0.3.2
|
||||
*/
|
||||
String replace(final String target,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 包含
|
||||
* @param string 字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.3.2
|
||||
*/
|
||||
boolean contains(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import com.github.houbb.sensitive.word.bs.SensitiveWordContext;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
|
||||
import java.util.Map;
|
||||
@@ -222,4 +223,19 @@ public interface IWordContext {
|
||||
*/
|
||||
ICharFormat charFormat();
|
||||
|
||||
/**
|
||||
* 获取 wordMap 策略
|
||||
* @return 策略
|
||||
* @since 0.3.2
|
||||
*/
|
||||
IWordMap wordMap();
|
||||
|
||||
/**
|
||||
* 设置 wordMap 策略
|
||||
* @param wordMap 策略
|
||||
* @return this
|
||||
* @since 0.3.2
|
||||
*/
|
||||
IWordContext wordMap(IWordMap wordMap);
|
||||
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ import java.util.List;
|
||||
* @author binbin.hou
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public interface IWordMap extends ISensitiveCheck {
|
||||
public interface IWordMap {
|
||||
|
||||
|
||||
/**
|
||||
@@ -32,38 +32,4 @@ public interface IWordMap extends ISensitiveCheck {
|
||||
boolean contains(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 返回所有对应的敏感词
|
||||
* @param string 原始字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.0.1
|
||||
* @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式
|
||||
*/
|
||||
List<IWordResult> findAll(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 返回第一个对应的敏感词
|
||||
* @param string 原始字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.0.1
|
||||
*/
|
||||
IWordResult findFirst(final String string,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 替换所有敏感词内容
|
||||
*
|
||||
* ps: 这里可以添加优化。
|
||||
*
|
||||
* @param target 目标字符串
|
||||
* @param context 上下文
|
||||
* @return 替换后结果
|
||||
* @since 0.0.2
|
||||
*/
|
||||
String replace(final String target,
|
||||
final IWordContext context);
|
||||
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import com.github.houbb.heaven.support.handler.IHandler;
|
||||
import com.github.houbb.heaven.util.common.ArgUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.sensitive.word.api.*;
|
||||
import com.github.houbb.sensitive.word.core.SensitiveWords;
|
||||
import com.github.houbb.sensitive.word.support.allow.WordAllows;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.support.check.impl.SensitiveChecks;
|
||||
@@ -81,6 +82,12 @@ public class SensitiveWordBs {
|
||||
private int sensitiveCheckNumLen = 8;
|
||||
|
||||
//------------------------------------------------------------- 基本属性 END
|
||||
/**
|
||||
* 脱敏策略
|
||||
* @since 0.3.2
|
||||
*/
|
||||
private ISensitiveWord sensitiveWord = SensitiveWords.defaults();
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
*
|
||||
@@ -114,12 +121,6 @@ public class SensitiveWordBs {
|
||||
*/
|
||||
private IWordContext context = SensitiveWordContext.newInstance();
|
||||
|
||||
public SensitiveWordBs sensitiveWordReplace(ISensitiveWordReplace sensitiveWordReplace) {
|
||||
ArgUtil.notNull(sensitiveWordReplace, "sensitiveWordReplace");
|
||||
this.sensitiveWordReplace = sensitiveWordReplace;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 新建验证实例
|
||||
* <p>
|
||||
@@ -132,6 +133,7 @@ public class SensitiveWordBs {
|
||||
return new SensitiveWordBs();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 初始化
|
||||
*
|
||||
@@ -182,6 +184,7 @@ public class SensitiveWordBs {
|
||||
// 额外配置
|
||||
context.sensitiveCheckNumLen(sensitiveCheckNumLen);
|
||||
context.sensitiveWordReplace(sensitiveWordReplace);
|
||||
context.wordMap(wordMap);
|
||||
|
||||
return context;
|
||||
}
|
||||
@@ -202,6 +205,24 @@ public class SensitiveWordBs {
|
||||
wordMap.initWordMap(results);
|
||||
}
|
||||
|
||||
public SensitiveWordBs sensitiveWord(ISensitiveWord sensitiveWord) {
|
||||
ArgUtil.notNull(sensitiveWord, "sensitiveWord");
|
||||
|
||||
this.sensitiveWord = sensitiveWord;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置替换策略
|
||||
* @param sensitiveWordReplace 替换
|
||||
* @return 结果
|
||||
*/
|
||||
public SensitiveWordBs sensitiveWordReplace(ISensitiveWordReplace sensitiveWordReplace) {
|
||||
ArgUtil.notNull(sensitiveWordReplace, "sensitiveWordReplace");
|
||||
this.sensitiveWordReplace = sensitiveWordReplace;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置禁止的实现
|
||||
* @param wordDeny 禁止的实现
|
||||
@@ -348,7 +369,7 @@ public class SensitiveWordBs {
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public boolean contains(final String target) {
|
||||
return wordMap.contains(target, context);
|
||||
return sensitiveWord.contains(target, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -390,7 +411,7 @@ public class SensitiveWordBs {
|
||||
public <R> List<R> findAll(final String target, final IWordResultHandler<R> handler) {
|
||||
ArgUtil.notNull(handler, "handler");
|
||||
|
||||
List<IWordResult> wordResults = wordMap.findAll(target, context);
|
||||
List<IWordResult> wordResults = sensitiveWord.findAll(target, context);
|
||||
return CollectionUtil.toList(wordResults, new IHandler<IWordResult, R>() {
|
||||
@Override
|
||||
public R handle(IWordResult wordResult) {
|
||||
@@ -412,7 +433,7 @@ public class SensitiveWordBs {
|
||||
public <R> R findFirst(final String target, final IWordResultHandler<R> handler) {
|
||||
ArgUtil.notNull(handler, "handler");
|
||||
|
||||
IWordResult wordResult = wordMap.findFirst(target, context);
|
||||
IWordResult wordResult = sensitiveWord.findFirst(target, context);
|
||||
return handler.handle(wordResult);
|
||||
}
|
||||
|
||||
@@ -424,7 +445,7 @@ public class SensitiveWordBs {
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public String replace(final String target) {
|
||||
return wordMap.replace(target, context);
|
||||
return sensitiveWord.replace(target, context);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------------ 公开方法 END
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.ICharFormat;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.*;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
|
||||
import java.util.Map;
|
||||
@@ -36,6 +34,7 @@ public class SensitiveWordContext implements IWordContext {
|
||||
* 敏感词信息
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@Deprecated
|
||||
private Map sensitiveWordMap;
|
||||
|
||||
/**
|
||||
@@ -98,6 +97,22 @@ public class SensitiveWordContext implements IWordContext {
|
||||
*/
|
||||
private ICharFormat charFormat;
|
||||
|
||||
/**
|
||||
* 单词 map 信息
|
||||
*
|
||||
* @since 0.3.2
|
||||
*/
|
||||
private IWordMap wordMap;
|
||||
|
||||
public IWordMap wordMap() {
|
||||
return wordMap;
|
||||
}
|
||||
|
||||
public SensitiveWordContext wordMap(IWordMap wordMap) {
|
||||
this.wordMap = wordMap;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ISensitiveWordReplace sensitiveWordReplace() {
|
||||
return sensitiveWordReplace;
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
package com.github.houbb.sensitive.word.core;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.sensitive.word.api.*;
|
||||
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 抽象实现
|
||||
*
|
||||
* @since 0.3.2
|
||||
*/
|
||||
public abstract class AbstractSensitiveWord implements ISensitiveWord {
|
||||
|
||||
/**
|
||||
* 执行全部替换
|
||||
* @param string 字符串
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.3.2
|
||||
*/
|
||||
protected abstract List<IWordResult> doFindAll(String string, IWordContext context);
|
||||
|
||||
/**
|
||||
* 替换
|
||||
* @param target 目标字符串
|
||||
* @param allList 敏感词列表
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.3.2
|
||||
*/
|
||||
protected String doReplace(String target, List<IWordResult> allList, IWordContext context) {
|
||||
// 根据 index 直接分割
|
||||
|
||||
final ISensitiveWordReplace replace = context.sensitiveWordReplace();
|
||||
// 是否需要对 allList 排序?
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
|
||||
// 注意边界
|
||||
int startIndex = 0;
|
||||
char[] chars = target.toCharArray();
|
||||
|
||||
for(IWordResult wordResult : allList) {
|
||||
final int itemStartIx = wordResult.startIndex();
|
||||
final int itemEndIx = wordResult.endIndex();
|
||||
|
||||
// 脱敏的左边
|
||||
if(startIndex < itemStartIx) {
|
||||
stringBuilder.append(chars, startIndex, itemStartIx-startIndex);
|
||||
}
|
||||
|
||||
// 脱敏部分
|
||||
String word = wordResult.word();
|
||||
ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance()
|
||||
.sensitiveWord(word)
|
||||
.wordLength(word.length());
|
||||
String replacedText = replace.replace(replaceContext);
|
||||
stringBuilder.append(replacedText);
|
||||
|
||||
// 更新结尾
|
||||
startIndex = Math.max(startIndex, itemEndIx);
|
||||
}
|
||||
|
||||
// 最后部分
|
||||
if (startIndex < chars.length) {
|
||||
stringBuilder.append(chars, startIndex, chars.length-startIndex);
|
||||
}
|
||||
|
||||
return stringBuilder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<IWordResult> findAll(String string, IWordContext context) {
|
||||
if(StringUtil.isEmpty(string)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return doFindAll(string, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IWordResult findFirst(String string, IWordContext context) {
|
||||
//TODO: 这个是懒惰的实现,性能一般。也可以调整为 FAST_OVER 模式。
|
||||
List<IWordResult> allList = findAll(string, context);
|
||||
if(CollectionUtil.isNotEmpty(allList)) {
|
||||
return allList.get(0);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String replace(String target, IWordContext context) {
|
||||
if(StringUtil.isEmpty(target)) {
|
||||
return target;
|
||||
}
|
||||
|
||||
List<IWordResult> allList = findAll(target, context);
|
||||
if(CollectionUtil.isEmpty(allList)) {
|
||||
return target;
|
||||
}
|
||||
|
||||
return doReplace(target, allList, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(String string, IWordContext context) {
|
||||
//1. 第一个存在
|
||||
IWordResult firstResult = this.findFirst(string, context);
|
||||
return firstResult != null;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,84 @@
|
||||
package com.github.houbb.sensitive.word.core;
|
||||
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.heaven.util.io.FileUtil;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWord;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordResult;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl;
|
||||
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResult;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 默认实现
|
||||
*
|
||||
* @since 0.3.2
|
||||
*/
|
||||
public class SensitiveWord extends AbstractSensitiveWord {
|
||||
|
||||
/**
|
||||
* 0.3.2
|
||||
*/
|
||||
private static final ISensitiveWord INSTANCE = new SensitiveWord();
|
||||
|
||||
public static ISensitiveWord getInstance() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<IWordResult> doFindAll(String string, IWordContext context) {
|
||||
return innerSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取敏感词列表
|
||||
*
|
||||
* @param text 文本
|
||||
* @param modeEnum 模式
|
||||
* @return 结果列表
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private List<IWordResult> innerSensitiveWords(final String text,
|
||||
final ValidModeEnum modeEnum,
|
||||
final IWordContext context) {
|
||||
//1. 是否存在敏感词,如果比存在,直接返回空列表
|
||||
final ISensitiveCheck sensitiveCheck = context.sensitiveCheck();
|
||||
List<IWordResult> resultList = Guavas.newArrayList();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
SensitiveCheckResult checkResult = sensitiveCheck.sensitiveCheck(text, i, ValidModeEnum.FAIL_OVER, context);
|
||||
|
||||
// 命中
|
||||
int wordLength = checkResult.index();
|
||||
if (wordLength > 0) {
|
||||
// 保存敏感词
|
||||
String sensitiveWord = text.substring(i, i + wordLength);
|
||||
|
||||
// 添加去重
|
||||
WordResult wordResult = WordResult.newInstance()
|
||||
.startIndex(i)
|
||||
.endIndex(i+wordLength)
|
||||
.word(sensitiveWord);
|
||||
resultList.add(wordResult);
|
||||
|
||||
// 快速返回
|
||||
if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// 增加 i 的步长
|
||||
// 为什么要-1,因为默认就会自增1
|
||||
// TODO: 这里可以根据字符串匹配算法优化。
|
||||
i += wordLength - 1;
|
||||
}
|
||||
}
|
||||
|
||||
return resultList;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.github.houbb.sensitive.word.core;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWord;
|
||||
|
||||
/**
|
||||
* 策略工具类
|
||||
* @since 0.3.2
|
||||
*/
|
||||
public final class SensitiveWords {
|
||||
|
||||
private SensitiveWords(){}
|
||||
|
||||
/**
|
||||
* 默认策略
|
||||
* @return 策略
|
||||
*/
|
||||
public static ISensitiveWord defaults() {
|
||||
return SensitiveWord.getInstance();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
package com.github.houbb.sensitive.word.support.check.impl;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
|
||||
/**
|
||||
* 抽象实现策略
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.3.2
|
||||
*/
|
||||
@ThreadSafe
|
||||
public abstract class AbstractSensitiveCheck implements ISensitiveCheck {
|
||||
|
||||
/**
|
||||
* 当前字符串是否符合规范
|
||||
* @param mappingChar 当前字符
|
||||
* @param index 下标
|
||||
* @param rawText 原始文本
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.3.2
|
||||
*/
|
||||
protected abstract boolean isCharCondition(char mappingChar,
|
||||
int index,
|
||||
String rawText,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 这里指定一个阈值条件
|
||||
* @param index 当前下标
|
||||
* @param rawText 原始文本
|
||||
* @param stringBuilder 缓存
|
||||
* @param context 上下文
|
||||
* @return 是否满足条件
|
||||
* @since 0.3.2
|
||||
*/
|
||||
protected abstract boolean isStringCondition(int index,
|
||||
String rawText,
|
||||
final StringBuilder stringBuilder,
|
||||
final IWordContext context);
|
||||
|
||||
/**
|
||||
* 获取校验类
|
||||
* @return 类
|
||||
* @since 0.3.2
|
||||
*/
|
||||
protected abstract Class<? extends ISensitiveCheck> getSensitiveCheckClass();
|
||||
|
||||
@Override
|
||||
public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex,
|
||||
ValidModeEnum validModeEnum,
|
||||
IWordContext context) {
|
||||
// 采用 ThreadLocal 应该可以提升性能,减少对象的创建。
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
int actualLength = 0;
|
||||
// 前一个条件
|
||||
for(int i = beginIndex; i < txt.length(); i++) {
|
||||
char currentChar = txt.charAt(i);
|
||||
|
||||
// 映射处理
|
||||
char mappingChar = context.charFormat().format(currentChar, context);
|
||||
|
||||
// 符合条件
|
||||
boolean currentCondition = isCharCondition(mappingChar, i, txt, context);
|
||||
if(currentCondition) {
|
||||
stringBuilder.append(currentChar);
|
||||
|
||||
// 匹配
|
||||
if(isStringCondition(i, txt, stringBuilder, context)) {
|
||||
actualLength = stringBuilder.length();
|
||||
|
||||
// 是否遍历全部匹配的模式
|
||||
if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// 处理结果
|
||||
return SensitiveCheckResult.of(actualLength, getSensitiveCheckClass());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -24,7 +24,7 @@ import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
* @since 0.0.9
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveCheckEmail implements ISensitiveCheck {
|
||||
public class SensitiveCheckEmail extends AbstractSensitiveCheck {
|
||||
|
||||
/**
|
||||
* @since 0.3.0
|
||||
@@ -36,48 +36,19 @@ public class SensitiveCheckEmail implements ISensitiveCheck {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
// 这里偷懒直接使用 String 拼接,然后结合正则表达式。
|
||||
// DFA 本质就可以做正则表达式,这样实现不免性能会差一些。
|
||||
// 后期如果有想法,对 DFA 进一步深入学习后,将进行优化。
|
||||
for(int i = beginIndex; i < txt.length(); i++) {
|
||||
char currentChar = txt.charAt(i);
|
||||
char mappingChar = context.charFormat()
|
||||
.format(currentChar, context);
|
||||
|
||||
if(CharUtil.isEmilChar(mappingChar)) {
|
||||
lengthCount++;
|
||||
stringBuilder.append(currentChar);
|
||||
|
||||
if(isCondition(stringBuilder.toString())) {
|
||||
actualLength = lengthCount;
|
||||
|
||||
// 是否遍历全部匹配的模式
|
||||
if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return SensitiveCheckResult.of(actualLength, SensitiveCheckEmail.class);
|
||||
protected boolean isCharCondition(char mappingChar, int index, String rawText, IWordContext context) {
|
||||
return CharUtil.isEmilChar(mappingChar);
|
||||
}
|
||||
|
||||
/**
|
||||
* 这里指定一个阈值条件
|
||||
* @param string 长度
|
||||
* @return 是否满足条件
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private boolean isCondition(final String string) {
|
||||
@Override
|
||||
protected boolean isStringCondition(int index, String rawText, StringBuilder stringBuilder, IWordContext context) {
|
||||
String string = stringBuilder.toString();
|
||||
return RegexUtil.isEmail(string);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Class<? extends ISensitiveCheck> getSensitiveCheckClass() {
|
||||
return SensitiveCheckEmail.class;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,9 +2,7 @@ package com.github.houbb.sensitive.word.support.check.impl;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
|
||||
/**
|
||||
* 敏感词监测实现
|
||||
@@ -14,7 +12,7 @@ import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveCheckNum implements ISensitiveCheck {
|
||||
public class SensitiveCheckNum extends AbstractSensitiveCheck {
|
||||
|
||||
/**
|
||||
* @since 0.3.0
|
||||
@@ -26,51 +24,20 @@ public class SensitiveCheckNum implements ISensitiveCheck {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
for (int i = beginIndex; i < txt.length(); i++) {
|
||||
char c = txt.charAt(i);
|
||||
char charKey = context.charFormat().format(c, context);
|
||||
|
||||
// 如果是数字
|
||||
// 满足进入的条件
|
||||
if (Character.isDigit(charKey)) {
|
||||
lengthCount++;
|
||||
|
||||
// 满足结束的条件
|
||||
boolean isCondition = isCondition(lengthCount, context);
|
||||
if (isCondition) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
actualLength = lengthCount;
|
||||
|
||||
// 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。
|
||||
if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 直接跳出循环
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return SensitiveCheckResult.of(actualLength, SensitiveCheckNum.class);
|
||||
protected boolean isCharCondition(char mappingChar, int index, String rawText, IWordContext context) {
|
||||
return Character.isDigit(mappingChar);
|
||||
}
|
||||
|
||||
/**
|
||||
* 这里指定一个阈值条件
|
||||
* TODO: 这里有一个问题,会把一些 url 中的数字替换掉。
|
||||
* @param lengthCount 长度
|
||||
* @param context 上下文
|
||||
* @return 是否满足条件
|
||||
* @since 0.0.5
|
||||
*/
|
||||
protected boolean isCondition(final int lengthCount,
|
||||
final IWordContext context) {
|
||||
return lengthCount >= context.sensitiveCheckNumLen();
|
||||
@Override
|
||||
protected boolean isStringCondition(int index, String rawText, StringBuilder stringBuilder, IWordContext context) {
|
||||
int bufferLen = stringBuilder.length();
|
||||
|
||||
return bufferLen >= context.sensitiveCheckNumLen();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Class<? extends ISensitiveCheck> getSensitiveCheckClass() {
|
||||
return SensitiveCheckNum.class;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -5,9 +5,7 @@ import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.heaven.util.util.regex.RegexUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
|
||||
/**
|
||||
* URL 正则表达式检测实现。
|
||||
@@ -22,7 +20,7 @@ import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
* @since 0.0.9
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveCheckUrl implements ISensitiveCheck {
|
||||
public class SensitiveCheckUrl extends AbstractSensitiveCheck {
|
||||
|
||||
/**
|
||||
* @since 0.3.0
|
||||
@@ -34,50 +32,24 @@ public class SensitiveCheckUrl implements ISensitiveCheck {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
// 这里偷懒直接使用 String 拼接,然后结合正则表达式。
|
||||
// DFA 本质就可以做正则表达式,这样实现不免性能会差一些。
|
||||
// 后期如果有想法,对 DFA 进一步深入学习后,将进行优化。
|
||||
for(int i = beginIndex; i < txt.length(); i++) {
|
||||
char currentChar = txt.charAt(i);
|
||||
char mappingChar = context.charFormat()
|
||||
.format(currentChar, context);
|
||||
|
||||
if(CharUtil.isWebSiteChar(mappingChar)
|
||||
&& lengthCount <= AppConst.MAX_WEB_SITE_LEN) {
|
||||
lengthCount++;
|
||||
stringBuilder.append(currentChar);
|
||||
|
||||
if(isCondition(stringBuilder.toString())) {
|
||||
actualLength = lengthCount;
|
||||
|
||||
// 是否遍历全部匹配的模式
|
||||
if(ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return SensitiveCheckResult.of(actualLength, SensitiveCheckUrl.class);
|
||||
protected boolean isCharCondition(char mappingChar, int index, String rawText, IWordContext context) {
|
||||
return CharUtil.isWebSiteChar(mappingChar);
|
||||
}
|
||||
|
||||
/**
|
||||
* 这里指定一个阈值条件
|
||||
* @param string 长度
|
||||
* @return 是否满足条件
|
||||
* @since 0.0.12
|
||||
*/
|
||||
private boolean isCondition(final String string) {
|
||||
@Override
|
||||
protected boolean isStringCondition(int index, String rawText, StringBuilder stringBuilder, IWordContext context) {
|
||||
int bufferLen = stringBuilder.length();
|
||||
if(bufferLen > AppConst.MAX_WEB_SITE_LEN) {
|
||||
return false;
|
||||
}
|
||||
|
||||
String string = stringBuilder.toString();
|
||||
return RegexUtil.isWebSite(string);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Class<? extends ISensitiveCheck> getSensitiveCheckClass() {
|
||||
return SensitiveCheckUrl.class;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,14 +1,8 @@
|
||||
package com.github.houbb.sensitive.word.support.check.impl;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 敏感词监测实现
|
||||
@@ -16,7 +10,7 @@ import java.util.Map;
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveCheckWord implements ISensitiveCheck {
|
||||
public class SensitiveCheckWord extends AbstractSensitiveCheck {
|
||||
|
||||
/**
|
||||
* @since 0.3.0
|
||||
@@ -28,94 +22,18 @@ public class SensitiveCheckWord implements ISensitiveCheck {
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
Map nowMap = context.sensitiveWordMap();
|
||||
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
for (int i = beginIndex; i < txt.length(); i++) {
|
||||
// 获取当前的 map 信息
|
||||
nowMap = getNowMap(nowMap, context, txt, i);
|
||||
|
||||
if (ObjectUtil.isNotNull(nowMap)) {
|
||||
lengthCount++;
|
||||
|
||||
// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
|
||||
boolean isEnd = isEnd(nowMap);
|
||||
if (isEnd) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
// eg: 敏感词 敏感词xxx
|
||||
// 如果是 【敏感词x】也会被匹配。
|
||||
actualLength = lengthCount;
|
||||
|
||||
// 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。
|
||||
if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 直接跳出循环
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return SensitiveCheckResult.of(actualLength, SensitiveCheckWord.class);
|
||||
protected boolean isCharCondition(char mappingChar, int index, String rawText, IWordContext context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否结束
|
||||
* BUG-FIX: 避免出现敏感词库中没有的文字。
|
||||
* @param map map 信息
|
||||
* @return 是否结束
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private static boolean isEnd(final Map map) {
|
||||
if(ObjectUtil.isNull(map)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Object value = map.get(AppConst.IS_END);
|
||||
if(ObjectUtil.isNull(value)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (boolean)value;
|
||||
@Override
|
||||
protected boolean isStringCondition(int index, String rawText, StringBuilder stringBuilder, IWordContext context) {
|
||||
return context.wordMap().contains(stringBuilder.toString(), context);
|
||||
}
|
||||
/**
|
||||
* 获取当前的 Map
|
||||
* @param nowMap 原始的当前 map
|
||||
* @param context 上下文
|
||||
* @param txt 文本信息
|
||||
* @param index 下标
|
||||
* @return 实际的当前 map
|
||||
* @since 0.0.7
|
||||
*/
|
||||
private Map getNowMap(Map nowMap,
|
||||
final IWordContext context,
|
||||
final String txt,
|
||||
final int index) {
|
||||
char c = txt.charAt(index);
|
||||
char mappingChar = context.charFormat().format(c, context);
|
||||
|
||||
// 这里做一次重复词的处理
|
||||
//TODO: 这里可以优化,是否获取一次。
|
||||
Map currentMap = (Map) nowMap.get(mappingChar);
|
||||
// 启用忽略重复&当前下标不是第一个
|
||||
if(context.ignoreRepeat()
|
||||
&& index > 0) {
|
||||
char preChar = txt.charAt(index-1);
|
||||
char preMappingChar = context.charFormat()
|
||||
.format(preChar, context);
|
||||
|
||||
// 直接赋值为上一个 map
|
||||
if(preMappingChar == mappingChar) {
|
||||
currentMap = nowMap;
|
||||
}
|
||||
}
|
||||
|
||||
return currentMap;
|
||||
@Override
|
||||
protected Class<? extends ISensitiveCheck> getSensitiveCheckClass() {
|
||||
return SensitiveCheckWord.class;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,11 +1,16 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
|
||||
import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
|
||||
import com.github.houbb.opencc4j.util.ZhConverterUtil;
|
||||
import com.github.houbb.sensitive.word.api.ICharFormat;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 忽略中文样式
|
||||
* @author binbin.hou
|
||||
@@ -22,9 +27,12 @@ public class IgnoreChineseStyleFormat implements ICharFormat {
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
String string = String.valueOf(original);
|
||||
String simple = ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(string);
|
||||
return simple.charAt(0);
|
||||
List<String> mappingList = ZhConverterUtil.toSimple(original);
|
||||
if(CollectionUtil.isEmpty(mappingList)) {
|
||||
return original;
|
||||
}
|
||||
|
||||
return mappingList.get(0).charAt(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,265 +1,265 @@
|
||||
package com.github.houbb.sensitive.word.support.map;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.heaven.util.io.FileUtil;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.sensitive.word.api.*;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl;
|
||||
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResult;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveWordMap implements IWordMap {
|
||||
|
||||
/**
|
||||
* 脱敏单词 map
|
||||
*
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private Map innerWordMap;
|
||||
|
||||
/**
|
||||
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
|
||||
*
|
||||
* @param collection 敏感词库集合
|
||||
* @since 0.0.1
|
||||
* <p>
|
||||
* 使用对象代码 map 的这种一直递归。
|
||||
* 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html
|
||||
* https://blog.csdn.net/chenssy/article/details/26961957
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public synchronized void initWordMap(Collection<String> collection) {
|
||||
// 避免扩容带来的消耗
|
||||
Map newInnerWordMap = new HashMap(collection.size());
|
||||
|
||||
for (String key : collection) {
|
||||
if (StringUtil.isEmpty(key)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 用来按照相应的格式保存敏感词库数据
|
||||
char[] chars = key.toCharArray();
|
||||
final int size = chars.length;
|
||||
|
||||
// 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
|
||||
Map currentMap = newInnerWordMap;
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
// 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
|
||||
char charKey = chars[i];
|
||||
// 如果集合存在
|
||||
Object wordMap = currentMap.get(charKey);
|
||||
|
||||
// 如果集合存在
|
||||
if (ObjectUtil.isNotNull(wordMap)) {
|
||||
// 直接将获取到的 map 当前当前 map 进行继续的操作
|
||||
currentMap = (Map) wordMap;
|
||||
} else {
|
||||
//不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
|
||||
Map<String, Boolean> newWordMap = new HashMap<>(8);
|
||||
newWordMap.put(AppConst.IS_END, false);
|
||||
|
||||
// 将新的节点放入当前 map 中
|
||||
currentMap.put(charKey, newWordMap);
|
||||
|
||||
// 将新节点设置为当前节点,方便下一次节点的循环。
|
||||
currentMap = newWordMap;
|
||||
}
|
||||
|
||||
// 判断是否为最后一个,添加是否结束的标识。
|
||||
if (i == size - 1) {
|
||||
currentMap.put(AppConst.IS_END, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 最后更新为新的 map,保证更新过程中旧的数据可用
|
||||
this.innerWordMap = newInnerWordMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否包含
|
||||
* (1)直接遍历所有
|
||||
* (2)如果遇到,则直接返回 true
|
||||
*
|
||||
* @param string 字符串
|
||||
* @return 是否包含
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Override
|
||||
public boolean contains(String string, final IWordContext context) {
|
||||
if (StringUtil.isEmpty(string)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
SensitiveCheckResult checkResult = sensitiveCheck(string, i, ValidModeEnum.FAIL_FAST, context);
|
||||
// 快速返回
|
||||
if (checkResult.index() > 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回所有对应的敏感词
|
||||
* (1)结果是有序的
|
||||
* (2)为了保留所有的下标,结果从 v0.1.0 之后不再去重。
|
||||
*
|
||||
* @param string 原始字符串
|
||||
* @return 结果
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Override
|
||||
public List<IWordResult> findAll(String string, final IWordContext context) {
|
||||
return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public IWordResult findFirst(String string, final IWordContext context) {
|
||||
List<IWordResult> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context);
|
||||
|
||||
if (CollectionUtil.isEmpty(stringList)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return stringList.get(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String replace(String target, final IWordContext context) {
|
||||
if(StringUtil.isEmpty(target)) {
|
||||
return target;
|
||||
}
|
||||
|
||||
return this.replaceSensitiveWord(target, context);
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取敏感词列表
|
||||
*
|
||||
* @param text 文本
|
||||
* @param modeEnum 模式
|
||||
* @return 结果列表
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private List<IWordResult> getSensitiveWords(final String text, final ValidModeEnum modeEnum,
|
||||
final IWordContext context) {
|
||||
//1. 是否存在敏感词,如果比存在,直接返回空列表
|
||||
if (StringUtil.isEmpty(text)) {
|
||||
return Guavas.newArrayList();
|
||||
}
|
||||
|
||||
List<IWordResult> resultList = Guavas.newArrayList();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
SensitiveCheckResult checkResult = sensitiveCheck(text, i, ValidModeEnum.FAIL_OVER, context);
|
||||
// 命中
|
||||
int wordLength = checkResult.index();
|
||||
if (wordLength > 0) {
|
||||
// 保存敏感词
|
||||
String sensitiveWord = text.substring(i, i + wordLength);
|
||||
|
||||
// 添加去重
|
||||
WordResult wordResult = WordResult.newInstance()
|
||||
.startIndex(i)
|
||||
.endIndex(i+wordLength)
|
||||
.word(sensitiveWord);
|
||||
resultList.add(wordResult);
|
||||
|
||||
// 快速返回
|
||||
if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) {
|
||||
break;
|
||||
}
|
||||
|
||||
// 增加 i 的步长
|
||||
// 为什么要-1,因为默认就会自增1
|
||||
// TODO: 这里可以根据字符串匹配算法优化。
|
||||
i += wordLength - 1;
|
||||
}
|
||||
}
|
||||
|
||||
return resultList;
|
||||
}
|
||||
|
||||
/**
|
||||
* 直接替换敏感词,返回替换后的结果
|
||||
* @param target 文本信息
|
||||
* @param context 上下文
|
||||
* @return 脱敏后的字符串
|
||||
* @since 0.0.2
|
||||
*/
|
||||
private String replaceSensitiveWord(final String target,
|
||||
final IWordContext context) {
|
||||
if(StringUtil.isEmpty(target)) {
|
||||
return target;
|
||||
}
|
||||
// 用于结果构建
|
||||
StringBuilder resultBuilder = new StringBuilder(target.length());
|
||||
|
||||
for (int i = 0; i < target.length(); i++) {
|
||||
char currentChar = target.charAt(i);
|
||||
// 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
|
||||
SensitiveCheckResult checkResult = sensitiveCheck(target, i, ValidModeEnum.FAIL_OVER, context);
|
||||
|
||||
// 敏感词
|
||||
int wordLength = checkResult.index();
|
||||
if(wordLength > 0) {
|
||||
// 是否执行替换
|
||||
Class checkClass = checkResult.checkClass();
|
||||
String string = target.substring(i, i+wordLength);
|
||||
if(SensitiveCheckUrl.class.equals(checkClass)
|
||||
&& FileUtil.isImage(string)) {
|
||||
// 直接使用原始内容,避免 markdown 图片转换失败
|
||||
resultBuilder.append(string);
|
||||
} else {
|
||||
// 创建上下文
|
||||
ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance()
|
||||
.sensitiveWord(string)
|
||||
.wordLength(wordLength);
|
||||
String replaceStr = context.sensitiveWordReplace().replace(replaceContext);
|
||||
|
||||
resultBuilder.append(replaceStr);
|
||||
}
|
||||
|
||||
// 直接跳过敏感词的长度
|
||||
i += wordLength-1;
|
||||
} else {
|
||||
// 普通词
|
||||
resultBuilder.append(currentChar);
|
||||
}
|
||||
}
|
||||
|
||||
return resultBuilder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 默认执行敏感词操作
|
||||
context.sensitiveWordMap(innerWordMap);
|
||||
|
||||
// 责任链模式调用
|
||||
return context.sensitiveCheck()
|
||||
.sensitiveCheck(txt, beginIndex, validModeEnum, context);
|
||||
}
|
||||
|
||||
}
|
||||
//package com.github.houbb.sensitive.word.support.map;
|
||||
//
|
||||
//import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
//import com.github.houbb.heaven.util.guava.Guavas;
|
||||
//import com.github.houbb.heaven.util.io.FileUtil;
|
||||
//import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
//import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
//import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
//import com.github.houbb.sensitive.word.api.*;
|
||||
//import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
//import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
//import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
//import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl;
|
||||
//import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
|
||||
//import com.github.houbb.sensitive.word.support.result.WordResult;
|
||||
//
|
||||
//import java.util.Collection;
|
||||
//import java.util.HashMap;
|
||||
//import java.util.List;
|
||||
//import java.util.Map;
|
||||
//
|
||||
///**
|
||||
// * 敏感词 map
|
||||
// *
|
||||
// * @author binbin.hou
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
//@ThreadSafe
|
||||
//public class SensitiveWordMap implements IWordMap {
|
||||
//
|
||||
// /**
|
||||
// * 脱敏单词 map
|
||||
// *
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// private Map innerWordMap;
|
||||
//
|
||||
// /**
|
||||
// * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
|
||||
// *
|
||||
// * @param collection 敏感词库集合
|
||||
// * @since 0.0.1
|
||||
// * <p>
|
||||
// * 使用对象代码 map 的这种一直递归。
|
||||
// * 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html
|
||||
// * https://blog.csdn.net/chenssy/article/details/26961957
|
||||
// */
|
||||
// @Override
|
||||
// @SuppressWarnings("unchecked")
|
||||
// public synchronized void initWordMap(Collection<String> collection) {
|
||||
// // 避免扩容带来的消耗
|
||||
// Map newInnerWordMap = new HashMap(collection.size());
|
||||
//
|
||||
// for (String key : collection) {
|
||||
// if (StringUtil.isEmpty(key)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// // 用来按照相应的格式保存敏感词库数据
|
||||
// char[] chars = key.toCharArray();
|
||||
// final int size = chars.length;
|
||||
//
|
||||
// // 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
|
||||
// Map currentMap = newInnerWordMap;
|
||||
//
|
||||
// for (int i = 0; i < size; i++) {
|
||||
// // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
|
||||
// char charKey = chars[i];
|
||||
// // 如果集合存在
|
||||
// Object wordMap = currentMap.get(charKey);
|
||||
//
|
||||
// // 如果集合存在
|
||||
// if (ObjectUtil.isNotNull(wordMap)) {
|
||||
// // 直接将获取到的 map 当前当前 map 进行继续的操作
|
||||
// currentMap = (Map) wordMap;
|
||||
// } else {
|
||||
// //不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
|
||||
// Map<String, Boolean> newWordMap = new HashMap<>(8);
|
||||
// newWordMap.put(AppConst.IS_END, false);
|
||||
//
|
||||
// // 将新的节点放入当前 map 中
|
||||
// currentMap.put(charKey, newWordMap);
|
||||
//
|
||||
// // 将新节点设置为当前节点,方便下一次节点的循环。
|
||||
// currentMap = newWordMap;
|
||||
// }
|
||||
//
|
||||
// // 判断是否为最后一个,添加是否结束的标识。
|
||||
// if (i == size - 1) {
|
||||
// currentMap.put(AppConst.IS_END, true);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // 最后更新为新的 map,保证更新过程中旧的数据可用
|
||||
// this.innerWordMap = newInnerWordMap;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 是否包含
|
||||
// * (1)直接遍历所有
|
||||
// * (2)如果遇到,则直接返回 true
|
||||
// *
|
||||
// * @param string 字符串
|
||||
// * @return 是否包含
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// @Override
|
||||
// public boolean contains(String string, final IWordContext context) {
|
||||
// if (StringUtil.isEmpty(string)) {
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// for (int i = 0; i < string.length(); i++) {
|
||||
// SensitiveCheckResult checkResult = sensitiveCheck(string, i, ValidModeEnum.FAIL_FAST, context);
|
||||
// // 快速返回
|
||||
// if (checkResult.index() > 0) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 返回所有对应的敏感词
|
||||
// * (1)结果是有序的
|
||||
// * (2)为了保留所有的下标,结果从 v0.1.0 之后不再去重。
|
||||
// *
|
||||
// * @param string 原始字符串
|
||||
// * @return 结果
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// @Override
|
||||
// public List<IWordResult> findAll(String string, final IWordContext context) {
|
||||
// return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public IWordResult findFirst(String string, final IWordContext context) {
|
||||
// List<IWordResult> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context);
|
||||
//
|
||||
// if (CollectionUtil.isEmpty(stringList)) {
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// return stringList.get(0);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String replace(String target, final IWordContext context) {
|
||||
// if(StringUtil.isEmpty(target)) {
|
||||
// return target;
|
||||
// }
|
||||
//
|
||||
// return this.replaceSensitiveWord(target, context);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 获取敏感词列表
|
||||
// *
|
||||
// * @param text 文本
|
||||
// * @param modeEnum 模式
|
||||
// * @return 结果列表
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// private List<IWordResult> getSensitiveWords(final String text, final ValidModeEnum modeEnum,
|
||||
// final IWordContext context) {
|
||||
// //1. 是否存在敏感词,如果比存在,直接返回空列表
|
||||
// if (StringUtil.isEmpty(text)) {
|
||||
// return Guavas.newArrayList();
|
||||
// }
|
||||
//
|
||||
// List<IWordResult> resultList = Guavas.newArrayList();
|
||||
// for (int i = 0; i < text.length(); i++) {
|
||||
// SensitiveCheckResult checkResult = sensitiveCheck(text, i, ValidModeEnum.FAIL_OVER, context);
|
||||
// // 命中
|
||||
// int wordLength = checkResult.index();
|
||||
// if (wordLength > 0) {
|
||||
// // 保存敏感词
|
||||
// String sensitiveWord = text.substring(i, i + wordLength);
|
||||
//
|
||||
// // 添加去重
|
||||
// WordResult wordResult = WordResult.newInstance()
|
||||
// .startIndex(i)
|
||||
// .endIndex(i+wordLength)
|
||||
// .word(sensitiveWord);
|
||||
// resultList.add(wordResult);
|
||||
//
|
||||
// // 快速返回
|
||||
// if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) {
|
||||
// break;
|
||||
// }
|
||||
//
|
||||
// // 增加 i 的步长
|
||||
// // 为什么要-1,因为默认就会自增1
|
||||
// // TODO: 这里可以根据字符串匹配算法优化。
|
||||
// i += wordLength - 1;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return resultList;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 直接替换敏感词,返回替换后的结果
|
||||
// * @param target 文本信息
|
||||
// * @param context 上下文
|
||||
// * @return 脱敏后的字符串
|
||||
// * @since 0.0.2
|
||||
// */
|
||||
// private String replaceSensitiveWord(final String target,
|
||||
// final IWordContext context) {
|
||||
// if(StringUtil.isEmpty(target)) {
|
||||
// return target;
|
||||
// }
|
||||
// // 用于结果构建
|
||||
// StringBuilder resultBuilder = new StringBuilder(target.length());
|
||||
//
|
||||
// for (int i = 0; i < target.length(); i++) {
|
||||
// char currentChar = target.charAt(i);
|
||||
// // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
|
||||
// SensitiveCheckResult checkResult = sensitiveCheck(target, i, ValidModeEnum.FAIL_OVER, context);
|
||||
//
|
||||
// // 敏感词
|
||||
// int wordLength = checkResult.index();
|
||||
// if(wordLength > 0) {
|
||||
// // 是否执行替换
|
||||
// Class checkClass = checkResult.checkClass();
|
||||
// String string = target.substring(i, i+wordLength);
|
||||
// if(SensitiveCheckUrl.class.equals(checkClass)
|
||||
// && FileUtil.isImage(string)) {
|
||||
// // 直接使用原始内容,避免 markdown 图片转换失败
|
||||
// resultBuilder.append(string);
|
||||
// } else {
|
||||
// // 创建上下文
|
||||
// ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance()
|
||||
// .sensitiveWord(string)
|
||||
// .wordLength(wordLength);
|
||||
// String replaceStr = context.sensitiveWordReplace().replace(replaceContext);
|
||||
//
|
||||
// resultBuilder.append(replaceStr);
|
||||
// }
|
||||
//
|
||||
// // 直接跳过敏感词的长度
|
||||
// i += wordLength-1;
|
||||
// } else {
|
||||
// // 普通词
|
||||
// resultBuilder.append(currentChar);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return resultBuilder.toString();
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// // 默认执行敏感词操作
|
||||
// context.sensitiveWordMap(innerWordMap);
|
||||
//
|
||||
// // 责任链模式调用
|
||||
// return context.sensitiveCheck()
|
||||
// .sensitiveCheck(txt, beginIndex, validModeEnum, context);
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
||||
@@ -0,0 +1,181 @@
|
||||
package com.github.houbb.sensitive.word.support.map;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class WordMap implements IWordMap {
|
||||
|
||||
/**
|
||||
* 脱敏单词 map
|
||||
*
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private Map innerWordMap;
|
||||
|
||||
/**
|
||||
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
|
||||
*
|
||||
* @param collection 敏感词库集合
|
||||
* @since 0.0.1
|
||||
* <p>
|
||||
* 使用对象代码 map 的这种一直递归。
|
||||
* 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html
|
||||
* https://blog.csdn.net/chenssy/article/details/26961957
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public synchronized void initWordMap(Collection<String> collection) {
|
||||
// 避免扩容带来的消耗
|
||||
Map newInnerWordMap = new HashMap(collection.size());
|
||||
|
||||
for (String key : collection) {
|
||||
if (StringUtil.isEmpty(key)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 用来按照相应的格式保存敏感词库数据
|
||||
char[] chars = key.toCharArray();
|
||||
final int size = chars.length;
|
||||
|
||||
// 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
|
||||
Map currentMap = newInnerWordMap;
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
// 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
|
||||
char charKey = chars[i];
|
||||
// 如果集合存在
|
||||
Object wordMap = currentMap.get(charKey);
|
||||
|
||||
// 如果集合存在
|
||||
if (ObjectUtil.isNotNull(wordMap)) {
|
||||
// 直接将获取到的 map 当前当前 map 进行继续的操作
|
||||
currentMap = (Map) wordMap;
|
||||
} else {
|
||||
//不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
|
||||
Map<String, Boolean> newWordMap = new HashMap<>(8);
|
||||
newWordMap.put(AppConst.IS_END, false);
|
||||
|
||||
// 将新的节点放入当前 map 中
|
||||
currentMap.put(charKey, newWordMap);
|
||||
|
||||
// 将新节点设置为当前节点,方便下一次节点的循环。
|
||||
currentMap = newWordMap;
|
||||
}
|
||||
|
||||
// 判断是否为最后一个,添加是否结束的标识。
|
||||
if (i == size - 1) {
|
||||
currentMap.put(AppConst.IS_END, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 最后更新为新的 map,保证更新过程中旧的数据可用
|
||||
this.innerWordMap = newInnerWordMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否包含
|
||||
* (1)直接遍历所有
|
||||
* (2)如果遇到,则直接返回 true
|
||||
*
|
||||
* @param string 字符串
|
||||
* @return 是否包含
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Override
|
||||
public boolean contains(String string, final IWordContext context) {
|
||||
if (StringUtil.isEmpty(string)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return innerContainsSensitive(string, context);
|
||||
}
|
||||
|
||||
private boolean innerContainsSensitive(String txt,
|
||||
IWordContext context) {
|
||||
// 初始化为当前的 map
|
||||
Map nowMap = this.innerWordMap;
|
||||
|
||||
// 记录敏感词的长度
|
||||
for (int i = 0; i < txt.length(); i++) {
|
||||
// 获取当前的 map 信息
|
||||
nowMap = getNowMap(nowMap, context, txt, i);
|
||||
|
||||
// 如果不为空,则判断是否为结尾。
|
||||
if (ObjectUtil.isNull(nowMap)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return isEnd(nowMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否结束
|
||||
* BUG-FIX: 避免出现敏感词库中没有的文字。
|
||||
* @param map map 信息
|
||||
* @return 是否结束
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private static boolean isEnd(final Map map) {
|
||||
if(ObjectUtil.isNull(map)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Object value = map.get(AppConst.IS_END);
|
||||
if(ObjectUtil.isNull(value)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (boolean)value;
|
||||
}
|
||||
/**
|
||||
* 获取当前的 Map
|
||||
* @param nowMap 原始的当前 map
|
||||
* @param context 上下文
|
||||
* @param txt 文本信息
|
||||
* @param index 下标
|
||||
* @return 实际的当前 map
|
||||
* @since 0.0.7
|
||||
*/
|
||||
private Map getNowMap(Map nowMap,
|
||||
final IWordContext context,
|
||||
final String txt,
|
||||
final int index) {
|
||||
char c = txt.charAt(index);
|
||||
char mappingChar = context.charFormat().format(c, context);
|
||||
|
||||
// 这里做一次重复词的处理
|
||||
//TODO: 这里可以优化,是否获取一次。
|
||||
Map currentMap = (Map) nowMap.get(mappingChar);
|
||||
// 启用忽略重复&当前下标不是第一个
|
||||
if(context.ignoreRepeat()
|
||||
&& index > 0) {
|
||||
char preChar = txt.charAt(index-1);
|
||||
char preMappingChar = context.charFormat().format(preChar, context);
|
||||
|
||||
// 直接赋值为上一个 map
|
||||
if(preMappingChar == mappingChar) {
|
||||
currentMap = nowMap;
|
||||
}
|
||||
}
|
||||
|
||||
return currentMap;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -18,7 +18,7 @@ public final class WordMaps {
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public static IWordMap defaults() {
|
||||
return new SensitiveWordMap();
|
||||
return new WordMap();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ public class SensitiveWordBsUrlTest {
|
||||
List<String> wordList = SensitiveWordBs.newInstance().init().findAll(text);
|
||||
Assert.assertEquals("[www.big-image.png]", wordList.toString());
|
||||
|
||||
Assert.assertEquals(text, SensitiveWordBs.newInstance().init().replace(text));
|
||||
Assert.assertEquals("双击查看大图 *****************查看", SensitiveWordBs.newInstance().init().replace(text));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user