release branch 0.7.0

This commit is contained in:
binbin.hou
2023-06-09 17:21:48 +08:00
parent 258622e8a2
commit 1df1e6aec1
27 changed files with 431 additions and 347 deletions

View File

@@ -184,3 +184,9 @@
|:---|:-----|--------------------------|:--------------------|:-------|
| 1 | O | 性能优化:字符映射统一处理一遍,而不是每次都处理 | 2023-06-09 23:51:58 | |
| 2 | D | 移除废弃的 replaceContext | 2023-06-09 23:51:58 | |
# release_0.7.0
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:-----|---------------------------------------------|:--------------------|:----------------|
| 1 | A | IWordMap 命名调整为 IWordData, 添加 Tree 实现。优化内存占用 | 2023-06-09 23:51:58 | 避免过于限制,放开便于后续拓展 |

View File

@@ -58,7 +58,7 @@
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.6.0</version>
<version>0.7.0</version>
</dependency>
```
@@ -662,12 +662,16 @@ ps: 不同环境会有差异,但是比例基本稳定。
# 后期 road-map
- [x] wordMap 的抽象,便于拓展
- [x] wordData 的内存占用对比 + 优化
- [x] word 的统一性能优化,移除 string 的生成
- [ ] 用户指定自定义的词组,同时允许指定词组的组合获取,更加灵活
ICharFormat/ISensitiveCheck/Word 方法,允许用户自定义。
- [ ] word check 策略的优化,统一遍历+转换
- [ ] DFA 数据结构的另一种实现
- 同音字处理
- 形近字处理
@@ -678,10 +682,6 @@ ps: 不同环境会有差异,但是比例基本稳定。
- 敏感词标签支持
- [ ] DFA 数据结构的另一种实现
放开 wordMap 策略定义
# 拓展阅读
[敏感词工具实现思路](https://houbb.github.io/2020/01/07/sensitive-word)

15
pom.xml
View File

@@ -6,7 +6,7 @@
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.6.0</version>
<version>0.7.0</version>
<properties>
<!--============================== All Plugins START ==============================-->
@@ -68,6 +68,14 @@
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.0.0</version>
<scope>test</scope>
<optional>true</optional>
</dependency>
</dependencies>
</dependencyManagement>
@@ -91,6 +99,11 @@
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</dependency>
</dependencies>
<build>

View File

@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
:: 版本号信息(需要手动指定)
:::: 旧版本名称
SET version=0.6.0
SET version=0.7.0
:::: 新版本名称
SET newVersion=0.7.0
SET newVersion=0.8.0
:::: 组织名称
SET groupName=com.github.houbb
:::: 项目名称

View File

@@ -223,7 +223,7 @@ public interface IWordContext {
* @return 策略
* @since 0.3.2
*/
IWordMap wordMap();
IWordData wordData();
/**
* 设置 wordMap 策略
@@ -231,6 +231,6 @@ public interface IWordContext {
* @return this
* @since 0.3.2
*/
IWordContext wordMap(IWordMap wordMap);
IWordContext wordData(IWordData wordMap);
}

View File

@@ -11,7 +11,7 @@ import java.util.Collection;
* @author binbin.hou
* @since 0.0.1
*/
public interface IWordMap {
public interface IWordData {
/**
@@ -19,7 +19,7 @@ public interface IWordMap {
* @param collection 集合信息
* @since 0.0.1
*/
void initWordMap(Collection<String> collection);
void initWordData(Collection<String> collection);
/**
* 是否包含敏感词

View File

@@ -10,7 +10,7 @@ import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
import com.github.houbb.sensitive.word.support.check.impl.SensitiveChecks;
import com.github.houbb.sensitive.word.support.deny.WordDenys;
import com.github.houbb.sensitive.word.support.format.CharFormats;
import com.github.houbb.sensitive.word.support.map.WordMaps;
import com.github.houbb.sensitive.word.support.data.WordDatas;
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaces;
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
import com.github.houbb.sensitive.word.utils.InnerWordDataUtils;
@@ -95,31 +95,29 @@ public class SensitiveWordBs {
private ISensitiveWord sensitiveWord = SensitiveWords.defaults();
/**
* 敏感词 map
*
* TODO: 暂时定义为 final后续放开抽象。
* 敏感词 Data
*
* @since 0.0.1
*/
private final IWordMap wordMap = WordMaps.defaults();
private IWordData wordData = WordDatas.defaults();
/**
* 禁止的单词
* @since 0.0.13
*/
private IWordDeny wordDeny = WordDenys.system();
private IWordDeny wordDeny = WordDenys.defaults();
/**
* 允许的单词
* @since 0.0.13
*/
private IWordAllow wordAllow = WordAllows.system();
private IWordAllow wordAllow = WordAllows.defaults();
/**
* 替换策略
* @since 0.3.0
*/
private ISensitiveWordReplace sensitiveWordReplace = SensitiveWordReplaces.chars();
private ISensitiveWordReplace sensitiveWordReplace = SensitiveWordReplaces.defaults();
/**
* 上下文
@@ -191,7 +189,7 @@ public class SensitiveWordBs {
// 额外配置
context.sensitiveCheckNumLen(numCheckLen);
context.sensitiveWordReplace(sensitiveWordReplace);
context.wordMap(wordMap);
context.wordData(wordData);
return context;
}
@@ -209,7 +207,20 @@ public class SensitiveWordBs {
List<String> results = InnerWordDataUtils.getActualDenyList(denyList, allowList, context);
// 便于可以多次初始化
wordMap.initWordMap(results);
wordData.initWordData(results);
}
/**
* 允许指定策略数据
* @param wordData 单词数据
* @return 结果
* @since 0.7.0
*/
public SensitiveWordBs wordData(IWordData wordData) {
ArgUtil.notNull(wordData, "wordData");
this.wordData = wordData;
return this;
}
public SensitiveWordBs sensitiveWord(ISensitiveWord sensitiveWord) {

View File

@@ -3,7 +3,7 @@ package com.github.houbb.sensitive.word.bs;
import com.github.houbb.sensitive.word.api.ICharFormat;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
/**
@@ -102,14 +102,14 @@ public class SensitiveWordContext implements IWordContext {
*
* @since 0.3.2
*/
private IWordMap wordMap;
private IWordData wordData;
public IWordMap wordMap() {
return wordMap;
public IWordData wordData() {
return wordData;
}
public SensitiveWordContext wordMap(IWordMap wordMap) {
this.wordMap = wordMap;
public SensitiveWordContext wordData(IWordData wordData) {
this.wordData = wordData;
return this;
}

View File

@@ -41,7 +41,7 @@ public final class WordAllows {
* @return 结果
* @since 0.0.13
*/
public static IWordAllow system() {
public static IWordAllow defaults() {
return WordAllowSystem.getInstance();
}

View File

@@ -2,7 +2,7 @@ package com.github.houbb.sensitive.word.support.check.impl;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
@@ -41,7 +41,7 @@ public class SensitiveCheckWord extends AbstractSensitiveCheck {
// 采用 ThreadLocal 应该可以提升性能,减少对象的创建。
int actualLength = 0;
final IWordMap wordMap = context.wordMap();
final IWordData wordData = context.wordData();
// 前一个条件
StringBuilder stringBuilder = new StringBuilder();
@@ -53,7 +53,7 @@ public class SensitiveCheckWord extends AbstractSensitiveCheck {
stringBuilder.append(mappingChar);
// 判断是否存在
WordContainsTypeEnum wordContainsTypeEnum = wordMap.contains(stringBuilder, innerContext);
WordContainsTypeEnum wordContainsTypeEnum = wordData.contains(stringBuilder, innerContext);
if(WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnum)) {
actualLength = stringBuilder.length();

View File

@@ -0,0 +1,49 @@
package com.github.houbb.sensitive.word.support.data;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
import java.util.Collection;
/**
* 抽象数据
*
* @since 0.7.0
*/
public abstract class AbstractWordData implements IWordData {
/**
* 是否包含
* @param stringBuilder 字符
* @param innerContext 上下文
* @return 结果
*/
protected abstract WordContainsTypeEnum doContains(StringBuilder stringBuilder, InnerSensitiveContext innerContext);
/**
* 初始化
* @param collection 数据
*/
protected abstract void doInitWordData(Collection<String> collection);
@Override
public void initWordData(Collection<String> collection) {
//1. 预留
this.doInitWordData(collection);
}
@Override
public WordContainsTypeEnum contains(StringBuilder stringBuilder, InnerSensitiveContext innerContext) {
if(stringBuilder == null
|| stringBuilder.length() <= 0) {
return WordContainsTypeEnum.NOT_FOUND;
}
return doContains(stringBuilder, innerContext);
}
}

View File

@@ -1,10 +1,9 @@
package com.github.houbb.sensitive.word.support.map;
package com.github.houbb.sensitive.word.support.data;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
import com.github.houbb.sensitive.word.constant.AppConst;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
@@ -20,7 +19,7 @@ import java.util.Map;
* @since 0.0.1
*/
@ThreadSafe
public class WordMap implements IWordMap {
public class WordDataHashMap extends AbstractWordData {
/**
* 脱敏单词 map
@@ -41,7 +40,7 @@ public class WordMap implements IWordMap {
*/
@Override
@SuppressWarnings("unchecked")
public synchronized void initWordMap(Collection<String> collection) {
public synchronized void doInitWordData(Collection<String> collection) {
// 避免扩容带来的消耗
Map newInnerWordMap = new HashMap(collection.size());
@@ -78,12 +77,10 @@ public class WordMap implements IWordMap {
// 将新节点设置为当前节点方便下一次节点的循环
currentMap = newWordMap;
}
// 判断是否为最后一个添加是否结束的标识
if (i == size - 1) {
currentMap.put(AppConst.IS_END, true);
}
}
// 判断是否为最后一个添加是否结束的标识
currentMap.put(AppConst.IS_END, true);
}
// 最后更新为新的 map保证更新过程中旧的数据可用
@@ -101,13 +98,8 @@ public class WordMap implements IWordMap {
* @since 0.0.1
*/
@Override
public WordContainsTypeEnum contains(final StringBuilder stringBuilder,
public WordContainsTypeEnum doContains(final StringBuilder stringBuilder,
final InnerSensitiveContext innerContext) {
if (stringBuilder == null
|| stringBuilder.length() <= 0) {
return WordContainsTypeEnum.NOT_FOUND;
}
return innerContainsSensitive(stringBuilder, innerContext);
}

View File

@@ -0,0 +1,123 @@
package com.github.houbb.sensitive.word.support.data;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
import com.github.houbb.sensitive.word.constant.AppConst;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* 敏感词 map
* PRhttps://github.com/houbb/sensitive-word/pull/33
*
* @author xiaochangbai
* @author binbin.hou
* @since 0.7.0
*/
@ThreadSafe
public class WordDataTree implements IWordData {
/**
* 根节点
*/
private WordDataTreeNode root;
@Override
public synchronized void initWordData(Collection<String> collection) {
WordDataTreeNode newRoot = new WordDataTreeNode();
for(String word : collection) {
if(StringUtil.isEmpty(word)) {
continue;
}
WordDataTreeNode tempNode = newRoot;
char[] chars = word.toCharArray();
for (char c : chars) {
// 获取子节点
WordDataTreeNode subNode = tempNode.getSubNode(c);
if (subNode == null) {
subNode = new WordDataTreeNode();
// 加入新的子节点
tempNode.addSubNode(c, subNode);
}
// 临时节点指向子节点,进入下一次循环
tempNode = subNode;
}
// 设置结束标识(循环结束,设置一次即可)
tempNode.end(true);
}
// 初始化完成才做替换
this.root = newRoot;
}
@Override
public WordContainsTypeEnum contains(StringBuilder stringBuilder,
InnerSensitiveContext innerContext) {
WordDataTreeNode nowNode = root;
int len = stringBuilder.length();
for(int i = 0; i < len; i++) {
// 获取当前的 map 信息
nowNode = getNowMap(nowNode, i, stringBuilder, innerContext);
// 如果不为空,则判断是否为结尾。
if (ObjectUtil.isNull(nowNode)) {
return WordContainsTypeEnum.NOT_FOUND;
}
}
if(nowNode.end()) {
return WordContainsTypeEnum.CONTAINS_END;
}
return WordContainsTypeEnum.CONTAINS_PREFIX;
}
/**
* 获取当前的 Map
* @param nowNode 当前节点
* @param index 下标
* @param stringBuilder 文本缓存
* @param sensitiveContext 上下文
* @return 实际的当前 map
* @since 0.0.7
*/
private WordDataTreeNode getNowMap(WordDataTreeNode nowNode,
final int index,
final StringBuilder stringBuilder,
final InnerSensitiveContext sensitiveContext) {
final IWordContext context = sensitiveContext.wordContext();
// 这里的 char 已经是统一格式化之后的,所以可以不用再次格式化。
char mappingChar = stringBuilder.charAt(index);
// 这里做一次重复词的处理
WordDataTreeNode currentMap = nowNode.getSubNode(mappingChar);
// 启用忽略重复&当前下标不是第一个
if(context.ignoreRepeat()
&& index > 0) {
char preMappingChar = stringBuilder.charAt(index-1);
// 直接赋值为上一个 map
if(preMappingChar == mappingChar) {
currentMap = nowNode;
}
}
return currentMap;
}
}

View File

@@ -0,0 +1,49 @@
package com.github.houbb.sensitive.word.support.data;
import java.util.HashMap;
import java.util.Map;
/**
* 树节点
*
* @since 0.7.0
*/
public class WordDataTreeNode {
/**
* 关键词结束标识
*/
private boolean end;
/**
* 子节点(key是下级字符,value是下级节点)
*/
private Map<Character, WordDataTreeNode> subNodeMap;
public boolean end() {
return end;
}
public WordDataTreeNode end(boolean end) {
this.end = end;
return this;
}
public WordDataTreeNode getSubNode(final char c) {
if(subNodeMap == null) {
return null;
}
return subNodeMap.get(c);
}
public WordDataTreeNode addSubNode(char c, WordDataTreeNode subNode) {
if(this.subNodeMap == null) {
subNodeMap = new HashMap<>();
}
subNodeMap.put(c, subNode);
return this;
}
}

View File

@@ -0,0 +1,43 @@
package com.github.houbb.sensitive.word.support.data;
import com.github.houbb.sensitive.word.api.IWordData;
/**
* 敏感词 map
*
* @author binbin.hou
* @since 0.3.0
*/
public final class WordDatas {
private WordDatas(){}
/**
* 默认策略
* @return 策略
* @since 0.3.0
*/
public static IWordData defaults() {
return tree();
}
/**
* 树模式
* @return 树
* @since 0.7.0
*/
public static IWordData tree() {
return new WordDataTree();
}
/**
* 树模式
* @return 树
* @since 0.7.0
*/
public static IWordData hashMap() {
return new WordDataHashMap();
}
}

View File

@@ -41,7 +41,7 @@ public final class WordDenys {
* @return 结果
* @since 0.0.13
*/
public static IWordDeny system() {
public static IWordDeny defaults() {
return WordDenySystem.getInstance();
}

View File

@@ -1,265 +0,0 @@
//package com.github.houbb.sensitive.word.support.map;
//
//import com.github.houbb.heaven.annotation.ThreadSafe;
//import com.github.houbb.heaven.util.guava.Guavas;
//import com.github.houbb.heaven.util.io.FileUtil;
//import com.github.houbb.heaven.util.lang.ObjectUtil;
//import com.github.houbb.heaven.util.lang.StringUtil;
//import com.github.houbb.heaven.util.util.CollectionUtil;
//import com.github.houbb.sensitive.word.api.*;
//import com.github.houbb.sensitive.word.constant.AppConst;
//import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
//import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
//import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl;
//import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
//import com.github.houbb.sensitive.word.support.result.WordResult;
//
//import java.util.Collection;
//import java.util.HashMap;
//import java.util.List;
//import java.util.Map;
//
///**
// * 敏感词 map
// *
// * @author binbin.hou
// * @since 0.0.1
// */
//@ThreadSafe
//public class SensitiveWordMap implements IWordMap {
//
// /**
// * 脱敏单词 map
// *
// * @since 0.0.1
// */
// private Map innerWordMap;
//
// /**
// * 读取敏感词库将敏感词放入HashSet中构建一个DFA算法模型
// *
// * @param collection 敏感词库集合
// * @since 0.0.1
// * <p>
// * 使用对象代码 map 的这种一直递归。
// * 参考资料https://www.cnblogs.com/AlanLee/p/5329555.html
// * https://blog.csdn.net/chenssy/article/details/26961957
// */
// @Override
// @SuppressWarnings("unchecked")
// public synchronized void initWordMap(Collection<String> collection) {
// // 避免扩容带来的消耗
// Map newInnerWordMap = new HashMap(collection.size());
//
// for (String key : collection) {
// if (StringUtil.isEmpty(key)) {
// continue;
// }
//
// // 用来按照相应的格式保存敏感词库数据
// char[] chars = key.toCharArray();
// final int size = chars.length;
//
// // 每一个新词的循环,直接将结果设置为当前 map所有变化都会体现在结果的 map 中
// Map currentMap = newInnerWordMap;
//
// for (int i = 0; i < size; i++) {
// // 截取敏感词当中的字在敏感词库中字为HashMap对象的Key键值
// char charKey = chars[i];
// // 如果集合存在
// Object wordMap = currentMap.get(charKey);
//
// // 如果集合存在
// if (ObjectUtil.isNotNull(wordMap)) {
// // 直接将获取到的 map 当前当前 map 进行继续的操作
// currentMap = (Map) wordMap;
// } else {
// //不存在则则构建一个新的map同时将isEnd设置为0因为他不是最后一
// Map<String, Boolean> newWordMap = new HashMap<>(8);
// newWordMap.put(AppConst.IS_END, false);
//
// // 将新的节点放入当前 map 中
// currentMap.put(charKey, newWordMap);
//
// // 将新节点设置为当前节点,方便下一次节点的循环。
// currentMap = newWordMap;
// }
//
// // 判断是否为最后一个,添加是否结束的标识。
// if (i == size - 1) {
// currentMap.put(AppConst.IS_END, true);
// }
// }
// }
//
// // 最后更新为新的 map保证更新过程中旧的数据可用
// this.innerWordMap = newInnerWordMap;
// }
//
// /**
// * 是否包含
// * 1直接遍历所有
// * 2如果遇到则直接返回 true
// *
// * @param string 字符串
// * @return 是否包含
// * @since 0.0.1
// */
// @Override
// public boolean contains(String string, final IWordContext context) {
// if (StringUtil.isEmpty(string)) {
// return false;
// }
//
// for (int i = 0; i < string.length(); i++) {
// SensitiveCheckResult checkResult = sensitiveCheck(string, i, ValidModeEnum.FAIL_FAST, context);
// // 快速返回
// if (checkResult.index() > 0) {
// return true;
// }
// }
// return false;
// }
//
// /**
// * 返回所有对应的敏感词
// * 1结果是有序的
// * 2为了保留所有的下标结果从 v0.1.0 之后不再去重。
// *
// * @param string 原始字符串
// * @return 结果
// * @since 0.0.1
// */
// @Override
// public List<IWordResult> findAll(String string, final IWordContext context) {
// return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
// }
//
// @Override
// public IWordResult findFirst(String string, final IWordContext context) {
// List<IWordResult> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context);
//
// if (CollectionUtil.isEmpty(stringList)) {
// return null;
// }
//
// return stringList.get(0);
// }
//
// @Override
// public String replace(String target, final IWordContext context) {
// if(StringUtil.isEmpty(target)) {
// return target;
// }
//
// return this.replaceSensitiveWord(target, context);
// }
//
// /**
// * 获取敏感词列表
// *
// * @param text 文本
// * @param modeEnum 模式
// * @return 结果列表
// * @since 0.0.1
// */
// private List<IWordResult> getSensitiveWords(final String text, final ValidModeEnum modeEnum,
// final IWordContext context) {
// //1. 是否存在敏感词,如果比存在,直接返回空列表
// if (StringUtil.isEmpty(text)) {
// return Guavas.newArrayList();
// }
//
// List<IWordResult> resultList = Guavas.newArrayList();
// for (int i = 0; i < text.length(); i++) {
// SensitiveCheckResult checkResult = sensitiveCheck(text, i, ValidModeEnum.FAIL_OVER, context);
// // 命中
// int wordLength = checkResult.index();
// if (wordLength > 0) {
// // 保存敏感词
// String sensitiveWord = text.substring(i, i + wordLength);
//
// // 添加去重
// WordResult wordResult = WordResult.newInstance()
// .startIndex(i)
// .endIndex(i+wordLength)
// .word(sensitiveWord);
// resultList.add(wordResult);
//
// // 快速返回
// if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) {
// break;
// }
//
// // 增加 i 的步长
// // 为什么要-1因为默认就会自增1
// // TODO: 这里可以根据字符串匹配算法优化。
// i += wordLength - 1;
// }
// }
//
// return resultList;
// }
//
// /**
// * 直接替换敏感词,返回替换后的结果
// * @param target 文本信息
// * @param context 上下文
// * @return 脱敏后的字符串
// * @since 0.0.2
// */
// private String replaceSensitiveWord(final String target,
// final IWordContext context) {
// if(StringUtil.isEmpty(target)) {
// return target;
// }
// // 用于结果构建
// StringBuilder resultBuilder = new StringBuilder(target.length());
//
// for (int i = 0; i < target.length(); i++) {
// char currentChar = target.charAt(i);
// // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
// SensitiveCheckResult checkResult = sensitiveCheck(target, i, ValidModeEnum.FAIL_OVER, context);
//
// // 敏感词
// int wordLength = checkResult.index();
// if(wordLength > 0) {
// // 是否执行替换
// Class checkClass = checkResult.checkClass();
// String string = target.substring(i, i+wordLength);
// if(SensitiveCheckUrl.class.equals(checkClass)
// && FileUtil.isImage(string)) {
// // 直接使用原始内容,避免 markdown 图片转换失败
// resultBuilder.append(string);
// } else {
// // 创建上下文
// ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance()
// .sensitiveWord(string)
// .wordLength(wordLength);
// String replaceStr = context.sensitiveWordReplace().replace(replaceContext);
//
// resultBuilder.append(replaceStr);
// }
//
// // 直接跳过敏感词的长度
// i += wordLength-1;
// } else {
// // 普通词
// resultBuilder.append(currentChar);
// }
// }
//
// return resultBuilder.toString();
// }
//
// @Override
// public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
// // 默认执行敏感词操作
// context.sensitiveWordMap(innerWordMap);
//
// // 责任链模式调用
// return context.sensitiveCheck()
// .sensitiveCheck(txt, beginIndex, validModeEnum, context);
// }
//
//}

View File

@@ -1,24 +0,0 @@
package com.github.houbb.sensitive.word.support.map;
import com.github.houbb.sensitive.word.api.IWordMap;
/**
* 敏感词 map
*
* @author binbin.hou
* @since 0.3.0
*/
public final class WordMaps {
private WordMaps(){}
/**
* 默认策略
* @return 策略
* @since 0.3.0
*/
public static IWordMap defaults() {
return new WordMap();
}
}

View File

@@ -31,4 +31,13 @@ public final class SensitiveWordReplaces {
return new SensitiveWordReplaceChar();
}
/**
* 字符,默认为 *
* @return 结果
* @since 0.7.0
*/
public static ISensitiveWordReplace defaults() {
return chars();
}
}

View File

@@ -13,6 +13,7 @@ public class BenchmarkTimesTest {
* 测试基准100+字符串 * 10W次
*
* V0.6.0: 1470ms接近 7.2W QPS
* V0.7.0: 1380ms
*/
@Test
public void onlyWordAndNoReplaceTest() {
@@ -45,6 +46,7 @@ public class BenchmarkTimesTest {
* 测试基准100+字符串 * 10W次
*
* V0.6.0: 2744ms, 约 3.7W QPS
* V0.7.0: 2723ms
*/
@Test
public void onlyWordAndWithReplaceTest() {

View File

@@ -0,0 +1,27 @@
package com.github.houbb.sensitive.word.bs;
import com.github.houbb.sensitive.word.support.data.WordDatas;
import org.junit.Assert;
import org.junit.Test;
/**
* <p> project: sensitive-word-SensitiveWordBsConfigTest </p>
* <p> create on 2020/1/7 23:43 </p>
*
* @author Administrator
* @since 0.7.0
*/
public class SensitiveWordBsDataTest {
@Test
public void wordDataConfigTest() {
SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
.wordData(WordDatas.tree())
.init();
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
Assert.assertTrue(wordBs.contains(text));
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordBs.findAll(text).toString());
}
}

View File

@@ -106,8 +106,8 @@ public class SensitiveWordBsTest {
@Test
public void configTest() {
SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
.wordDeny(WordDenys.system())
.wordAllow(WordAllows.system())
.wordDeny(WordDenys.defaults())
.wordAllow(WordAllows.defaults())
.init();
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";

View File

@@ -12,8 +12,8 @@ public class MySensitiveTest {
@Test
public void test() {
IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDeny());
IWordAllow wordAllow = WordAllows.chains(WordAllows.system(), new MyWordAllow());
IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny());
IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow());
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
.wordAllow(wordAllow)
.wordDeny(wordDeny)// 各种其他配置

View File

@@ -10,7 +10,7 @@ public class MyWordDenyChineseTest {
@Test
public void test() {
IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDenyChineseNum());
IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDenyChineseNum());
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
.wordDeny(wordDeny)// 各种其他配置
.init();// init() 初始化敏感词字典

View File

@@ -30,8 +30,8 @@ public class SensitiveWordBsDefineTest {
public void defineChainsTest() {
String text = "这是一个测试。我的自定义敏感词。";
IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDeny());
IWordAllow wordAllow = WordAllows.chains(WordAllows.system(), new MyWordAllow());
IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny());
IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow());
SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
.wordDeny(wordDeny)

View File

@@ -0,0 +1,49 @@
package com.github.houbb.sensitive.word.memory;
import com.github.houbb.heaven.util.io.StreamUtil;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.support.data.WordDatas;
import org.apache.lucene.util.RamUsageEstimator;
import org.junit.Ignore;
import org.junit.Test;
import java.util.List;
/**
* 数据内存测试
*
* @since 0.7.0
*/
@Ignore
public class DataMemoryTest {
/**
* 35.5 MB
*/
@Test
public void hashMapTest() {
List<String> allLines = StreamUtil.readAllLines("/dict.txt");
IWordData wordData = WordDatas.defaults();
wordData.initWordData(allLines);
//计算指定对象及其引用树上的所有对象的综合大小返回可读的结果2KB
String humanSize = RamUsageEstimator.humanSizeOf(wordData);
System.out.println(humanSize);
}
//33.4 MB
@Test
public void treeTest() {
List<String> allLines = StreamUtil.readAllLines("/dict.txt");
IWordData wordData = WordDatas.tree();
wordData.initWordData(allLines);
//计算指定对象及其引用树上的所有对象的综合大小返回可读的结果2KB
String humanSize = RamUsageEstimator.humanSizeOf(wordData);
System.out.println(humanSize);
}
}

View File

@@ -29,7 +29,7 @@ public class SpringSensitiveWordConfig {
@Bean
public SensitiveWordBs sensitiveWordBs() {
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
.wordAllow(WordAllows.chains(WordAllows.system(), myDdWordAllow))
.wordAllow(WordAllows.chains(WordAllows.defaults(), myDdWordAllow))
.wordDeny(myDdWordDeny)
// 各种其他配置
.init();