mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
release branch 0.7.0
This commit is contained in:
@@ -184,3 +184,9 @@
|
||||
|:---|:-----|--------------------------|:--------------------|:-------|
|
||||
| 1 | O | 性能优化:字符映射统一处理一遍,而不是每次都处理 | 2023-06-09 23:51:58 | |
|
||||
| 2 | D | 移除废弃的 replaceContext | 2023-06-09 23:51:58 | |
|
||||
|
||||
# release_0.7.0
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:-----|---------------------------------------------|:--------------------|:----------------|
|
||||
| 1 | A | IWordMap 命名调整为 IWordData, 添加 Tree 实现。优化内存占用 | 2023-06-09 23:51:58 | 避免过于限制,放开便于后续拓展 |
|
||||
14
README.md
14
README.md
@@ -58,7 +58,7 @@
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.6.0</version>
|
||||
<version>0.7.0</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@@ -662,12 +662,16 @@ ps: 不同环境会有差异,但是比例基本稳定。
|
||||
|
||||
# 后期 road-map
|
||||
|
||||
- [x] wordMap 的抽象,便于拓展
|
||||
- [x] wordData 的内存占用对比 + 优化
|
||||
|
||||
- [x] word 的统一性能优化,移除 string 的生成
|
||||
- [ ] 用户指定自定义的词组,同时允许指定词组的组合获取,更加灵活
|
||||
|
||||
ICharFormat/ISensitiveCheck/Word 方法,允许用户自定义。
|
||||
|
||||
- [ ] word check 策略的优化,统一遍历+转换
|
||||
|
||||
- [ ] DFA 数据结构的另一种实现
|
||||
|
||||
- 同音字处理
|
||||
|
||||
- 形近字处理
|
||||
@@ -678,10 +682,6 @@ ps: 不同环境会有差异,但是比例基本稳定。
|
||||
|
||||
- 敏感词标签支持
|
||||
|
||||
- [ ] DFA 数据结构的另一种实现
|
||||
|
||||
放开 wordMap 策略定义
|
||||
|
||||
# 拓展阅读
|
||||
|
||||
[敏感词工具实现思路](https://houbb.github.io/2020/01/07/sensitive-word)
|
||||
|
||||
15
pom.xml
15
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.6.0</version>
|
||||
<version>0.7.0</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
@@ -68,6 +68,14 @@
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
<version>4.0.0</version>
|
||||
<scope>test</scope>
|
||||
<optional>true</optional>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
@@ -91,6 +99,11 @@
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
|
||||
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
|
||||
|
||||
:: 版本号信息(需要手动指定)
|
||||
:::: 旧版本名称
|
||||
SET version=0.6.0
|
||||
SET version=0.7.0
|
||||
:::: 新版本名称
|
||||
SET newVersion=0.7.0
|
||||
SET newVersion=0.8.0
|
||||
:::: 组织名称
|
||||
SET groupName=com.github.houbb
|
||||
:::: 项目名称
|
||||
|
||||
@@ -223,7 +223,7 @@ public interface IWordContext {
|
||||
* @return 策略
|
||||
* @since 0.3.2
|
||||
*/
|
||||
IWordMap wordMap();
|
||||
IWordData wordData();
|
||||
|
||||
/**
|
||||
* 设置 wordMap 策略
|
||||
@@ -231,6 +231,6 @@ public interface IWordContext {
|
||||
* @return this
|
||||
* @since 0.3.2
|
||||
*/
|
||||
IWordContext wordMap(IWordMap wordMap);
|
||||
IWordContext wordData(IWordData wordMap);
|
||||
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ import java.util.Collection;
|
||||
* @author binbin.hou
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public interface IWordMap {
|
||||
public interface IWordData {
|
||||
|
||||
|
||||
/**
|
||||
@@ -19,7 +19,7 @@ public interface IWordMap {
|
||||
* @param collection 集合信息
|
||||
* @since 0.0.1
|
||||
*/
|
||||
void initWordMap(Collection<String> collection);
|
||||
void initWordData(Collection<String> collection);
|
||||
|
||||
/**
|
||||
* 是否包含敏感词
|
||||
@@ -10,7 +10,7 @@ import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.support.check.impl.SensitiveChecks;
|
||||
import com.github.houbb.sensitive.word.support.deny.WordDenys;
|
||||
import com.github.houbb.sensitive.word.support.format.CharFormats;
|
||||
import com.github.houbb.sensitive.word.support.map.WordMaps;
|
||||
import com.github.houbb.sensitive.word.support.data.WordDatas;
|
||||
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaces;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
|
||||
import com.github.houbb.sensitive.word.utils.InnerWordDataUtils;
|
||||
@@ -95,31 +95,29 @@ public class SensitiveWordBs {
|
||||
private ISensitiveWord sensitiveWord = SensitiveWords.defaults();
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
*
|
||||
* TODO: 暂时定义为 final,后续放开抽象。
|
||||
* 敏感词 Data
|
||||
*
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private final IWordMap wordMap = WordMaps.defaults();
|
||||
private IWordData wordData = WordDatas.defaults();
|
||||
|
||||
/**
|
||||
* 禁止的单词
|
||||
* @since 0.0.13
|
||||
*/
|
||||
private IWordDeny wordDeny = WordDenys.system();
|
||||
private IWordDeny wordDeny = WordDenys.defaults();
|
||||
|
||||
/**
|
||||
* 允许的单词
|
||||
* @since 0.0.13
|
||||
*/
|
||||
private IWordAllow wordAllow = WordAllows.system();
|
||||
private IWordAllow wordAllow = WordAllows.defaults();
|
||||
|
||||
/**
|
||||
* 替换策略
|
||||
* @since 0.3.0
|
||||
*/
|
||||
private ISensitiveWordReplace sensitiveWordReplace = SensitiveWordReplaces.chars();
|
||||
private ISensitiveWordReplace sensitiveWordReplace = SensitiveWordReplaces.defaults();
|
||||
|
||||
/**
|
||||
* 上下文
|
||||
@@ -191,7 +189,7 @@ public class SensitiveWordBs {
|
||||
// 额外配置
|
||||
context.sensitiveCheckNumLen(numCheckLen);
|
||||
context.sensitiveWordReplace(sensitiveWordReplace);
|
||||
context.wordMap(wordMap);
|
||||
context.wordData(wordData);
|
||||
|
||||
return context;
|
||||
}
|
||||
@@ -209,7 +207,20 @@ public class SensitiveWordBs {
|
||||
List<String> results = InnerWordDataUtils.getActualDenyList(denyList, allowList, context);
|
||||
|
||||
// 便于可以多次初始化
|
||||
wordMap.initWordMap(results);
|
||||
wordData.initWordData(results);
|
||||
}
|
||||
|
||||
/**
|
||||
* 允许指定策略数据
|
||||
* @param wordData 单词数据
|
||||
* @return 结果
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public SensitiveWordBs wordData(IWordData wordData) {
|
||||
ArgUtil.notNull(wordData, "wordData");
|
||||
|
||||
this.wordData = wordData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SensitiveWordBs sensitiveWord(ISensitiveWord sensitiveWord) {
|
||||
|
||||
@@ -3,7 +3,7 @@ package com.github.houbb.sensitive.word.bs;
|
||||
import com.github.houbb.sensitive.word.api.ICharFormat;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
import com.github.houbb.sensitive.word.support.check.ISensitiveCheck;
|
||||
|
||||
/**
|
||||
@@ -102,14 +102,14 @@ public class SensitiveWordContext implements IWordContext {
|
||||
*
|
||||
* @since 0.3.2
|
||||
*/
|
||||
private IWordMap wordMap;
|
||||
private IWordData wordData;
|
||||
|
||||
public IWordMap wordMap() {
|
||||
return wordMap;
|
||||
public IWordData wordData() {
|
||||
return wordData;
|
||||
}
|
||||
|
||||
public SensitiveWordContext wordMap(IWordMap wordMap) {
|
||||
this.wordMap = wordMap;
|
||||
public SensitiveWordContext wordData(IWordData wordData) {
|
||||
this.wordData = wordData;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ public final class WordAllows {
|
||||
* @return 结果
|
||||
* @since 0.0.13
|
||||
*/
|
||||
public static IWordAllow system() {
|
||||
public static IWordAllow defaults() {
|
||||
return WordAllowSystem.getInstance();
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ package com.github.houbb.sensitive.word.support.check.impl;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
|
||||
@@ -41,7 +41,7 @@ public class SensitiveCheckWord extends AbstractSensitiveCheck {
|
||||
|
||||
// 采用 ThreadLocal 应该可以提升性能,减少对象的创建。
|
||||
int actualLength = 0;
|
||||
final IWordMap wordMap = context.wordMap();
|
||||
final IWordData wordData = context.wordData();
|
||||
|
||||
// 前一个条件
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
@@ -53,7 +53,7 @@ public class SensitiveCheckWord extends AbstractSensitiveCheck {
|
||||
stringBuilder.append(mappingChar);
|
||||
|
||||
// 判断是否存在
|
||||
WordContainsTypeEnum wordContainsTypeEnum = wordMap.contains(stringBuilder, innerContext);
|
||||
WordContainsTypeEnum wordContainsTypeEnum = wordData.contains(stringBuilder, innerContext);
|
||||
if(WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnum)) {
|
||||
actualLength = stringBuilder.length();
|
||||
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.github.houbb.sensitive.word.support.data;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* 抽象数据
|
||||
*
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public abstract class AbstractWordData implements IWordData {
|
||||
|
||||
/**
|
||||
* 是否包含
|
||||
* @param stringBuilder 字符
|
||||
* @param innerContext 上下文
|
||||
* @return 结果
|
||||
*/
|
||||
protected abstract WordContainsTypeEnum doContains(StringBuilder stringBuilder, InnerSensitiveContext innerContext);
|
||||
|
||||
/**
|
||||
* 初始化
|
||||
* @param collection 数据
|
||||
*/
|
||||
protected abstract void doInitWordData(Collection<String> collection);
|
||||
|
||||
@Override
|
||||
public void initWordData(Collection<String> collection) {
|
||||
//1. 预留
|
||||
|
||||
this.doInitWordData(collection);
|
||||
}
|
||||
|
||||
@Override
|
||||
public WordContainsTypeEnum contains(StringBuilder stringBuilder, InnerSensitiveContext innerContext) {
|
||||
if(stringBuilder == null
|
||||
|| stringBuilder.length() <= 0) {
|
||||
return WordContainsTypeEnum.NOT_FOUND;
|
||||
}
|
||||
|
||||
return doContains(stringBuilder, innerContext);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,10 +1,9 @@
|
||||
package com.github.houbb.sensitive.word.support.map;
|
||||
package com.github.houbb.sensitive.word.support.data;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
|
||||
@@ -20,7 +19,7 @@ import java.util.Map;
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class WordMap implements IWordMap {
|
||||
public class WordDataHashMap extends AbstractWordData {
|
||||
|
||||
/**
|
||||
* 脱敏单词 map
|
||||
@@ -41,7 +40,7 @@ public class WordMap implements IWordMap {
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public synchronized void initWordMap(Collection<String> collection) {
|
||||
public synchronized void doInitWordData(Collection<String> collection) {
|
||||
// 避免扩容带来的消耗
|
||||
Map newInnerWordMap = new HashMap(collection.size());
|
||||
|
||||
@@ -78,12 +77,10 @@ public class WordMap implements IWordMap {
|
||||
// 将新节点设置为当前节点,方便下一次节点的循环。
|
||||
currentMap = newWordMap;
|
||||
}
|
||||
|
||||
// 判断是否为最后一个,添加是否结束的标识。
|
||||
if (i == size - 1) {
|
||||
currentMap.put(AppConst.IS_END, true);
|
||||
}
|
||||
}
|
||||
|
||||
// 判断是否为最后一个,添加是否结束的标识。
|
||||
currentMap.put(AppConst.IS_END, true);
|
||||
}
|
||||
|
||||
// 最后更新为新的 map,保证更新过程中旧的数据可用
|
||||
@@ -101,13 +98,8 @@ public class WordMap implements IWordMap {
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Override
|
||||
public WordContainsTypeEnum contains(final StringBuilder stringBuilder,
|
||||
public WordContainsTypeEnum doContains(final StringBuilder stringBuilder,
|
||||
final InnerSensitiveContext innerContext) {
|
||||
if (stringBuilder == null
|
||||
|| stringBuilder.length() <= 0) {
|
||||
return WordContainsTypeEnum.NOT_FOUND;
|
||||
}
|
||||
|
||||
return innerContainsSensitive(stringBuilder, innerContext);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
package com.github.houbb.sensitive.word.support.data;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
* PR:https://github.com/houbb/sensitive-word/pull/33
|
||||
*
|
||||
* @author xiaochangbai
|
||||
* @author binbin.hou
|
||||
* @since 0.7.0
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class WordDataTree implements IWordData {
|
||||
|
||||
/**
|
||||
* 根节点
|
||||
*/
|
||||
private WordDataTreeNode root;
|
||||
|
||||
@Override
|
||||
public synchronized void initWordData(Collection<String> collection) {
|
||||
WordDataTreeNode newRoot = new WordDataTreeNode();
|
||||
|
||||
for(String word : collection) {
|
||||
if(StringUtil.isEmpty(word)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
WordDataTreeNode tempNode = newRoot;
|
||||
char[] chars = word.toCharArray();
|
||||
for (char c : chars) {
|
||||
// 获取子节点
|
||||
WordDataTreeNode subNode = tempNode.getSubNode(c);
|
||||
if (subNode == null) {
|
||||
subNode = new WordDataTreeNode();
|
||||
// 加入新的子节点
|
||||
tempNode.addSubNode(c, subNode);
|
||||
}
|
||||
|
||||
// 临时节点指向子节点,进入下一次循环
|
||||
tempNode = subNode;
|
||||
}
|
||||
|
||||
// 设置结束标识(循环结束,设置一次即可)
|
||||
tempNode.end(true);
|
||||
}
|
||||
|
||||
// 初始化完成才做替换
|
||||
this.root = newRoot;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WordContainsTypeEnum contains(StringBuilder stringBuilder,
|
||||
InnerSensitiveContext innerContext) {
|
||||
WordDataTreeNode nowNode = root;
|
||||
|
||||
int len = stringBuilder.length();
|
||||
|
||||
for(int i = 0; i < len; i++) {
|
||||
// 获取当前的 map 信息
|
||||
nowNode = getNowMap(nowNode, i, stringBuilder, innerContext);
|
||||
|
||||
// 如果不为空,则判断是否为结尾。
|
||||
if (ObjectUtil.isNull(nowNode)) {
|
||||
return WordContainsTypeEnum.NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
if(nowNode.end()) {
|
||||
return WordContainsTypeEnum.CONTAINS_END;
|
||||
}
|
||||
|
||||
return WordContainsTypeEnum.CONTAINS_PREFIX;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 获取当前的 Map
|
||||
* @param nowNode 当前节点
|
||||
* @param index 下标
|
||||
* @param stringBuilder 文本缓存
|
||||
* @param sensitiveContext 上下文
|
||||
* @return 实际的当前 map
|
||||
* @since 0.0.7
|
||||
*/
|
||||
private WordDataTreeNode getNowMap(WordDataTreeNode nowNode,
|
||||
final int index,
|
||||
final StringBuilder stringBuilder,
|
||||
final InnerSensitiveContext sensitiveContext) {
|
||||
final IWordContext context = sensitiveContext.wordContext();
|
||||
|
||||
// 这里的 char 已经是统一格式化之后的,所以可以不用再次格式化。
|
||||
char mappingChar = stringBuilder.charAt(index);
|
||||
|
||||
// 这里做一次重复词的处理
|
||||
WordDataTreeNode currentMap = nowNode.getSubNode(mappingChar);
|
||||
// 启用忽略重复&当前下标不是第一个
|
||||
if(context.ignoreRepeat()
|
||||
&& index > 0) {
|
||||
char preMappingChar = stringBuilder.charAt(index-1);
|
||||
|
||||
// 直接赋值为上一个 map
|
||||
if(preMappingChar == mappingChar) {
|
||||
currentMap = nowNode;
|
||||
}
|
||||
}
|
||||
|
||||
return currentMap;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.github.houbb.sensitive.word.support.data;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 树节点
|
||||
*
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public class WordDataTreeNode {
|
||||
|
||||
/**
|
||||
* 关键词结束标识
|
||||
*/
|
||||
private boolean end;
|
||||
|
||||
/**
|
||||
* 子节点(key是下级字符,value是下级节点)
|
||||
*/
|
||||
private Map<Character, WordDataTreeNode> subNodeMap;
|
||||
|
||||
public boolean end() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public WordDataTreeNode end(boolean end) {
|
||||
this.end = end;
|
||||
return this;
|
||||
}
|
||||
|
||||
public WordDataTreeNode getSubNode(final char c) {
|
||||
if(subNodeMap == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return subNodeMap.get(c);
|
||||
}
|
||||
|
||||
public WordDataTreeNode addSubNode(char c, WordDataTreeNode subNode) {
|
||||
if(this.subNodeMap == null) {
|
||||
subNodeMap = new HashMap<>();
|
||||
}
|
||||
|
||||
subNodeMap.put(c, subNode);
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package com.github.houbb.sensitive.word.support.data;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public final class WordDatas {
|
||||
|
||||
private WordDatas(){}
|
||||
|
||||
/**
|
||||
* 默认策略
|
||||
* @return 策略
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public static IWordData defaults() {
|
||||
return tree();
|
||||
}
|
||||
|
||||
/**
|
||||
* 树模式
|
||||
* @return 树
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public static IWordData tree() {
|
||||
return new WordDataTree();
|
||||
}
|
||||
|
||||
/**
|
||||
* 树模式
|
||||
* @return 树
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public static IWordData hashMap() {
|
||||
return new WordDataHashMap();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -41,7 +41,7 @@ public final class WordDenys {
|
||||
* @return 结果
|
||||
* @since 0.0.13
|
||||
*/
|
||||
public static IWordDeny system() {
|
||||
public static IWordDeny defaults() {
|
||||
return WordDenySystem.getInstance();
|
||||
}
|
||||
|
||||
|
||||
@@ -1,265 +0,0 @@
|
||||
//package com.github.houbb.sensitive.word.support.map;
|
||||
//
|
||||
//import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
//import com.github.houbb.heaven.util.guava.Guavas;
|
||||
//import com.github.houbb.heaven.util.io.FileUtil;
|
||||
//import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
//import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
//import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
//import com.github.houbb.sensitive.word.api.*;
|
||||
//import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
//import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
//import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
//import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl;
|
||||
//import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
|
||||
//import com.github.houbb.sensitive.word.support.result.WordResult;
|
||||
//
|
||||
//import java.util.Collection;
|
||||
//import java.util.HashMap;
|
||||
//import java.util.List;
|
||||
//import java.util.Map;
|
||||
//
|
||||
///**
|
||||
// * 敏感词 map
|
||||
// *
|
||||
// * @author binbin.hou
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
//@ThreadSafe
|
||||
//public class SensitiveWordMap implements IWordMap {
|
||||
//
|
||||
// /**
|
||||
// * 脱敏单词 map
|
||||
// *
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// private Map innerWordMap;
|
||||
//
|
||||
// /**
|
||||
// * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
|
||||
// *
|
||||
// * @param collection 敏感词库集合
|
||||
// * @since 0.0.1
|
||||
// * <p>
|
||||
// * 使用对象代码 map 的这种一直递归。
|
||||
// * 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html
|
||||
// * https://blog.csdn.net/chenssy/article/details/26961957
|
||||
// */
|
||||
// @Override
|
||||
// @SuppressWarnings("unchecked")
|
||||
// public synchronized void initWordMap(Collection<String> collection) {
|
||||
// // 避免扩容带来的消耗
|
||||
// Map newInnerWordMap = new HashMap(collection.size());
|
||||
//
|
||||
// for (String key : collection) {
|
||||
// if (StringUtil.isEmpty(key)) {
|
||||
// continue;
|
||||
// }
|
||||
//
|
||||
// // 用来按照相应的格式保存敏感词库数据
|
||||
// char[] chars = key.toCharArray();
|
||||
// final int size = chars.length;
|
||||
//
|
||||
// // 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
|
||||
// Map currentMap = newInnerWordMap;
|
||||
//
|
||||
// for (int i = 0; i < size; i++) {
|
||||
// // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
|
||||
// char charKey = chars[i];
|
||||
// // 如果集合存在
|
||||
// Object wordMap = currentMap.get(charKey);
|
||||
//
|
||||
// // 如果集合存在
|
||||
// if (ObjectUtil.isNotNull(wordMap)) {
|
||||
// // 直接将获取到的 map 当前当前 map 进行继续的操作
|
||||
// currentMap = (Map) wordMap;
|
||||
// } else {
|
||||
// //不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
|
||||
// Map<String, Boolean> newWordMap = new HashMap<>(8);
|
||||
// newWordMap.put(AppConst.IS_END, false);
|
||||
//
|
||||
// // 将新的节点放入当前 map 中
|
||||
// currentMap.put(charKey, newWordMap);
|
||||
//
|
||||
// // 将新节点设置为当前节点,方便下一次节点的循环。
|
||||
// currentMap = newWordMap;
|
||||
// }
|
||||
//
|
||||
// // 判断是否为最后一个,添加是否结束的标识。
|
||||
// if (i == size - 1) {
|
||||
// currentMap.put(AppConst.IS_END, true);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// // 最后更新为新的 map,保证更新过程中旧的数据可用
|
||||
// this.innerWordMap = newInnerWordMap;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 是否包含
|
||||
// * (1)直接遍历所有
|
||||
// * (2)如果遇到,则直接返回 true
|
||||
// *
|
||||
// * @param string 字符串
|
||||
// * @return 是否包含
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// @Override
|
||||
// public boolean contains(String string, final IWordContext context) {
|
||||
// if (StringUtil.isEmpty(string)) {
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// for (int i = 0; i < string.length(); i++) {
|
||||
// SensitiveCheckResult checkResult = sensitiveCheck(string, i, ValidModeEnum.FAIL_FAST, context);
|
||||
// // 快速返回
|
||||
// if (checkResult.index() > 0) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 返回所有对应的敏感词
|
||||
// * (1)结果是有序的
|
||||
// * (2)为了保留所有的下标,结果从 v0.1.0 之后不再去重。
|
||||
// *
|
||||
// * @param string 原始字符串
|
||||
// * @return 结果
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// @Override
|
||||
// public List<IWordResult> findAll(String string, final IWordContext context) {
|
||||
// return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public IWordResult findFirst(String string, final IWordContext context) {
|
||||
// List<IWordResult> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context);
|
||||
//
|
||||
// if (CollectionUtil.isEmpty(stringList)) {
|
||||
// return null;
|
||||
// }
|
||||
//
|
||||
// return stringList.get(0);
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public String replace(String target, final IWordContext context) {
|
||||
// if(StringUtil.isEmpty(target)) {
|
||||
// return target;
|
||||
// }
|
||||
//
|
||||
// return this.replaceSensitiveWord(target, context);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 获取敏感词列表
|
||||
// *
|
||||
// * @param text 文本
|
||||
// * @param modeEnum 模式
|
||||
// * @return 结果列表
|
||||
// * @since 0.0.1
|
||||
// */
|
||||
// private List<IWordResult> getSensitiveWords(final String text, final ValidModeEnum modeEnum,
|
||||
// final IWordContext context) {
|
||||
// //1. 是否存在敏感词,如果比存在,直接返回空列表
|
||||
// if (StringUtil.isEmpty(text)) {
|
||||
// return Guavas.newArrayList();
|
||||
// }
|
||||
//
|
||||
// List<IWordResult> resultList = Guavas.newArrayList();
|
||||
// for (int i = 0; i < text.length(); i++) {
|
||||
// SensitiveCheckResult checkResult = sensitiveCheck(text, i, ValidModeEnum.FAIL_OVER, context);
|
||||
// // 命中
|
||||
// int wordLength = checkResult.index();
|
||||
// if (wordLength > 0) {
|
||||
// // 保存敏感词
|
||||
// String sensitiveWord = text.substring(i, i + wordLength);
|
||||
//
|
||||
// // 添加去重
|
||||
// WordResult wordResult = WordResult.newInstance()
|
||||
// .startIndex(i)
|
||||
// .endIndex(i+wordLength)
|
||||
// .word(sensitiveWord);
|
||||
// resultList.add(wordResult);
|
||||
//
|
||||
// // 快速返回
|
||||
// if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) {
|
||||
// break;
|
||||
// }
|
||||
//
|
||||
// // 增加 i 的步长
|
||||
// // 为什么要-1,因为默认就会自增1
|
||||
// // TODO: 这里可以根据字符串匹配算法优化。
|
||||
// i += wordLength - 1;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return resultList;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 直接替换敏感词,返回替换后的结果
|
||||
// * @param target 文本信息
|
||||
// * @param context 上下文
|
||||
// * @return 脱敏后的字符串
|
||||
// * @since 0.0.2
|
||||
// */
|
||||
// private String replaceSensitiveWord(final String target,
|
||||
// final IWordContext context) {
|
||||
// if(StringUtil.isEmpty(target)) {
|
||||
// return target;
|
||||
// }
|
||||
// // 用于结果构建
|
||||
// StringBuilder resultBuilder = new StringBuilder(target.length());
|
||||
//
|
||||
// for (int i = 0; i < target.length(); i++) {
|
||||
// char currentChar = target.charAt(i);
|
||||
// // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
|
||||
// SensitiveCheckResult checkResult = sensitiveCheck(target, i, ValidModeEnum.FAIL_OVER, context);
|
||||
//
|
||||
// // 敏感词
|
||||
// int wordLength = checkResult.index();
|
||||
// if(wordLength > 0) {
|
||||
// // 是否执行替换
|
||||
// Class checkClass = checkResult.checkClass();
|
||||
// String string = target.substring(i, i+wordLength);
|
||||
// if(SensitiveCheckUrl.class.equals(checkClass)
|
||||
// && FileUtil.isImage(string)) {
|
||||
// // 直接使用原始内容,避免 markdown 图片转换失败
|
||||
// resultBuilder.append(string);
|
||||
// } else {
|
||||
// // 创建上下文
|
||||
// ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance()
|
||||
// .sensitiveWord(string)
|
||||
// .wordLength(wordLength);
|
||||
// String replaceStr = context.sensitiveWordReplace().replace(replaceContext);
|
||||
//
|
||||
// resultBuilder.append(replaceStr);
|
||||
// }
|
||||
//
|
||||
// // 直接跳过敏感词的长度
|
||||
// i += wordLength-1;
|
||||
// } else {
|
||||
// // 普通词
|
||||
// resultBuilder.append(currentChar);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return resultBuilder.toString();
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// // 默认执行敏感词操作
|
||||
// context.sensitiveWordMap(innerWordMap);
|
||||
//
|
||||
// // 责任链模式调用
|
||||
// return context.sensitiveCheck()
|
||||
// .sensitiveCheck(txt, beginIndex, validModeEnum, context);
|
||||
// }
|
||||
//
|
||||
//}
|
||||
@@ -1,24 +0,0 @@
|
||||
package com.github.houbb.sensitive.word.support.map;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public final class WordMaps {
|
||||
|
||||
private WordMaps(){}
|
||||
|
||||
/**
|
||||
* 默认策略
|
||||
* @return 策略
|
||||
* @since 0.3.0
|
||||
*/
|
||||
public static IWordMap defaults() {
|
||||
return new WordMap();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -31,4 +31,13 @@ public final class SensitiveWordReplaces {
|
||||
return new SensitiveWordReplaceChar();
|
||||
}
|
||||
|
||||
/**
|
||||
* 字符,默认为 *
|
||||
* @return 结果
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public static ISensitiveWordReplace defaults() {
|
||||
return chars();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ public class BenchmarkTimesTest {
|
||||
* 测试基准:100+字符串 * 10W次
|
||||
*
|
||||
* V0.6.0: 1470ms,接近 7.2W QPS
|
||||
* V0.7.0: 1380ms
|
||||
*/
|
||||
@Test
|
||||
public void onlyWordAndNoReplaceTest() {
|
||||
@@ -45,6 +46,7 @@ public class BenchmarkTimesTest {
|
||||
* 测试基准:100+字符串 * 10W次
|
||||
*
|
||||
* V0.6.0: 2744ms, 约 3.7W QPS
|
||||
* V0.7.0: 2723ms
|
||||
*/
|
||||
@Test
|
||||
public void onlyWordAndWithReplaceTest() {
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import com.github.houbb.sensitive.word.support.data.WordDatas;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-SensitiveWordBsConfigTest </p>
|
||||
* <p> create on 2020/1/7 23:43 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.7.0
|
||||
*/
|
||||
public class SensitiveWordBsDataTest {
|
||||
|
||||
@Test
|
||||
public void wordDataConfigTest() {
|
||||
SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
|
||||
.wordData(WordDatas.tree())
|
||||
.init();
|
||||
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
Assert.assertTrue(wordBs.contains(text));
|
||||
Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordBs.findAll(text).toString());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -106,8 +106,8 @@ public class SensitiveWordBsTest {
|
||||
@Test
|
||||
public void configTest() {
|
||||
SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
|
||||
.wordDeny(WordDenys.system())
|
||||
.wordAllow(WordAllows.system())
|
||||
.wordDeny(WordDenys.defaults())
|
||||
.wordAllow(WordAllows.defaults())
|
||||
.init();
|
||||
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
@@ -12,8 +12,8 @@ public class MySensitiveTest {
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDeny());
|
||||
IWordAllow wordAllow = WordAllows.chains(WordAllows.system(), new MyWordAllow());
|
||||
IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny());
|
||||
IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow());
|
||||
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
|
||||
.wordAllow(wordAllow)
|
||||
.wordDeny(wordDeny)// 各种其他配置
|
||||
|
||||
@@ -10,7 +10,7 @@ public class MyWordDenyChineseTest {
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDenyChineseNum());
|
||||
IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDenyChineseNum());
|
||||
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
|
||||
.wordDeny(wordDeny)// 各种其他配置
|
||||
.init();// init() 初始化敏感词字典
|
||||
|
||||
@@ -30,8 +30,8 @@ public class SensitiveWordBsDefineTest {
|
||||
public void defineChainsTest() {
|
||||
String text = "这是一个测试。我的自定义敏感词。";
|
||||
|
||||
IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDeny());
|
||||
IWordAllow wordAllow = WordAllows.chains(WordAllows.system(), new MyWordAllow());
|
||||
IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny());
|
||||
IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow());
|
||||
|
||||
SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
|
||||
.wordDeny(wordDeny)
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.github.houbb.sensitive.word.memory;
|
||||
|
||||
import com.github.houbb.heaven.util.io.StreamUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordData;
|
||||
import com.github.houbb.sensitive.word.support.data.WordDatas;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 数据内存测试
|
||||
*
|
||||
* @since 0.7.0
|
||||
*/
|
||||
@Ignore
|
||||
public class DataMemoryTest {
|
||||
|
||||
/**
|
||||
* 35.5 MB
|
||||
*/
|
||||
@Test
|
||||
public void hashMapTest() {
|
||||
List<String> allLines = StreamUtil.readAllLines("/dict.txt");
|
||||
IWordData wordData = WordDatas.defaults();
|
||||
|
||||
wordData.initWordData(allLines);
|
||||
|
||||
//计算指定对象及其引用树上的所有对象的综合大小,返回可读的结果,如:2KB
|
||||
String humanSize = RamUsageEstimator.humanSizeOf(wordData);
|
||||
System.out.println(humanSize);
|
||||
}
|
||||
|
||||
|
||||
//33.4 MB
|
||||
@Test
|
||||
public void treeTest() {
|
||||
List<String> allLines = StreamUtil.readAllLines("/dict.txt");
|
||||
IWordData wordData = WordDatas.tree();
|
||||
|
||||
wordData.initWordData(allLines);
|
||||
|
||||
//计算指定对象及其引用树上的所有对象的综合大小,返回可读的结果,如:2KB
|
||||
String humanSize = RamUsageEstimator.humanSizeOf(wordData);
|
||||
System.out.println(humanSize);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -29,7 +29,7 @@ public class SpringSensitiveWordConfig {
|
||||
@Bean
|
||||
public SensitiveWordBs sensitiveWordBs() {
|
||||
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
|
||||
.wordAllow(WordAllows.chains(WordAllows.system(), myDdWordAllow))
|
||||
.wordAllow(WordAllows.chains(WordAllows.defaults(), myDdWordAllow))
|
||||
.wordDeny(myDdWordDeny)
|
||||
// 各种其他配置
|
||||
.init();
|
||||
|
||||
Reference in New Issue
Block a user