diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 0e6a462..178c077 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -184,3 +184,9 @@ |:---|:-----|--------------------------|:--------------------|:-------| | 1 | O | 性能优化:字符映射统一处理一遍,而不是每次都处理 | 2023-06-09 23:51:58 | | | 2 | D | 移除废弃的 replaceContext | 2023-06-09 23:51:58 | | + +# release_0.7.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|---------------------------------------------|:--------------------|:----------------| +| 1 | A | IWordMap 命名调整为 IWordData, 添加 Tree 实现。优化内存占用 | 2023-06-09 23:51:58 | 避免过于限制,放开便于后续拓展 | \ No newline at end of file diff --git a/README.md b/README.md index 2932c9f..16e9760 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ com.github.houbb sensitive-word - 0.6.0 + 0.7.0 ``` @@ -662,12 +662,16 @@ ps: 不同环境会有差异,但是比例基本稳定。 # 后期 road-map -- [x] wordMap 的抽象,便于拓展 +- [x] wordData 的内存占用对比 + 优化 -- [x] word 的统一性能优化,移除 string 的生成 +- [ ] 用户指定自定义的词组,同时允许指定词组的组合获取,更加灵活 + +ICharFormat/ISensitiveCheck/Word 方法,允许用户自定义。 - [ ] word check 策略的优化,统一遍历+转换 +- [ ] DFA 数据结构的另一种实现 + - 同音字处理 - 形近字处理 @@ -678,10 +682,6 @@ ps: 不同环境会有差异,但是比例基本稳定。 - 敏感词标签支持 -- [ ] DFA 数据结构的另一种实现 - -放开 wordMap 策略定义 - # 拓展阅读 [敏感词工具实现思路](https://houbb.github.io/2020/01/07/sensitive-word) diff --git a/pom.xml b/pom.xml index 43277b5..d82fd93 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.6.0 + 0.7.0 @@ -68,6 +68,14 @@ true + + org.apache.lucene + lucene-core + 4.0.0 + test + true + + @@ -91,6 +99,11 @@ junit + + org.apache.lucene + lucene-core + + diff --git a/release.bat b/release.bat index 880cef9..3a66af8 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.6.0 +SET version=0.7.0 :::: 新版本名称 -SET newVersion=0.7.0 +SET newVersion=0.8.0 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index ee9a597..1a1da6c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -223,7 +223,7 @@ public interface IWordContext { * @return 策略 * @since 0.3.2 */ - IWordMap wordMap(); + IWordData wordData(); /** * 设置 wordMap 策略 @@ -231,6 +231,6 @@ public interface IWordContext { * @return this * @since 0.3.2 */ - IWordContext wordMap(IWordMap wordMap); + IWordContext wordData(IWordData wordMap); } diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordData.java similarity index 91% rename from src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java rename to src/main/java/com/github/houbb/sensitive/word/api/IWordData.java index 548cf57..3095e9a 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordData.java @@ -11,7 +11,7 @@ import java.util.Collection; * @author binbin.hou * @since 0.0.1 */ -public interface IWordMap { +public interface IWordData { /** @@ -19,7 +19,7 @@ public interface IWordMap { * @param collection 集合信息 * @since 0.0.1 */ - void initWordMap(Collection collection); + void initWordData(Collection collection); /** * 是否包含敏感词 diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 01149bf..e17dda7 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -10,7 +10,7 @@ import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; import com.github.houbb.sensitive.word.support.check.impl.SensitiveChecks; import com.github.houbb.sensitive.word.support.deny.WordDenys; import com.github.houbb.sensitive.word.support.format.CharFormats; -import com.github.houbb.sensitive.word.support.map.WordMaps; +import com.github.houbb.sensitive.word.support.data.WordDatas; import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaces; import com.github.houbb.sensitive.word.support.result.WordResultHandlers; import com.github.houbb.sensitive.word.utils.InnerWordDataUtils; @@ -95,31 +95,29 @@ public class SensitiveWordBs { private ISensitiveWord sensitiveWord = SensitiveWords.defaults(); /** - * 敏感词 map - * - * TODO: 暂时定义为 final,后续放开抽象。 + * 敏感词 Data * * @since 0.0.1 */ - private final IWordMap wordMap = WordMaps.defaults(); + private IWordData wordData = WordDatas.defaults(); /** * 禁止的单词 * @since 0.0.13 */ - private IWordDeny wordDeny = WordDenys.system(); + private IWordDeny wordDeny = WordDenys.defaults(); /** * 允许的单词 * @since 0.0.13 */ - private IWordAllow wordAllow = WordAllows.system(); + private IWordAllow wordAllow = WordAllows.defaults(); /** * 替换策略 * @since 0.3.0 */ - private ISensitiveWordReplace sensitiveWordReplace = SensitiveWordReplaces.chars(); + private ISensitiveWordReplace sensitiveWordReplace = SensitiveWordReplaces.defaults(); /** * 上下文 @@ -191,7 +189,7 @@ public class SensitiveWordBs { // 额外配置 context.sensitiveCheckNumLen(numCheckLen); context.sensitiveWordReplace(sensitiveWordReplace); - context.wordMap(wordMap); + context.wordData(wordData); return context; } @@ -209,7 +207,20 @@ public class SensitiveWordBs { List results = InnerWordDataUtils.getActualDenyList(denyList, allowList, context); // 便于可以多次初始化 - wordMap.initWordMap(results); + wordData.initWordData(results); + } + + /** + * 允许指定策略数据 + * @param wordData 单词数据 + * @return 结果 + * @since 0.7.0 + */ + public SensitiveWordBs wordData(IWordData wordData) { + ArgUtil.notNull(wordData, "wordData"); + + this.wordData = wordData; + return this; } public SensitiveWordBs sensitiveWord(ISensitiveWord sensitiveWord) { diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index c14f70f..545cc38 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -3,7 +3,7 @@ package com.github.houbb.sensitive.word.bs; import com.github.houbb.sensitive.word.api.ICharFormat; import com.github.houbb.sensitive.word.api.ISensitiveWordReplace; import com.github.houbb.sensitive.word.api.IWordContext; -import com.github.houbb.sensitive.word.api.IWordMap; +import com.github.houbb.sensitive.word.api.IWordData; import com.github.houbb.sensitive.word.support.check.ISensitiveCheck; /** @@ -102,14 +102,14 @@ public class SensitiveWordContext implements IWordContext { * * @since 0.3.2 */ - private IWordMap wordMap; + private IWordData wordData; - public IWordMap wordMap() { - return wordMap; + public IWordData wordData() { + return wordData; } - public SensitiveWordContext wordMap(IWordMap wordMap) { - this.wordMap = wordMap; + public SensitiveWordContext wordData(IWordData wordData) { + this.wordData = wordData; return this; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllows.java b/src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllows.java index 48f6cd1..008647f 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllows.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/allow/WordAllows.java @@ -41,7 +41,7 @@ public final class WordAllows { * @return 结果 * @since 0.0.13 */ - public static IWordAllow system() { + public static IWordAllow defaults() { return WordAllowSystem.getInstance(); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java index a60b802..c7330e1 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/impl/SensitiveCheckWord.java @@ -2,7 +2,7 @@ package com.github.houbb.sensitive.word.support.check.impl; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.sensitive.word.api.IWordContext; -import com.github.houbb.sensitive.word.api.IWordMap; +import com.github.houbb.sensitive.word.api.IWordData; import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext; import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; @@ -41,7 +41,7 @@ public class SensitiveCheckWord extends AbstractSensitiveCheck { // 采用 ThreadLocal 应该可以提升性能,减少对象的创建。 int actualLength = 0; - final IWordMap wordMap = context.wordMap(); + final IWordData wordData = context.wordData(); // 前一个条件 StringBuilder stringBuilder = new StringBuilder(); @@ -53,7 +53,7 @@ public class SensitiveCheckWord extends AbstractSensitiveCheck { stringBuilder.append(mappingChar); // 判断是否存在 - WordContainsTypeEnum wordContainsTypeEnum = wordMap.contains(stringBuilder, innerContext); + WordContainsTypeEnum wordContainsTypeEnum = wordData.contains(stringBuilder, innerContext); if(WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnum)) { actualLength = stringBuilder.length(); diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/AbstractWordData.java b/src/main/java/com/github/houbb/sensitive/word/support/data/AbstractWordData.java new file mode 100644 index 0000000..bc9acf1 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/AbstractWordData.java @@ -0,0 +1,49 @@ +package com.github.houbb.sensitive.word.support.data; + +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.heaven.util.util.CollectionUtil; +import com.github.houbb.sensitive.word.api.IWordData; +import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext; +import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; + +import java.util.Collection; + +/** + * 抽象数据 + * + * @since 0.7.0 + */ +public abstract class AbstractWordData implements IWordData { + + /** + * 是否包含 + * @param stringBuilder 字符 + * @param innerContext 上下文 + * @return 结果 + */ + protected abstract WordContainsTypeEnum doContains(StringBuilder stringBuilder, InnerSensitiveContext innerContext); + + /** + * 初始化 + * @param collection 数据 + */ + protected abstract void doInitWordData(Collection collection); + + @Override + public void initWordData(Collection collection) { + //1. 预留 + + this.doInitWordData(collection); + } + + @Override + public WordContainsTypeEnum contains(StringBuilder stringBuilder, InnerSensitiveContext innerContext) { + if(stringBuilder == null + || stringBuilder.length() <= 0) { + return WordContainsTypeEnum.NOT_FOUND; + } + + return doContains(stringBuilder, innerContext); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/WordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataHashMap.java similarity index 90% rename from src/main/java/com/github/houbb/sensitive/word/support/map/WordMap.java rename to src/main/java/com/github/houbb/sensitive/word/support/data/WordDataHashMap.java index fd4db3d..93fffbc 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/WordMap.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataHashMap.java @@ -1,10 +1,9 @@ -package com.github.houbb.sensitive.word.support.map; +package com.github.houbb.sensitive.word.support.data; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.util.lang.ObjectUtil; import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.sensitive.word.api.IWordContext; -import com.github.houbb.sensitive.word.api.IWordMap; import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext; import com.github.houbb.sensitive.word.constant.AppConst; import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; @@ -20,7 +19,7 @@ import java.util.Map; * @since 0.0.1 */ @ThreadSafe -public class WordMap implements IWordMap { +public class WordDataHashMap extends AbstractWordData { /** * 脱敏单词 map @@ -41,7 +40,7 @@ public class WordMap implements IWordMap { */ @Override @SuppressWarnings("unchecked") - public synchronized void initWordMap(Collection collection) { + public synchronized void doInitWordData(Collection collection) { // 避免扩容带来的消耗 Map newInnerWordMap = new HashMap(collection.size()); @@ -78,12 +77,10 @@ public class WordMap implements IWordMap { // 将新节点设置为当前节点,方便下一次节点的循环。 currentMap = newWordMap; } - - // 判断是否为最后一个,添加是否结束的标识。 - if (i == size - 1) { - currentMap.put(AppConst.IS_END, true); - } } + + // 判断是否为最后一个,添加是否结束的标识。 + currentMap.put(AppConst.IS_END, true); } // 最后更新为新的 map,保证更新过程中旧的数据可用 @@ -101,13 +98,8 @@ public class WordMap implements IWordMap { * @since 0.0.1 */ @Override - public WordContainsTypeEnum contains(final StringBuilder stringBuilder, + public WordContainsTypeEnum doContains(final StringBuilder stringBuilder, final InnerSensitiveContext innerContext) { - if (stringBuilder == null - || stringBuilder.length() <= 0) { - return WordContainsTypeEnum.NOT_FOUND; - } - return innerContainsSensitive(stringBuilder, innerContext); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataTree.java b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataTree.java new file mode 100644 index 0000000..8d949b0 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataTree.java @@ -0,0 +1,123 @@ +package com.github.houbb.sensitive.word.support.data; + +import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.heaven.util.lang.ObjectUtil; +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.sensitive.word.api.IWordContext; +import com.github.houbb.sensitive.word.api.IWordData; +import com.github.houbb.sensitive.word.api.context.InnerSensitiveContext; +import com.github.houbb.sensitive.word.constant.AppConst; +import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +/** + * 敏感词 map + * PR:https://github.com/houbb/sensitive-word/pull/33 + * + * @author xiaochangbai + * @author binbin.hou + * @since 0.7.0 + */ +@ThreadSafe +public class WordDataTree implements IWordData { + + /** + * 根节点 + */ + private WordDataTreeNode root; + + @Override + public synchronized void initWordData(Collection collection) { + WordDataTreeNode newRoot = new WordDataTreeNode(); + + for(String word : collection) { + if(StringUtil.isEmpty(word)) { + continue; + } + + WordDataTreeNode tempNode = newRoot; + char[] chars = word.toCharArray(); + for (char c : chars) { + // 获取子节点 + WordDataTreeNode subNode = tempNode.getSubNode(c); + if (subNode == null) { + subNode = new WordDataTreeNode(); + // 加入新的子节点 + tempNode.addSubNode(c, subNode); + } + + // 临时节点指向子节点,进入下一次循环 + tempNode = subNode; + } + + // 设置结束标识(循环结束,设置一次即可) + tempNode.end(true); + } + + // 初始化完成才做替换 + this.root = newRoot; + } + + @Override + public WordContainsTypeEnum contains(StringBuilder stringBuilder, + InnerSensitiveContext innerContext) { + WordDataTreeNode nowNode = root; + + int len = stringBuilder.length(); + + for(int i = 0; i < len; i++) { + // 获取当前的 map 信息 + nowNode = getNowMap(nowNode, i, stringBuilder, innerContext); + + // 如果不为空,则判断是否为结尾。 + if (ObjectUtil.isNull(nowNode)) { + return WordContainsTypeEnum.NOT_FOUND; + } + } + + if(nowNode.end()) { + return WordContainsTypeEnum.CONTAINS_END; + } + + return WordContainsTypeEnum.CONTAINS_PREFIX; + } + + + /** + * 获取当前的 Map + * @param nowNode 当前节点 + * @param index 下标 + * @param stringBuilder 文本缓存 + * @param sensitiveContext 上下文 + * @return 实际的当前 map + * @since 0.0.7 + */ + private WordDataTreeNode getNowMap(WordDataTreeNode nowNode, + final int index, + final StringBuilder stringBuilder, + final InnerSensitiveContext sensitiveContext) { + final IWordContext context = sensitiveContext.wordContext(); + + // 这里的 char 已经是统一格式化之后的,所以可以不用再次格式化。 + char mappingChar = stringBuilder.charAt(index); + + // 这里做一次重复词的处理 + WordDataTreeNode currentMap = nowNode.getSubNode(mappingChar); + // 启用忽略重复&当前下标不是第一个 + if(context.ignoreRepeat() + && index > 0) { + char preMappingChar = stringBuilder.charAt(index-1); + + // 直接赋值为上一个 map + if(preMappingChar == mappingChar) { + currentMap = nowNode; + } + } + + return currentMap; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataTreeNode.java b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataTreeNode.java new file mode 100644 index 0000000..350adce --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataTreeNode.java @@ -0,0 +1,49 @@ +package com.github.houbb.sensitive.word.support.data; + +import java.util.HashMap; +import java.util.Map; + +/** + * 树节点 + * + * @since 0.7.0 + */ +public class WordDataTreeNode { + + /** + * 关键词结束标识 + */ + private boolean end; + + /** + * 子节点(key是下级字符,value是下级节点) + */ + private Map subNodeMap; + + public boolean end() { + return end; + } + + public WordDataTreeNode end(boolean end) { + this.end = end; + return this; + } + + public WordDataTreeNode getSubNode(final char c) { + if(subNodeMap == null) { + return null; + } + + return subNodeMap.get(c); + } + + public WordDataTreeNode addSubNode(char c, WordDataTreeNode subNode) { + if(this.subNodeMap == null) { + subNodeMap = new HashMap<>(); + } + + subNodeMap.put(c, subNode); + return this; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/WordDatas.java b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDatas.java new file mode 100644 index 0000000..28333aa --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDatas.java @@ -0,0 +1,43 @@ +package com.github.houbb.sensitive.word.support.data; + +import com.github.houbb.sensitive.word.api.IWordData; + +/** + * 敏感词 map + * + * @author binbin.hou + * @since 0.3.0 + */ +public final class WordDatas { + + private WordDatas(){} + + /** + * 默认策略 + * @return 策略 + * @since 0.3.0 + */ + public static IWordData defaults() { + return tree(); + } + + /** + * 树模式 + * @return 树 + * @since 0.7.0 + */ + public static IWordData tree() { + return new WordDataTree(); + } + + /** + * 树模式 + * @return 树 + * @since 0.7.0 + */ + public static IWordData hashMap() { + return new WordDataHashMap(); + } + + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenys.java b/src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenys.java index e80be0e..dfcd524 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenys.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/deny/WordDenys.java @@ -41,7 +41,7 @@ public final class WordDenys { * @return 结果 * @since 0.0.13 */ - public static IWordDeny system() { + public static IWordDeny defaults() { return WordDenySystem.getInstance(); } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java deleted file mode 100644 index b98c064..0000000 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java +++ /dev/null @@ -1,265 +0,0 @@ -//package com.github.houbb.sensitive.word.support.map; -// -//import com.github.houbb.heaven.annotation.ThreadSafe; -//import com.github.houbb.heaven.util.guava.Guavas; -//import com.github.houbb.heaven.util.io.FileUtil; -//import com.github.houbb.heaven.util.lang.ObjectUtil; -//import com.github.houbb.heaven.util.lang.StringUtil; -//import com.github.houbb.heaven.util.util.CollectionUtil; -//import com.github.houbb.sensitive.word.api.*; -//import com.github.houbb.sensitive.word.constant.AppConst; -//import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum; -//import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult; -//import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl; -//import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext; -//import com.github.houbb.sensitive.word.support.result.WordResult; -// -//import java.util.Collection; -//import java.util.HashMap; -//import java.util.List; -//import java.util.Map; -// -///** -// * 敏感词 map -// * -// * @author binbin.hou -// * @since 0.0.1 -// */ -//@ThreadSafe -//public class SensitiveWordMap implements IWordMap { -// -// /** -// * 脱敏单词 map -// * -// * @since 0.0.1 -// */ -// private Map innerWordMap; -// -// /** -// * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型: -// * -// * @param collection 敏感词库集合 -// * @since 0.0.1 -// *

-// * 使用对象代码 map 的这种一直递归。 -// * 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html -// * https://blog.csdn.net/chenssy/article/details/26961957 -// */ -// @Override -// @SuppressWarnings("unchecked") -// public synchronized void initWordMap(Collection collection) { -// // 避免扩容带来的消耗 -// Map newInnerWordMap = new HashMap(collection.size()); -// -// for (String key : collection) { -// if (StringUtil.isEmpty(key)) { -// continue; -// } -// -// // 用来按照相应的格式保存敏感词库数据 -// char[] chars = key.toCharArray(); -// final int size = chars.length; -// -// // 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中 -// Map currentMap = newInnerWordMap; -// -// for (int i = 0; i < size; i++) { -// // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值 -// char charKey = chars[i]; -// // 如果集合存在 -// Object wordMap = currentMap.get(charKey); -// -// // 如果集合存在 -// if (ObjectUtil.isNotNull(wordMap)) { -// // 直接将获取到的 map 当前当前 map 进行继续的操作 -// currentMap = (Map) wordMap; -// } else { -// //不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一 -// Map newWordMap = new HashMap<>(8); -// newWordMap.put(AppConst.IS_END, false); -// -// // 将新的节点放入当前 map 中 -// currentMap.put(charKey, newWordMap); -// -// // 将新节点设置为当前节点,方便下一次节点的循环。 -// currentMap = newWordMap; -// } -// -// // 判断是否为最后一个,添加是否结束的标识。 -// if (i == size - 1) { -// currentMap.put(AppConst.IS_END, true); -// } -// } -// } -// -// // 最后更新为新的 map,保证更新过程中旧的数据可用 -// this.innerWordMap = newInnerWordMap; -// } -// -// /** -// * 是否包含 -// * (1)直接遍历所有 -// * (2)如果遇到,则直接返回 true -// * -// * @param string 字符串 -// * @return 是否包含 -// * @since 0.0.1 -// */ -// @Override -// public boolean contains(String string, final IWordContext context) { -// if (StringUtil.isEmpty(string)) { -// return false; -// } -// -// for (int i = 0; i < string.length(); i++) { -// SensitiveCheckResult checkResult = sensitiveCheck(string, i, ValidModeEnum.FAIL_FAST, context); -// // 快速返回 -// if (checkResult.index() > 0) { -// return true; -// } -// } -// return false; -// } -// -// /** -// * 返回所有对应的敏感词 -// * (1)结果是有序的 -// * (2)为了保留所有的下标,结果从 v0.1.0 之后不再去重。 -// * -// * @param string 原始字符串 -// * @return 结果 -// * @since 0.0.1 -// */ -// @Override -// public List findAll(String string, final IWordContext context) { -// return getSensitiveWords(string, ValidModeEnum.FAIL_OVER, context); -// } -// -// @Override -// public IWordResult findFirst(String string, final IWordContext context) { -// List stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST, context); -// -// if (CollectionUtil.isEmpty(stringList)) { -// return null; -// } -// -// return stringList.get(0); -// } -// -// @Override -// public String replace(String target, final IWordContext context) { -// if(StringUtil.isEmpty(target)) { -// return target; -// } -// -// return this.replaceSensitiveWord(target, context); -// } -// -// /** -// * 获取敏感词列表 -// * -// * @param text 文本 -// * @param modeEnum 模式 -// * @return 结果列表 -// * @since 0.0.1 -// */ -// private List getSensitiveWords(final String text, final ValidModeEnum modeEnum, -// final IWordContext context) { -// //1. 是否存在敏感词,如果比存在,直接返回空列表 -// if (StringUtil.isEmpty(text)) { -// return Guavas.newArrayList(); -// } -// -// List resultList = Guavas.newArrayList(); -// for (int i = 0; i < text.length(); i++) { -// SensitiveCheckResult checkResult = sensitiveCheck(text, i, ValidModeEnum.FAIL_OVER, context); -// // 命中 -// int wordLength = checkResult.index(); -// if (wordLength > 0) { -// // 保存敏感词 -// String sensitiveWord = text.substring(i, i + wordLength); -// -// // 添加去重 -// WordResult wordResult = WordResult.newInstance() -// .startIndex(i) -// .endIndex(i+wordLength) -// .word(sensitiveWord); -// resultList.add(wordResult); -// -// // 快速返回 -// if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) { -// break; -// } -// -// // 增加 i 的步长 -// // 为什么要-1,因为默认就会自增1 -// // TODO: 这里可以根据字符串匹配算法优化。 -// i += wordLength - 1; -// } -// } -// -// return resultList; -// } -// -// /** -// * 直接替换敏感词,返回替换后的结果 -// * @param target 文本信息 -// * @param context 上下文 -// * @return 脱敏后的字符串 -// * @since 0.0.2 -// */ -// private String replaceSensitiveWord(final String target, -// final IWordContext context) { -// if(StringUtil.isEmpty(target)) { -// return target; -// } -// // 用于结果构建 -// StringBuilder resultBuilder = new StringBuilder(target.length()); -// -// for (int i = 0; i < target.length(); i++) { -// char currentChar = target.charAt(i); -// // 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词 -// SensitiveCheckResult checkResult = sensitiveCheck(target, i, ValidModeEnum.FAIL_OVER, context); -// -// // 敏感词 -// int wordLength = checkResult.index(); -// if(wordLength > 0) { -// // 是否执行替换 -// Class checkClass = checkResult.checkClass(); -// String string = target.substring(i, i+wordLength); -// if(SensitiveCheckUrl.class.equals(checkClass) -// && FileUtil.isImage(string)) { -// // 直接使用原始内容,避免 markdown 图片转换失败 -// resultBuilder.append(string); -// } else { -// // 创建上下文 -// ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance() -// .sensitiveWord(string) -// .wordLength(wordLength); -// String replaceStr = context.sensitiveWordReplace().replace(replaceContext); -// -// resultBuilder.append(replaceStr); -// } -// -// // 直接跳过敏感词的长度 -// i += wordLength-1; -// } else { -// // 普通词 -// resultBuilder.append(currentChar); -// } -// } -// -// return resultBuilder.toString(); -// } -// -// @Override -// public SensitiveCheckResult sensitiveCheck(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) { -// // 默认执行敏感词操作 -// context.sensitiveWordMap(innerWordMap); -// -// // 责任链模式调用 -// return context.sensitiveCheck() -// .sensitiveCheck(txt, beginIndex, validModeEnum, context); -// } -// -//} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/map/WordMaps.java b/src/main/java/com/github/houbb/sensitive/word/support/map/WordMaps.java deleted file mode 100644 index f759bc2..0000000 --- a/src/main/java/com/github/houbb/sensitive/word/support/map/WordMaps.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.github.houbb.sensitive.word.support.map; - -import com.github.houbb.sensitive.word.api.IWordMap; - -/** - * 敏感词 map - * - * @author binbin.hou - * @since 0.3.0 - */ -public final class WordMaps { - - private WordMaps(){} - - /** - * 默认策略 - * @return 策略 - * @since 0.3.0 - */ - public static IWordMap defaults() { - return new WordMap(); - } - -} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaces.java b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaces.java index 552547e..a2f8034 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaces.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/replace/SensitiveWordReplaces.java @@ -31,4 +31,13 @@ public final class SensitiveWordReplaces { return new SensitiveWordReplaceChar(); } + /** + * 字符,默认为 * + * @return 结果 + * @since 0.7.0 + */ + public static ISensitiveWordReplace defaults() { + return chars(); + } + } diff --git a/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkTimesTest.java b/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkTimesTest.java index c0aa035..d820e3c 100644 --- a/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkTimesTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkTimesTest.java @@ -13,6 +13,7 @@ public class BenchmarkTimesTest { * 测试基准:100+字符串 * 10W次 * * V0.6.0: 1470ms,接近 7.2W QPS + * V0.7.0: 1380ms */ @Test public void onlyWordAndNoReplaceTest() { @@ -45,6 +46,7 @@ public class BenchmarkTimesTest { * 测试基准:100+字符串 * 10W次 * * V0.6.0: 2744ms, 约 3.7W QPS + * V0.7.0: 2723ms */ @Test public void onlyWordAndWithReplaceTest() { diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsDataTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsDataTest.java new file mode 100644 index 0000000..104d84b --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsDataTest.java @@ -0,0 +1,27 @@ +package com.github.houbb.sensitive.word.bs; + +import com.github.houbb.sensitive.word.support.data.WordDatas; +import org.junit.Assert; +import org.junit.Test; + +/** + *

project: sensitive-word-SensitiveWordBsConfigTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.7.0 + */ +public class SensitiveWordBsDataTest { + + @Test + public void wordDataConfigTest() { + SensitiveWordBs wordBs = SensitiveWordBs.newInstance() + .wordData(WordDatas.tree()) + .init(); + + final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; + Assert.assertTrue(wordBs.contains(text)); + Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordBs.findAll(text).toString()); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java index 979e25d..72f0a03 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java @@ -106,8 +106,8 @@ public class SensitiveWordBsTest { @Test public void configTest() { SensitiveWordBs wordBs = SensitiveWordBs.newInstance() - .wordDeny(WordDenys.system()) - .wordAllow(WordAllows.system()) + .wordDeny(WordDenys.defaults()) + .wordAllow(WordAllows.defaults()) .init(); final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; diff --git a/src/test/java/com/github/houbb/sensitive/word/bugs/b20211211/MySensitiveTest.java b/src/test/java/com/github/houbb/sensitive/word/bugs/b20211211/MySensitiveTest.java index 1cb2144..69a75cb 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bugs/b20211211/MySensitiveTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bugs/b20211211/MySensitiveTest.java @@ -12,8 +12,8 @@ public class MySensitiveTest { @Test public void test() { - IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDeny()); - IWordAllow wordAllow = WordAllows.chains(WordAllows.system(), new MyWordAllow()); + IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny()); + IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow()); SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() .wordAllow(wordAllow) .wordDeny(wordDeny)// 各种其他配置 diff --git a/src/test/java/com/github/houbb/sensitive/word/bugs/b32/MyWordDenyChineseTest.java b/src/test/java/com/github/houbb/sensitive/word/bugs/b32/MyWordDenyChineseTest.java index 282b4da..bd4c7b9 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bugs/b32/MyWordDenyChineseTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bugs/b32/MyWordDenyChineseTest.java @@ -10,7 +10,7 @@ public class MyWordDenyChineseTest { @Test public void test() { - IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDenyChineseNum()); + IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDenyChineseNum()); SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() .wordDeny(wordDeny)// 各种其他配置 .init();// init() 初始化敏感词字典 diff --git a/src/test/java/com/github/houbb/sensitive/word/define/SensitiveWordBsDefineTest.java b/src/test/java/com/github/houbb/sensitive/word/define/SensitiveWordBsDefineTest.java index 63c982c..88848d3 100644 --- a/src/test/java/com/github/houbb/sensitive/word/define/SensitiveWordBsDefineTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/define/SensitiveWordBsDefineTest.java @@ -30,8 +30,8 @@ public class SensitiveWordBsDefineTest { public void defineChainsTest() { String text = "这是一个测试。我的自定义敏感词。"; - IWordDeny wordDeny = WordDenys.chains(WordDenys.system(), new MyWordDeny()); - IWordAllow wordAllow = WordAllows.chains(WordAllows.system(), new MyWordAllow()); + IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny()); + IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow()); SensitiveWordBs wordBs = SensitiveWordBs.newInstance() .wordDeny(wordDeny) diff --git a/src/test/java/com/github/houbb/sensitive/word/memory/DataMemoryTest.java b/src/test/java/com/github/houbb/sensitive/word/memory/DataMemoryTest.java new file mode 100644 index 0000000..6e30cae --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/memory/DataMemoryTest.java @@ -0,0 +1,49 @@ +package com.github.houbb.sensitive.word.memory; + +import com.github.houbb.heaven.util.io.StreamUtil; +import com.github.houbb.sensitive.word.api.IWordData; +import com.github.houbb.sensitive.word.support.data.WordDatas; +import org.apache.lucene.util.RamUsageEstimator; +import org.junit.Ignore; +import org.junit.Test; + +import java.util.List; + +/** + * 数据内存测试 + * + * @since 0.7.0 + */ +@Ignore +public class DataMemoryTest { + + /** + * 35.5 MB + */ + @Test + public void hashMapTest() { + List allLines = StreamUtil.readAllLines("/dict.txt"); + IWordData wordData = WordDatas.defaults(); + + wordData.initWordData(allLines); + + //计算指定对象及其引用树上的所有对象的综合大小,返回可读的结果,如:2KB + String humanSize = RamUsageEstimator.humanSizeOf(wordData); + System.out.println(humanSize); + } + + + //33.4 MB + @Test + public void treeTest() { + List allLines = StreamUtil.readAllLines("/dict.txt"); + IWordData wordData = WordDatas.tree(); + + wordData.initWordData(allLines); + + //计算指定对象及其引用树上的所有对象的综合大小,返回可读的结果,如:2KB + String humanSize = RamUsageEstimator.humanSizeOf(wordData); + System.out.println(humanSize); + } + +} diff --git a/src/test/java/com/github/houbb/sensitive/word/spring/SpringSensitiveWordConfig.java b/src/test/java/com/github/houbb/sensitive/word/spring/SpringSensitiveWordConfig.java index 85ca8e4..af7b545 100644 --- a/src/test/java/com/github/houbb/sensitive/word/spring/SpringSensitiveWordConfig.java +++ b/src/test/java/com/github/houbb/sensitive/word/spring/SpringSensitiveWordConfig.java @@ -29,7 +29,7 @@ public class SpringSensitiveWordConfig { @Bean public SensitiveWordBs sensitiveWordBs() { SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() - .wordAllow(WordAllows.chains(WordAllows.system(), myDdWordAllow)) + .wordAllow(WordAllows.chains(WordAllows.defaults(), myDdWordAllow)) .wordDeny(myDdWordDeny) // 各种其他配置 .init();