diff --git a/README.md b/README.md index 3903c95..d65c2fe 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ com.github.houbb sensitive-word - 0.0.6 + 0.0.7 ``` @@ -169,9 +169,16 @@ List wordList = SensitiveWordBs.newInstance().findAll(text); Assert.assertEquals("[Ⓕⓤc⒦]", wordList.toString()); ``` -# 后期 road-map +## 忽略重复词 -- 重复词 +```java +final String text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words"; + +List wordList = SensitiveWordBs.newInstance().findAll(text); +Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString()); +``` + +# 后期 road-map - 停顿词 diff --git a/doc/CHANGE_LOG.md b/doc/CHANGE_LOG.md index 1dbe0e9..34e900c 100644 --- a/doc/CHANGE_LOG.md +++ b/doc/CHANGE_LOG.md @@ -55,4 +55,10 @@ | 1 | A | 添加中文繁简体转换支持 | 2020-1-10 09:34:35 | | | 2 | A | 添加英文常见写法转换支持 | 2020-1-10 09:34:35 | | | 3 | A | 新增敏感词 `艹` | 2020-1-10 09:34:35 | | -| 4 | D | 移除单个词 `k买仆办功务动区卖台吨天房本歌滚灾独证踢弓` | 2020-1-10 09:34:35 | | \ No newline at end of file +| 4 | D | 移除单个词 `k买仆办功务动区卖台吨天房本歌滚灾独证踢弓` | 2020-1-10 09:34:35 | | + +# release_0.0.7 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:---|:---|:---|:--| +| 1 | A | 添加忽略重复词支持 | 2020-1-10 09:34:35 | | \ No newline at end of file diff --git a/doc/issues/roadmap/v007-重复词的处理.md b/doc/issues/roadmap/v007-重复词的处理.md index c1f2c6c..db7d9f0 100644 --- a/doc/issues/roadmap/v007-重复词的处理.md +++ b/doc/issues/roadmap/v007-重复词的处理.md @@ -4,4 +4,10 @@ ffffuuuuccckkk f xxx -x 如果和上一个字符一样,则直接忽略。 \ No newline at end of file +x 如果和上一个字符一样,则直接忽略。 + +# 细节 + +当开启的时候,如果在敏感词获取的时候,如果下一个字没有找到,则进行去重。 + +即如果当前字符和上一个字符完全一样,则直接跳过。(仅仅在没有普匹配的场景下) \ No newline at end of file diff --git a/doc/issues/roadmap/v012-停顿词的处理.md b/doc/issues/roadmap/v012-停顿词的处理.md index b4bc7ae..c215dcd 100644 --- a/doc/issues/roadmap/v012-停顿词的处理.md +++ b/doc/issues/roadmap/v012-停顿词的处理.md @@ -1,3 +1,9 @@ +# 标点符号 + +无论中文英文数字,其中特殊符号一定是停顿词。 + +可以这么粗俗的认为。 + # 英文 核心是英文停顿词。 diff --git a/pom.xml b/pom.xml index c2ec8d8..afad7c1 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.0.6 + 0.0.7 diff --git a/release.bat b/release.bat index 6209cd6..2f3a21d 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.0.6 +SET version=0.0.7 :::: 新版本名称 -SET newVersion=0.0.7 +SET newVersion=0.0.8 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index 7b7bd5c..29ff5ce 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -29,7 +29,6 @@ public interface IWordContext { */ boolean ignoreNumStyle(); - /** * 设置是否忽略大小写 * @param ignoreCase 是否忽略大小写 @@ -64,12 +63,11 @@ public interface IWordContext { /** * 设置是否忽略中文繁简体格式 * @param ignoreChineseStyle 是否忽略 - * @return 是否 + * @return this * @since 0.0.6 */ IWordContext ignoreChineseStyle(final boolean ignoreChineseStyle); - /** * 获取敏感词信息 * @return 敏感词 @@ -95,7 +93,7 @@ public interface IWordContext { /** * 设置敏感数字检测 * @param sensitiveNumCheck 数字格式检测 - * @return 数字检测 + * @return this * @since 0.0.5 */ IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck); @@ -110,9 +108,24 @@ public interface IWordContext { /** * 设置忽略英文的写法 * @param ignoreEnglishStyle 是否忽略 - * @return 数字检测 + * @return this * @since 0.0.6 */ IWordContext ignoreEnglishStyle(final boolean ignoreEnglishStyle); + /** + * 忽略重复词 + * @return 是否忽略 + * @since 0.0.7 + */ + boolean ignoreRepeat(); + + /** + * 设置忽略重复词 + * @param ignoreRepeat 是否忽略 + * @return this + * @since 0.0.7 + */ + IWordContext ignoreRepeat(final boolean ignoreRepeat); + } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 1669604..da2fa86 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -85,6 +85,7 @@ public class SensitiveWordBs { wordContext.ignoreNumStyle(true); wordContext.ignoreChineseStyle(true); wordContext.ignoreEnglishStyle(true); + wordContext.ignoreRepeat(true); // 开启校验 wordContext.sensitiveNumCheck(true); diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 232709b..9cdf438 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -53,6 +53,12 @@ public class SensitiveWordContext implements IWordContext { */ private boolean ignoreEnglishStyle; + /** + * 忽略重复词 + * @since 0.0.7 + */ + private boolean ignoreRepeat; + /** * 私有化构造器 * @since 0.0.4 @@ -147,16 +153,14 @@ public class SensitiveWordContext implements IWordContext { } @Override - public String toString() { - return "SensitiveWordContext{" + - "ignoreCase=" + ignoreCase + - ", ignoreWidth=" + ignoreWidth + - ", ignoreNumStyle=" + ignoreNumStyle + - ", sensitiveWordMap=" + sensitiveWordMap + - ", sensitiveNumCheck=" + sensitiveNumCheck + - ", ignoreChineseStyle=" + ignoreChineseStyle + - ", ignoreEnglishStyle=" + ignoreEnglishStyle + - '}'; + public boolean ignoreRepeat() { + return ignoreRepeat; + } + + @Override + public SensitiveWordContext ignoreRepeat(boolean ignoreRepeat) { + this.ignoreRepeat = ignoreRepeat; + return this; } } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java index 1a1d48c..b1f27d5 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/SensitiveWordCheck.java @@ -2,7 +2,6 @@ package com.github.houbb.sensitive.word.support.check; import com.github.houbb.heaven.annotation.ThreadSafe; import com.github.houbb.heaven.support.instance.impl.Instances; -import com.github.houbb.heaven.util.lang.CharUtil; import com.github.houbb.heaven.util.lang.ObjectUtil; import com.github.houbb.sensitive.word.api.ISensitiveCheck; import com.github.houbb.sensitive.word.api.IWordContext; @@ -29,12 +28,9 @@ public class SensitiveWordCheck implements ISensitiveCheck { int actualLength = 0; for (int i = beginIndex; i < txt.length(); i++) { - char c = txt.charAt(i); - char charKey = Instances.singleton(CharFormatChain.class).format(c, context); + // 获取当前的 map 信息 + nowMap = getNowMap(nowMap, context, txt, i); - // 判断该字是否存在于敏感词库中 - // 并且将 nowMap 替换为新的 map,进入下一层的循环。 - nowMap = (Map) nowMap.get(charKey); if (ObjectUtil.isNotNull(nowMap)) { lengthCount++; @@ -60,4 +56,38 @@ public class SensitiveWordCheck implements ISensitiveCheck { return actualLength; } + /** + * 获取当前的 Map + * @param nowMap 原始的当前 map + * @param context 上下文 + * @param txt 文本信息 + * @param index 下标 + * @return 实际的当前 map + * @since 0.0.7 + */ + private Map getNowMap(Map nowMap, + final IWordContext context, + final String txt, + final int index) { + char c = txt.charAt(index); + char mappingChar = Instances.singleton(CharFormatChain.class).format(c, context); + + // 这里做一次重复词的处理 + Map currentMap = (Map) nowMap.get(mappingChar); + // 启用忽略重复&当前下标不是第一个 + if(context.ignoreRepeat() + && index > 0) { + char preChar = txt.charAt(index-1); + char preMappingChar = Instances.singleton(CharFormatChain.class) + .format(preChar, context); + + // 直接赋值为上一个 map + if(preMappingChar == mappingChar) { + currentMap = nowMap; + } + } + + return currentMap; + } + } diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsRepeatTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsRepeatTest.java new file mode 100644 index 0000000..16e20df --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsRepeatTest.java @@ -0,0 +1,29 @@ +package com.github.houbb.sensitive.word.bs; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.0.7 + */ +public class SensitiveWordBsRepeatTest { + + /** + * 忽略重复词 + * @since 0.0.7 + */ + @Test + public void ignoreChineseStyleTest() { + final String text = "ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦ the bad words"; + + List wordList = SensitiveWordBs.newInstance().findAll(text); + Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString()); + } + +}