From 56df8bd6484a0f5d2332f716162eea31abff4f68 Mon Sep 17 00:00:00 2001 From: houbb Date: Sat, 9 Dec 2023 00:38:10 +0800 Subject: [PATCH] release branch 0.11.0 --- CHANGE_LOG.md | 6 +++ README.md | 39 ++++++++++++++++++- pom.xml | 4 +- release.bat | 4 +- .../word/api/ISensitiveWordCharIgnore.java | 22 +++++++++++ .../sensitive/word/api/IWordContext.java | 4 ++ .../sensitive/word/bs/SensitiveWordBs.java | 15 +++++++ .../word/bs/SensitiveWordContext.java | 14 +++++++ .../word/support/check/WordCheckWord.java | 13 ++++++- .../AbstractSensitiveWordCharIgnore.java | 19 +++++++++ .../ignore/NoneSensitiveWordCharIgnore.java | 16 ++++++++ .../ignore/SensitiveWordCharIgnores.java | 22 +++++++++++ .../SpecialCharSensitiveWordCharIgnore.java | 28 +++++++++++++ src/main/resources/dict.txt | 5 +++ .../word/bs/SensitiveWordBsEmailTest.java | 7 ++++ .../bs/SensitiveWordBsIgnoreCharTest.java | 39 +++++++++++++++++++ 16 files changed, 251 insertions(+), 6 deletions(-) create mode 100644 src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordCharIgnore.java create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/ignore/AbstractSensitiveWordCharIgnore.java create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/ignore/NoneSensitiveWordCharIgnore.java create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/ignore/SensitiveWordCharIgnores.java create mode 100644 src/main/java/com/github/houbb/sensitive/word/support/ignore/SpecialCharSensitiveWordCharIgnore.java create mode 100644 src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsIgnoreCharTest.java diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index cded43a..467fc53 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -212,3 +212,9 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:-----|------------------|:--------------------|:------| | 1 | A | 添加脏词的标签接口,便于后续拓展 | 2023-12-05 23:51:58 | | + +# release_0.11.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|----------------------|:--------------------|:------| +| 1 | A | 添加忽略字符接口,便于跳过一些干扰的字符 | 2023-12-08 23:51:58 | | \ No newline at end of file diff --git a/README.md b/README.md index f1b6579..491cbc9 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,8 @@ - [支持敏感词的标签接口](https://github.com/houbb/sensitive-word#%E6%95%8F%E6%84%9F%E8%AF%8D%E6%A0%87%E7%AD%BE) +- [支持跳过一些特殊字符,让匹配更灵活]() + ## 变更日志 [CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/CHANGE_LOG.md) @@ -60,7 +62,7 @@ com.github.houbb sensitive-word - 0.10.0 + 0.11.0 ``` @@ -390,6 +392,41 @@ Assert.assertTrue(wordBs.contains(text)); | 10 | enableWordCheck | 是否启用敏感单词检测 | true | | 11 | numCheckLen | 数字检测,自定义指定长度。 | 8 | + +# 忽略字符 + +## 说明 + +我们的敏感词一般都是比较连续的,比如【傻帽】 + +那就有大聪明发现,可以在中间加一些字符,比如【傻!@#$帽】跳过检测,但是骂人等攻击力不减。 + +那么,如何应对这些类似的场景呢? + +我们可以指定特殊字符的跳过集合,忽略掉这些无意义的字符即可。 + +v0.11.0 开始支持 + +## 例子 + +其中 charIgnore 对应的字符策略,用户可以自行灵活定义。 + +```java +final String text = "傻@冒,狗+东西"; + +//默认因为有特殊字符分割,无法识别 +List wordList = SensitiveWordBs.newInstance().init().findAll(text); +Assert.assertEquals("[]", wordList.toString()); + +// 指定忽略的字符策略,可自行实现。 +List wordList2 = SensitiveWordBs.newInstance() + .charIgnore(SensitiveWordCharIgnores.specialChars()) + .init() + .findAll(text); + +Assert.assertEquals("[傻@冒, 狗+东西]", wordList2.toString()); +``` + # 敏感词标签 ## 说明 diff --git a/pom.xml b/pom.xml index d324ece..afa3eea 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.github.houbb sensitive-word - 0.10.0 + 0.11.0 @@ -25,7 +25,7 @@ 1.7 - 0.2.7 + 0.6.0 1.8.1 diff --git a/release.bat b/release.bat index 3373766..0460ad3 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.10.0 +SET version=0.11.0 :::: 新版本名称 -SET newVersion=0.11.0 +SET newVersion=0.12.0 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordCharIgnore.java b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordCharIgnore.java new file mode 100644 index 0000000..1d8cc4d --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWordCharIgnore.java @@ -0,0 +1,22 @@ +package com.github.houbb.sensitive.word.api; + +import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; + +/** + * 是否忽略某一个字符 + * @since 0.11.0 + */ +public interface ISensitiveWordCharIgnore { + + /** + * 是否忽略当前字符 + * @param ix 下标志 + * @param chars 字符数组 + * @param innerContext 上下文 + * @return 结果 + */ + boolean ignore(final int ix, + final char[] chars, + InnerSensitiveWordContext innerContext); + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index 44315cf..1fda0ba 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -237,4 +237,8 @@ public interface IWordContext { SensitiveWordContext wordTag(IWordTag wordTag); + ISensitiveWordCharIgnore charIgnore(); + + SensitiveWordContext charIgnore(ISensitiveWordCharIgnore charIgnore); + } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index cf20c9c..467078b 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -15,6 +15,7 @@ import com.github.houbb.sensitive.word.support.combine.check.WordCheckCombines; import com.github.houbb.sensitive.word.support.combine.format.WordFormatCombines; import com.github.houbb.sensitive.word.support.data.WordDatas; import com.github.houbb.sensitive.word.support.deny.WordDenys; +import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; import com.github.houbb.sensitive.word.support.replace.WordReplaces; import com.github.houbb.sensitive.word.support.result.WordResultHandlers; import com.github.houbb.sensitive.word.support.tag.WordTags; @@ -156,6 +157,12 @@ public class SensitiveWordBs { */ private IWordTag wordTag = WordTags.none(); + /** + * 忽略的字符策略 + * @since 0.11.0 + */ + private ISensitiveWordCharIgnore charIgnore = SensitiveWordCharIgnores.defaults(); + /** * 新建验证实例 *

@@ -225,10 +232,18 @@ public class SensitiveWordBs { context.wordReplace(wordReplace); context.wordData(wordData); context.wordTag(wordTag); + context.charIgnore(charIgnore); return context; } + public SensitiveWordBs charIgnore(ISensitiveWordCharIgnore charIgnore) { + ArgUtil.notNull(charIgnore, "charIgnore"); + + this.charIgnore = charIgnore; + return this; + } + public SensitiveWordBs wordTag(IWordTag wordTag) { ArgUtil.notNull(wordTag, "wordTag"); diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index a42730c..6167b5e 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -107,6 +107,12 @@ public class SensitiveWordContext implements IWordContext { */ private IWordTag wordTag; + /** + * 忽略的字符 + * @since 0.11.0 + */ + private ISensitiveWordCharIgnore charIgnore; + public IWordData wordData() { return wordData; } @@ -290,4 +296,12 @@ public class SensitiveWordContext implements IWordContext { return this; } + public ISensitiveWordCharIgnore charIgnore() { + return charIgnore; + } + + public SensitiveWordContext charIgnore(ISensitiveWordCharIgnore charIgnore) { + this.charIgnore = charIgnore; + return this; + } } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java index e3b4b4e..b19a14c 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java @@ -1,6 +1,7 @@ package com.github.houbb.sensitive.word.support.check; import com.github.houbb.heaven.annotation.ThreadSafe; +import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore; import com.github.houbb.sensitive.word.api.IWordCheck; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordData; @@ -46,16 +47,26 @@ public class WordCheckWord extends AbstractWordCheck { // 前一个条件 StringBuilder stringBuilder = new StringBuilder(); char[] rawChars = txt.toCharArray(); + + final ISensitiveWordCharIgnore wordCharIgnore = context.charIgnore(); + int tempLen = 0; for(int i = beginIndex; i < rawChars.length; i++) { + // 判断是否跳过? + if(wordCharIgnore.ignore(i, rawChars, innerContext)) { + tempLen++; + continue; + } + // 映射处理 final char currentChar = rawChars[i]; char mappingChar = formatCharMapping.get(currentChar); stringBuilder.append(mappingChar); + tempLen++; // 判断是否存在 WordContainsTypeEnum wordContainsTypeEnum = wordData.contains(stringBuilder, innerContext); if(WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnum)) { - actualLength = stringBuilder.length(); + actualLength = tempLen; // 是否遍历全部匹配的模式 if(WordValidModeEnum.FAIL_FAST.equals(wordValidModeEnum)) { diff --git a/src/main/java/com/github/houbb/sensitive/word/support/ignore/AbstractSensitiveWordCharIgnore.java b/src/main/java/com/github/houbb/sensitive/word/support/ignore/AbstractSensitiveWordCharIgnore.java new file mode 100644 index 0000000..880dc44 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/ignore/AbstractSensitiveWordCharIgnore.java @@ -0,0 +1,19 @@ +package com.github.houbb.sensitive.word.support.ignore; + +import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore; +import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; + +/** + * 抽象实现 + * @since 0.11.0 + */ +public abstract class AbstractSensitiveWordCharIgnore implements ISensitiveWordCharIgnore { + + protected abstract boolean doIgnore(int ix, char[] chars, InnerSensitiveWordContext innerContext); + + @Override + public boolean ignore(int ix, char[] chars, InnerSensitiveWordContext innerContext) { + return doIgnore(ix, chars, innerContext); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/ignore/NoneSensitiveWordCharIgnore.java b/src/main/java/com/github/houbb/sensitive/word/support/ignore/NoneSensitiveWordCharIgnore.java new file mode 100644 index 0000000..631ccc2 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/ignore/NoneSensitiveWordCharIgnore.java @@ -0,0 +1,16 @@ +package com.github.houbb.sensitive.word.support.ignore; + +import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; + +/** + * 特殊字符忽略 + * @since 0.11.0 + */ +public class NoneSensitiveWordCharIgnore extends AbstractSensitiveWordCharIgnore { + + @Override + protected boolean doIgnore(int ix, char[] chars, InnerSensitiveWordContext innerContext) { + return false; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/ignore/SensitiveWordCharIgnores.java b/src/main/java/com/github/houbb/sensitive/word/support/ignore/SensitiveWordCharIgnores.java new file mode 100644 index 0000000..1b07a58 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/ignore/SensitiveWordCharIgnores.java @@ -0,0 +1,22 @@ +package com.github.houbb.sensitive.word.support.ignore; + +import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore; + +/** + * @since 0.11.0 + */ +public class SensitiveWordCharIgnores { + + public static ISensitiveWordCharIgnore specialChars() { + return new SpecialCharSensitiveWordCharIgnore(); + } + + public static ISensitiveWordCharIgnore none() { + return new NoneSensitiveWordCharIgnore(); + } + + public static ISensitiveWordCharIgnore defaults() { + return none(); + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/support/ignore/SpecialCharSensitiveWordCharIgnore.java b/src/main/java/com/github/houbb/sensitive/word/support/ignore/SpecialCharSensitiveWordCharIgnore.java new file mode 100644 index 0000000..cac8605 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/support/ignore/SpecialCharSensitiveWordCharIgnore.java @@ -0,0 +1,28 @@ +package com.github.houbb.sensitive.word.support.ignore; + +import com.github.houbb.heaven.util.lang.StringUtil; +import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; + +import java.util.Set; + +/** + * 特殊字符忽略 + * @since 0.11.0 + */ +public class SpecialCharSensitiveWordCharIgnore extends AbstractSensitiveWordCharIgnore { + + private static final String SPECIAL = "`-=~!@#$%^&*()_+[]{}\\|;:'\",./<>?"; + + private static final Set SET; + + static { + SET = StringUtil.toCharSet(SPECIAL); + } + + @Override + protected boolean doIgnore(int ix, char[] chars, InnerSensitiveWordContext innerContext) { + char c = chars[ix]; + return SET.contains(c); + } + +} diff --git a/src/main/resources/dict.txt b/src/main/resources/dict.txt index 7986e2f..cc81bf5 100644 --- a/src/main/resources/dict.txt +++ b/src/main/resources/dict.txt @@ -65141,3 +65141,8 @@ z以留吧以其以武 龟投 龟毛 𫔰苞价咯 +傻逼 +傻冒 +狗东西 +草你大爷 +操你大爷 \ No newline at end of file diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java index 84b61bf..dcac883 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsEmailTest.java @@ -38,4 +38,11 @@ public class SensitiveWordBsEmailTest { Assert.assertEquals("[123456789, xx.com]", wordList.toString()); } + @Test + public void emailTest() { + final String text = "你我.他你"; + List wordList = SensitiveWordBs.newInstance().init().findAll(text); + Assert.assertEquals("[]", wordList.toString()); + } + } diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsIgnoreCharTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsIgnoreCharTest.java new file mode 100644 index 0000000..81b4174 --- /dev/null +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsIgnoreCharTest.java @@ -0,0 +1,39 @@ +package com.github.houbb.sensitive.word.bs; + +import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; +import org.junit.Assert; +import org.junit.Test; + +import java.util.List; + +/** + *

project: sensitive-word-SensitiveWordBsTest

+ *

create on 2020/1/7 23:43

+ * + * @author Administrator + * @since 0.11.0 + */ +public class SensitiveWordBsIgnoreCharTest { + + /** + * 忽略中文繁简体 + * @since 0.0.6 + */ + @Test + public void ignoreChineseStyleTest() { + final String text = "傻@冒,狗+东西"; + + //默认因为有特殊字符分割,无法识别 + List wordList = SensitiveWordBs.newInstance().init().findAll(text); + Assert.assertEquals("[]", wordList.toString()); + + // 指定忽略的字符策略,可自行实现。 + List wordList2 = SensitiveWordBs.newInstance() + .charIgnore(SensitiveWordCharIgnores.specialChars()) + .init() + .findAll(text); + + Assert.assertEquals("[傻@冒, 狗+东西]", wordList2.toString()); + } + +}