From 6132261e77a7c381ce109d3386a847fa764b0f86 Mon Sep 17 00:00:00 2001 From: yds <11232266+yuds11@user.noreply.gitee.com> Date: Fri, 2 May 2025 23:31:43 +0800 Subject: [PATCH] =?UTF-8?q?=E9=BB=91=E7=99=BD=E5=90=8D=E5=8D=95=E5=85=B1?= =?UTF-8?q?=E5=90=8C=E6=A3=80=E6=B5=8B=E6=97=B6=E9=80=BB=E8=BE=91=E7=9A=84?= =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../sensitive/word/api/ISensitiveWord.java | 14 +- .../sensitive/word/bs/SensitiveWordBs.java | 6 +- .../word/core/AbstractSensitiveWord.java | 8 +- .../sensitive/word/core/SensitiveWord.java | 34 ++- .../word/support/check/WordCheckWord.java | 24 +-- .../word/bs/SensitiveWordBsReplaceTest.java | 20 -- .../word/bs/SensitiveWordFailFastTest.java | 200 +++++++++++++++++- 7 files changed, 233 insertions(+), 73 deletions(-) delete mode 100644 src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsReplaceTest.java diff --git a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWord.java b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWord.java index fe921a4..17f246d 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/ISensitiveWord.java @@ -19,7 +19,7 @@ public interface ISensitiveWord { * @see WordValidModeEnum#FAIL_OVER 建议使用全部检测返回模式 */ List findAll(final String string, - final IWordContext context); + final IWordContext context); /** * 返回第一个对应的敏感词 @@ -29,22 +29,20 @@ public interface ISensitiveWord { * @since 0.3.2 */ IWordResult findFirst(final String string, - final IWordContext context); + final IWordContext context); /** * 替换所有敏感词内容 - *

+ * * ps: 这里可以添加优化。 * - * @param target 目标字符串 + * @param target 目标字符串 * @param context 上下文 - * @param replace 替换策略 * @return 替换后结果 * @since 0.3.2 */ String replace(final String target, - final IWordContext context, - final IWordReplace replace); + final IWordContext context); /** * 包含 @@ -54,6 +52,6 @@ public interface ISensitiveWord { * @since 0.3.2 */ boolean contains(final String string, - final IWordContext context); + final IWordContext context); } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 86a48b9..8d10b5b 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -672,12 +672,10 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { * @since 0.2.0 */ public String replace(final String target) { - return this.replace(target,context.wordReplace()); - } - public String replace(final String target, IWordReplace replace) { - return sensitiveWord.replace(target, context, replace); + return sensitiveWord.replace(target, context); } + /** * 获取敏感词的标签 * diff --git a/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java b/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java index 28c8866..c3ce135 100644 --- a/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/core/AbstractSensitiveWord.java @@ -38,8 +38,9 @@ public abstract class AbstractSensitiveWord implements ISensitiveWord { * @return 结果 * @since 0.3.2 */ - protected String doReplace(String target, List allList, IWordContext context, IWordReplace replace) { + protected String doReplace(String target, List allList, IWordContext context) { // 根据 index 直接分割 + final IWordReplace replace = context.wordReplace(); // 是否需要对 allList 排序? StringBuilder stringBuilder = new StringBuilder(); @@ -90,7 +91,7 @@ public abstract class AbstractSensitiveWord implements ISensitiveWord { } @Override - public String replace(String target, IWordContext context, IWordReplace replace) { + public String replace(String target, IWordContext context) { if(StringUtil.isEmpty(target)) { return target; } @@ -100,9 +101,8 @@ public abstract class AbstractSensitiveWord implements ISensitiveWord { return target; } - return doReplace(target, allList, context, replace); + return doReplace(target, allList, context); } - @Override public boolean contains(String string, IWordContext context) { //1. 第一个存在 diff --git a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java index 8147d34..7af8705 100644 --- a/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/core/SensitiveWord.java @@ -37,7 +37,7 @@ public class SensitiveWord extends AbstractSensitiveWord { @Override protected IWordResult doFindFirst(String string, IWordContext context) { List wordResults = innerSensitiveWords(string, WordValidModeEnum.FAIL_FAST, context); - if(!CollectionUtil.isEmpty(wordResults)){ + if (!CollectionUtil.isEmpty(wordResults)) { return wordResults.get(0); } return null; @@ -47,14 +47,14 @@ public class SensitiveWord extends AbstractSensitiveWord { /** * 获取敏感词列表 * - * @param text 文本 + * @param text 文本 * @param modeEnum 模式 * @return 结果列表 * @since 0.0.1 */ private List innerSensitiveWords(final String text, - final WordValidModeEnum modeEnum, - final IWordContext context) { + final WordValidModeEnum modeEnum, + final IWordContext context) { //1. 是否存在敏感词,如果比存在,直接返回空列表 final IWordCheck sensitiveCheck = context.sensitiveCheck(); List resultList = Guavas.newArrayList(); @@ -74,38 +74,32 @@ public class SensitiveWord extends AbstractSensitiveWord { // v0.21.0 白名单跳过 WordCheckResult checkResult = sensitiveCheck.sensitiveCheck(i, checkContext); int wordLengthAllow = checkResult.wordLengthResult().wordAllowLen(); - if(wordLengthAllow > 0) { - i += wordLengthAllow-1; - continue; - } + int wordLengthDeny = checkResult.wordLengthResult().wordDenyLen(); - - // 命中 - final WordLengthResult wordLengthResult = checkResult.wordLengthResult(); - int wordLength = wordLengthResult.wordDenyLen(); - if (wordLength > 0) { + //如果命中的白名单长度小于黑名单,则直接对黑名单的敏感词进行保存 + if (wordLengthAllow < wordLengthDeny) { // 保存敏感词 WordResult wordResult = WordResult.newInstance() .startIndex(i) - .endIndex(i+wordLength) + .endIndex(i + wordLengthDeny) .type(checkResult.type()) - .word(wordLengthResult.wordDeny()); + .word(checkResult.wordLengthResult().wordDeny()); //v0.13.0 添加判断 - if(wordResultCondition.match(wordResult, text, modeEnum, context)) { + if (wordResultCondition.match(wordResult, text, modeEnum, context)) { resultList.add(wordResult); // 快速返回 if (WordValidModeEnum.FAIL_FAST.equals(modeEnum)) { break; } } - - - // 增加 i 的步长 // 为什么要-1,因为默认就会自增1 // TODO: 这里可以根据字符串匹配算法优化。 - i += wordLength - 1; + i += wordLengthDeny - 1; + } else { + //如果命中的白名单长度大于黑名单长度,则跳过白名单个字符 + i += Math.max(0, wordLengthAllow - 1); } } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java index f7779d1..dbd309a 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java @@ -51,7 +51,7 @@ public class WordCheckWord extends AbstractWordCheck { int tempLen = 0; int maxWhite = 0; int maxBlack = 0; - int skipLen=0; + int skipLen = 0; for (int i = beginIndex; i < rawChars.length; i++) { if (wordCharIgnore.ignore(i, rawChars, innerContext) && tempLen != 0) { @@ -67,24 +67,18 @@ public class WordCheckWord extends AbstractWordCheck { WordContainsTypeEnum wordContainsTypeEnumDeny = wordData.contains(stringBuilder, innerContext); if (WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnumAllow)) { - maxWhite += tempLen; - if (!failFast) { - //此处将tempLen设为0,为了防止重复累加 - tempLen = 0; - }else{ + maxWhite = tempLen; + if (failFast) { //为falFast模式,主动设为notFound退出循环 - wordContainsTypeEnumAllow=WordContainsTypeEnum.NOT_FOUND; + wordContainsTypeEnumAllow = WordContainsTypeEnum.NOT_FOUND; } } if (WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnumDeny)) { - maxBlack += tempLen; - if (!failFast) { - //此处将tempLen设为0,为了防止重复累加 - tempLen = 0; - }else{ + maxBlack = tempLen; + if (failFast) { //为falFast模式,主动设为notFound退出循环 - wordContainsTypeEnumDeny=WordContainsTypeEnum.NOT_FOUND; + wordContainsTypeEnumDeny = WordContainsTypeEnum.NOT_FOUND; } } @@ -95,8 +89,8 @@ public class WordCheckWord extends AbstractWordCheck { } String string = stringBuilder.toString(); - String wordAllow = string.substring(0, Math.max(0,maxWhite - skipLen)); - String wordDeny = string.substring(0, Math.max(0,maxBlack - skipLen)); + String wordAllow = string.substring(0, Math.max(0, maxWhite - skipLen)); + String wordDeny = string.substring(0, Math.max(0, maxBlack - skipLen)); return WordLengthResult.newInstance() diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsReplaceTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsReplaceTest.java deleted file mode 100644 index 30ccdc9..0000000 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsReplaceTest.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.github.houbb.sensitive.word.bs; - -import com.github.houbb.sensitive.word.api.IWordReplace; -import com.github.houbb.sensitive.word.replace.MyWordReplace; -import org.junit.Assert; -import org.junit.Test; - -public class SensitiveWordBsReplaceTest { - - @Test - public void defineReplaceTest() { - final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。"; - SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().init(); - - IWordReplace replace = new MyWordReplace(); - String result = sensitiveWordBs.replace(text, replace); - - Assert.assertEquals("国家旗帜迎风飘扬,教员的画像屹立在***前。", result); - } -} diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java index 6b4258c..498a5aa 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java @@ -1,10 +1,12 @@ package com.github.houbb.sensitive.word.bs; +import com.github.houbb.sensitive.word.api.IWordAllow; import com.github.houbb.sensitive.word.api.IWordDeny; import org.junit.Assert; import org.junit.Test; import java.util.Arrays; +import java.util.Collections; import java.util.List; public class SensitiveWordFailFastTest { @@ -20,12 +22,108 @@ public class SensitiveWordFailFastTest { } }).init(); - String text = "我在我的家里玩我的世界"; + SensitiveWordBs bs1 = SensitiveWordBs.newInstance() + .failFastWordPattern(true) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("操你妈"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("你"); + } + }) + .init(); + + //黑长白短,且初始下标一致 + SensitiveWordBs bs2 = SensitiveWordBs.newInstance() + .failFastWordPattern(true) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("大傻逼"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("大"); + } + }) + .init(); + + + + //白长黑短,且白和黑初始下标不再一起 + SensitiveWordBs bs3 = SensitiveWordBs.newInstance() + .failFastWordPattern(true) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("口交"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("地铁口交易"); + } + }) + .init(); + + + //白长黑短,且白和黑初始下标在一起 + SensitiveWordBs bs4 = SensitiveWordBs.newInstance() + .failFastWordPattern(true) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("龟孙"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("龟孙可"); + } + }) + .init(); + + + + + + + String text = "我在我的家里玩我的世界"; List textList = bs.findAll(text); Assert.assertEquals(Arrays.asList("我的", "我的"), textList); + + String text1 = "操你妈"; + List textList1 = bs1.findAll(text1); + Assert.assertEquals(Collections.singletonList("操你妈"), textList1); + + String text2 = "大傻逼"; + List textList2 = bs2.findAll(text2); + Assert.assertEquals(Collections.singletonList("大傻逼"), textList2); + + + String text3 = "地铁口交易"; + List textList3 = bs3.findAll(text3); + Assert.assertTrue("Expected empty list", textList3.isEmpty()); + + String text4 = "龟孙可"; + List textList4 = bs4.findAll(text4); + Assert.assertTrue("Expected empty list", textList4.isEmpty()); + + } + + @Test public void fallOverTest() { SensitiveWordBs bs = SensitiveWordBs.newInstance() @@ -37,11 +135,109 @@ public class SensitiveWordFailFastTest { } }).init(); - String text = "我在我的家里玩我的世界"; + //黑长白短,且初始下标不一致 + SensitiveWordBs bs1 = SensitiveWordBs.newInstance() + .failFastWordPattern(false) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("操你妈"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("你"); + } + }) + .init(); + + + //黑长白短,且初始下标一致 + SensitiveWordBs bs2 = SensitiveWordBs.newInstance() + .failFastWordPattern(false) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("大傻逼"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("大"); + } + }) + .init(); + + + + //白长黑短,且白和黑初始下标不再一起 + SensitiveWordBs bs3 = SensitiveWordBs.newInstance() + .failFastWordPattern(false) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("口交"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("地铁口交易"); + } + }) + .init(); + + + //白长黑短,且白和黑初始下标在一起 + SensitiveWordBs bs4 = SensitiveWordBs.newInstance() + .failFastWordPattern(false) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Collections.singletonList("龟孙"); + } + }) + .wordAllow(new IWordAllow() { + @Override + public List allow() { + return Collections.singletonList("龟孙可"); + } + }) + .init(); + + + + + + + String text = "我在我的家里玩我的世界"; List textList = bs.findAll(text); Assert.assertEquals(Arrays.asList("我的", "我的世界"), textList); + + String text1 = "操你妈"; + List textList1 = bs1.findAll(text1); + Assert.assertEquals(Collections.singletonList("操你妈"), textList1); + + String text2 = "大傻逼"; + List textList2 = bs2.findAll(text2); + Assert.assertEquals(Collections.singletonList("大傻逼"), textList2); + + + String text3 = "地铁口交易"; + List textList3 = bs3.findAll(text3); + Assert.assertTrue("Expected empty list", textList3.isEmpty()); + + String text4 = "龟孙可"; + List textList4 = bs4.findAll(text4); + Assert.assertTrue("Expected empty list", textList4.isEmpty()); + + } + + }