diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md index 7ffad8d..dd63580 100644 --- a/CHANGE_LOG.md +++ b/CHANGE_LOG.md @@ -419,3 +419,9 @@ | 序号 | 变更类型 | 说明 | 时间 | 备注 | |:---|:-----|------------------------------|:------------------|:---------------------------------------------------| | 1 | A | 修正 tags 匹配问题,黑名单命中时返回对应的黑名单词 | 2025-5-2 20:25:04 | https://github.com/houbb/sensitive-word/issues/105 | + +# release_0.26.0 + +| 序号 | 变更类型 | 说明 | 时间 | 备注 | +|:---|:-----|----------|:------------------|:---------------------------------------------------| +| 1 | A | 支持最长匹配模式 | 2025-5-3 00:58:42 | https://github.com/houbb/sensitive-word/issues/110 | diff --git a/README.md b/README.md index e65a173..c9fa52b 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ v0.24.0 开始内置支持对敏感词的分类细化,不过工作量比较大 - [支持黑白名单单个的新增/修改,无需全量初始化](https://github.com/houbb/sensitive-word?tab=readme-ov-file#%E9%92%88%E5%AF%B9%E5%8D%95%E4%B8%AA%E8%AF%8D%E7%9A%84%E6%96%B0%E5%A2%9E%E5%88%A0%E9%99%A4%E6%97%A0%E9%9C%80%E5%85%A8%E9%87%8F%E5%88%9D%E5%A7%8B%E5%8C%96) +- [支持词匹配模式的两种模式]() + ## 变更日志 [CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/CHANGE_LOG.md) @@ -483,6 +485,7 @@ SensitiveWordBs wordBs = SensitiveWordBs.newInstance() .enableUrlCheck(false) .enableIpv4Check(false) .enableWordCheck(true) + .wordFailFast(true) .wordCheckNum(WordChecks.num()) .wordCheckEmail(WordChecks.email()) .wordCheckUrl(WordChecks.url()) @@ -501,29 +504,77 @@ Assert.assertTrue(wordBs.contains(text)); 其中各项配置的说明如下: -| 序号 | 方法 | 说明 | 默认值 | -|:---|:--------------------|:-----------------------------|:------| -| 1 | ignoreCase | 忽略大小写 | true | -| 2 | ignoreWidth | 忽略半角圆角 | true | -| 3 | ignoreNumStyle | 忽略数字的写法 | true | -| 4 | ignoreChineseStyle | 忽略中文的书写格式 | true | -| 5 | ignoreEnglishStyle | 忽略英文的书写格式 | true | -| 6 | ignoreRepeat | 忽略重复词 | false | -| 7 | enableNumCheck | 是否启用数字检测。 | false | -| 8 | enableEmailCheck | 是有启用邮箱检测 | false | -| 9 | enableUrlCheck | 是否启用链接检测 | false | -| 10 | enableIpv4Check | 是否启用IPv4检测 | false | -| 11 | enableWordCheck | 是否启用敏感单词检测 | true | -| 12 | numCheckLen | 数字检测,自定义指定长度。 | 8 | -| 13 | wordTag | 词对应的标签 | none | -| 14 | charIgnore | 忽略的字符 | none | -| 15 | wordResultCondition | 针对匹配的敏感词额外加工,比如可以限制英文单词必须全匹配 | 恒为真 | -| 16 | wordCheckNum | 数字检测策略(v0.25.0开始支持) | `WordChecks.num()` | -| 17 | wordCheckEmail | 邮箱检测策略(v0.25.0开始支持) | `WordChecks.email()` | -| 18 | wordCheckUrl | URL检测策略(v0.25.0开始支持) | `(WordChecks.url()` | -| 19 | wordCheckIpv4 | ipv4检测策略(v0.25.0开始支持) | `WordChecks.ipv4()` | -| 20 | wordCheckWord | 敏感词检测策略(v0.25.0开始支持) | `WordChecks.word()` | -| 21 | wordReplace | 替换策略 | `WordReplaces.defaults()` | +| 序号 | 方法 | 说明 | 默认值 | +|:---|:--------------------|:-----------------------------|:--------------------------| +| 1 | ignoreCase | 忽略大小写 | true | +| 2 | ignoreWidth | 忽略半角圆角 | true | +| 3 | ignoreNumStyle | 忽略数字的写法 | true | +| 4 | ignoreChineseStyle | 忽略中文的书写格式 | true | +| 5 | ignoreEnglishStyle | 忽略英文的书写格式 | true | +| 6 | ignoreRepeat | 忽略重复词 | false | +| 7 | enableNumCheck | 是否启用数字检测。 | false | +| 8 | enableEmailCheck | 是有启用邮箱检测 | false | +| 9 | enableUrlCheck | 是否启用链接检测 | false | +| 10 | enableIpv4Check | 是否启用IPv4检测 | false | +| 11 | enableWordCheck | 是否启用敏感单词检测 | true | +| 12 | numCheckLen | 数字检测,自定义指定长度。 | 8 | +| 13 | wordTag | 词对应的标签 | none | +| 14 | charIgnore | 忽略的字符 | none | +| 15 | wordResultCondition | 针对匹配的敏感词额外加工,比如可以限制英文单词必须全匹配 | 恒为真 | +| 16 | wordCheckNum | 数字检测策略(v0.25.0开始支持) | `WordChecks.num()` | +| 17 | wordCheckEmail | 邮箱检测策略(v0.25.0开始支持) | `WordChecks.email()` | +| 18 | wordCheckUrl | URL检测策略(v0.25.0开始支持) | `(WordChecks.url()` | +| 19 | wordCheckIpv4 | ipv4检测策略(v0.25.0开始支持) | `WordChecks.ipv4()` | +| 20 | wordCheckWord | 敏感词检测策略(v0.25.0开始支持) | `WordChecks.word()` | +| 21 | wordReplace | 替换策略 | `WordReplaces.defaults()` | +| 22 | wordFailFast | 敏感词匹配模式是否快速返回 | true | + + +## wordFailFast 敏感词匹配快速失败模式 + +### 场景说明 + +v0.26.0 开始支持。 + +默认情况下,wordFailFast=true。匹配时快速返回,性能较好。 + +但是有时候不太符合人的直觉。 + +默认如下: + +```java +SensitiveWordBs bs2 = SensitiveWordBs.newInstance() + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Arrays.asList("我的世界", "我的"); + } + }).init(); + +List textList2 = bs2.findAll(text); +Assert.assertEquals(Arrays.asList("我的", "我的"), textList2); +``` + +此时会优先匹配短的【我的】,导致后面的【我的世界】被跳过。 + +### failOver 模式 + +尽可能找到最长的匹配词。 + +```java +SensitiveWordBs bs = SensitiveWordBs.newInstance() + .wordFailFast(false) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Arrays.asList("我的世界", "我的"); + } + }).init(); + +String text = "他的世界它的世界和她的世界都不是我的也不是我的世界"; +List textList = bs.findAll(text); +Assert.assertEquals(Arrays.asList("我的", "我的世界"), textList); +``` ## 内存资源的释放 diff --git a/release.bat b/release.bat index 145eb7d..e3bb7d6 100644 --- a/release.bat +++ b/release.bat @@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..." :: 版本号信息(需要手动指定) :::: 旧版本名称 -SET version=0.25.1 +SET version=0.26.0 :::: 新版本名称 -SET newVersion=0.26.0 +SET newVersion=0.27.0 :::: 组织名称 SET groupName=com.github.houbb :::: 项目名称 diff --git a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java index 508899d..4147aef 100644 --- a/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordContext.java @@ -12,11 +12,18 @@ public interface IWordContext { /** * 为true时,遇到第一个敏感词词就返回 * 解决issue110 - * @return + * @return 是否 + * @since 0.26.0 */ - boolean failFastWordPattern(); + boolean wordFailFast(); - IWordContext failFastWordPattern(boolean failFastWordPattern); + /** + * word 快速失败 + * @param wordFailFast 快速失败 + * @return this + * @since 0.26.0 + */ + IWordContext wordFailFast(boolean wordFailFast); /** diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java index 8d10b5b..625d15d 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java @@ -69,7 +69,11 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { */ private boolean ignoreRepeat = false; - private boolean failFastWordPattern = true; + /** + * 单词快速匹配模式 + * @since 0.26.0 + */ + private boolean wordFailFast = true; // 开启校验 @@ -280,7 +284,7 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { context.ignoreChineseStyle(ignoreChineseStyle); context.ignoreEnglishStyle(ignoreEnglishStyle); context.ignoreRepeat(ignoreRepeat); - context.failFastWordPattern(failFastWordPattern); + context.wordFailFast(wordFailFast); // 开启校验 context.enableNumCheck(enableNumCheck); @@ -582,8 +586,15 @@ public class SensitiveWordBs implements ISensitiveWordDestroy { this.ignoreRepeat = ignoreRepeat; return this; } - public SensitiveWordBs failFastWordPattern(boolean failFastWordPattern) { - this.failFastWordPattern = failFastWordPattern; + + /** + * 设置快速返回 + * @param wordFailFast 快速匹配 + * @return this + * @since 0.26.0 + */ + public SensitiveWordBs wordFailFast(boolean wordFailFast) { + this.wordFailFast = wordFailFast; return this; } diff --git a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java index 6f142fd..59cf16f 100644 --- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java +++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordContext.java @@ -1,7 +1,6 @@ package com.github.houbb.sensitive.word.bs; import com.github.houbb.sensitive.word.api.*; -import com.github.houbb.sensitive.word.support.check.WordChecks; /** * 上下文 @@ -13,8 +12,9 @@ public class SensitiveWordContext implements IWordContext { /** * issue110 + * @since 0.26.0 */ - private boolean failFastWordPattern; + private boolean wordFailFast; /** * 忽略大小写 @@ -227,14 +227,14 @@ public class SensitiveWordContext implements IWordContext { } @Override - public boolean failFastWordPattern() { - return failFastWordPattern; + public boolean wordFailFast() { + return wordFailFast; } @Override - public IWordContext failFastWordPattern(boolean failFastWordPattern){ - this.failFastWordPattern=failFastWordPattern; + public IWordContext wordFailFast(boolean wordFailFast){ + this.wordFailFast = wordFailFast; return this; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java index dbd309a..a105fda 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/check/WordCheckWord.java @@ -5,12 +5,10 @@ import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore; import com.github.houbb.sensitive.word.api.IWordCheck; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordData; -import com.github.houbb.sensitive.word.api.IWordFormat; import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum; import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; import com.github.houbb.sensitive.word.support.result.WordLengthResult; -import com.github.houbb.sensitive.word.utils.InnerWordFormatUtils; import java.util.Map; @@ -44,7 +42,7 @@ public class WordCheckWord extends AbstractWordCheck { final IWordData wordData = context.wordData(); final IWordData wordDataAllow = context.wordDataAllow(); final ISensitiveWordCharIgnore wordCharIgnore = context.charIgnore(); - final boolean failFast = context.failFastWordPattern(); + final boolean failFast = context.wordFailFast(); StringBuilder stringBuilder = new StringBuilder(); char[] rawChars = txt.toCharArray(); diff --git a/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkBasicTest.java b/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkBasicTest.java index 476bad0..e88124c 100644 --- a/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkBasicTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/benchmark/BenchmarkBasicTest.java @@ -24,6 +24,7 @@ public class BenchmarkBasicTest { * 12942ms 第一次优化。 * 12983ms 添加对应的 contains 优化,性能无太大变化。 * + * 【2025-5-3 01:11:31】1193ms 1281 1201 1256 */ @Test public void costTimeTest() { diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java index 498a5aa..8320734 100644 --- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordFailFastTest.java @@ -9,12 +9,15 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +/** + * @since 0.26.0 + */ public class SensitiveWordFailFastTest { @Test public void failFastTest() { SensitiveWordBs bs = SensitiveWordBs.newInstance() - .failFastWordPattern(true) + .wordFailFast(true) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -23,7 +26,7 @@ public class SensitiveWordFailFastTest { }).init(); SensitiveWordBs bs1 = SensitiveWordBs.newInstance() - .failFastWordPattern(true) + .wordFailFast(true) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -41,7 +44,7 @@ public class SensitiveWordFailFastTest { //黑长白短,且初始下标一致 SensitiveWordBs bs2 = SensitiveWordBs.newInstance() - .failFastWordPattern(true) + .wordFailFast(true) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -60,7 +63,7 @@ public class SensitiveWordFailFastTest { //白长黑短,且白和黑初始下标不再一起 SensitiveWordBs bs3 = SensitiveWordBs.newInstance() - .failFastWordPattern(true) + .wordFailFast(true) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -78,7 +81,7 @@ public class SensitiveWordFailFastTest { //白长黑短,且白和黑初始下标在一起 SensitiveWordBs bs4 = SensitiveWordBs.newInstance() - .failFastWordPattern(true) + .wordFailFast(true) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -127,7 +130,7 @@ public class SensitiveWordFailFastTest { @Test public void fallOverTest() { SensitiveWordBs bs = SensitiveWordBs.newInstance() - .failFastWordPattern(false) + .wordFailFast(false) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -138,7 +141,7 @@ public class SensitiveWordFailFastTest { //黑长白短,且初始下标不一致 SensitiveWordBs bs1 = SensitiveWordBs.newInstance() - .failFastWordPattern(false) + .wordFailFast(false) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -156,7 +159,7 @@ public class SensitiveWordFailFastTest { //黑长白短,且初始下标一致 SensitiveWordBs bs2 = SensitiveWordBs.newInstance() - .failFastWordPattern(false) + .wordFailFast(false) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -175,7 +178,7 @@ public class SensitiveWordFailFastTest { //白长黑短,且白和黑初始下标不再一起 SensitiveWordBs bs3 = SensitiveWordBs.newInstance() - .failFastWordPattern(false) + .wordFailFast(false) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -193,7 +196,7 @@ public class SensitiveWordFailFastTest { //白长黑短,且白和黑初始下标在一起 SensitiveWordBs bs4 = SensitiveWordBs.newInstance() - .failFastWordPattern(false) + .wordFailFast(false) .wordDeny(new IWordDeny() { @Override public List deny() { @@ -238,6 +241,32 @@ public class SensitiveWordFailFastTest { } + @Test + public void fallOverTest2() { + SensitiveWordBs bs = SensitiveWordBs.newInstance() + .wordFailFast(false) + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Arrays.asList("我的世界", "我的"); + } + }).init(); + + String text = "他的世界它的世界和她的世界都不是我的也不是我的世界"; + List textList = bs.findAll(text); + Assert.assertEquals(Arrays.asList("我的", "我的世界"), textList); + + SensitiveWordBs bs2 = SensitiveWordBs.newInstance() + .wordDeny(new IWordDeny() { + @Override + public List deny() { + return Arrays.asList("我的世界", "我的"); + } + }).init(); + + List textList2 = bs2.findAll(text); + Assert.assertEquals(Arrays.asList("我的", "我的"), textList2); + } }