Merge branch 'houbb:master' into master

This commit is contained in:
k9999dot
2025-05-02 16:36:36 +08:00
committed by GitHub
23 changed files with 606 additions and 164 deletions

View File

@@ -393,3 +393,30 @@
|:---|:-----|----------------|:--------------------|:--------------| |:---|:-----|----------------|:--------------------|:--------------|
| 1 | A | 内置支持多个单词标签实现策略 | 2024-12-22 14:08:20 | 强化单词标签能力,方便复用 | | 1 | A | 内置支持多个单词标签实现策略 | 2024-12-22 14:08:20 | 强化单词标签能力,方便复用 |
| 2 | O | 升级 heaven 依赖 | 2024-12-22 14:08:20 | | | 2 | O | 升级 heaven 依赖 | 2024-12-22 14:08:20 | |
# release_0.24.1
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:-----|------------|:------------------|:---------------|
| 1 | F | 删除时添加同步锁优化 | 2025-2-2 15:30:26 | 涉及到接口调整 PR-100 |
# release_0.24.2
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:-----|---------------------|:------------------|:-------------------|
| 1 | O | findFirst 真实实现,性能优化 | 2025-2-2 15:30:26 | PR-99 |
| 2 | O | 黑白名单遍历统一优化,性能优化 | 2025-2-2 15:30:26 | PR-99 涉及到原始结果返回值调整 |
# release_0.25.0
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:-----|----------------------|:-------------------|:-----|
| 1 | A | wordCheck 策略支持用户自定义 | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 |
| 2 | A | wordCheckUrlNoPrefix | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 |
# release_0.25.0
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:-----|----------------------|:-------------------|:-----|
| 1 | A | wordCheck 策略支持用户自定义 | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 |
| 2 | A | wordCheckUrlNoPrefix | 2025-2-17 12:06:45 | https://github.com/houbb/sensitive-word/issues/101 |

View File

@@ -58,14 +58,6 @@ v0.24.0 开始内置支持对敏感词的分类细化,不过工作量比较大
[CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/CHANGE_LOG.md) [CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/CHANGE_LOG.md)
### V0.23.0
- 结果条件拓展支持 wordTags 和 chains
### V0.24.0
- 初步内置实现单词标签,丰富单词标签内置策略
## 更多资料 ## 更多资料
### 敏感词控台 ### 敏感词控台
@@ -104,7 +96,7 @@ v0.24.0 开始内置支持对敏感词的分类细化,不过工作量比较大
<dependency> <dependency>
<groupId>com.github.houbb</groupId> <groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId> <artifactId>sensitive-word</artifactId>
<version>0.24.0</version> <version>0.25.0</version>
</dependency> </dependency>
``` ```
@@ -370,6 +362,22 @@ Assert.assertEquals("[ⒻⒻⒻfⓤuⓤ⒰cⓒ⒦]", wordList.toString());
## 更多检测策略 ## 更多检测策略
### 说明
v0.25.0 目前的几个策略,也支持用户引导类自定义。所有的策略都是接口,支持用户自定义实现。
| 序号 | 方法 | 说明 | 默认值 |
|:---|:---------------------|:-------------------------------------------|:------|
| 16 | wordCheckNum | 数字检测策略(v0.25.0开始支持) | `WordChecks.num()` |
| 17 | wordCheckEmail | 邮箱检测策略(v0.25.0开始支持) | `WordChecks.email()` |
| 18 | wordCheckUrl | URL检测策略(v0.25.0开始支持),内置还是实现了 `urlNoPrefix()` | `(WordChecks.url()` |
| 19 | wordCheckIpv4 | ipv4检测策略(v0.25.0开始支持) | `WordChecks.ipv4()` |
| 20 | wordCheckWord | 敏感词检测策略(v0.25.0开始支持) | `WordChecks.word()` |
内置实现:
a) `WordChecks.urlNoPrefix()` 作为 url 的额外实现,可以不需要 `https://``http://` 前缀。
### 邮箱检测 ### 邮箱检测
邮箱等个人信息,默认未启用。 邮箱等个人信息,默认未启用。
@@ -418,6 +426,21 @@ Assert.assertEquals("[https://www.baidu.com]", wordList.toString());
Assert.assertEquals("点击链接 ********************* 查看答案", sensitiveWordBs.replace(text)); Assert.assertEquals("点击链接 ********************* 查看答案", sensitiveWordBs.replace(text));
``` ```
v0.25.0 内置支持不需要 http 协议的前缀检测:
```java
final String text = "点击链接 https://www.baidu.com 查看答案,当然也可以是 baidu.com、www.baidu.com";
final SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
.enableUrlCheck(true) // 启用URL检测
.wordCheckUrl(WordChecks.urlNoPrefix()) //指定检测的方式
.init();
List<String> wordList = sensitiveWordBs.findAll(text);
Assert.assertEquals("[www.baidu.com, baidu.com, www.baidu.com]", wordList.toString());
Assert.assertEquals("点击链接 https://************* 查看答案,当然也可以是 *********、*************", sensitiveWordBs.replace(text));
```
### IPV4 检测 ### IPV4 检测
v0.17.0 支持 v0.17.0 支持
@@ -460,6 +483,11 @@ SensitiveWordBs wordBs = SensitiveWordBs.newInstance()
.enableUrlCheck(false) .enableUrlCheck(false)
.enableIpv4Check(false) .enableIpv4Check(false)
.enableWordCheck(true) .enableWordCheck(true)
.wordCheckNum(WordChecks.num())
.wordCheckEmail(WordChecks.email())
.wordCheckUrl(WordChecks.url())
.wordCheckIpv4(WordChecks.ipv4())
.wordCheckWord(WordChecks.word())
.numCheckLen(8) .numCheckLen(8)
.wordTag(WordTags.none()) .wordTag(WordTags.none())
.charIgnore(SensitiveWordCharIgnores.defaults()) .charIgnore(SensitiveWordCharIgnores.defaults())
@@ -474,7 +502,7 @@ Assert.assertTrue(wordBs.contains(text));
其中各项配置的说明如下: 其中各项配置的说明如下:
| 序号 | 方法 | 说明 | 默认值 | | 序号 | 方法 | 说明 | 默认值 |
|:---|:---------------------|:-----------------------------|:------| |:---|:--------------------|:-----------------------------|:------|
| 1 | ignoreCase | 忽略大小写 | true | | 1 | ignoreCase | 忽略大小写 | true |
| 2 | ignoreWidth | 忽略半角圆角 | true | | 2 | ignoreWidth | 忽略半角圆角 | true |
| 3 | ignoreNumStyle | 忽略数字的写法 | true | | 3 | ignoreNumStyle | 忽略数字的写法 | true |
@@ -490,6 +518,12 @@ Assert.assertTrue(wordBs.contains(text));
| 13 | wordTag | 词对应的标签 | none | | 13 | wordTag | 词对应的标签 | none |
| 14 | charIgnore | 忽略的字符 | none | | 14 | charIgnore | 忽略的字符 | none |
| 15 | wordResultCondition | 针对匹配的敏感词额外加工,比如可以限制英文单词必须全匹配 | 恒为真 | | 15 | wordResultCondition | 针对匹配的敏感词额外加工,比如可以限制英文单词必须全匹配 | 恒为真 |
| 16 | wordCheckNum | 数字检测策略(v0.25.0开始支持) | `WordChecks.num()` |
| 17 | wordCheckEmail | 邮箱检测策略(v0.25.0开始支持) | `WordChecks.email()` |
| 18 | wordCheckUrl | URL检测策略(v0.25.0开始支持) | `(WordChecks.url()` |
| 19 | wordCheckIpv4 | ipv4检测策略(v0.25.0开始支持) | `WordChecks.ipv4()` |
| 20 | wordCheckWord | 敏感词检测策略(v0.25.0开始支持) | `WordChecks.word()` |
| 21 | wordReplace | 替换策略 | `WordReplaces.defaults()` |
## 内存资源的释放 ## 内存资源的释放

View File

@@ -6,7 +6,7 @@
<groupId>com.github.houbb</groupId> <groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId> <artifactId>sensitive-word</artifactId>
<version>0.24.0</version> <version>0.25.0</version>
<properties> <properties>
<!--============================== All Plugins START ==============================--> <!--============================== All Plugins START ==============================-->

View File

@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
:: 版本号信息(需要手动指定) :: 版本号信息(需要手动指定)
:::: 旧版本名称 :::: 旧版本名称
SET version=0.24.0 SET version=0.25.0
:::: 新版本名称 :::: 新版本名称
SET newVersion=0.25.0 SET newVersion=0.26.0
:::: 组织名称 :::: 组织名称
SET groupName=com.github.houbb SET groupName=com.github.houbb
:::: 项目名称 :::: 项目名称

View File

@@ -31,6 +31,7 @@ public interface IWordCheck {
* @param context 执行上下文 * @param context 执行上下文
* @return 敏感信息对应的长度 * @return 敏感信息对应的长度
* @since 0.0.5 * @since 0.0.5
* @since 0.24.2 为了黑白名单统一,调整了对应的返回值
*/ */
WordCheckResult sensitiveCheck(final int beginIndex, WordCheckResult sensitiveCheck(final int beginIndex,
final InnerSensitiveWordContext context); final InnerSensitiveWordContext context);

View File

@@ -275,4 +275,23 @@ public interface IWordContext {
SensitiveWordContext wordResultCondition(IWordResultCondition wordResultCondition); SensitiveWordContext wordResultCondition(IWordResultCondition wordResultCondition);
IWordCheck wordCheckWord();
SensitiveWordContext wordCheckWord(IWordCheck wordCheckWord);
IWordCheck wordCheckNum();
SensitiveWordContext wordCheckNum(IWordCheck wordCheckNum);
IWordCheck wordCheckEmail();
SensitiveWordContext wordCheckEmail(IWordCheck wordCheckEmail);
IWordCheck wordCheckUrl();
SensitiveWordContext wordCheckUrl(IWordCheck wordCheckUrl);
IWordCheck wordCheckIpv4();
SensitiveWordContext wordCheckIpv4(IWordCheck wordCheckIpv4);
} }

View File

@@ -22,10 +22,10 @@ public interface IWordData extends ISensitiveWordDestroy {
/** /**
* 删除敏感词 * 删除敏感词
* @param word 单词 * @param collection 单词
* @since 0.19.0 * @since 0.19.0
*/ */
void removeWord(String word); void removeWord(Collection<String> collection);
/** /**
* 新增敏感词 * 新增敏感词

View File

@@ -10,6 +10,7 @@ import com.github.houbb.sensitive.word.api.combine.IWordCheckCombine;
import com.github.houbb.sensitive.word.api.combine.IWordFormatCombine; import com.github.houbb.sensitive.word.api.combine.IWordFormatCombine;
import com.github.houbb.sensitive.word.core.SensitiveWords; import com.github.houbb.sensitive.word.core.SensitiveWords;
import com.github.houbb.sensitive.word.support.allow.WordAllows; import com.github.houbb.sensitive.word.support.allow.WordAllows;
import com.github.houbb.sensitive.word.support.check.WordChecks;
import com.github.houbb.sensitive.word.support.combine.allowdeny.WordAllowDenyCombines; import com.github.houbb.sensitive.word.support.combine.allowdeny.WordAllowDenyCombines;
import com.github.houbb.sensitive.word.support.combine.check.WordCheckCombines; import com.github.houbb.sensitive.word.support.combine.check.WordCheckCombines;
import com.github.houbb.sensitive.word.support.combine.format.WordFormatCombines; import com.github.houbb.sensitive.word.support.combine.format.WordFormatCombines;
@@ -182,6 +183,36 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
*/ */
private IWordResultCondition wordResultCondition = WordResultConditions.alwaysTrue(); private IWordResultCondition wordResultCondition = WordResultConditions.alwaysTrue();
/**
* 单词检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckWord = WordChecks.word();
/**
* 数字检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckNum = WordChecks.num();
/**
* email 检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckEmail = WordChecks.email();
/**
* URL 检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckUrl = WordChecks.url();
/**
* ipv4 检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckIpv4 = WordChecks.ipv4();
/** /**
* 新建验证实例 * 新建验证实例
* <p> * <p>
@@ -255,6 +286,13 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
context.enableWordCheck(enableWordCheck); context.enableWordCheck(enableWordCheck);
context.enableIpv4Check(enableIpv4Check); context.enableIpv4Check(enableIpv4Check);
// 校验策略实现配置
context.wordCheckWord(wordCheckWord);
context.wordCheckEmail(wordCheckEmail);
context.wordCheckNum(wordCheckNum);
context.wordCheckUrl(wordCheckUrl);
context.wordCheckIpv4(wordCheckIpv4);
// 额外配置 // 额外配置
context.sensitiveCheckNumLen(numCheckLen); context.sensitiveCheckNumLen(numCheckLen);
context.wordReplace(wordReplace); context.wordReplace(wordReplace);
@@ -370,6 +408,41 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
return this; return this;
} }
public SensitiveWordBs wordCheckWord(IWordCheck wordCheckWord) {
ArgUtil.notNull(wordCheckWord, "wordCheckWord");
this.wordCheckWord = wordCheckWord;
return this;
}
public SensitiveWordBs wordCheckNum(IWordCheck wordCheckNum) {
ArgUtil.notNull(wordCheckNum, "wordCheckNum");
this.wordCheckNum = wordCheckNum;
return this;
}
public SensitiveWordBs wordCheckEmail(IWordCheck wordCheckEmail) {
ArgUtil.notNull(wordCheckEmail, "wordCheckEmail");
this.wordCheckEmail = wordCheckEmail;
return this;
}
public SensitiveWordBs wordCheckUrl(IWordCheck wordCheckUrl) {
ArgUtil.notNull(wordCheckUrl, "wordCheckUrl");
this.wordCheckUrl = wordCheckUrl;
return this;
}
public SensitiveWordBs wordCheckIpv4(IWordCheck wordCheckIpv4) {
ArgUtil.notNull(wordCheckIpv4, "wordCheckIpv4");
this.wordCheckIpv4 = wordCheckIpv4;
return this;
}
//-------------------------------------------------------- 基础属性设置 //-------------------------------------------------------- 基础属性设置
/** /**
* 是否启用 ipv4 校验 * 是否启用 ipv4 校验
@@ -642,9 +715,7 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
// 主要原因是二者没有保持一致,初始化的数据和插入的数据没有做相同的格式化 // 主要原因是二者没有保持一致,初始化的数据和插入的数据没有做相同的格式化
List<String> formatList = InnerWordFormatUtils.formatWordList(collection, context); List<String> formatList = InnerWordFormatUtils.formatWordList(collection, context);
for(String word : formatList) { this.wordData.removeWord(formatList);
this.wordData.removeWord(word);
}
} }
/** /**
@@ -701,9 +772,8 @@ public class SensitiveWordBs implements ISensitiveWordDestroy {
// 主要原因是二者没有保持一致,初始化的数据和插入的数据没有做相同的格式化 // 主要原因是二者没有保持一致,初始化的数据和插入的数据没有做相同的格式化
List<String> formatList = InnerWordFormatUtils.formatWordList(collection, context); List<String> formatList = InnerWordFormatUtils.formatWordList(collection, context);
for(String word : formatList) { this.wordDataAllow.removeWord(formatList);
this.wordDataAllow.removeWord(word);
}
} }
/** /**
* 新增敏感词白名单 * 新增敏感词白名单

View File

@@ -1,6 +1,7 @@
package com.github.houbb.sensitive.word.bs; package com.github.houbb.sensitive.word.bs;
import com.github.houbb.sensitive.word.api.*; import com.github.houbb.sensitive.word.api.*;
import com.github.houbb.sensitive.word.support.check.WordChecks;
/** /**
* 上下文 * 上下文
@@ -133,6 +134,36 @@ public class SensitiveWordContext implements IWordContext {
*/ */
private IWordResultCondition wordResultCondition; private IWordResultCondition wordResultCondition;
/**
* 单词检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckWord;
/**
* 数字检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckNum;
/**
* email 检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckEmail;
/**
* URL 检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckUrl;
/**
* ipv4 检测策略
* @since 0.25.0
*/
private IWordCheck wordCheckIpv4;
public IWordData wordData() { public IWordData wordData() {
return wordData; return wordData;
} }
@@ -355,4 +386,49 @@ public class SensitiveWordContext implements IWordContext {
this.wordResultCondition = wordResultCondition; this.wordResultCondition = wordResultCondition;
return this; return this;
} }
public IWordCheck wordCheckWord() {
return wordCheckWord;
}
public SensitiveWordContext wordCheckWord(IWordCheck wordCheckWord) {
this.wordCheckWord = wordCheckWord;
return this;
}
public IWordCheck wordCheckNum() {
return wordCheckNum;
}
public SensitiveWordContext wordCheckNum(IWordCheck wordCheckNum) {
this.wordCheckNum = wordCheckNum;
return this;
}
public IWordCheck wordCheckEmail() {
return wordCheckEmail;
}
public SensitiveWordContext wordCheckEmail(IWordCheck wordCheckEmail) {
this.wordCheckEmail = wordCheckEmail;
return this;
}
public IWordCheck wordCheckUrl() {
return wordCheckUrl;
}
public SensitiveWordContext wordCheckUrl(IWordCheck wordCheckUrl) {
this.wordCheckUrl = wordCheckUrl;
return this;
}
public IWordCheck wordCheckIpv4() {
return wordCheckIpv4;
}
public SensitiveWordContext wordCheckIpv4(IWordCheck wordCheckIpv4) {
this.wordCheckIpv4 = wordCheckIpv4;
return this;
}
} }

View File

@@ -60,7 +60,17 @@ public class WordCheckUrl extends AbstractConditionWordCheck {
// 改为 http:// 或者 https:// 开头 // 改为 http:// 或者 https:// 开头
String string = stringBuilder.toString(); String string = stringBuilder.toString();
return RegexUtil.isUrl(string); return isUrl(string);
}
/**
* 是否为 URL
* @param text 原始文本
* @return 结果
* @since 0.25.0
*/
protected boolean isUrl(final String text) {
return RegexUtil.isUrl(text);
} }
} }

View File

@@ -0,0 +1,32 @@
package com.github.houbb.sensitive.word.support.check;
import com.github.houbb.heaven.util.util.regex.RegexUtil;
import com.github.houbb.sensitive.word.api.IWordCheck;
/**
* 1暂时先粗略的处理 web-site
* 2如果网址的最后为图片类型则跳过。
* 3长度超过 70直接结束。
*
* 不包含前缀的实现策略
*
* @author binbin.hou
* @since 0.25.0
*/
public class WordCheckUrlNoPrefix extends WordCheckUrl {
/**
* @since 0.3.0
*/
private static final IWordCheck INSTANCE = new WordCheckUrlNoPrefix();
public static IWordCheck getInstance() {
return INSTANCE;
}
@Override
protected boolean isUrl(String text) {
return RegexUtil.isWebSite(text);
}
}

View File

@@ -0,0 +1,96 @@
package com.github.houbb.sensitive.word.support.check;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.ISensitiveWordCharIgnore;
import com.github.houbb.sensitive.word.api.IWordCheck;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
import com.github.houbb.sensitive.word.constant.enums.WordTypeEnum;
import com.github.houbb.sensitive.word.support.result.WordLengthResult;
import java.util.Map;
/**
* 敏感词监测实现
* @author binbin.hou
* @since 0.26.0
*/
@Deprecated
public class WordCheckWordMaxLen extends AbstractWordCheck {
@Override
protected Class<? extends IWordCheck> getSensitiveCheckClass() {
return WordCheckWordMaxLen.class;
}
@Override
protected WordLengthResult getActualLength(int beginIndex, InnerSensitiveWordContext innerContext) {
final String txt = innerContext.originalText();
final Map<Character, Character> formatCharMapping = innerContext.formatCharMapping();
final IWordContext context = innerContext.wordContext();
final IWordData wordData = context.wordData();
final IWordData wordDataAllow = context.wordDataAllow();
final ISensitiveWordCharIgnore wordCharIgnore = context.charIgnore();
// 前一个条件
StringBuilder stringBuilder = new StringBuilder();
char[] rawChars = txt.toCharArray();
int tempLen = 0;
int maxWhite = 0;
int maxBlack = 0;
boolean firstCheck = true;
WordContainsTypeEnum wordContainsTypeEnumAllow = wordDataAllow.contains(stringBuilder, innerContext);
WordContainsTypeEnum wordContainsTypeEnumDeny = wordData.contains(stringBuilder, innerContext);
for (int i = beginIndex; i < rawChars.length; i++) {
if (wordCharIgnore.ignore(i, rawChars, innerContext) && tempLen != 0) {
tempLen++;
continue;
}
char mappingChar = formatCharMapping.get(rawChars[i]);
stringBuilder.append(mappingChar);
tempLen++;
if (firstCheck || !WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumAllow)) {
wordContainsTypeEnumAllow = wordDataAllow.contains(stringBuilder, innerContext);
if (WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnumAllow)) {
maxWhite += tempLen;
wordContainsTypeEnumAllow = WordContainsTypeEnum.NOT_FOUND;
}
}
// 黑名单命中
if (firstCheck || !WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumDeny)) {
wordContainsTypeEnumDeny = wordData.contains(stringBuilder, innerContext);
if (WordContainsTypeEnum.CONTAINS_END.equals(wordContainsTypeEnumDeny)) {
maxBlack += tempLen;
wordContainsTypeEnumDeny = WordContainsTypeEnum.NOT_FOUND;
}
}
// 不再是第一次检测
firstCheck = false;
// 黑白名单都未匹配
if (WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumAllow) &&
WordContainsTypeEnum.NOT_FOUND.equals(wordContainsTypeEnumDeny)) {
break;
}
}
return WordLengthResult.newInstance()
.wordAllowLen(maxWhite)
.wordDenyLen(maxBlack);
}
@Override
protected String getType() {
return WordTypeEnum.WORD.getCode();
}
}

View File

@@ -77,4 +77,15 @@ public final class WordChecks {
return WordCheckIPV4.getInstance(); return WordCheckIPV4.getInstance();
} }
/**
* 不需要前缀的 urlPrefix
* 注意:这种检测方法可能会和代码中的包名称冲突
*
* @return 实现
* @since 0.25.0
*/
public static IWordCheck urlNoPrefix() {
return WordCheckUrlNoPrefix.getInstance();
}
} }

View File

@@ -18,19 +18,19 @@ public class WordCheckCombine extends AbstractWordCheckCombine {
List<IWordCheck> wordCheckList = new ArrayList<>(); List<IWordCheck> wordCheckList = new ArrayList<>();
if(context.enableWordCheck()) { if(context.enableWordCheck()) {
wordCheckList.add(WordChecks.word()); wordCheckList.add(context.wordCheckWord());
} }
if(context.enableNumCheck()) { if(context.enableNumCheck()) {
wordCheckList.add(WordChecks.num()); wordCheckList.add(context.wordCheckNum());
} }
if(context.enableEmailCheck()) { if(context.enableEmailCheck()) {
wordCheckList.add(WordChecks.email()); wordCheckList.add(context.wordCheckEmail());
} }
if(context.enableUrlCheck()) { if(context.enableUrlCheck()) {
wordCheckList.add(WordChecks.url()); wordCheckList.add(context.wordCheckUrl());
} }
if(context.enableIpv4Check()) { if(context.enableIpv4Check()) {
wordCheckList.add(WordChecks.ipv4()); wordCheckList.add(context.wordCheckIpv4());
} }
return wordCheckList; return wordCheckList;

View File

@@ -1,6 +1,5 @@
package com.github.houbb.sensitive.word.support.data; package com.github.houbb.sensitive.word.support.data;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.sensitive.word.api.IWordData; import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
@@ -31,9 +30,9 @@ public abstract class AbstractWordData implements IWordData {
/** /**
* 删除敏感词 * 删除敏感词
* @param word 敏感词 * @param collection 集合
*/ */
protected abstract void doRemoveWord(String word); protected abstract void doRemoveWord(Collection<String> collection);
/** /**
* 新增敏感词 * 新增敏感词
@@ -49,12 +48,12 @@ public abstract class AbstractWordData implements IWordData {
} }
@Override @Override
public void removeWord(String word) { public void removeWord(Collection<String> collection) {
if(StringUtil.isEmpty(word)) { if(CollectionUtil.isEmpty(collection)) {
return; return;
} }
doRemoveWord(word); doRemoveWord(collection);
} }
@Override @Override

View File

@@ -89,7 +89,7 @@ public class WordDataHashMap extends AbstractWordData {
} }
@Override @Override
protected void doRemoveWord(String word) { protected void doRemoveWord(Collection<String> collection) {
} }

View File

@@ -4,7 +4,6 @@ import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.lang.ObjectUtil; import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordData;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext; import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum; import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
@@ -84,49 +83,7 @@ public class WordDataTree extends AbstractWordData {
this.root = newRoot; this.root = newRoot;
} }
@Override
protected void doRemoveWord(String word) {
WordDataTreeNode tempNode = root;
//需要删除的
Map<Character, WordDataTreeNode> map = new HashMap<>();
char[] chars = word.toCharArray();
int length = chars.length;
for (int i = 0; i < length; i++) {
//不存在第一个词
WordDataTreeNode subNode = tempNode.getSubNode(chars[i]);
if (subNode == null) {
return;
}
if (i == (length - 1)) {
//尾字符判断是否结束
if (!subNode.end()) {
return;
}
if (subNode.getNodeSize() > 0) {
//尾字符下还存在字符,即标识即可
subNode.end(false);
return;
}
}
if (subNode.end()) {
map.clear();
}
map.put(chars[i], tempNode);
tempNode = subNode;
}
for (Map.Entry<Character, WordDataTreeNode> entry : map.entrySet()) {
WordDataTreeNode value = entry.getValue();
//节点只有一个就置空
if (value.getNodeSize() == 1) {
value.clearNode();
return;
}
//多个就删除
value.removeNode(entry.getKey());
}
}
/** /**
* 新增敏感词 * 新增敏感词
@@ -143,6 +100,21 @@ public class WordDataTree extends AbstractWordData {
} }
} }
@Override
protected synchronized void doRemoveWord(Collection<String> collection) {
for (String word : collection) {
if (StringUtil.isEmpty(word)) {
continue;
}
removeWord(this.root, word);
}
}
/** /**
* 获取当前的 Map * 获取当前的 Map
* @param nowNode 当前节点 * @param nowNode 当前节点
@@ -211,4 +183,48 @@ public class WordDataTree extends AbstractWordData {
tempNode.end(true); tempNode.end(true);
} }
private void removeWord(WordDataTreeNode root, String word){
WordDataTreeNode tempNode = root;
//需要删除的
Map<Character, WordDataTreeNode> map = new HashMap<>();
char[] chars = word.toCharArray();
int length = chars.length;
for (int i = 0; i < length; i++) {
//不存在第一个词
WordDataTreeNode subNode = tempNode.getSubNode(chars[i]);
if (subNode == null) {
return;
}
if (i == (length - 1)) {
//尾字符判断是否结束
if (!subNode.end()) {
return;
}
if (subNode.getNodeSize() > 0) {
//尾字符下还存在字符,即标识即可
subNode.end(false);
return;
}
}
if (subNode.end()) {
map.clear();
}
map.put(chars[i], tempNode);
tempNode = subNode;
}
for (Map.Entry<Character, WordDataTreeNode> entry : map.entrySet()) {
WordDataTreeNode value = entry.getValue();
//节点只有一个就置空
if (value.getNodeSize() == 1) {
value.clearNode();
return;
}
//多个就删除
value.removeNode(entry.getKey());
}
}
} }

View File

@@ -1,32 +1,48 @@
package com.github.houbb.sensitive.word.support.result; package com.github.houbb.sensitive.word.support.result;
/**
* 说明:统一让黑白名单一次遍历,性能优化
*
* @since 0.24.2
*/
public class WordLengthResult { public class WordLengthResult {
/**
* 白名单长度
*/
private int wordAllowLen; private int wordAllowLen;
/**
* 黑名单长度
*/
private int wordDenyLen; private int wordDenyLen;
public static WordLengthResult newInstance() {
private WordLengthResult(){}
public static WordLengthResult newInstance(){
return new WordLengthResult(); return new WordLengthResult();
} }
public int wordAllowLen() {
public int wordAllowLen(){
return this.wordAllowLen; return this.wordAllowLen;
} }
public WordLengthResult wordAllowLen(int wordAllowLen){
this.wordAllowLen=wordAllowLen; public WordLengthResult wordAllowLen(int wordAllowLen) {
this.wordAllowLen = wordAllowLen;
return this; return this;
} }
public int wordDenyLen(){ public int wordDenyLen() {
return this.wordDenyLen; return this.wordDenyLen;
} }
public WordLengthResult wordDenyLen(int wordDenyLen){
this.wordDenyLen=wordDenyLen; public WordLengthResult wordDenyLen(int wordDenyLen) {
this.wordDenyLen = wordDenyLen;
return this; return this;
} }
@Override
public String toString() {
return "WordLengthResult{" +
"wordAllowLen=" + wordAllowLen +
", wordDenyLen=" + wordDenyLen +
'}';
}
} }

View File

@@ -71,6 +71,8 @@ public class WordTags {
/** /**
* 根据标准的约定行处理 * 根据标准的约定行处理
* @param lines 行信息 * @param lines 行信息
* @param wordSplit 单词分割
* @param tagSplit 标签分割
* @return 结果 * @return 结果
*/ */
public static IWordTag lines(final Collection<String> lines, final String wordSplit, final String tagSplit) { public static IWordTag lines(final Collection<String> lines, final String wordSplit, final String tagSplit) {

View File

@@ -72,44 +72,37 @@ public class BenchmarkBasicTest {
} }
/** /**
* * 黑白名单一次遍历 优化前300*他们在地铁口交易查10000次26183 * 黑白名单一次遍历 优化前300*他们在地铁口交易查10000次26183
* * 黑白名单一次遍历 优化后300*他们在地铁口交易查10000次15705 * 黑白名单一次遍历 优化后300*他们在地铁口交易查10000次15705
* * @since 0.24.2
*/ */
@Test @Test
public void costTimeOneTraceTest() { public void costTimeOneTraceTest() {
StringBuilder sb=new StringBuilder(); StringBuilder sb = new StringBuilder();
for(int i=0;i<300;i++){ for (int i = 0; i < 300; i++) {
sb.append("他们在地铁口交易").append(i); sb.append("他们在地铁口交易").append(i);
} }
String text = sb.toString(); String text = sb.toString();
// 1W 次 // 1W 次
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance() SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance().wordDeny(new IWordDeny() {
.wordDeny(new IWordDeny() {
@Override @Override
public List<String> deny() { public List<String> deny() {
return Collections.singletonList("口交"); return Collections.singletonList("口交");
} }
}) }).wordAllow(new IWordAllow() {
.wordAllow(new IWordAllow() {
@Override @Override
public List<String> allow() { public List<String> allow() {
return Collections.singletonList("地铁口交易"); return Collections.singletonList("地铁口交易");
} }
}) }).enableWordCheck(true).enableNumCheck(false).enableUrlCheck(false).enableEmailCheck(false).init();
.enableWordCheck(true)
.enableNumCheck(false)
.enableUrlCheck(false)
.enableEmailCheck(false)
.init();
for(int i = 0; i < 10000; i++) { for (int i = 0; i < 10000; i++) {
sensitiveWordBs.findAll(text); sensitiveWordBs.findAll(text);
} }
long end = System.currentTimeMillis(); long end = System.currentTimeMillis();
System.out.println("------------------ COST: " + (end-start)); System.out.println("------------------ COST: " + (end - start));
} }
/** /**

View File

@@ -4,8 +4,10 @@ import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.sensitive.word.data.WordCountDto; import com.github.houbb.sensitive.word.data.WordCountDto;
import com.github.houbb.sensitive.word.support.allow.WordAllows; import com.github.houbb.sensitive.word.support.allow.WordAllows;
import com.github.houbb.sensitive.word.support.check.WordChecks;
import com.github.houbb.sensitive.word.support.deny.WordDenys; import com.github.houbb.sensitive.word.support.deny.WordDenys;
import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores; import com.github.houbb.sensitive.word.support.ignore.SensitiveWordCharIgnores;
import com.github.houbb.sensitive.word.support.replace.WordReplaces;
import com.github.houbb.sensitive.word.support.resultcondition.WordResultConditions; import com.github.houbb.sensitive.word.support.resultcondition.WordResultConditions;
import com.github.houbb.sensitive.word.support.tag.WordTags; import com.github.houbb.sensitive.word.support.tag.WordTags;
import org.junit.Assert; import org.junit.Assert;
@@ -39,46 +41,19 @@ public class SensitiveWordBsConfigTest {
.enableUrlCheck(false) .enableUrlCheck(false)
.enableIpv4Check(false) .enableIpv4Check(false)
.enableWordCheck(true) .enableWordCheck(true)
.wordCheckNum(WordChecks.num())
.wordCheckEmail(WordChecks.email())
.wordCheckUrl(WordChecks.url())
.wordCheckIpv4(WordChecks.ipv4())
.wordCheckWord(WordChecks.word())
.numCheckLen(8) .numCheckLen(8)
.wordTag(WordTags.none()) .wordTag(WordTags.none())
.charIgnore(SensitiveWordCharIgnores.defaults()) .charIgnore(SensitiveWordCharIgnores.defaults())
.wordResultCondition(WordResultConditions.alwaysTrue()) .wordResultCondition(WordResultConditions.alwaysTrue())
.wordAllow(WordAllows.defaults()) .wordAllow(WordAllows.defaults())
.wordDeny(WordDenys.defaults()) .wordDeny(WordDenys.defaults())
.wordReplace(WordReplaces.defaults())
.init(); .init();
// String dir = "D:\\code\\github\\houbb.github.io\\_posts";
// File[] files = new File(dir).listFiles();
//
// Set<String> wordSet = new HashSet<>();
//
// Map<String, Integer> wordCountMap = new HashMap<>();
// for(File file : files) {
// String content = FileUtil.getFileContent(file);
// List<String> allWords = wordBs.findAll(content);
//
// for(String word : allWords) {
// Integer integer = wordCountMap.get(word);
// if(integer == null) {
// integer = 0;
// }
//
// integer++;
// wordCountMap.put(word, integer);
// }
//
// System.out.println(file.getName());
// }
//
//// List<WordCountDto> wordCountDtoList = new ArrayList<>();
// for(Map.Entry<String, Integer> entry : wordCountMap.entrySet()) {
// if(entry.getValue() >= 3) {
// System.out.println(entry.getKey() + " : " + entry.getValue());
// }
// }
// Collections.sort(wordCountDtoList);
// System.out.println(wordCountDtoList);
} }
@Test @Test

View File

@@ -0,0 +1,37 @@
package com.github.houbb.sensitive.word.bs;
import com.github.houbb.sensitive.word.support.check.WordChecks;
import org.junit.Assert;
import org.junit.Test;
import java.util.List;
/**
* <p> project: sensitive-word-SensitiveWordBsTest </p>
* <p> create on 2020/1/7 23:43 </p>
*
* @author Administrator
* @since 0.25.0
*/
public class SensitiveWordBsUrlNoPrefixTest {
/**
* URL 检测
*
* @since 0.25.0
*/
@Test
public void urlNoPrefixTest() {
final String text = "点击链接 https://www.baidu.com 查看答案,当然也可以是 baidu.com、www.baidu.com";
final SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
.enableUrlCheck(true) // 启用URL检测
.wordCheckUrl(WordChecks.urlNoPrefix()) //指定检测的方式
.init();
List<String> wordList = sensitiveWordBs.findAll(text);
Assert.assertEquals("[www.baidu.com, baidu.com, www.baidu.com]", wordList.toString());
Assert.assertEquals("点击链接 https://************* 查看答案,当然也可以是 *********、*************", sensitiveWordBs.replace(text));
}
}

View File

@@ -0,0 +1,28 @@
package com.github.houbb.sensitive.word.bs;
import com.github.houbb.sensitive.word.api.IWordDeny;
import org.junit.Assert;
import org.junit.Test;
import java.util.Arrays;
import java.util.List;
public class SensitiveWordMaxFirstTest {
@Test
public void maxFirstTest() {
SensitiveWordBs bs = SensitiveWordBs.newInstance()
.wordDeny(new IWordDeny() {
@Override
public List<String> deny() {
return Arrays.asList("我的世界", "我的");
}
}).init();
String text = "我的世界我的好玩";
List<String> textList = bs.findAll(text);
// Assert.assertEquals("", textList.toString());
}
}