mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
release branch 0.0.5
This commit is contained in:
17
README.md
17
README.md
@@ -30,6 +30,8 @@
|
||||
|
||||
- 支持英文大小写互换
|
||||
|
||||
- 支持数字各种形式的互换
|
||||
|
||||
## 变更日志
|
||||
|
||||
[CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md)
|
||||
@@ -48,7 +50,7 @@
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.0.4</version>
|
||||
<version>0.0.5</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@@ -134,9 +136,18 @@ String word = SensitiveWordBs.newInstance().findFirst(text);
|
||||
Assert.assertEquals("fuck", word);
|
||||
```
|
||||
|
||||
# 后期 road-map
|
||||
## 忽略数字的写法
|
||||
|
||||
- 数字的转换处理
|
||||
这里实现了数字常见形式的转换。
|
||||
|
||||
```java
|
||||
final String text = "这个是我的微信:9⓿二肆⁹₈③⑸⒋➃㈤㊄";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString());
|
||||
```
|
||||
|
||||
# 后期 road-map
|
||||
|
||||
- 繁简体互换
|
||||
|
||||
|
||||
@@ -41,4 +41,8 @@
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | D | 移除单个字符 `v` | 2020-1-9 09:34:35 | |
|
||||
| 1 | D | 移除单个字符 `v` | 2020-1-9 09:34:35 | |
|
||||
| 2 | D | 移除单个字符 `我` | 2020-1-10 09:34:35 | |
|
||||
| 3 | O | 责任链模式优化代码实现 | 2020-1-10 09:34:35 | |
|
||||
| 4 | A | 支持数字格式化转换 | 2020-1-10 09:34:35 | |
|
||||
| 5 | A | 支持数字敏感词验证 | 2020-1-10 09:34:35 | |
|
||||
@@ -0,0 +1,7 @@
|
||||
# 转换为数字
|
||||
|
||||
所有中文/符号转换为数字。
|
||||
|
||||
# 是否为多个数字的判断
|
||||
|
||||
连续超过 6 位的数字。
|
||||
7
doc/issues/roadmap/v013-邮箱URL的转换实现.md
Normal file
7
doc/issues/roadmap/v013-邮箱URL的转换实现.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# 是否为邮箱 check
|
||||
|
||||
# 是否为 URL check
|
||||
|
||||
可以直接开辟另一道验证方式。
|
||||
|
||||
直接 regex+全文检索实现。
|
||||
2
pom.xml
2
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.0.5-SNAPSHOT</version>
|
||||
<version>0.0.5</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
|
||||
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
|
||||
|
||||
:: 版本号信息(需要手动指定)
|
||||
:::: 旧版本名称
|
||||
SET version=0.0.4
|
||||
SET version=0.0.5
|
||||
:::: 新版本名称
|
||||
SET newVersion=0.0.5
|
||||
SET newVersion=0.0.6
|
||||
:::: 组织名称
|
||||
SET groupName=com.github.houbb
|
||||
:::: 项目名称
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
/**
|
||||
* 单词格式化
|
||||
* (1)忽略大小写
|
||||
* (2)忽略全角半角
|
||||
* (3)忽略停顿词
|
||||
* (4)忽略数字转换。
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
public interface ICharFormat {
|
||||
|
||||
/**
|
||||
* 针对 char 格式化
|
||||
* @param original 原始 char
|
||||
* @param context 上下文
|
||||
* @return 格式化后的 char
|
||||
* @since 0.0.5
|
||||
*/
|
||||
char format(final char original,
|
||||
final IWordContext context);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
|
||||
/**
|
||||
* 敏感信息监测接口
|
||||
* (1)敏感词
|
||||
* (2)数字(连续6位及其以上)
|
||||
* (3)邮箱
|
||||
* (4)URL
|
||||
*
|
||||
* 可以使用责任链的模式,循环调用。
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
public interface ISensitiveCheck {
|
||||
|
||||
/**
|
||||
* 检查敏感词数量
|
||||
* <p>
|
||||
* (1)如果未命中敏感词,直接返回 0
|
||||
* (2)命中敏感词,则返回敏感词的长度。
|
||||
* <p>
|
||||
* ps: 这里结果进行优化,
|
||||
* 1. 是否包含敏感词。
|
||||
* 2. 敏感词的长度
|
||||
* 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复)
|
||||
*
|
||||
* @param txt 文本信息
|
||||
* @param beginIndex 开始下标
|
||||
* @param validModeEnum 验证模式
|
||||
* @param context 执行上下文
|
||||
* @return 敏感信息对应的长度
|
||||
* @since 0.0.5
|
||||
*/
|
||||
int checkSensitive(final String txt,
|
||||
final int beginIndex,
|
||||
final ValidModeEnum validModeEnum,
|
||||
final IWordContext context);
|
||||
|
||||
}
|
||||
@@ -1,5 +1,7 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author binbin.hou
|
||||
* @since 0.0.4
|
||||
@@ -20,6 +22,14 @@ public interface IWordContext {
|
||||
*/
|
||||
boolean ignoreWidth();
|
||||
|
||||
/**
|
||||
* 是否忽略数字格式
|
||||
* @return 是否
|
||||
* @since 0.0.5
|
||||
*/
|
||||
boolean ignoreNumStyle();
|
||||
|
||||
|
||||
/**
|
||||
* 设置是否忽略大小写
|
||||
* @param ignoreCase 是否忽略大小写
|
||||
@@ -36,4 +46,41 @@ public interface IWordContext {
|
||||
*/
|
||||
IWordContext ignoreWidth(boolean ignoreWidth);
|
||||
|
||||
/**
|
||||
* 设置是否忽略半角圆角
|
||||
* @param ignoreNumStyle 是否忽略半角圆角
|
||||
* @return this
|
||||
* @since 0.0.5
|
||||
*/
|
||||
IWordContext ignoreNumStyle(boolean ignoreNumStyle);
|
||||
|
||||
/**
|
||||
* 获取敏感词信息
|
||||
* @return 敏感词
|
||||
* @since 0.0.5
|
||||
*/
|
||||
Map sensitiveWordMap();
|
||||
|
||||
/**
|
||||
* 敏感词信息
|
||||
* @param map map 信息
|
||||
* @return this
|
||||
* @since 0.0.5
|
||||
*/
|
||||
IWordContext sensitiveWordMap(final Map map);
|
||||
|
||||
/**
|
||||
* 敏感数字检测
|
||||
* @return 数字检测
|
||||
* @since 0.0.5
|
||||
*/
|
||||
boolean sensitiveNumCheck();
|
||||
|
||||
/**
|
||||
* 设置敏感数字检测
|
||||
* @return 数字检测
|
||||
* @since 0.0.5
|
||||
*/
|
||||
IWordContext sensitiveNumCheck(final boolean sensitiveNumCheck);
|
||||
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ import java.util.List;
|
||||
* @author binbin.hou
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public interface IWordMap {
|
||||
public interface IWordMap extends ISensitiveCheck {
|
||||
|
||||
|
||||
/**
|
||||
|
||||
@@ -79,8 +79,13 @@ public class SensitiveWordBs {
|
||||
*/
|
||||
private static IWordContext buildDefaultContext() {
|
||||
IWordContext wordContext = SensitiveWordContext.newInstance();
|
||||
// 格式统一化
|
||||
wordContext.ignoreCase(true);
|
||||
wordContext.ignoreWidth(true);
|
||||
wordContext.ignoreNumStyle(true);
|
||||
|
||||
// 开启校验
|
||||
wordContext.sensitiveNumCheck(true);
|
||||
|
||||
return wordContext;
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@ package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 上下文
|
||||
* @author binbin.hou
|
||||
@@ -21,6 +23,23 @@ public class SensitiveWordContext implements IWordContext {
|
||||
*/
|
||||
private boolean ignoreWidth;
|
||||
|
||||
/**
|
||||
* 是否忽略数字格式
|
||||
* @since 0.0.5
|
||||
*/
|
||||
private boolean ignoreNumStyle;
|
||||
|
||||
/**
|
||||
* 敏感词信息
|
||||
* @since 0.0.5
|
||||
*/
|
||||
private Map sensitiveWordMap;
|
||||
|
||||
/**
|
||||
* 是否进行敏感数字检测
|
||||
* @since 0.0.6
|
||||
*/
|
||||
private boolean sensitiveNumCheck;
|
||||
/**
|
||||
* 私有化构造器
|
||||
* @since 0.0.4
|
||||
@@ -59,22 +78,37 @@ public class SensitiveWordContext implements IWordContext {
|
||||
return this;
|
||||
}
|
||||
|
||||
private static class ContextHolder {
|
||||
private static final SensitiveWordContext INSTANCE = new SensitiveWordContext();
|
||||
|
||||
static {
|
||||
INSTANCE.ignoreCase(true);
|
||||
INSTANCE.ignoreWidth(true);
|
||||
}
|
||||
@Override
|
||||
public boolean ignoreNumStyle() {
|
||||
return ignoreNumStyle;
|
||||
}
|
||||
|
||||
/**
|
||||
* 默认配置
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static SensitiveWordContext defaultContext() {
|
||||
return ContextHolder.INSTANCE;
|
||||
@Override
|
||||
public SensitiveWordContext ignoreNumStyle(boolean ignoreNumStyle) {
|
||||
this.ignoreNumStyle = ignoreNumStyle;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map sensitiveWordMap() {
|
||||
return sensitiveWordMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveWordContext sensitiveWordMap(Map sensitiveWordMap) {
|
||||
this.sensitiveWordMap = sensitiveWordMap;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean sensitiveNumCheck() {
|
||||
return sensitiveNumCheck;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SensitiveWordContext sensitiveNumCheck(boolean sensitiveNumCheck) {
|
||||
this.sensitiveNumCheck = sensitiveNumCheck;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -82,6 +116,8 @@ public class SensitiveWordContext implements IWordContext {
|
||||
return "SensitiveWordContext{" +
|
||||
"ignoreCase=" + ignoreCase +
|
||||
", ignoreWidth=" + ignoreWidth +
|
||||
", ignoreNumStyle=" + ignoreNumStyle +
|
||||
", sensitiveNumCheck=" + sensitiveNumCheck +
|
||||
'}';
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ public final class AppConst {
|
||||
* 字典的大小
|
||||
* @since 0.0.1
|
||||
*/
|
||||
public static final int DICT_SIZE = 65711;
|
||||
public static final int DICT_SIZE = 65709;
|
||||
|
||||
/**
|
||||
* 英语词典的大小
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
package com.github.houbb.sensitive.word.support.check;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.support.instance.impl.Instances;
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 敏感词检测责任链模式
|
||||
*
|
||||
* 这里可以提供一个公共的父类。
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveCheckChain implements ISensitiveCheck {
|
||||
|
||||
@Override
|
||||
public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 初始化责任链
|
||||
List<ISensitiveCheck> sensitiveChecks = Guavas.newArrayList();
|
||||
// 默认添加敏感词校验
|
||||
sensitiveChecks.add(Instances.singleton(SensitiveWordCheck.class));
|
||||
if(context.sensitiveNumCheck()) {
|
||||
sensitiveChecks.add(Instances.singleton(SensitiveNumCheck.class));
|
||||
}
|
||||
|
||||
// 循环调用
|
||||
for(ISensitiveCheck sensitiveCheck : sensitiveChecks) {
|
||||
int result = sensitiveCheck.checkSensitive(txt, beginIndex, validModeEnum, context);
|
||||
|
||||
if(result > 0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// 默认返回 0
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,67 @@
|
||||
package com.github.houbb.sensitive.word.support.check;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.support.instance.impl.Instances;
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.format.CharFormatChain;
|
||||
import com.github.houbb.sensitive.word.utils.NumUtils;
|
||||
|
||||
/**
|
||||
* 敏感词监测实现
|
||||
*
|
||||
* 这里可以提供一个公共的父类。
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveNumCheck implements ISensitiveCheck {
|
||||
|
||||
@Override
|
||||
public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
for (int i = beginIndex; i < txt.length(); i++) {
|
||||
char c = txt.charAt(i);
|
||||
char charKey = Instances.singleton(CharFormatChain.class).format(c, context);
|
||||
|
||||
// 如果是数字
|
||||
// 满足进入的条件
|
||||
if (Character.isDigit(charKey)) {
|
||||
lengthCount++;
|
||||
|
||||
// 满足结束的条件
|
||||
boolean isCondition = isCondition(lengthCount);
|
||||
if (isCondition) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
actualLength = lengthCount;
|
||||
|
||||
// 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。
|
||||
if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 直接跳出循环
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return actualLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* 这里指定一个阈值条件
|
||||
* @param lengthCount 长度
|
||||
* @return 是否满足条件
|
||||
* @since 0.0.5
|
||||
*/
|
||||
private boolean isCondition(final int lengthCount) {
|
||||
return lengthCount >= 6;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
package com.github.houbb.sensitive.word.support.check;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.support.instance.impl.Instances;
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveCheck;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.format.CharFormatChain;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 敏感词监测实现
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveWordCheck implements ISensitiveCheck {
|
||||
|
||||
@Override
|
||||
public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
Map nowMap = context.sensitiveWordMap();
|
||||
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
for (int i = beginIndex; i < txt.length(); i++) {
|
||||
char c = txt.charAt(i);
|
||||
char charKey = Instances.singleton(CharFormatChain.class).format(c, context);
|
||||
|
||||
// 判断该字是否存在于敏感词库中
|
||||
// 并且将 nowMap 替换为新的 map,进入下一层的循环。
|
||||
nowMap = (Map) nowMap.get(charKey);
|
||||
if (ObjectUtil.isNotNull(nowMap)) {
|
||||
lengthCount++;
|
||||
|
||||
// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
|
||||
boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
|
||||
if (isEnd) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
// eg: 敏感词 敏感词xxx
|
||||
// 如果是 【敏感词x】也会被匹配。
|
||||
actualLength = lengthCount;
|
||||
|
||||
// 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。
|
||||
if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 直接跳出循环
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return actualLength;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.support.instance.impl.Instances;
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.ICharFormat;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 格式化责任链
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class CharFormatChain implements ICharFormat {
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
char result = original;
|
||||
|
||||
List<ICharFormat> charFormats = Guavas.newArrayList();
|
||||
if(context.ignoreCase()) {
|
||||
charFormats.add(Instances.singleton(IgnoreCaseCharFormat.class));
|
||||
charFormats.add(Instances.singleton(IgnoreWidthCharFormat.class));
|
||||
charFormats.add(Instances.singleton(IgnoreNumStyleCharFormat.class));
|
||||
}
|
||||
|
||||
// 循环执行
|
||||
for(ICharFormat charFormat : charFormats) {
|
||||
result = charFormat.format(result, context);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.ICharFormat;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
|
||||
/**
|
||||
* 忽略大小写
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class IgnoreCaseCharFormat implements ICharFormat {
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return Character.toLowerCase(original);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.ICharFormat;
|
||||
import com.github.houbb.sensitive.word.utils.NumUtils;
|
||||
|
||||
/**
|
||||
* 忽略数字的样式
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class IgnoreNumStyleCharFormat implements ICharFormat {
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return NumUtils.getMappingChar(original);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.ICharFormat;
|
||||
|
||||
/**
|
||||
* 格式化责任链
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class IgnoreWidthCharFormat implements ICharFormat {
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return CharUtil.toHalfWidth(original);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package com.github.houbb.sensitive.word.support.map;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.support.instance.impl.Instances;
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
@@ -11,6 +12,7 @@ import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckChain;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
@@ -116,7 +118,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
}
|
||||
|
||||
for (int i = 0; i < string.length(); i++) {
|
||||
int checkResult = checkSensitiveWord(string, i, ValidModeEnum.FAIL_FAST, context);
|
||||
int checkResult = checkSensitive(string, i, ValidModeEnum.FAIL_FAST, context);
|
||||
// 快速返回
|
||||
if (checkResult > 0) {
|
||||
return true;
|
||||
@@ -176,7 +178,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
|
||||
List<String> resultList = Guavas.newArrayList();
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
int wordLength = checkSensitiveWord(text, i, ValidModeEnum.FAIL_OVER, context);
|
||||
int wordLength = checkSensitive(text, i, ValidModeEnum.FAIL_OVER, context);
|
||||
|
||||
// 命中
|
||||
if (wordLength > 0) {
|
||||
@@ -203,86 +205,6 @@ public class SensitiveWordMap implements IWordMap {
|
||||
return resultList;
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查敏感词数量
|
||||
* <p>
|
||||
* (1)如果未命中敏感词,直接返回 0
|
||||
* (2)命中敏感词,则返回敏感词的长度。
|
||||
*
|
||||
* ps: 这里结果进行优化,
|
||||
* 1. 是否包含敏感词。
|
||||
* 2. 敏感词的长度
|
||||
* 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复)
|
||||
*
|
||||
* @param txt 文本信息
|
||||
* @param beginIndex 开始下标
|
||||
* @param validModeEnum 验证模式
|
||||
* @param context 执行上下文
|
||||
* @return 敏感词对应的长度
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private int checkSensitiveWord(final String txt, final int beginIndex,
|
||||
final ValidModeEnum validModeEnum,
|
||||
final IWordContext context) {
|
||||
Map nowMap = innerWordMap;
|
||||
|
||||
// 记录敏感词的长度
|
||||
int lengthCount = 0;
|
||||
int actualLength = 0;
|
||||
|
||||
for (int i = beginIndex; i < txt.length(); i++) {
|
||||
char c = txt.charAt(i);
|
||||
char charKey = getActualChar(c, context);
|
||||
|
||||
// 判断该字是否存在于敏感词库中
|
||||
// 并且将 nowMap 替换为新的 map,进入下一层的循环。
|
||||
nowMap = (Map) nowMap.get(charKey);
|
||||
if (ObjectUtil.isNotNull(nowMap)) {
|
||||
lengthCount++;
|
||||
|
||||
// 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
|
||||
boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
|
||||
if (isEnd) {
|
||||
// 只在匹配到结束的时候才记录长度,避免不完全匹配导致的问题。
|
||||
// eg: 敏感词 敏感词xxx
|
||||
// 如果是 【敏感词x】也会被匹配。
|
||||
actualLength = lengthCount;
|
||||
|
||||
// 这里确实需要一种验证模式,主要是为了最大匹配从而达到最佳匹配的效果。
|
||||
if (ValidModeEnum.FAIL_FAST.equals(validModeEnum)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 直接跳出循环
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return actualLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取实际对应的符号
|
||||
* @param c 编号
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private char getActualChar(final char c,
|
||||
final IWordContext context) {
|
||||
char resultChar = c;
|
||||
|
||||
if(context.ignoreCase()) {
|
||||
resultChar = Character.toLowerCase(resultChar);
|
||||
}
|
||||
if(context.ignoreWidth()) {
|
||||
resultChar = CharUtil.toHalfWidth(resultChar);
|
||||
}
|
||||
|
||||
return resultChar;
|
||||
}
|
||||
|
||||
/**
|
||||
* 直接替换敏感词,返回替换后的结果
|
||||
* @param target 文本信息
|
||||
@@ -301,7 +223,7 @@ public class SensitiveWordMap implements IWordMap {
|
||||
for (int i = 0; i < target.length(); i++) {
|
||||
char currentChar = target.charAt(i);
|
||||
// 内层直接从 i 开始往后遍历,这个算法的,获取第一个匹配的单词
|
||||
int wordLength = checkSensitiveWord(target, i, ValidModeEnum.FAIL_OVER, context);
|
||||
int wordLength = checkSensitive(target, i, ValidModeEnum.FAIL_OVER, context);
|
||||
|
||||
// 敏感词
|
||||
if(wordLength > 0) {
|
||||
@@ -319,4 +241,14 @@ public class SensitiveWordMap implements IWordMap {
|
||||
return resultBuilder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int checkSensitive(String txt, int beginIndex, ValidModeEnum validModeEnum, IWordContext context) {
|
||||
// 默认执行敏感词操作
|
||||
context.sensitiveWordMap(innerWordMap);
|
||||
|
||||
// 责任链模式调用
|
||||
return Instances.singleton(SensitiveCheckChain.class)
|
||||
.checkSensitive(txt, beginIndex, validModeEnum, context);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@ package com.github.houbb.sensitive.word.utils;
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@@ -51,11 +53,6 @@ public final class NumUtils {
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789";
|
||||
|
||||
/**
|
||||
@@ -104,4 +101,28 @@ public final class NumUtils {
|
||||
return stringBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查敏感词数量
|
||||
* <p>
|
||||
* (1)如果未命中敏感词,直接返回 0
|
||||
* (2)命中敏感词,则返回敏感词的长度。
|
||||
*
|
||||
* ps: 这里结果进行优化,
|
||||
* 1. 是否包含敏感词。
|
||||
* 2. 敏感词的长度
|
||||
* 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复)
|
||||
*
|
||||
* @param txt 文本信息
|
||||
* @param beginIndex 开始下标
|
||||
* @param validModeEnum 验证模式
|
||||
* @param context 执行上下文
|
||||
* @return 敏感数字对应的长度
|
||||
* @since 0.0.5
|
||||
*/
|
||||
private int getSensitiveNumber(final String txt, final int beginIndex,
|
||||
final ValidModeEnum validModeEnum,
|
||||
final IWordContext context) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -32719,7 +32719,6 @@ z以留吧以其以武
|
||||
成都美女上门qq100996803
|
||||
成都锦天
|
||||
成龙记
|
||||
我
|
||||
我qiuqiu446巴636巴8
|
||||
我qq前五位27279接后接61388共十位
|
||||
我xx你
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
package com.github.houbb.sensitive.word.bs;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-SensitiveWordBsTest </p>
|
||||
* <p> create on 2020/1/7 23:43 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.0.5
|
||||
*/
|
||||
public class SensitiveWordBsNumTest {
|
||||
|
||||
/**
|
||||
* 返回所有敏感词
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@Test
|
||||
public void findAllTest() {
|
||||
final String text = "这个是我的微信:9989123456";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[9989123456]", wordList.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* 返回所有敏感词
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@Test
|
||||
public void ignoreNumStyleTest() {
|
||||
final String text = "这个是我的微信:9⓿二肆⁹₈③⑸⒋➃㈤㊄";
|
||||
|
||||
List<String> wordList = SensitiveWordBs.newInstance().findAll(text);
|
||||
Assert.assertEquals("[9⓿二肆⁹₈③⑸⒋➃㈤㊄]", wordList.toString());
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user