release branch 0.2.0

This commit is contained in:
houbb
2022-01-15 11:07:13 +08:00
parent e156e73348
commit 368520fc90
15 changed files with 306 additions and 24 deletions

View File

@@ -126,3 +126,10 @@
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:---|:---|:---|:--|
| 1 | F | 自定义敏感词 allow/deny 进行格式化处理 | 2021-12-11 23:51:58 | |
# release_0.2.0
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:---|:---|:---|:--|
| 1 | A | 允许用户自定义替换策略 | 2022-01-15 23:51:58 | |
| 2 | U | 升级二方数据库依赖 | 2022-01-15 23:51:58 | |

View File

@@ -44,9 +44,9 @@
[CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md)
v0.1.1 变更:
v0.2.0 变更:
- 敏感词自定义 Allow/Deny 进行格式化处理
- 支持用户自定义替换策略
# 快速开始
@@ -62,7 +62,7 @@ v0.1.1 变更:
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.1.1</version>
<version>0.2.0</version>
</dependency>
```
@@ -73,6 +73,7 @@ v0.1.1 变更:
| 方法 | 参数 | 返回值| 说明 |
|:---|:---|:---|:---|
| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 |
| replace(String, ISensitiveWordReplace) | 使用指定的替换策略替换敏感词 | 字符串 | 返回脱敏后的字符串 |
| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 |
@@ -170,6 +171,58 @@ String result = SensitiveWordHelper.replace(text, '0');
Assert.assertEquals("0000迎风飘扬000的画像屹立在000前。", result);
```
### 自定义替换策略
V0.2.0 支持该特性。
场景说明:有时候我们希望不同的敏感词有不同的替换结果。比如【游戏】替换为【电子竞技】,【失业】替换为【灵活就业】。
诚然,提前使用字符串的正则替换也可以,不过性能一般。
使用例子:
```java
/**
* 自定替换策略
* @since 0.2.0
*/
@Test
public void defineReplaceTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
ISensitiveWordReplace replace = new MySensitiveWordReplace();
String result = SensitiveWordHelper.replace(text, replace);
Assert.assertEquals("国家旗帜迎风飘扬,教员的画像屹立在***前。", result);
}
```
其中 `MySensitiveWordReplace` 是我们自定义的替换策略,实现如下:
```java
public class MySensitiveWordReplace implements ISensitiveWordReplace {
@Override
public String replace(ISensitiveWordReplaceContext context) {
String sensitiveWord = context.sensitiveWord();
// 自定义不同的敏感词替换策略,可以从数据库等地方读取
if("五星红旗".equals(sensitiveWord)) {
return "国家旗帜";
}
if("毛主席".equals(sensitiveWord)) {
return "教员";
}
// 其他默认使用 * 代替
int wordLength = context.wordLength();
return CharUtil.repeat('*', wordLength);
}
}
```
我们针对其中的部分词做固定映射处理,其他的默认转换为 `*`
# 更多特性
后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。
@@ -530,8 +583,6 @@ public class SensitiveWordService {
# 后期 road-map
- 停顿词
- 同音字处理
- 形近字处理
@@ -542,7 +593,7 @@ public class SensitiveWordService {
- 敏感词标签支持
- 邮箱后缀检测
- [ ] DFA 数据结构的另一种实现
# 拓展阅读
@@ -552,4 +603,16 @@ public class SensitiveWordService {
[敏感词库优化流程](https://houbb.github.io/2020/01/07/sensitive-word-slim)
[停止词的思考记录](https://houbb.github.io/2020/01/07/sensitive-word-stopword)
[java 如何实现开箱即用的敏感词控台服务?](https://mp.weixin.qq.com/s/rQo75cfMU_OEbTJa0JGMGg)
![WECHAT](WECHAT.png)
# 相关开源库
[heaven 基础工具包](https://github.com/houbb/heaven)
[opencc4j 繁简体转换](https://github.com/houbb/opencc4j)
[pinyin 拼音工具](https://github.com/houbb/pinyin)
[nlp-hanzi-similar 汉字相似度工具](https://github.com/houbb/nlp-hanzi-similar)

BIN
WECHAT.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

View File

@@ -6,7 +6,7 @@
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.1.1</version>
<version>0.2.0</version>
<properties>
<!--============================== All Plugins START ==============================-->
@@ -25,8 +25,8 @@
<project.compiler.level>1.7</project.compiler.level>
<!--============================== INTER ==============================-->
<heaven.version>0.1.148</heaven.version>
<opencc4j.version>1.7.1</opencc4j.version>
<heaven.version>0.1.154</heaven.version>
<opencc4j.version>1.7.2</opencc4j.version>
<!--============================== OTHER ==============================-->
<junit.version>4.13.1</junit.version>
@@ -104,7 +104,7 @@
<source>${project.compiler.level}</source>
<target>${project.compiler.level}</target>
<encoding>${project.build.sourceEncoding}</encoding>
<compilerArgument>-proc:none</compilerArgument>
<!-- <compilerArgument>-proc:none</compilerArgument>-->
</configuration>
</plugin>

View File

@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
:: 版本号信息(需要手动指定)
:::: 旧版本名称
SET version=0.1.1
SET version=0.2.0
:::: 新版本名称
SET newVersion=0.1.2
SET newVersion=0.2.1
:::: 组织名称
SET groupName=com.github.houbb
:::: 项目名称

View File

@@ -0,0 +1,19 @@
package com.github.houbb.sensitive.word.api;
/**
* 敏感词替换策略
*
* @author binbin.hou
* @since 0.2.0
*/
public interface ISensitiveWordReplace {
/**
* 替换
* @param context 上下文
* @return 结果
* @since 0.2.0
*/
String replace(ISensitiveWordReplaceContext context);
}

View File

@@ -0,0 +1,25 @@
package com.github.houbb.sensitive.word.api;
/**
* 敏感词替换策略上下文
*
* @author binbin.hou
* @since 0.2.0
*/
public interface ISensitiveWordReplaceContext {
/**
* 敏感词
* @return 敏感词
* @since 0.2.0
*/
String sensitiveWord();
/**
* 单词长度
* @return 单词长度
* @since 0.2.0
*/
int wordLength();
}

View File

@@ -59,12 +59,13 @@ public interface IWordMap extends ISensitiveCheck {
* ps: 这里可以添加优化。
*
* @param target 目标字符串
* @param replaceChar 替换为的 char
* @param replace 替换策略
* @param context 上下文
* @return 替换后结果
* @since 0.0.2
*/
String replace(final String target, final char replaceChar,
String replace(final String target,
final ISensitiveWordReplace replace,
final IWordContext context);
}

View File

@@ -8,6 +8,7 @@ import com.github.houbb.sensitive.word.api.*;
import com.github.houbb.sensitive.word.support.allow.WordAllows;
import com.github.houbb.sensitive.word.support.deny.WordDenys;
import com.github.houbb.sensitive.word.support.map.SensitiveWordMap;
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceChar;
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
import com.github.houbb.sensitive.word.utils.InnerFormatUtils;
@@ -393,9 +394,23 @@ public class SensitiveWordBs {
* @since 0.0.2
*/
public String replace(final String target, final char replaceChar) {
ISensitiveWordReplace replace = new SensitiveWordReplaceChar(replaceChar);
return replace(target, replace);
}
/**
* 替换所有内容
*
* @param target 目标字符串
* @param replace 替换策略
* @return 替换后结果
* @since 0.2.0
*/
public String replace(final String target, final ISensitiveWordReplace replace) {
statusCheck();
return sensitiveWordMap.replace(target, replaceChar, context);
return sensitiveWordMap.replace(target, replace, context);
}
/**

View File

@@ -1,5 +1,6 @@
package com.github.houbb.sensitive.word.core;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
import com.github.houbb.sensitive.word.api.IWordResultHandler;
import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
@@ -59,6 +60,18 @@ public final class SensitiveWordHelper {
return WORD_BS.findFirst(target);
}
/**
* 替换所有内容
*
* @param target 目标字符串
* @param replace 替换策略
* @return 替换后结果
* @since 0.2.0
*/
public static String replace(final String target, final ISensitiveWordReplace replace) {
return WORD_BS.replace(target, replace);
}
/**
* 替换所有内容
*

View File

@@ -9,14 +9,13 @@ import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.heaven.util.util.MapUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordMap;
import com.github.houbb.sensitive.word.api.IWordResult;
import com.github.houbb.sensitive.word.api.*;
import com.github.houbb.sensitive.word.constant.AppConst;
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckChain;
import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl;
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
import com.github.houbb.sensitive.word.support.result.WordResult;
import java.util.Collection;
@@ -156,12 +155,12 @@ public class SensitiveWordMap implements IWordMap {
}
@Override
public String replace(String target, char replaceChar, final IWordContext context) {
public String replace(String target, final ISensitiveWordReplace replace, final IWordContext context) {
if(StringUtil.isEmpty(target)) {
return target;
}
return this.replaceSensitiveWord(target, replaceChar, context);
return this.replaceSensitiveWord(target, replace, context);
}
/**
@@ -213,11 +212,13 @@ public class SensitiveWordMap implements IWordMap {
/**
* 直接替换敏感词,返回替换后的结果
* @param target 文本信息
* @param replace 替换策略
* @param context 上下文
* @return 脱敏后的字符串
* @since 0.0.2
*/
private String replaceSensitiveWord(final String target,
final char replaceChar,
final ISensitiveWordReplace replace,
final IWordContext context) {
if(StringUtil.isEmpty(target)) {
return target;
@@ -241,7 +242,12 @@ public class SensitiveWordMap implements IWordMap {
// 直接使用原始内容,避免 markdown 图片转换失败
resultBuilder.append(string);
} else {
String replaceStr = CharUtil.repeat(replaceChar, wordLength);
// 创建上下文
ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance()
.sensitiveWord(string)
.wordLength(wordLength);
String replaceStr = replace.replace(replaceContext);
resultBuilder.append(replaceStr);
}

View File

@@ -0,0 +1,29 @@
package com.github.houbb.sensitive.word.support.replace;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.lang.CharUtil;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext;
/**
* 指定字符的替换策略
* @author binbin.hou
* @since 0.2.0
*/
@ThreadSafe
public class SensitiveWordReplaceChar implements ISensitiveWordReplace {
private final char replaceChar;
public SensitiveWordReplaceChar(char replaceChar) {
this.replaceChar = replaceChar;
}
@Override
public String replace(ISensitiveWordReplaceContext context) {
int wordLength = context.wordLength();
return CharUtil.repeat(replaceChar, wordLength);
}
}

View File

@@ -0,0 +1,57 @@
package com.github.houbb.sensitive.word.support.replace;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext;
/**
* 敏感词替换上下文
*
* @author binbin.hou
* @since 0.2.0
*/
public class SensitiveWordReplaceContext implements ISensitiveWordReplaceContext {
public static SensitiveWordReplaceContext newInstance() {
return new SensitiveWordReplaceContext();
}
/**
* 敏感词
* @since 0.2.0
*/
private String sensitiveWord;
/**
* 单词长度
* @since 0.2.0
*/
private int wordLength;
@Override
public String sensitiveWord() {
return sensitiveWord;
}
public SensitiveWordReplaceContext sensitiveWord(String sensitiveWord) {
this.sensitiveWord = sensitiveWord;
return this;
}
@Override
public int wordLength() {
return wordLength;
}
public SensitiveWordReplaceContext wordLength(int wordLength) {
this.wordLength = wordLength;
return this;
}
@Override
public String toString() {
return "SensitiveWordReplaceContext{" +
"sensitiveWord='" + sensitiveWord + '\'' +
", wordLength=" + wordLength +
'}';
}
}

View File

@@ -1,6 +1,8 @@
package com.github.houbb.sensitive.word.core;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
import com.github.houbb.sensitive.word.api.IWordResult;
import com.github.houbb.sensitive.word.replace.MySensitiveWordReplace;
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
import org.junit.Assert;
import org.junit.Test;
@@ -148,4 +150,18 @@ public class SensitiveWordHelperTest {
Assert.assertEquals("", word);
}
/**
* 自定替换策略
* @since 0.2.0
*/
@Test
public void defineReplaceTest() {
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
ISensitiveWordReplace replace = new MySensitiveWordReplace();
String result = SensitiveWordHelper.replace(text, replace);
Assert.assertEquals("国家旗帜迎风飘扬,教员的画像屹立在***前。", result);
}
}

View File

@@ -0,0 +1,31 @@
package com.github.houbb.sensitive.word.replace;
import com.github.houbb.heaven.util.lang.CharUtil;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext;
/**
* 自定义敏感词替换策略
*
* @author binbin.hou
* @since 0.2.0
*/
public class MySensitiveWordReplace implements ISensitiveWordReplace {
@Override
public String replace(ISensitiveWordReplaceContext context) {
String sensitiveWord = context.sensitiveWord();
// 自定义不同的敏感词替换策略,可以从数据库等地方读取
if("五星红旗".equals(sensitiveWord)) {
return "国家旗帜";
}
if("毛主席".equals(sensitiveWord)) {
return "教员";
}
// 其他默认使用 * 代替
int wordLength = context.wordLength();
return CharUtil.repeat('*', wordLength);
}
}