mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 00:17:35 +08:00
release branch 0.2.0
This commit is contained in:
@@ -126,3 +126,10 @@
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | F | 自定义敏感词 allow/deny 进行格式化处理 | 2021-12-11 23:51:58 | |
|
||||
|
||||
# release_0.2.0
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:---|:---|:---|:--|
|
||||
| 1 | A | 允许用户自定义替换策略 | 2022-01-15 23:51:58 | |
|
||||
| 2 | U | 升级二方数据库依赖 | 2022-01-15 23:51:58 | |
|
||||
|
||||
77
README.md
77
README.md
@@ -44,9 +44,9 @@
|
||||
|
||||
[CHANGE_LOG.md](https://github.com/houbb/sensitive-word/blob/master/doc/CHANGE_LOG.md)
|
||||
|
||||
v0.1.1 变更:
|
||||
v0.2.0 变更:
|
||||
|
||||
- 敏感词自定义 Allow/Deny 进行格式化处理
|
||||
- 支持用户自定义替换策略
|
||||
|
||||
# 快速开始
|
||||
|
||||
@@ -62,7 +62,7 @@ v0.1.1 变更:
|
||||
<dependency>
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.1.1</version>
|
||||
<version>0.2.0</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@@ -73,6 +73,7 @@ v0.1.1 变更:
|
||||
| 方法 | 参数 | 返回值| 说明 |
|
||||
|:---|:---|:---|:---|
|
||||
| contains(String) | 待验证的字符串 | 布尔值 | 验证字符串是否包含敏感词 |
|
||||
| replace(String, ISensitiveWordReplace) | 使用指定的替换策略替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| replace(String, char) | 使用指定的 char 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| replace(String) | 使用 `*` 替换敏感词 | 字符串 | 返回脱敏后的字符串 |
|
||||
| findAll(String) | 待验证的字符串 | 字符串列表 | 返回字符串中所有敏感词 |
|
||||
@@ -170,6 +171,58 @@ String result = SensitiveWordHelper.replace(text, '0');
|
||||
Assert.assertEquals("0000迎风飘扬,000的画像屹立在000前。", result);
|
||||
```
|
||||
|
||||
### 自定义替换策略
|
||||
|
||||
V0.2.0 支持该特性。
|
||||
|
||||
场景说明:有时候我们希望不同的敏感词有不同的替换结果。比如【游戏】替换为【电子竞技】,【失业】替换为【灵活就业】。
|
||||
|
||||
诚然,提前使用字符串的正则替换也可以,不过性能一般。
|
||||
|
||||
使用例子:
|
||||
|
||||
```java
|
||||
/**
|
||||
* 自定替换策略
|
||||
* @since 0.2.0
|
||||
*/
|
||||
@Test
|
||||
public void defineReplaceTest() {
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
ISensitiveWordReplace replace = new MySensitiveWordReplace();
|
||||
String result = SensitiveWordHelper.replace(text, replace);
|
||||
|
||||
Assert.assertEquals("国家旗帜迎风飘扬,教员的画像屹立在***前。", result);
|
||||
}
|
||||
```
|
||||
|
||||
其中 `MySensitiveWordReplace` 是我们自定义的替换策略,实现如下:
|
||||
|
||||
```java
|
||||
public class MySensitiveWordReplace implements ISensitiveWordReplace {
|
||||
|
||||
@Override
|
||||
public String replace(ISensitiveWordReplaceContext context) {
|
||||
String sensitiveWord = context.sensitiveWord();
|
||||
// 自定义不同的敏感词替换策略,可以从数据库等地方读取
|
||||
if("五星红旗".equals(sensitiveWord)) {
|
||||
return "国家旗帜";
|
||||
}
|
||||
if("毛主席".equals(sensitiveWord)) {
|
||||
return "教员";
|
||||
}
|
||||
|
||||
// 其他默认使用 * 代替
|
||||
int wordLength = context.wordLength();
|
||||
return CharUtil.repeat('*', wordLength);
|
||||
}
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
我们针对其中的部分词做固定映射处理,其他的默认转换为 `*`。
|
||||
|
||||
# 更多特性
|
||||
|
||||
后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。
|
||||
@@ -530,8 +583,6 @@ public class SensitiveWordService {
|
||||
|
||||
# 后期 road-map
|
||||
|
||||
- 停顿词
|
||||
|
||||
- 同音字处理
|
||||
|
||||
- 形近字处理
|
||||
@@ -542,7 +593,7 @@ public class SensitiveWordService {
|
||||
|
||||
- 敏感词标签支持
|
||||
|
||||
- 邮箱后缀检测
|
||||
- [ ] DFA 数据结构的另一种实现
|
||||
|
||||
# 拓展阅读
|
||||
|
||||
@@ -552,4 +603,16 @@ public class SensitiveWordService {
|
||||
|
||||
[敏感词库优化流程](https://houbb.github.io/2020/01/07/sensitive-word-slim)
|
||||
|
||||
[停止词的思考记录](https://houbb.github.io/2020/01/07/sensitive-word-stopword)
|
||||
[java 如何实现开箱即用的敏感词控台服务?](https://mp.weixin.qq.com/s/rQo75cfMU_OEbTJa0JGMGg)
|
||||
|
||||

|
||||
|
||||
# 相关开源库
|
||||
|
||||
[heaven 基础工具包](https://github.com/houbb/heaven)
|
||||
|
||||
[opencc4j 繁简体转换](https://github.com/houbb/opencc4j)
|
||||
|
||||
[pinyin 拼音工具](https://github.com/houbb/pinyin)
|
||||
|
||||
[nlp-hanzi-similar 汉字相似度工具](https://github.com/houbb/nlp-hanzi-similar)
|
||||
|
||||
BIN
WECHAT.png
Normal file
BIN
WECHAT.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 107 KiB |
8
pom.xml
8
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.1.1</version>
|
||||
<version>0.2.0</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
@@ -25,8 +25,8 @@
|
||||
<project.compiler.level>1.7</project.compiler.level>
|
||||
|
||||
<!--============================== INTER ==============================-->
|
||||
<heaven.version>0.1.148</heaven.version>
|
||||
<opencc4j.version>1.7.1</opencc4j.version>
|
||||
<heaven.version>0.1.154</heaven.version>
|
||||
<opencc4j.version>1.7.2</opencc4j.version>
|
||||
|
||||
<!--============================== OTHER ==============================-->
|
||||
<junit.version>4.13.1</junit.version>
|
||||
@@ -104,7 +104,7 @@
|
||||
<source>${project.compiler.level}</source>
|
||||
<target>${project.compiler.level}</target>
|
||||
<encoding>${project.build.sourceEncoding}</encoding>
|
||||
<compilerArgument>-proc:none</compilerArgument>
|
||||
<!-- <compilerArgument>-proc:none</compilerArgument>-->
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
|
||||
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
|
||||
|
||||
:: 版本号信息(需要手动指定)
|
||||
:::: 旧版本名称
|
||||
SET version=0.1.1
|
||||
SET version=0.2.0
|
||||
:::: 新版本名称
|
||||
SET newVersion=0.1.2
|
||||
SET newVersion=0.2.1
|
||||
:::: 组织名称
|
||||
SET groupName=com.github.houbb
|
||||
:::: 项目名称
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
/**
|
||||
* 敏感词替换策略
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public interface ISensitiveWordReplace {
|
||||
|
||||
/**
|
||||
* 替换
|
||||
* @param context 上下文
|
||||
* @return 结果
|
||||
* @since 0.2.0
|
||||
*/
|
||||
String replace(ISensitiveWordReplaceContext context);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.github.houbb.sensitive.word.api;
|
||||
|
||||
/**
|
||||
* 敏感词替换策略上下文
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public interface ISensitiveWordReplaceContext {
|
||||
|
||||
/**
|
||||
* 敏感词
|
||||
* @return 敏感词
|
||||
* @since 0.2.0
|
||||
*/
|
||||
String sensitiveWord();
|
||||
|
||||
/**
|
||||
* 单词长度
|
||||
* @return 单词长度
|
||||
* @since 0.2.0
|
||||
*/
|
||||
int wordLength();
|
||||
|
||||
}
|
||||
@@ -59,12 +59,13 @@ public interface IWordMap extends ISensitiveCheck {
|
||||
* ps: 这里可以添加优化。
|
||||
*
|
||||
* @param target 目标字符串
|
||||
* @param replaceChar 替换为的 char
|
||||
* @param replace 替换策略
|
||||
* @param context 上下文
|
||||
* @return 替换后结果
|
||||
* @since 0.0.2
|
||||
*/
|
||||
String replace(final String target, final char replaceChar,
|
||||
String replace(final String target,
|
||||
final ISensitiveWordReplace replace,
|
||||
final IWordContext context);
|
||||
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ import com.github.houbb.sensitive.word.api.*;
|
||||
import com.github.houbb.sensitive.word.support.allow.WordAllows;
|
||||
import com.github.houbb.sensitive.word.support.deny.WordDenys;
|
||||
import com.github.houbb.sensitive.word.support.map.SensitiveWordMap;
|
||||
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceChar;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
|
||||
import com.github.houbb.sensitive.word.utils.InnerFormatUtils;
|
||||
|
||||
@@ -393,9 +394,23 @@ public class SensitiveWordBs {
|
||||
* @since 0.0.2
|
||||
*/
|
||||
public String replace(final String target, final char replaceChar) {
|
||||
ISensitiveWordReplace replace = new SensitiveWordReplaceChar(replaceChar);
|
||||
|
||||
return replace(target, replace);
|
||||
}
|
||||
|
||||
/**
|
||||
* 替换所有内容
|
||||
*
|
||||
* @param target 目标字符串
|
||||
* @param replace 替换策略
|
||||
* @return 替换后结果
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public String replace(final String target, final ISensitiveWordReplace replace) {
|
||||
statusCheck();
|
||||
|
||||
return sensitiveWordMap.replace(target, replaceChar, context);
|
||||
return sensitiveWordMap.replace(target, replace, context);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package com.github.houbb.sensitive.word.core;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
|
||||
import com.github.houbb.sensitive.word.api.IWordResultHandler;
|
||||
import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
|
||||
|
||||
@@ -59,6 +60,18 @@ public final class SensitiveWordHelper {
|
||||
return WORD_BS.findFirst(target);
|
||||
}
|
||||
|
||||
/**
|
||||
* 替换所有内容
|
||||
*
|
||||
* @param target 目标字符串
|
||||
* @param replace 替换策略
|
||||
* @return 替换后结果
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public static String replace(final String target, final ISensitiveWordReplace replace) {
|
||||
return WORD_BS.replace(target, replace);
|
||||
}
|
||||
|
||||
/**
|
||||
* 替换所有内容
|
||||
*
|
||||
|
||||
@@ -9,14 +9,13 @@ import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.heaven.util.util.MapUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordMap;
|
||||
import com.github.houbb.sensitive.word.api.IWordResult;
|
||||
import com.github.houbb.sensitive.word.api.*;
|
||||
import com.github.houbb.sensitive.word.constant.AppConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;
|
||||
import com.github.houbb.sensitive.word.support.check.SensitiveCheckResult;
|
||||
import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckChain;
|
||||
import com.github.houbb.sensitive.word.support.check.impl.SensitiveCheckUrl;
|
||||
import com.github.houbb.sensitive.word.support.replace.SensitiveWordReplaceContext;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResult;
|
||||
|
||||
import java.util.Collection;
|
||||
@@ -156,12 +155,12 @@ public class SensitiveWordMap implements IWordMap {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String replace(String target, char replaceChar, final IWordContext context) {
|
||||
public String replace(String target, final ISensitiveWordReplace replace, final IWordContext context) {
|
||||
if(StringUtil.isEmpty(target)) {
|
||||
return target;
|
||||
}
|
||||
|
||||
return this.replaceSensitiveWord(target, replaceChar, context);
|
||||
return this.replaceSensitiveWord(target, replace, context);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -213,11 +212,13 @@ public class SensitiveWordMap implements IWordMap {
|
||||
/**
|
||||
* 直接替换敏感词,返回替换后的结果
|
||||
* @param target 文本信息
|
||||
* @param replace 替换策略
|
||||
* @param context 上下文
|
||||
* @return 脱敏后的字符串
|
||||
* @since 0.0.2
|
||||
*/
|
||||
private String replaceSensitiveWord(final String target,
|
||||
final char replaceChar,
|
||||
final ISensitiveWordReplace replace,
|
||||
final IWordContext context) {
|
||||
if(StringUtil.isEmpty(target)) {
|
||||
return target;
|
||||
@@ -241,7 +242,12 @@ public class SensitiveWordMap implements IWordMap {
|
||||
// 直接使用原始内容,避免 markdown 图片转换失败
|
||||
resultBuilder.append(string);
|
||||
} else {
|
||||
String replaceStr = CharUtil.repeat(replaceChar, wordLength);
|
||||
// 创建上下文
|
||||
ISensitiveWordReplaceContext replaceContext = SensitiveWordReplaceContext.newInstance()
|
||||
.sensitiveWord(string)
|
||||
.wordLength(wordLength);
|
||||
String replaceStr = replace.replace(replaceContext);
|
||||
|
||||
resultBuilder.append(replaceStr);
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.github.houbb.sensitive.word.support.replace;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext;
|
||||
|
||||
/**
|
||||
* 指定字符的替换策略
|
||||
* @author binbin.hou
|
||||
* @since 0.2.0
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class SensitiveWordReplaceChar implements ISensitiveWordReplace {
|
||||
|
||||
private final char replaceChar;
|
||||
|
||||
public SensitiveWordReplaceChar(char replaceChar) {
|
||||
this.replaceChar = replaceChar;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String replace(ISensitiveWordReplaceContext context) {
|
||||
int wordLength = context.wordLength();
|
||||
|
||||
return CharUtil.repeat(replaceChar, wordLength);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
package com.github.houbb.sensitive.word.support.replace;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext;
|
||||
|
||||
/**
|
||||
* 敏感词替换上下文
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class SensitiveWordReplaceContext implements ISensitiveWordReplaceContext {
|
||||
|
||||
public static SensitiveWordReplaceContext newInstance() {
|
||||
return new SensitiveWordReplaceContext();
|
||||
}
|
||||
|
||||
/**
|
||||
* 敏感词
|
||||
* @since 0.2.0
|
||||
*/
|
||||
private String sensitiveWord;
|
||||
|
||||
/**
|
||||
* 单词长度
|
||||
* @since 0.2.0
|
||||
*/
|
||||
private int wordLength;
|
||||
|
||||
@Override
|
||||
public String sensitiveWord() {
|
||||
return sensitiveWord;
|
||||
}
|
||||
|
||||
public SensitiveWordReplaceContext sensitiveWord(String sensitiveWord) {
|
||||
this.sensitiveWord = sensitiveWord;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int wordLength() {
|
||||
return wordLength;
|
||||
}
|
||||
|
||||
public SensitiveWordReplaceContext wordLength(int wordLength) {
|
||||
this.wordLength = wordLength;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SensitiveWordReplaceContext{" +
|
||||
"sensitiveWord='" + sensitiveWord + '\'' +
|
||||
", wordLength=" + wordLength +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,6 +1,8 @@
|
||||
package com.github.houbb.sensitive.word.core;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
|
||||
import com.github.houbb.sensitive.word.api.IWordResult;
|
||||
import com.github.houbb.sensitive.word.replace.MySensitiveWordReplace;
|
||||
import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
@@ -148,4 +150,18 @@ public class SensitiveWordHelperTest {
|
||||
Assert.assertEquals("fuck", word);
|
||||
}
|
||||
|
||||
/**
|
||||
* 自定替换策略
|
||||
* @since 0.2.0
|
||||
*/
|
||||
@Test
|
||||
public void defineReplaceTest() {
|
||||
final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
|
||||
|
||||
ISensitiveWordReplace replace = new MySensitiveWordReplace();
|
||||
String result = SensitiveWordHelper.replace(text, replace);
|
||||
|
||||
Assert.assertEquals("国家旗帜迎风飘扬,教员的画像屹立在***前。", result);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
package com.github.houbb.sensitive.word.replace;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplace;
|
||||
import com.github.houbb.sensitive.word.api.ISensitiveWordReplaceContext;
|
||||
|
||||
/**
|
||||
* 自定义敏感词替换策略
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.2.0
|
||||
*/
|
||||
public class MySensitiveWordReplace implements ISensitiveWordReplace {
|
||||
|
||||
@Override
|
||||
public String replace(ISensitiveWordReplaceContext context) {
|
||||
String sensitiveWord = context.sensitiveWord();
|
||||
// 自定义不同的敏感词替换策略,可以从数据库等地方读取
|
||||
if("五星红旗".equals(sensitiveWord)) {
|
||||
return "国家旗帜";
|
||||
}
|
||||
if("毛主席".equals(sensitiveWord)) {
|
||||
return "教员";
|
||||
}
|
||||
|
||||
// 其他默认使用 * 代替
|
||||
int wordLength = context.wordLength();
|
||||
return CharUtil.repeat('*', wordLength);
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user