diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md
index 467fc53..ed66493 100644
--- a/CHANGE_LOG.md
+++ b/CHANGE_LOG.md
@@ -217,4 +217,11 @@
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:-----|----------------------|:--------------------|:------|
-| 1 | A | 添加忽略字符接口,便于跳过一些干扰的字符 | 2023-12-08 23:51:58 | |
\ No newline at end of file
+| 1 | A | 添加忽略字符接口,便于跳过一些干扰的字符 | 2023-12-08 23:51:58 | |
+
+# release_0.12.0
+
+| 序号 | 变更类型 | 说明 | 时间 | 备注 |
+|:---|:-----|----------------------------------------------|:--------------------|:------|
+| 1 | A | 添加 wordTags 标签结果处理类 | 2023-12-18 23:51:58 | |
+| 2 | A | 添加 AbstractWordResultHandler 处理类,便于后续拓展和统一管理 | 2023-12-18 23:51:58 | |
\ No newline at end of file
diff --git a/README.md b/README.md
index 3c5f224..6e14a4b 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@
这两个资料阅读可在下方文章获取:
-> [v0.11.0-敏感词新特性](https://mp.weixin.qq.com/s/m40ZnR6YF6WgPrArUSZ_0g)
+> [v0.11.0-敏感词新特性及对应标签文件](https://mp.weixin.qq.com/s/m40ZnR6YF6WgPrArUSZ_0g)
# 快速开始
@@ -78,7 +78,7 @@
com.github.houbb
sensitive-word
- 0.11.0
+ 0.12.0
```
@@ -98,23 +98,7 @@
| findFirst(String, IWordResultHandler) | IWordResultHandler 结果处理类 | 字符串 | 返回字符串中第一个敏感词 |
| tags(String) | 获取敏感词的标签 | 敏感词字符串 | 返回敏感词的标签列表 |
-## IWordResultHandler 结果处理类
-IWordResultHandler 可以对敏感词的结果进行处理,允许用户自定义。
-
-内置实现见 `WordResultHandlers` 工具类:
-
-- WordResultHandlers.word()
-
-只保留敏感词单词本身。
-
-- WordResultHandlers.raw()
-
-保留敏感词相关信息,包含敏感词的开始和结束下标。
-
-## 使用实例
-
-所有测试案例参见 [SensitiveWordHelperTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java)
### 判断是否包含敏感词
@@ -243,6 +227,60 @@ public class MyWordReplace implements IWordReplace {
我们针对其中的部分词做固定映射处理,其他的默认转换为 `*`。
+## IWordResultHandler 结果处理类
+
+IWordResultHandler 可以对敏感词的结果进行处理,允许用户自定义。
+
+内置实现见 `WordResultHandlers` 工具类:
+
+- WordResultHandlers.word()
+
+只保留敏感词单词本身。
+
+- WordResultHandlers.raw()
+
+保留敏感词相关信息,包含敏感词的开始和结束下标。
+
+- WordResultHandlers.wordTags()
+
+同时保留单词,和对应的词标签信息。
+
+### 使用实例
+
+所有测试案例参见 [SensitiveWordHelperTest](https://github.com/houbb/sensitive-word/blob/master/src/test/java/com/github/houbb/sensitive/word/core/SensitiveWordHelperTest.java)
+
+1)基本例子
+
+```java
+final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
+
+List wordList = SensitiveWordHelper.findAll(text);
+Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
+List wordList2 = SensitiveWordHelper.findAll(text, WordResultHandlers.word());
+Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList2.toString());
+
+List wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw());
+Assert.assertEquals("[WordResult{startIndex=0, endIndex=4}, WordResult{startIndex=9, endIndex=12}, WordResult{startIndex=18, endIndex=21}]", wordList3.toString());
+```
+
+2) wordTags 例子
+
+我们在 `dict_tag_test.txt` 文件中指定对应词的标签信息。
+
+```java
+final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
+
+// 默认敏感词标签为空
+List wordList1 = SensitiveWordHelper.findAll(text, WordResultHandlers.wordTags());
+Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[]}, WordTagsDto{word='毛主席', tags=[]}, WordTagsDto{word='天安门', tags=[]}]", wordList1.toString());
+
+List wordList2 = SensitiveWordBs.newInstance()
+ .wordTag(WordTags.file("dict_tag_test.txt"))
+ .init()
+ .findAll(text, WordResultHandlers.wordTags());
+Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[政治, 国家]}, WordTagsDto{word='毛主席', tags=[政治, 伟人, 国家]}, WordTagsDto{word='天安门', tags=[政治, 国家, 地址]}]", wordList2.toString());
+```
+
# 更多特性
后续的诸多特性,主要是针对各种针对各种情况的处理,尽可能的提升敏感词命中率。
@@ -793,9 +831,7 @@ remove、add、edit?
- [x] 敏感词标签接口支持
-- [ ] 敏感词处理时标签支持
-
-TODO: 比较耗时间。
+- [x] 敏感词处理时标签支持
- [x] wordData 的内存占用对比 + 优化
@@ -807,8 +843,6 @@ FormatCombine/CheckCombine/AllowDenyCombine 组合策略,允许用户自定义
- [ ] 添加 ThreadLocal 等性能优化
-
-
# 拓展阅读
[敏感词工具实现思路](https://houbb.github.io/2020/01/07/sensitive-word)
@@ -819,6 +853,8 @@ FormatCombine/CheckCombine/AllowDenyCombine 组合策略,允许用户自定义
[java 如何实现开箱即用的敏感词控台服务?](https://mp.weixin.qq.com/s/rQo75cfMU_OEbTJa0JGMGg)
+[v0.11.0-敏感词新特性及对应标签文件](https://mp.weixin.qq.com/s/m40ZnR6YF6WgPrArUSZ_0g)
+

# NLP 开源矩阵
diff --git a/release.bat b/release.bat
index 0460ad3..161d6e5 100644
--- a/release.bat
+++ b/release.bat
@@ -10,9 +10,9 @@ ECHO "============================= RELEASE START..."
:: 版本号信息(需要手动指定)
:::: 旧版本名称
-SET version=0.11.0
+SET version=0.12.0
:::: 新版本名称
-SET newVersion=0.12.0
+SET newVersion=0.13.0
:::: 组织名称
SET groupName=com.github.houbb
:::: 项目名称
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/AbstractWordResultHandler.java b/src/main/java/com/github/houbb/sensitive/word/support/result/AbstractWordResultHandler.java
new file mode 100644
index 0000000..345fb67
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/support/result/AbstractWordResultHandler.java
@@ -0,0 +1,26 @@
+package com.github.houbb.sensitive.word.support.result;
+
+import com.github.houbb.sensitive.word.api.IWordContext;
+import com.github.houbb.sensitive.word.api.IWordResult;
+import com.github.houbb.sensitive.word.api.IWordResultHandler;
+
+/**
+ * 抽象的处理结果
+ *
+ * @since 0.12.0
+ * @param 泛型
+ */
+public abstract class AbstractWordResultHandler implements IWordResultHandler {
+
+ protected abstract R doHandle(IWordResult wordResult, IWordContext wordContext, String originalText);
+
+ @Override
+ public R handle(IWordResult wordResult, IWordContext wordContext, String originalText) {
+ if(wordResult == null) {
+ return null;
+ }
+
+ return doHandle(wordResult, wordContext, originalText);
+ }
+
+}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java
index a1ddb2d..1e373ab 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerRaw.java
@@ -3,7 +3,6 @@ package com.github.houbb.sensitive.word.support.result;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordResult;
-import com.github.houbb.sensitive.word.api.IWordResultHandler;
/**
* 不做任何处理
@@ -11,7 +10,7 @@ import com.github.houbb.sensitive.word.api.IWordResultHandler;
* @since 0.1.0
*/
@ThreadSafe
-public class WordResultHandlerRaw implements IWordResultHandler {
+public class WordResultHandlerRaw extends AbstractWordResultHandler {
/**
* @since 0.3.0
@@ -23,7 +22,7 @@ public class WordResultHandlerRaw implements IWordResultHandler {
}
@Override
- public IWordResult handle(IWordResult wordResult, IWordContext wordContext, String originalText) {
+ protected IWordResult doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) {
return wordResult;
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java
index 6748946..af79edb 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWord.java
@@ -3,7 +3,6 @@ package com.github.houbb.sensitive.word.support.result;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordResult;
-import com.github.houbb.sensitive.word.api.IWordResultHandler;
import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;
/**
@@ -13,7 +12,7 @@ import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;
* @since 0.1.0
*/
@ThreadSafe
-public class WordResultHandlerWord implements IWordResultHandler {
+public class WordResultHandlerWord extends AbstractWordResultHandler {
/**
* @since 0.3.0
@@ -25,11 +24,7 @@ public class WordResultHandlerWord implements IWordResultHandler {
}
@Override
- public String handle(IWordResult wordResult, IWordContext wordContext, String originalText) {
- if(wordResult == null) {
- return null;
- }
-
+ protected String doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) {
// 截取
return InnerWordCharUtils.getString(originalText.toCharArray(), wordResult);
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java
new file mode 100644
index 0000000..7d1f4f2
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlerWordTags.java
@@ -0,0 +1,31 @@
+package com.github.houbb.sensitive.word.support.result;
+
+import com.github.houbb.sensitive.word.api.IWordContext;
+import com.github.houbb.sensitive.word.api.IWordResult;
+import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;
+
+import java.util.Set;
+
+/**
+ * 单词+对应的标签信息
+ *
+ * @author binbin.hou
+ * @since 0.12.0
+ */
+public class WordResultHandlerWordTags extends AbstractWordResultHandler {
+
+ @Override
+ protected WordTagsDto doHandle(IWordResult wordResult, IWordContext wordContext, String originalText) {
+ // 截取
+ String word = InnerWordCharUtils.getString(originalText.toCharArray(), wordResult);
+ // 标签
+
+ WordTagsDto dto = new WordTagsDto();
+ dto.setWord(word);
+ // 获取 tags
+ Set wordTags = wordContext.wordTag().getTag(word);
+ dto.setTags(wordTags);
+ return dto;
+ }
+
+}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java
index b77e6b7..07b1726 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordResultHandlers.java
@@ -30,4 +30,13 @@ public final class WordResultHandlers {
return WordResultHandlerWord.getInstance();
}
+ /**
+ * 单词+标签的处理结果
+ * @return 单词+标签的处理结果
+ * @since 0.12.0
+ */
+ public static IWordResultHandler wordTags() {
+ return new WordResultHandlerWordTags();
+ }
+
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/result/WordTagsDto.java b/src/main/java/com/github/houbb/sensitive/word/support/result/WordTagsDto.java
new file mode 100644
index 0000000..8031aa3
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/support/result/WordTagsDto.java
@@ -0,0 +1,39 @@
+package com.github.houbb.sensitive.word.support.result;
+
+import java.io.Serializable;
+import java.util.Set;
+
+/**
+ * @since 0.12.0
+ */
+public class WordTagsDto implements Serializable {
+
+ private String word;
+
+ private Set tags;
+
+ public String getWord() {
+ return word;
+ }
+
+ public void setWord(String word) {
+ this.word = word;
+ }
+
+ public Set getTags() {
+ return tags;
+ }
+
+ public void setTags(Set tags) {
+ this.tags = tags;
+ }
+
+ @Override
+ public String toString() {
+ return "WordTagsDto{" +
+ "word='" + word + '\'' +
+ ", tags=" + tags +
+ '}';
+ }
+
+}
diff --git a/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java b/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java
new file mode 100644
index 0000000..dd0784f
--- /dev/null
+++ b/src/test/java/com/github/houbb/sensitive/word/support/handler/WordResultHandlerTest.java
@@ -0,0 +1,49 @@
+package com.github.houbb.sensitive.word.support.handler;
+
+import com.github.houbb.sensitive.word.api.IWordResult;
+import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
+import com.github.houbb.sensitive.word.core.SensitiveWordHelper;
+import com.github.houbb.sensitive.word.support.result.WordResultHandlers;
+import com.github.houbb.sensitive.word.support.result.WordTagsDto;
+import com.github.houbb.sensitive.word.support.tag.WordTags;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.util.List;
+
+/**
+ * @since 0.12.0
+ */
+public class WordResultHandlerTest {
+
+ @Test
+ public void findAllWordTest() {
+ final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
+
+ List wordList = SensitiveWordHelper.findAll(text);
+ Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
+ List wordList2 = SensitiveWordHelper.findAll(text, WordResultHandlers.word());
+ Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList2.toString());
+
+ List wordList3 = SensitiveWordHelper.findAll(text, WordResultHandlers.raw());
+ Assert.assertEquals("[WordResult{startIndex=0, endIndex=4}, WordResult{startIndex=9, endIndex=12}, WordResult{startIndex=18, endIndex=21}]", wordList3.toString());
+ }
+
+ @Test
+ @Ignore
+ public void wordTagsTest() {
+ final String text = "五星红旗迎风飘扬,毛主席的画像屹立在天安门前。";
+
+ // 默认敏感词标签为空
+ List wordList1 = SensitiveWordHelper.findAll(text, WordResultHandlers.wordTags());
+ Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[]}, WordTagsDto{word='毛主席', tags=[]}, WordTagsDto{word='天安门', tags=[]}]", wordList1.toString());
+
+ List wordList2 = SensitiveWordBs.newInstance()
+ .wordTag(WordTags.file("D:\\github\\sensitive-word\\src\\test\\resources\\dict_tag_test.txt"))
+ .init()
+ .findAll(text, WordResultHandlers.wordTags());
+ Assert.assertEquals("[WordTagsDto{word='五星红旗', tags=[政治, 国家]}, WordTagsDto{word='毛主席', tags=[政治, 伟人, 国家]}, WordTagsDto{word='天安门', tags=[政治, 国家, 地址]}]", wordList2.toString());
+ }
+
+}
diff --git a/src/test/java/com/github/houbb/sensitive/word/support/package-info.java b/src/test/java/com/github/houbb/sensitive/word/support/package-info.java
new file mode 100644
index 0000000..2f0e44d
--- /dev/null
+++ b/src/test/java/com/github/houbb/sensitive/word/support/package-info.java
@@ -0,0 +1 @@
+package com.github.houbb.sensitive.word.support;
\ No newline at end of file
diff --git a/src/test/resources/dict_tag_test.txt b/src/test/resources/dict_tag_test.txt
index 51d66ca..02889a7 100644
--- a/src/test/resources/dict_tag_test.txt
+++ b/src/test/resources/dict_tag_test.txt
@@ -1 +1,3 @@
-五星红旗 政治,国家
\ No newline at end of file
+五星红旗 政治,国家
+毛主席 政治,国家,伟人
+天安门 政治,国家,地址
\ No newline at end of file