project: sensitive-word-AppConst
+ *create on 2020/1/7 23:39
+ * + * @author Administrator + * @since 0.0.1 + */ +public final class AppConst { + + /** + * 是否为结束标识 + * ps: 某种角度而言,我不是很喜欢这种风格。 + * @since 0.0.1 + */ + public static final String IS_END = "isEnd"; + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java b/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java new file mode 100644 index 0000000..e90b2fc --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java @@ -0,0 +1,23 @@ +package com.github.houbb.sensitive.word.constant.enums; + +/** + *project: sensitive-word-ValidModeEnum
+ *create on 2020/1/7 22:46
+ * + * @author Administrator + * @since 1.0.0 + */ +public enum ValidModeEnum { + + /** + * 快速失败 + * @since 0.0.1 + */ + FAIL_FAST, + + /** + * 全部遍历 + * @since 0.0.1 + */ + FAIL_OVER +} diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java b/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java new file mode 100644 index 0000000..1b6d452 --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java @@ -0,0 +1,8 @@ +/** + *project: sensitive-word-package-info
+ *create on 2020/1/7 22:46
+ * + * @author Administrator + * @since 1.0.0 + */ +package com.github.houbb.sensitive.word.constant; \ No newline at end of file diff --git a/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java b/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java deleted file mode 100644 index a8e60bd..0000000 --- a/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.github.houbb.sensitive.word.model; - -import java.util.List; - -/** - * 所有的敏感词,第一个字都是 key - * - * @author binbin.hou - * @since 0.0.1 - */ -public class WordMapEntry { - - /** - * 单个单词 - * @since 0.0.1 - */ - private String word; - - /** - * 是否为结束 - * @since 0.0.1 - */ - private boolean isEnd; - - /** - * 下一层的信息列表 - * @since 0.0.1 - */ - private List
- * 中 = {
- * isEnd = 0
- * 国 = {
- * isEnd = 1
- * 人 = {isEnd = 0
- * 民 = {isEnd = 1}
- * }
- * 男 = {
- * isEnd = 0
- * 人 = {
- * isEnd = 1
- * }
- * }
- * }
- * }
- *
- * 五 = {
- * isEnd = 0
- * 星 = {
- * isEnd = 0
- * 红 = {
- * isEnd = 0
- * 旗 = {
- * isEnd = 1
- * }
- * }
- * }
- * }
- *
- *
- * key: 对应的中文
- * value: 是否为结束。
- *
- * 日本人,日本鬼子为例
- *
- * 1、在hashMap中查询“日”看其是否在hashMap中存在,如果不存在,则证明已“日”开头的敏感词还不存在,则我们直接构建这样的一棵树。跳至3。
- * 2、如果在hashMap中查找到了,表明存在以“日”开头的敏感词,设置hashMap = hashMap.get("日"),跳至1,依次匹配“本”、“人”。
- * 3、判断该字是否为该词中的最后一个字。若是表示敏感词结束,设置标志位isEnd = 1,否则设置标志位isEnd = 0;
- *
* @param collection 敏感词库集合
* @since 0.0.1
- *
+ *
* 使用对象代码 map 的这种一直递归。
- *
+ * 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html
+ * https://blog.csdn.net/chenssy/article/details/26961957
*/
@Override
- public Map
+ * (1)如果未命中敏感词,直接返回 0
+ * (2)命中敏感词,则返回敏感词的长度。
+ *
+ * @param txt 文本信息
+ * @param beginIndex 开始下标
+ * @return 敏感词对应的长度
+ * @since 0.0.1
+ */
+ private int checkSensitiveWord(String txt, int beginIndex) {
+ Map nowMap = sensitiveWordMap;
+
+ boolean flag = false;
+ // 记录敏感词的长度
+ int sensitiveWordLength = 0;
+
+ for (int i = beginIndex; i < txt.length(); i++) {
+ char charKey = txt.charAt(i);
+ // 判断该字是否存在于敏感词库中
+ // 并且将 nowMap 替换为新的 map,进入下一层的循环。
+ nowMap = (Map) nowMap.get(charKey);
+ if (ObjectUtil.isNotNull(nowMap)) {
+ sensitiveWordLength++;
+
+ // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
+ boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
+ if (isEnd) {
+ flag = true;
+
+ // 这里直接默认 fail-fast 即可。
+ break;
+ }
+ } else {
+ // 直接跳出循环
+ break;
+ }
+ }
+
+ if (!flag) {
+ sensitiveWordLength = 0;
+ }
+ return sensitiveWordLength;
}
}
diff --git a/src/main/resources/dict.txt b/src/main/resources/dict.txt
index 767eb1d..5a344e9 100644
--- a/src/main/resources/dict.txt
+++ b/src/main/resources/dict.txt
@@ -164386,7 +164386,6 @@ z以留吧以其以武
百花故事
百花盛放
百行教师贱
-的
的同修
的妹 子 都 很 急 约
的阿斗
diff --git a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java
new file mode 100644
index 0000000..d48d364
--- /dev/null
+++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java
@@ -0,0 +1,40 @@
+package com.github.houbb.sensitive.word.bs;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+
+/**
+ * project: sensitive-word-SensitiveWordBsTest create on 2020/1/7 23:43