release branch 0.0.1

2026-03-22 16:37:17 +08:00 · 2020-01-08 00:16:08 +08:00
parent 3108126817
commit 6b656b26d9
18 changed files with 721 additions and 197 deletions
--- a/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java
+++ b/src/main/java/com/github/houbb/sensitive/word/api/IWordMap.java
@@ -1,9 +1,10 @@
 package com.github.houbb.sensitive.word.api;

-import com.github.houbb.sensitive.word.model.WordMapEntry;
+import com.github.houbb.heaven.util.lang.StringUtil;
+import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;

 import java.util.Collection;
-import java.util.Map;
+import java.util.List;

 /**
 * 敏感词 map
@@ -12,12 +13,38 @@ import java.util.Map;
 */
 public interface IWordMap {

+
    /**
-     * 获取单词 map
-     * @param collection 集合
-     * @return 敏感词 map
+     * 初始化敏感词 map
+     * @param collection 集合信息
     * @since 0.0.1
     */
-    Map<String, WordMapEntry> getWordMap(final Collection<String> collection);
+    void initWordMap(Collection<String> collection);
+
+    /**
+     * 是否包含敏感词
+     * @param string 字符串
+     * @return 是否包含
+     * @since 0.0.1
+     * @see ValidModeEnum#FAIL_FAST 建议使用快速返回模式
+     */
+    boolean contains(final String string);
+
+    /**
+     * 返回所有对应的敏感词
+     * @param string 原始字符串
+     * @return 结果
+     * @since 0.0.1
+     * @see ValidModeEnum#FAIL_OVER 建议使用全部检测返回模式
+     */
+    List<String> findAll(final String string);
+
+    /**
+     * 返回第一个对应的敏感词
+     * @param string 原始字符串
+     * @return 结果
+     * @since 0.0.1
+     */
+    String findFirst(final String string);

 }
--- a/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
+++ b/src/main/java/com/github/houbb/sensitive/word/bs/SensitiveWordBs.java
@@ -5,7 +5,9 @@ import com.github.houbb.heaven.util.guava.Guavas;
 import com.github.houbb.heaven.util.lang.StringUtil;
 import com.github.houbb.heaven.util.util.CollectionUtil;
 import com.github.houbb.sensitive.word.api.IWordData;
+import com.github.houbb.sensitive.word.api.IWordMap;
 import com.github.houbb.sensitive.word.support.data.SensitiveWordData;
+import com.github.houbb.sensitive.word.support.map.SensitiveWordMap;

 import java.util.List;

@@ -22,14 +24,6 @@ public class SensitiveWordBs {
     */
    private SensitiveWordBs(){}

-    /**
-     * 待验证字符串信息
-     * ps: 可以添加多个辅助类 xxxStringProvider
-     * 如 FileXXX
-     * @since 0.0.1
-     */
-    private volatile String target;
-
    /**
     * 敏感数据信息
     * @since 0.0.1
@@ -37,82 +31,73 @@ public class SensitiveWordBs {
    private IWordData sensitiveWordData = Instances.singleton(SensitiveWordData.class);

    /**
-     * 新建验证实例
-     * @param string 字符串
-     * @return this
+     * 敏感词 map
     * @since 0.0.1
     */
-    public static SensitiveWordBs newInstance(final String string) {
-        SensitiveWordBs instance = new SensitiveWordBs();
-        instance.target = string;
-        return instance;
+    private IWordMap sensitiveWordMap = Instances.singleton(SensitiveWordMap.class);
+
+    /**
+     * 获取单例信息
+     * @since 0.0.1
+     */
+    private static final SensitiveWordBs INSTANCE;
+
+    static {
+        synchronized (SensitiveWordBs.class) {
+            INSTANCE = new SensitiveWordBs();
+            List<String> lines = INSTANCE.sensitiveWordData.getWordData();
+            INSTANCE.sensitiveWordMap.initWordMap(lines);
+        }
    }

    /**
-     * 指定目标字符串信息
-     * @param string 字符串
+     * 新建验证实例
     * @return this
     * @since 0.0.1
     */
-    public SensitiveWordBs target(final String string) {
-        this.target = string;
-        return this;
+    public static SensitiveWordBs getInstance() {
+        return INSTANCE;
    }

    /**
     * 是否合法
+     * @param target 目标字符串
     * @return 是否
     * @since 0.0.1
-     * @see #contains() 是否包含
     */
-    public boolean valid() {
-        return !contains();
+    public boolean valid(final String target) {
+        return !contains(target);
    }

    /**
     * 是否包含敏感词
+     * @param target 目标字符串
     * @return 是否
     * @since 0.0.1
-     * @see #findAll() 列表不为空即可
     */
-    public boolean contains() {
-        return CollectionUtil.isNotEmpty(findAll());
+    public boolean contains(final String target) {
+        return this.sensitiveWordMap.contains(target);
    }

    /**
     * 返回所有的敏感词
-     * 1. 这里是默认去重的。
+     * 1. 这里是默认去重的，且是有序的。
+     * 2. 如果不存在，返回空列表
     * @return 敏感词列表
     * @since 0.0.1
     */
-    public List<String> findAll() {
-        if(StringUtil.isEmpty(target)) {
-            return Guavas.newArrayList();
-        }
-
-        // 分词
-        return null;
+    public List<String> findAll(final String target) {
+        return this.sensitiveWordMap.findAll(target);
    }

    /**
-     * 执行过滤
-     * 1. 使用默认策略
-     * 2. 默认策略就是直接移除。
-     * @return 过滤后的结果
+     * 返回第一个敏感词
+     * （1）如果不存在，则返回 {@code null}
+     * @return 敏感词
     * @since 0.0.1
     */
-    private String filter() {
-        return filter(StringUtil.EMPTY);
-    }
-
-    /**
-     * 指定过滤的字符，执行过滤
-     * 1. filter 只是一种特殊的字符串替换策略。
-     * @return 过滤后的结果
-     * @since 0.0.1
-     */
-    private String filter(final String filter) {
-        return "";
+    public String findFirst(final String target) {
+        return this.sensitiveWordMap.findFirst(target);
    }

 }
--- a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java
+++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java
@@ -0,0 +1,19 @@
+package com.github.houbb.sensitive.word.constant;
+
+/**
+ * <p> project: sensitive-word-AppConst </p>
+ * <p> create on 2020/1/7 23:39 </p>
+ *
+ * @author Administrator
+ * @since 0.0.1
+ */
+public final class AppConst {
+
+    /**
+     * 是否为结束标识
+     * ps: 某种角度而言，我不是很喜欢这种风格。
+     * @since 0.0.1
+     */
+    public static final String IS_END = "isEnd";
+
+}
--- a/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java
+++ b/src/main/java/com/github/houbb/sensitive/word/constant/enums/ValidModeEnum.java
@@ -0,0 +1,23 @@
+package com.github.houbb.sensitive.word.constant.enums;
+
+/**
+ * <p> project: sensitive-word-ValidModeEnum </p>
+ * <p> create on 2020/1/7 22:46 </p>
+ *
+ * @author Administrator
+ * @since 1.0.0
+ */
+public enum ValidModeEnum {
+
+    /**
+     * 快速失败
+     * @since 0.0.1
+     */
+    FAIL_FAST,
+
+    /**
+     * 全部遍历
+     * @since 0.0.1
+     */
+    FAIL_OVER
+}
--- a/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java
+++ b/src/main/java/com/github/houbb/sensitive/word/constant/package-info.java
@@ -0,0 +1,8 @@
+/**
+ * <p> project: sensitive-word-package-info </p>
+ * <p> create on 2020/1/7 22:46 </p>
+ *
+ * @author Administrator
+ * @since 1.0.0
+ */
+package com.github.houbb.sensitive.word.constant;
--- a/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java
+++ b/src/main/java/com/github/houbb/sensitive/word/model/WordMapEntry.java
@@ -1,58 +0,0 @@
-package com.github.houbb.sensitive.word.model;
-
-import java.util.List;
-
-/**
- * 所有的敏感词，第一个字都是 key
- *
- * @author binbin.hou
- * @since 0.0.1
- */
-public class WordMapEntry {
-
-    /**
-     * 单个单词
-     * @since 0.0.1
-     */
-    private String word;
-
-    /**
-     * 是否为结束
-     * @since 0.0.1
-     */
-    private boolean isEnd;
-
-    /**
-     * 下一层的信息列表
-     * @since 0.0.1
-     */
-    private List<WordMapEntry> nextEntryList;
-
-    public String word() {
-        return word;
-    }
-
-    public WordMapEntry word(String word) {
-        this.word = word;
-        return this;
-    }
-
-    public boolean end() {
-        return isEnd;
-    }
-
-    public WordMapEntry end(boolean end) {
-        isEnd = end;
-        return this;
-    }
-
-    public List<WordMapEntry> nextEntryList() {
-        return nextEntryList;
-    }
-
-    public WordMapEntry nextEntryList(List<WordMapEntry> nextEntryList) {
-        this.nextEntryList = nextEntryList;
-        return this;
-    }
-
-}
--- a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java
@@ -9,6 +9,7 @@ import java.util.List;

 /**
 * 数据加载使用单例的模式，只需要加载一次即可。
+ *
 * @author binbin.hou
 * @since 0.0.1
 */
@@ -17,18 +18,22 @@ public class SensitiveWordData implements IWordData {

    /**
     * 默认的内置行
+     *
     * @since 0.0.1
     */
    private static List<String> defaultLines;

    static {
-        long start  = System.currentTimeMillis();
-        defaultLines = new ArrayList<>(183837);
-        defaultLines = StreamUtils.readAllLines("/dict.txt");
-        long end  = System.currentTimeMillis();
-        System.out.println("Sensitive data loaded!, cost time: " + (end-start) + " ms");
+        synchronized (SensitiveWordData.class) {
+            long start = System.currentTimeMillis();
+            defaultLines = new ArrayList<>(183837);
+            defaultLines = StreamUtils.readAllLines("/dict.txt");
+            long end = System.currentTimeMillis();
+            System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms");
+        }
    }

+
    @Override
    public List<String> getWordData() {
        return defaultLines;
--- a/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/map/SensitiveWordMap.java
@@ -1,12 +1,18 @@
 package com.github.houbb.sensitive.word.support.map;

 import com.github.houbb.heaven.annotation.ThreadSafe;
+import com.github.houbb.heaven.util.guava.Guavas;
 import com.github.houbb.heaven.util.lang.ObjectUtil;
+import com.github.houbb.heaven.util.lang.StringUtil;
+import com.github.houbb.heaven.util.util.CollectionUtil;
+import com.github.houbb.heaven.util.util.MapUtil;
 import com.github.houbb.sensitive.word.api.IWordMap;
-import com.github.houbb.sensitive.word.model.WordMapEntry;
+import com.github.houbb.sensitive.word.constant.AppConst;
+import com.github.houbb.sensitive.word.constant.enums.ValidModeEnum;

 import java.util.Collection;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;

 /**
@@ -18,89 +24,216 @@ import java.util.Map;
@ThreadSafe
 public class SensitiveWordMap implements IWordMap {

+    /**
+     * 脱敏单词 map
+     *
+     * @since 0.0.1
+     */
+    private static Map sensitiveWordMap;

    /**
     * 读取敏感词库，将敏感词放入HashSet中，构建一个DFA算法模型：
     *
-     * <pre>
-     * 中 = {
-     * isEnd = 0
-     * 国 = {
-     * isEnd = 1
-     * 人 = {isEnd = 0
-     * 民 = {isEnd = 1}
-     * }
-     * 男  = {
-     * isEnd = 0
-     * 人 = {
-     * isEnd = 1
-     * }
-     * }
-     * }
-     * }
-     *
-     * 五 = {
-     * isEnd = 0
-     * 星 = {
-     * isEnd = 0
-     * 红 = {
-     * isEnd = 0
-     * 旗 = {
-     * isEnd = 1
-     * }
-     * }
-     * }
-     * }
-     * </pre>
-     *
-     * key: 对应的中文
-     * value: 是否为结束。
-     *
-     * 日本人，日本鬼子为例
-     *
-     * 1、在hashMap中查询“日”看其是否在hashMap中存在，如果不存在，则证明已“日”开头的敏感词还不存在，则我们直接构建这样的一棵树。跳至3。
-     * 2、如果在hashMap中查找到了，表明存在以“日”开头的敏感词，设置hashMap = hashMap.get("日")，跳至1，依次匹配“本”、“人”。
-     * 3、判断该字是否为该词中的最后一个字。若是表示敏感词结束，设置标志位isEnd = 1，否则设置标志位isEnd = 0；
-     *
     * @param collection 敏感词库集合
     * @since 0.0.1
-     *
+     * <p>
     * 使用对象代码 map 的这种一直递归。
-     *
+     * 参考资料：https://www.cnblogs.com/AlanLee/p/5329555.html
+     * https://blog.csdn.net/chenssy/article/details/26961957
     */
    @Override
-    public Map<String, WordMapEntry> getWordMap(Collection<String> collection) {
-        Map<String, WordMapEntry> resultMap = new HashMap<>(collection.size());
+    @SuppressWarnings("unchecked")
+    public void initWordMap(Collection<String> collection) {
+        // 避免重复加载
+        if (MapUtil.isNotEmpty(sensitiveWordMap)) {
+            return;
+        }
+
+        long startTime = System.currentTimeMillis();
+        // 避免扩容带来的消耗
+        sensitiveWordMap = new HashMap(collection.size());

        for (String key : collection) {
+            if (StringUtil.isEmpty(key)) {
+                continue;
+            }
+
+            // 用来按照相应的格式保存敏感词库数据
            char[] chars = key.toCharArray();
            final int size = chars.length;

-            for (int i = 0; i < size; i++) {
-                String charStr = String.valueOf(chars[i]);
+            // 每一个新词的循环，直接将结果设置为当前 map，所有变化都会体现在结果的 map 中
+            Map currentMap = sensitiveWordMap;

-                // 直接获取对应的 map
-                WordMapEntry wordMapEntry = resultMap.get(charStr);
+            for (int i = 0; i < size; i++) {
+                // 截取敏感词当中的字，在敏感词库中字为HashMap对象的Key键值
+                char charKey = chars[i];
+                // 如果集合存在
+                Object wordMap = currentMap.get(charKey);

                // 如果集合存在
-                if(ObjectUtil.isNotNull(wordMapEntry)) {
-
+                if (ObjectUtil.isNotNull(wordMap)) {
+                    // 直接将获取到的 map 当前当前 map 进行继续的操作
+                    currentMap = (Map) wordMap;
                } else {
-//                    // 如果集合不存在，直接新建一个 map
-//                    wordMap = new HashMap<>(size);
-//                    // 判断是否为最后一个，如果是则设置为1
-//                    boolean isEnd = i == size - 1;
-//                    // 设置最后的结果
-//                    wordMap.put(charStr, isEnd);
+                    //不存在则，则构建一个新的map，同时将isEnd设置为0，因为他不是最后一
+                    Map<String, Boolean> newWordMap = new HashMap<>();
+                    newWordMap.put(AppConst.IS_END, false);
+
+                    // 将新的节点放入当前 map 中
+                    currentMap.put(charKey, newWordMap);
+
+                    // 将新节点设置为当前节点，方便下一次节点的循环。
+                    currentMap = newWordMap;
+                }
+
+                // 判断是否为最后一个，添加是否结束的标识。
+                if (i == size - 1) {
+                    currentMap.put(AppConst.IS_END, true);
                }
            }
-
        }
-        return resultMap;
+
+        long endTime = System.currentTimeMillis();
+        System.out.println("Init sensitive word map end! Cost time " + (endTime-startTime) + "ms");
    }

-    public static void main(String[] args) {
-        System.out.println("s".toCharArray()[0]+"");
+    /**
+     * 是否包含
+     * （1）直接遍历所有
+     * （2）如果遇到，则直接返回 true
+     *
+     * @param string 字符串
+     * @return 是否包含
+     * @since 0.0.1
+     */
+    @Override
+    public boolean contains(String string) {
+        if (StringUtil.isEmpty(string)) {
+            return false;
+        }
+
+        for (int i = 0; i < string.length(); i++) {
+            int checkResult = checkSensitiveWord(string, i);
+            // 快速返回
+            if (checkResult > 0) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * 返回所有对应的敏感词
+     * （1）结果是有序的
+     * （2）结果是默认去重的
+     *
+     * @param string 原始字符串
+     * @return 结果
+     * @since 0.0.1
+     */
+    @Override
+    public List<String> findAll(String string) {
+        return getSensitiveWords(string, ValidModeEnum.FAIL_OVER);
+    }
+
+    @Override
+    public String findFirst(String string) {
+        List<String> stringList = getSensitiveWords(string, ValidModeEnum.FAIL_FAST);
+
+        if (CollectionUtil.isEmpty(stringList)) {
+            return null;
+        }
+
+        return stringList.get(0);
+    }
+
+    /**
+     * 获取敏感词列表
+     *
+     * @param text     文本
+     * @param modeEnum 模式
+     * @return 结果列表
+     * @since 0.0.1
+     */
+    private List<String> getSensitiveWords(final String text, final ValidModeEnum modeEnum) {
+        //1. 是否存在敏感词，如果比存在，直接返回空列表
+        if (StringUtil.isEmpty(text)) {
+            return Guavas.newArrayList();
+        }
+
+        List<String> resultList = Guavas.newArrayList();
+        for (int i = 0; i < text.length(); i++) {
+            int wordLength = checkSensitiveWord(text, i);
+
+            // 命中
+            if (wordLength > 0) {
+                // 保存敏感词
+                String sensitiveWord = text.substring(i, i+wordLength);
+
+                // 添加去重
+                if(!resultList.contains(sensitiveWord)) {
+                    resultList.add(sensitiveWord);
+                }
+
+                // 快速返回
+                if (ValidModeEnum.FAIL_FAST.equals(modeEnum)) {
+                    break;
+                }
+
+                // 增加 i 的步长
+                // 为什么要-1，因为默认就会自增1
+                i += wordLength - 1;
+            }
+        }
+
+        return resultList;
+    }
+
+    /**
+     * 检查敏感词数量
+     * <p>
+     * （1）如果未命中敏感词，直接返回 0
+     * （2）命中敏感词，则返回敏感词的长度。
+     *
+     * @param txt        文本信息
+     * @param beginIndex 开始下标
+     * @return 敏感词对应的长度
+     * @since 0.0.1
+     */
+    private int checkSensitiveWord(String txt, int beginIndex) {
+        Map nowMap = sensitiveWordMap;
+
+        boolean flag = false;
+        // 记录敏感词的长度
+        int sensitiveWordLength = 0;
+
+        for (int i = beginIndex; i < txt.length(); i++) {
+            char charKey = txt.charAt(i);
+            // 判断该字是否存在于敏感词库中
+            // 并且将 nowMap 替换为新的 map，进入下一层的循环。
+            nowMap = (Map) nowMap.get(charKey);
+            if (ObjectUtil.isNotNull(nowMap)) {
+                sensitiveWordLength++;
+
+                // 判断是否是敏感词的结尾字，如果是结尾字则判断是否继续检测
+                boolean isEnd = (boolean) nowMap.get(AppConst.IS_END);
+                if (isEnd) {
+                    flag = true;
+
+                    // 这里直接默认 fail-fast 即可。
+                    break;
+                }
+            } else {
+                // 直接跳出循环
+                break;
+            }
+        }
+
+        if (!flag) {
+            sensitiveWordLength = 0;
+        }
+        return sensitiveWordLength;
    }

 }
--- a/src/main/resources/dict.txt
+++ b/src/main/resources/dict.txt
@@ -164386,7 +164386,6 @@ z以留吧以其以武
 百花故事
 百花盛放
 百行教师贱
-的
 的同修
 的妹 子 都 很 急 约
 的阿斗
--- a/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java
+++ b/src/test/java/com/github/houbb/sensitive/word/bs/SensitiveWordBsTest.java
@@ -0,0 +1,40 @@
+package com.github.houbb.sensitive.word.bs;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.List;
+
+/**
+ * <p> project: sensitive-word-SensitiveWordBsTest </p>
+ * <p> create on 2020/1/7 23:43 </p>
+ *
+ * @author Administrator
+ * @since 0.0.1
+ */
+public class SensitiveWordBsTest {
+
+    @Test
+    public void containsTest() {
+        final String text = "五星红旗迎风飘扬，毛主席的画像屹立在天安门前。。";
+
+        Assert.assertTrue(SensitiveWordBs.getInstance().contains(text));
+    }
+
+    @Test
+    public void findAllTest() {
+        final String text = "五星红旗迎风飘扬，毛主席的画像屹立在天安门前。";
+
+        List<String> wordList = SensitiveWordBs.getInstance().findAll(text);
+        Assert.assertEquals("[五星红旗, 毛主席, 天安门]", wordList.toString());
+    }
+
+    @Test
+    public void findFirstTest() {
+        final String text = "五星红旗迎风飘扬，毛主席的画像屹立在天安门前。";
+
+        String word = SensitiveWordBs.getInstance().findFirst(text);
+        Assert.assertEquals("五星红旗", word);
+    }
+
+}
--- a/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java
+++ b/src/test/java/com/github/houbb/sensitive/word/data/DataInitTest.java
@@ -102,4 +102,18 @@ public class DataInitTest {
        FileUtil.write(target, disctinct);
    }

+    @Test
+    @Ignore
+    public void oneWordTest() {
+        final String source = "D:\\_github\\sensitive-word\\src\\main\\resources\\dict.txt";
+
+        List<String> lines = FileUtil.readAllLines(source);
+        for(int i = 0; i < lines.size(); i++) {
+            String line = lines.get(i);
+            if(line.trim().length() == 1) {
+                System.out.println(i + " === " + line);
+            }
+        }
+    }
+
 }