diff --git a/doc/issues/v2-基本敏感词的简化.md b/doc/issues/v2-基本敏感词的简化.md index d444465..8eb55cf 100644 --- a/doc/issues/v2-基本敏感词的简化.md +++ b/doc/issues/v2-基本敏感词的简化.md @@ -20,253 +20,4 @@ 对应的任意写法。 -https://github.com/toolgood 思想值得借鉴。 - -## 单个字 - -48339 === Q -83586 === q -117538 === ━ -117539 === │ -117540 === ┃ -117541 === ┄ -117542 === ┅ -117554 === ┆ -117555 === ┇ -117556 === ┈ -117557 === ┉ -117558 === ┊ -117559 === ┋ -117560 === ┌ -117561 === ┍ -117562 === ┎ -117563 === ┏ -117564 === ┐ -117565 === ┑ -117566 === ┒ -117567 === ┓ -117568 === └ -117569 === ┕ -117570 === ┖ -117571 === ┗ -117572 === ┘ -117573 === ┙ -117574 === ┚ -117575 === ┛ -117576 === ├ -117577 === ┝ -117578 === ┞ -117579 === ┟ -117580 === ┠ -117581 === ┡ -117582 === ┢ -117583 === ┣ -117584 === ┤ -117585 === ┥ -117586 === ┦ -117587 === ┧ -117588 === ┨ -117589 === ┩ -117590 === ┪ -117591 === ┫ -117592 === ┬ -117593 === ┭ -117594 === ┮ -117595 === ┯ -117596 === ┰ -117597 === ┱ -117598 === ┲ -117599 === ┳ -117600 === ┴ -117601 === ┵ -117602 === ┶ -117603 === ┷ -117604 === ┸ -117605 === ┹ -117606 === ┺ -117607 === ┻ -117609 === ┼ -117610 === ┽ -117611 === ┾ -117612 === ┿ -117613 === ╀ -117614 === ╁ -117615 === ╂ -117616 === ╃ -117617 === ╄ -117618 === ╅ -117619 === ╆ -117620 === ╇ -117621 === ╈ -117622 === ╉ -117623 === ╊ -117624 === ╋ -117846 === ㄖ -121501 === 买 -121979 === 乳 -123013 === 仆 -133622 === 功 -133786 === 動 -133790 === 務 -134011 === 區 -134255 === 卐 -134287 === 卖 -134910 === 卵 -135512 === 口 -136392 === 吊 -136576 === 吨 -137367 === 喷 -137479 === 嘸 -139926 === 奸 -140085 === 妈 -140126 === 妓 -140373 === 姘 -140397 === 姦 -140409 === 姩 -140464 === 娘 -140498 === 娼 -140503 === 婊 -140519 === 婬 -140562 === 媽 -140585 === 嫖 -140668 === 孕 -141291 === 寇 -141668 === 射 -142550 === 尻 -142603 === 尿 -142620 === 屄 -142639 === 屌 -142650 === 屍 -142653 === 屎 -142665 === 屙 -143107 === 巯 -143346 === 干 -143535 === 幹 -143735 === 床 -144165 === 弓 -144386 === 弩 -144931 === 忍 -145146 === 性 -145905 === 慰 -145913 === 慾 -146837 === 戳 -146919 === 房 -147574 === 扣 -149446 === 抠 -149774 === 抽 -150089 === 挂 -150244 === 捻 -150260 === 掛 -150296 === 掯 -151938 === 插 -152406 === 操 -153468 === 日 -154328 === 曰 -154902 === 本 -155789 === 枪 -156187 === 槍 -156578 === 歌 -156780 === 死 -158105 === 氟 -158172 === 氯 -158265 === 氰 -158565 === 汞 -159598 === 洱 -159944 === 淪 -159948 === 淫 -161116 === 滚 -161125 === 滛 -161669 === 灾 -161676 === 炮 -161774 === 烂 -161845 === 烯 -161856 === 烷 -162055 === 爛 -162196 === 爽 -162941 === 獨 -162985 === 獸 -163396 === 甙 -163934 === 畜 -165856 === 眯 -165880 === 睾 -165889 === 瞳 -166039 === 砒 -166049 === 砜 -166086 === 砷 -166097 === 础 -166234 === 硼 -166254 === 碡 -166265 === 碱 -166275 === 碼 -166290 === 磷 -166298 === 磺 -166876 === 穴 -167390 === 糞 -167499 === 統 -167536 === 綸 -167961 === 罂 -168722 === 羟 -168800 === 羰 -169070 === 耣 -169444 === 肏 -169474 === 肛 -169508 === 肝 -169679 === 肼 -169680 === 肾 -169725 === 胂 -169729 === 胍 -169883 === 胺 -169907 === 脬 -169939 === 腈 -170004 === 膦 -170283 === 臺 -170406 === 色 -171007 === 苄 -171216 === 茎 -171229 === 草 -171395 === 萋 -171473 === 葵 -171614 === 蔻 -172474 === 裸 -172599 === 褻 -172877 === 証 -174115 === 賤 -174531 === 贱 -174972 === 踢 -174984 === 蹣 -175044 === 躶 -175063 === 輪 -175475 === 轮 -175543 === 辦 -176368 === 逼 -176679 === 酐 -176733 === 酮 -176734 === 酯 -176735 === 酰 -176767 === 醚 -176768 === 醛 -177126 === 鈤 -177295 === 鎷 -177321 === 钒 -177332 === 钠 -177487 === 铀 -177569 === 铊 -179476 === 锇 -179520 === 镉 -179521 === 镍 -179803 === 阴 -180109 === 陰 -180173 === 隂 -180292 === 雞 -180594 === 靠 -181185 === 騒 -181190 === 騷 -181303 === 驽 -181352 === 骚 -182246 === 鯫 -182247 === 鰢 -182306 === 鸠 -182308 === 鸡 -182405 === 鸨 -183438 === B -183491 === b +https://github.com/toolgood 思想值得借鉴。 \ No newline at end of file diff --git a/doc/issues/关联框架.md b/doc/issues/关联框架.md index 3fa6900..c6246a7 100644 --- a/doc/issues/关联框架.md +++ b/doc/issues/关联框架.md @@ -1,4 +1,4 @@ -分词 +stop-word 拼音 @@ -6,10 +6,16 @@ 全角半角转换 -中文英文转换 +重复词 + +# 其他 + +中文英文转换(待定) 手写 Regex +分词 + ## 核心原理 DFA 算法 diff --git a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java index 661aef4..7b9ff22 100644 --- a/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java +++ b/src/main/java/com/github/houbb/sensitive/word/constant/AppConst.java @@ -23,6 +23,12 @@ public final class AppConst { * 字典的大小 * @since 0.0.1 */ - public static final int DICT_SIZE = 66337; + public static final int DICT_SIZE = 65711; + + /** + * 英语词典的大小 + * @since 0.0.4 + */ + public static final int DICT_EN_SIZE = 12; } diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java index b35e1b7..b551ad9 100644 --- a/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java +++ b/src/main/java/com/github/houbb/sensitive/word/support/data/SensitiveWordData.java @@ -27,8 +27,9 @@ public class SensitiveWordData implements IWordData { static { synchronized (SensitiveWordData.class) { long start = System.currentTimeMillis(); - defaultLines = Guavas.newArrayList(AppConst.DICT_SIZE); + defaultLines = Guavas.newArrayList(AppConst.DICT_SIZE+AppConst.DICT_EN_SIZE); defaultLines = StreamUtil.readAllLines("/dict.txt"); + defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt")); long end = System.currentTimeMillis(); System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms"); } diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/CharUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/CharUtils.java new file mode 100644 index 0000000..c3fcc3a --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/utils/CharUtils.java @@ -0,0 +1,68 @@ +package com.github.houbb.sensitive.word.utils; + +import com.github.houbb.heaven.util.guava.Guavas; +import com.github.houbb.heaven.util.lang.ObjectUtil; + +import java.util.Map; + +/** + *

project: sensitive-word-NumUtils

+ *

create on 2020/1/8 22:18

+ * + * @author Administrator + * @since 0.0.4 + */ +public final class CharUtils { + + private CharUtils() { + } + + /** + * 英文字母1 + * @since 0.0.4 + */ + private static final String LETTERS_ONE = + "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" + + "ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" + + "⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵"; + + /** + * 英文字母2 + * @since 0.0.4 + */ + private static final String LETTERS_TWO = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + + "abcdefghijklmnopqrstuvwxyz" + + "abcdefghijklmnopqrstuvwxyz"; + + + /** + * 英文字母 map + * @since 0.0.4 + */ + private static final Map LETTER_MAP = Guavas.newHashMap(LETTERS_ONE.length()); + + static { + final int size = LETTERS_ONE.length(); + + for(int i = 0; i < size; i++) { + LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i)); + } + } + + /** + * 映射后的 char + * @param character 待转换的 char + * @return 结果 + * @since 0.0.4 + */ + public static Character getMappingChar(final Character character) { + final Character mapChar = LETTER_MAP.get(character); + if(ObjectUtil.isNotNull(mapChar)) { + return mapChar; + } + + return character; + } + +} diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/NumUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/NumUtils.java new file mode 100644 index 0000000..daa525f --- /dev/null +++ b/src/main/java/com/github/houbb/sensitive/word/utils/NumUtils.java @@ -0,0 +1,107 @@ +package com.github.houbb.sensitive.word.utils; + +import com.github.houbb.heaven.util.guava.Guavas; +import com.github.houbb.heaven.util.lang.ObjectUtil; +import com.github.houbb.heaven.util.lang.StringUtil; + +import java.util.Map; + +/** + *

project: sensitive-word-NumUtils

+ *

create on 2020/1/8 22:18

+ * + * @author Administrator + * @since 0.0.4 + */ +public final class NumUtils { + + private NumUtils(){} + + private static final String NUM_ONE = "⓪0零º₀⓿○" + + "123456789" + + "一二三四五六七八九" + + "壹贰叁肆伍陆柒捌玖" + + "¹²³⁴⁵⁶⁷⁸⁹" + + "₁₂₃₄₅₆₇₈₉" + + "①②③④⑤⑥⑦⑧⑨" + + "⑴⑵⑶⑷⑸⑹⑺⑻⑼" + + "⒈⒉⒊⒋⒌⒍⒎⒏⒐" + + "❶❷❸❹❺❻❼❽❾" + + "➀➁➂➃➄➅➆➇➈" + + "➊➋➌➍➎➏➐➑➒" + + "㈠㈡㈢㈣㈤㈥㈦㈧㈨" + + "⓵⓶⓷⓸⓹⓺⓻⓼⓽" + + "㊀㊁㊂㊃㊄㊅㊆㊇㊈" + + "ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" + + "ⅠⅡⅢⅣⅤⅥⅦⅧⅨ"; + + private static final String NUM_TWO = "0000000"+ + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789" + + "123456789"; + + /** + * 英文字母 map + * @since 0.0.4 + */ + private static final Map NUMBER_MAP = Guavas.newHashMap(NUM_ONE.length()); + + static { + final int size = NUM_ONE.length(); + + for(int i = 0; i < size; i++) { + NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i)); + } + } + + /** + * 映射后的 char + * @param character 待转换的 char + * @return 结果 + * @since 0.0.4 + */ + public static Character getMappingChar(final Character character) { + final Character mapChar = NUMBER_MAP.get(character); + if(ObjectUtil.isNotNull(mapChar)) { + return mapChar; + } + + return character; + } + + public static String getMappingString(final String string) { + if(StringUtil.isEmpty(string)) { + return string; + } + + char[] chars = string.toCharArray(); + StringBuilder stringBuilder = new StringBuilder(chars.length); + for(char c : chars) { + char mapChar = getMappingChar(c); + + //TODO: stop word 的处理 + stringBuilder.append(mapChar); + } + + return stringBuilder.toString(); + } + +} diff --git a/src/main/resources/dict.txt b/src/main/resources/dict.txt index d4c0c09..0ab38fd 100644 Binary files a/src/main/resources/dict.txt and b/src/main/resources/dict.txt differ diff --git a/src/main/resources/dict_en.txt b/src/main/resources/dict_en.txt new file mode 100644 index 0000000..8b438c3 --- /dev/null +++ b/src/main/resources/dict_en.txt @@ -0,0 +1,12 @@ +fuck +duck +shit +chicken +fowl +sex +sexy +prostitute +whore +harlot +hooker +gender \ No newline at end of file diff --git a/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java b/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java index 793ed30..2c728bc 100644 --- a/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java +++ b/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java @@ -3,10 +3,12 @@ package com.github.houbb.sensitive.word.data; import com.github.houbb.heaven.support.filter.IFilter; import com.github.houbb.heaven.support.handler.IHandler; import com.github.houbb.heaven.util.io.FileUtil; +import com.github.houbb.heaven.util.lang.NumUtil; import com.github.houbb.heaven.util.lang.StringUtil; import com.github.houbb.heaven.util.util.CollectionUtil; import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap; import com.github.houbb.opencc4j.support.segment.impl.CharSegment; +import com.github.houbb.sensitive.word.utils.NumUtils; import org.junit.Ignore; import org.junit.Test; @@ -85,10 +87,64 @@ public class DictSlimTest { FileUtil.write(targetFile, resultList); } + /** + * 数字映射处理 + * @since 0.0.4 + */ + @Test + public void removeNumberMappingTest() { + final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\dict.txt"; + final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\dict.txt"; + + List words = FileUtil.readAllLines(sourceFile); + List formats = CollectionUtil.toList(words, new IHandler() { + @Override + public String handle(String s) { + return s.replaceAll(" ", ""); + } + }); + List filters = CollectionUtil.filterList(formats, new IFilter() { + @Override + public boolean filter(String string) { + return isNumber(string); + } + }); + + List resultList = DataUtil.disctinctAndSort(filters); + FileUtil.write(targetFile, resultList); + } + + /** + * 是否为存数字 + * (1)数字小于4的直接跳过。 + * @param string 原始字符串 + * @return 结果 + * @since 0.0.4 + */ + private static boolean isNumber(final String string) { + if(string.length() <= 4) { + return false; + } + + // 停顿词语 + String trim = string.replaceAll("加|否|与|和", ""); + String mapString = NumUtils.getMappingString(trim); + boolean result = StringUtil.isDigit(mapString); + if(result) { + System.out.println(string); + } + return result; + } + private static boolean isUrl(final String string) { return string.endsWith(".com") || string.endsWith(".cn") || string.endsWith(".org"); } + public static void main(String[] args) { + String trim = "1和2".replaceAll("加|否|与|和", ""); + System.out.println(trim); + } + }