[Feature] add for new

This commit is contained in:
houbb
2020-01-08 23:37:12 +08:00
parent 59c888ba93
commit 81d1399404
9 changed files with 261 additions and 254 deletions

View File

@@ -20,253 +20,4 @@
对应的任意写法。
https://github.com/toolgood 思想值得借鉴。
## 单个字
48339 === Q
83586 === q
117538 === ━
117539 === │
117540 === ┃
117541 === ┄
117542 === ┅
117554 === ┆
117555 === ┇
117556 === ┈
117557 === ┉
117558 === ┊
117559 === ┋
117560 === ┌
117561 === ┍
117562 === ┎
117563 === ┏
117564 === ┐
117565 === ┑
117566 === ┒
117567 === ┓
117568 === └
117569 === ┕
117570 === ┖
117571 === ┗
117572 === ┘
117573 === ┙
117574 === ┚
117575 === ┛
117576 === ├
117577 === ┝
117578 === ┞
117579 === ┟
117580 === ┠
117581 === ┡
117582 === ┢
117583 === ┣
117584 === ┤
117585 === ┥
117586 === ┦
117587 === ┧
117588 === ┨
117589 === ┩
117590 === ┪
117591 === ┫
117592 === ┬
117593 === ┭
117594 === ┮
117595 === ┯
117596 === ┰
117597 === ┱
117598 === ┲
117599 === ┳
117600 === ┴
117601 === ┵
117602 === ┶
117603 === ┷
117604 === ┸
117605 === ┹
117606 === ┺
117607 === ┻
117609 === ┼
117610 === ┽
117611 === ┾
117612 === ┿
117613 === ╀
117614 === ╁
117615 === ╂
117616 === ╃
117617 === ╄
117618 === ╅
117619 === ╆
117620 === ╇
117621 === ╈
117622 === ╉
117623 === ╊
117624 === ╋
117846 === ㄖ
121501 === 买
121979 === 乳
123013 === 仆
133622 === 功
133786 === 動
133790 === 務
134011 === 區
134255 === 卐
134287 === 卖
134910 === 卵
135512 === 口
136392 === 吊
136576 === 吨
137367 === 喷
137479 === 嘸
139926 === 奸
140085 === 妈
140126 === 妓
140373 === 姘
140397 === 姦
140409 === 姩
140464 === 娘
140498 === 娼
140503 === 婊
140519 === 婬
140562 === 媽
140585 === 嫖
140668 === 孕
141291 === 寇
141668 === 射
142550 === 尻
142603 === 尿
142620 === 屄
142639 === 屌
142650 === 屍
142653 === 屎
142665 === 屙
143107 === 巯
143346 === 干
143535 === 幹
143735 === 床
144165 === 弓
144386 === 弩
144931 === 忍
145146 === 性
145905 === 慰
145913 === 慾
146837 === 戳
146919 === 房
147574 === 扣
149446 === 抠
149774 === 抽
150089 === 挂
150244 === 捻
150260 === 掛
150296 === 掯
151938 === 插
152406 === 操
153468 === 日
154328 === 曰
154902 === 本
155789 === 枪
156187 === 槍
156578 === 歌
156780 === 死
158105 === 氟
158172 === 氯
158265 === 氰
158565 === 汞
159598 === 洱
159944 === 淪
159948 === 淫
161116 === 滚
161125 === 滛
161669 === 灾
161676 === 炮
161774 === 烂
161845 === 烯
161856 === 烷
162055 === 爛
162196 === 爽
162941 === 獨
162985 === 獸
163396 === 甙
163934 === 畜
165856 === 眯
165880 === 睾
165889 === 瞳
166039 === 砒
166049 === 砜
166086 === 砷
166097 === 础
166234 === 硼
166254 === 碡
166265 === 碱
166275 === 碼
166290 === 磷
166298 === 磺
166876 === 穴
167390 === 糞
167499 === 統
167536 === 綸
167961 === 罂
168722 === 羟
168800 === 羰
169070 === 耣
169444 === 肏
169474 === 肛
169508 === 肝
169679 === 肼
169680 === 肾
169725 === 胂
169729 === 胍
169883 === 胺
169907 === 脬
169939 === 腈
170004 === 膦
170283 === 臺
170406 === 色
171007 === 苄
171216 === 茎
171229 === 草
171395 === 萋
171473 === 葵
171614 === 蔻
172474 === 裸
172599 === 褻
172877 === 証
174115 === 賤
174531 === 贱
174972 === 踢
174984 === 蹣
175044 === 躶
175063 === 輪
175475 === 轮
175543 === 辦
176368 === 逼
176679 === 酐
176733 === 酮
176734 === 酯
176735 === 酰
176767 === 醚
176768 === 醛
177126 === 鈤
177295 === 鎷
177321 === 钒
177332 === 钠
177487 === 铀
177569 === 铊
179476 === 锇
179520 === 镉
179521 === 镍
179803 === 阴
180109 === 陰
180173 === 隂
180292 === 雞
180594 === 靠
181185 === 騒
181190 === 騷
181303 === 驽
181352 === 骚
182246 === 鯫
182247 === 鰢
182306 === 鸠
182308 === 鸡
182405 === 鸨
183438 ===
183491 ===
https://github.com/toolgood 思想值得借鉴。

View File

@@ -1,4 +1,4 @@
分词
stop-word
拼音
@@ -6,10 +6,16 @@
全角半角转换
中文英文转换
重复词
# 其他
中文英文转换(待定)
手写 Regex
分词
## 核心原理
DFA 算法

View File

@@ -23,6 +23,12 @@ public final class AppConst {
* 字典的大小
* @since 0.0.1
*/
public static final int DICT_SIZE = 66337;
public static final int DICT_SIZE = 65711;
/**
* 英语词典的大小
* @since 0.0.4
*/
public static final int DICT_EN_SIZE = 12;
}

View File

@@ -27,8 +27,9 @@ public class SensitiveWordData implements IWordData {
static {
synchronized (SensitiveWordData.class) {
long start = System.currentTimeMillis();
defaultLines = Guavas.newArrayList(AppConst.DICT_SIZE);
defaultLines = Guavas.newArrayList(AppConst.DICT_SIZE+AppConst.DICT_EN_SIZE);
defaultLines = StreamUtil.readAllLines("/dict.txt");
defaultLines.addAll(StreamUtil.readAllLines("/dict_en.txt"));
long end = System.currentTimeMillis();
System.out.println("Sensitive data loaded!, cost time: " + (end - start) + " ms");
}

View File

@@ -0,0 +1,68 @@
package com.github.houbb.sensitive.word.utils;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import java.util.Map;
/**
* <p> project: sensitive-word-NumUtils </p>
* <p> create on 2020/1/8 22:18 </p>
*
* @author Administrator
* @since 0.0.4
*/
public final class CharUtils {
private CharUtils() {
}
/**
* 英文字母1
* @since 0.0.4
*/
private static final String LETTERS_ONE =
"ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
"⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
/**
* 英文字母2
* @since 0.0.4
*/
private static final String LETTERS_TWO =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
"abcdefghijklmnopqrstuvwxyz" +
"abcdefghijklmnopqrstuvwxyz";
/**
* 英文字母 map
* @since 0.0.4
*/
private static final Map<Character, Character> LETTER_MAP = Guavas.newHashMap(LETTERS_ONE.length());
static {
final int size = LETTERS_ONE.length();
for(int i = 0; i < size; i++) {
LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param character 待转换的 char
* @return 结果
* @since 0.0.4
*/
public static Character getMappingChar(final Character character) {
final Character mapChar = LETTER_MAP.get(character);
if(ObjectUtil.isNotNull(mapChar)) {
return mapChar;
}
return character;
}
}

View File

@@ -0,0 +1,107 @@
package com.github.houbb.sensitive.word.utils;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import java.util.Map;
/**
* <p> project: sensitive-word-NumUtils </p>
* <p> create on 2020/1/8 22:18 </p>
*
* @author Administrator
* @since 0.0.4
*/
public final class NumUtils {
private NumUtils(){}
private static final String NUM_ONE = "⓪0零º₀⓿○" +
"" +
"一二三四五六七八九" +
"壹贰叁肆伍陆柒捌玖" +
"¹²³⁴⁵⁶⁷⁸⁹" +
"₁₂₃₄₅₆₇₈₉" +
"①②③④⑤⑥⑦⑧⑨" +
"⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
"⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
"❶❷❸❹❺❻❼❽❾" +
"➀➁➂➃➄➅➆➇➈" +
"➊➋➌➍➎➏➐➑➒" +
"㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
"⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
"㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
"ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
"ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
private static final String NUM_TWO = "0000000"+
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789";
/**
* 英文字母 map
* @since 0.0.4
*/
private static final Map<Character, Character> NUMBER_MAP = Guavas.newHashMap(NUM_ONE.length());
static {
final int size = NUM_ONE.length();
for(int i = 0; i < size; i++) {
NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param character 待转换的 char
* @return 结果
* @since 0.0.4
*/
public static Character getMappingChar(final Character character) {
final Character mapChar = NUMBER_MAP.get(character);
if(ObjectUtil.isNotNull(mapChar)) {
return mapChar;
}
return character;
}
public static String getMappingString(final String string) {
if(StringUtil.isEmpty(string)) {
return string;
}
char[] chars = string.toCharArray();
StringBuilder stringBuilder = new StringBuilder(chars.length);
for(char c : chars) {
char mapChar = getMappingChar(c);
//TODO: stop word 的处理
stringBuilder.append(mapChar);
}
return stringBuilder.toString();
}
}

Binary file not shown.

View File

@@ -0,0 +1,12 @@
fuck
duck
shit
chicken
fowl
sex
sexy
prostitute
whore
harlot
hooker
gender

View File

@@ -3,10 +3,12 @@ package com.github.houbb.sensitive.word.data;
import com.github.houbb.heaven.support.filter.IFilter;
import com.github.houbb.heaven.support.handler.IHandler;
import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.lang.NumUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
import com.github.houbb.sensitive.word.utils.NumUtils;
import org.junit.Ignore;
import org.junit.Test;
@@ -85,10 +87,64 @@ public class DictSlimTest {
FileUtil.write(targetFile, resultList);
}
/**
* 数字映射处理
* @since 0.0.4
*/
@Test
public void removeNumberMappingTest() {
final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\dict.txt";
final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\dict.txt";
List<String> words = FileUtil.readAllLines(sourceFile);
List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
@Override
public String handle(String s) {
return s.replaceAll(" ", "");
}
});
List<String> filters = CollectionUtil.filterList(formats, new IFilter<String>() {
@Override
public boolean filter(String string) {
return isNumber(string);
}
});
List<String> resultList = DataUtil.disctinctAndSort(filters);
FileUtil.write(targetFile, resultList);
}
/**
* 是否为存数字
* 1数字小于4的直接跳过。
* @param string 原始字符串
* @return 结果
* @since 0.0.4
*/
private static boolean isNumber(final String string) {
if(string.length() <= 4) {
return false;
}
// 停顿词语
String trim = string.replaceAll("加|否|与|和", "");
String mapString = NumUtils.getMappingString(trim);
boolean result = StringUtil.isDigit(mapString);
if(result) {
System.out.println(string);
}
return result;
}
private static boolean isUrl(final String string) {
return string.endsWith(".com")
|| string.endsWith(".cn")
|| string.endsWith(".org");
}
public static void main(String[] args) {
String trim = "1和2".replaceAll("加|否|与|和", "");
System.out.println(trim);
}
}