mirror of
https://github.com/houbb/sensitive-word.git
synced 2026-03-22 08:27:36 +08:00
[Feature] add for new
This commit is contained in:
@@ -471,3 +471,9 @@
|
||||
|:---|:-----|-----------------------------|:------------------|:--------------------|
|
||||
| 1 | O | 改进 check、format 的 chains 方法 | 2025-9-5 16:22:24 | 优化性能 |
|
||||
| 2 | O | InnerWordFormatUtils#format | 2025-9-5 16:22:24 | 优化性能+内存 toCharArray |
|
||||
|
||||
# release_0.29.2
|
||||
|
||||
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|
||||
|:---|:-----|---------------------|:------------------|:--------------------|
|
||||
| 1 | O | 拆箱、装箱优化。优化数字。英文的格式化 | 2025-9-5 16:22:24 | 优化性能 |
|
||||
|
||||
28
pom.xml
28
pom.xml
@@ -6,7 +6,7 @@
|
||||
|
||||
<groupId>com.github.houbb</groupId>
|
||||
<artifactId>sensitive-word</artifactId>
|
||||
<version>0.29.1</version>
|
||||
<version>0.29.2</version>
|
||||
|
||||
<properties>
|
||||
<!--============================== All Plugins START ==============================-->
|
||||
@@ -115,7 +115,6 @@
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
@@ -182,6 +181,20 @@
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>${plugin.maven-javadoc-plugin.version}</version>
|
||||
<configuration>
|
||||
<!-- 指定源码编码 -->
|
||||
<encoding>UTF-8</encoding>
|
||||
<!-- 指定文档编码 -->
|
||||
<docencoding>UTF-8</docencoding>
|
||||
<!-- 输出 HTML 的 charset -->
|
||||
<charset>UTF-8</charset>
|
||||
<!-- 强制生成,不因警告/错误中断 -->
|
||||
<failOnError>false</failOnError>
|
||||
<!-- 可以跳过 doclint -->
|
||||
<additionalJOptions>
|
||||
<additionalJOption>-Xdoclint:none</additionalJOption>
|
||||
</additionalJOptions>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
</plugins>
|
||||
@@ -252,6 +265,17 @@
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-javadoc-plugin</artifactId>
|
||||
<version>${plugin.maven-javadoc-plugin.version}</version>
|
||||
<configuration>
|
||||
<encoding>UTF-8</encoding>
|
||||
<charset>UTF-8</charset>
|
||||
<docencoding>UTF-8</docencoding>
|
||||
<additionalJOptions>
|
||||
<additionalJOption>-Xdoclint:none</additionalJOption>
|
||||
<additionalJOption>-charset UTF-8</additionalJOption>
|
||||
<additionalJOption>-encoding UTF-8</additionalJOption>
|
||||
<additionalJOption>-docencoding UTF-8</additionalJOption>
|
||||
</additionalJOptions>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>package</phase>
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
package com.github.houbb.sensitive.word.collection;
|
||||
|
||||
/**
|
||||
* 原生无装箱、拆箱的实现
|
||||
*
|
||||
* @since 0.29.2
|
||||
*/
|
||||
public final class Char2CharMap {
|
||||
|
||||
private static final char EMPTY_KEY = '\0'; // 特殊标记,表示空槽
|
||||
private static final float LOAD_FACTOR = 0.5f;
|
||||
|
||||
private char[] keys;
|
||||
private char[] values;
|
||||
private int size;
|
||||
private int mask; // capacity-1,用于快速取模
|
||||
private int maxSize;
|
||||
|
||||
public Char2CharMap(int expectedSize) {
|
||||
int capacity = tableSizeFor((int) (expectedSize / LOAD_FACTOR) + 1);
|
||||
this.keys = new char[capacity];
|
||||
this.values = new char[capacity];
|
||||
this.mask = capacity - 1;
|
||||
this.maxSize = (int) (capacity * LOAD_FACTOR);
|
||||
this.size = 0;
|
||||
}
|
||||
|
||||
/** 2 的幂次方容量 */
|
||||
private static int tableSizeFor(int cap) {
|
||||
int n = cap - 1;
|
||||
n |= n >>> 1;
|
||||
n |= n >>> 2;
|
||||
n |= n >>> 4;
|
||||
n |= n >>> 8;
|
||||
n |= n >>> 16;
|
||||
return (n < 2) ? 2 : (n >= (1 << 30) ? (1 << 30) : n + 1);
|
||||
}
|
||||
|
||||
private int hash(char k) {
|
||||
return (k * 0x9E3779B9) & mask; // 乘法哈希 + mask
|
||||
}
|
||||
|
||||
/** 插入或覆盖 */
|
||||
public void put(char key, char value) {
|
||||
if (key == EMPTY_KEY) {
|
||||
throw new IllegalArgumentException("Key '\0' is reserved as EMPTY_KEY.");
|
||||
}
|
||||
int idx = hash(key);
|
||||
while (true) {
|
||||
if (keys[idx] == EMPTY_KEY) {
|
||||
keys[idx] = key;
|
||||
values[idx] = value;
|
||||
if (++size >= maxSize) {
|
||||
resize();
|
||||
}
|
||||
return;
|
||||
} else if (keys[idx] == key) {
|
||||
values[idx] = value;
|
||||
return;
|
||||
}
|
||||
idx = (idx + 1) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
/** 查询,不存在时返回 defaultValue */
|
||||
public char get(char key, char defaultValue) {
|
||||
if (key == EMPTY_KEY) return defaultValue;
|
||||
int idx = hash(key);
|
||||
while (true) {
|
||||
char k = keys[idx];
|
||||
if (k == EMPTY_KEY) return defaultValue;
|
||||
if (k == key) return values[idx];
|
||||
idx = (idx + 1) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
public char get(char key) {
|
||||
char defaultVal = 0;
|
||||
return get(key, defaultVal);
|
||||
}
|
||||
|
||||
private void resize() {
|
||||
int newCap = keys.length << 1;
|
||||
char[] oldKeys = keys;
|
||||
char[] oldVals = values;
|
||||
|
||||
keys = new char[newCap];
|
||||
values = new char[newCap];
|
||||
mask = newCap - 1;
|
||||
maxSize = (int) (newCap * LOAD_FACTOR);
|
||||
size = 0;
|
||||
|
||||
for (int i = 0; i < oldKeys.length; i++) {
|
||||
char k = oldKeys[i];
|
||||
if (k != EMPTY_KEY) {
|
||||
put(k, oldVals[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return size;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,203 +0,0 @@
|
||||
package com.github.houbb.sensitive.word.support.data;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.WordConst;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 敏感词 map
|
||||
*
|
||||
* 不再维护,降低维护成本
|
||||
*
|
||||
* @author binbin.hou
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Deprecated
|
||||
public class WordDataHashMap extends AbstractWordData {
|
||||
|
||||
/**
|
||||
* 脱敏单词 map
|
||||
*
|
||||
* @since 0.0.1
|
||||
*/
|
||||
private Map innerWordMap;
|
||||
|
||||
/**
|
||||
* 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
|
||||
*
|
||||
* @param collection 敏感词库集合
|
||||
* @since 0.0.1
|
||||
* <p>
|
||||
* 使用对象代码 map 的这种一直递归。
|
||||
* 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html
|
||||
* https://blog.csdn.net/chenssy/article/details/26961957
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public synchronized void doInitWordData(Collection<String> collection) {
|
||||
// 避免扩容带来的消耗
|
||||
Map newInnerWordMap = new HashMap(collection.size());
|
||||
|
||||
for (String key : collection) {
|
||||
if (StringUtil.isEmpty(key)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 用来按照相应的格式保存敏感词库数据
|
||||
final int size = key.length();
|
||||
|
||||
// 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
|
||||
Map currentMap = newInnerWordMap;
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
// 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
|
||||
char charKey = key.charAt(i);
|
||||
// 如果集合存在
|
||||
Object wordMap = currentMap.get(charKey);
|
||||
|
||||
// 如果集合存在
|
||||
if (ObjectUtil.isNotNull(wordMap)) {
|
||||
// 直接将获取到的 map 当前当前 map 进行继续的操作
|
||||
currentMap = (Map) wordMap;
|
||||
} else {
|
||||
//不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
|
||||
Map<String, Boolean> newWordMap = new HashMap<>(8);
|
||||
newWordMap.put(WordConst.IS_END, false);
|
||||
|
||||
// 将新的节点放入当前 map 中
|
||||
currentMap.put(charKey, newWordMap);
|
||||
|
||||
// 将新节点设置为当前节点,方便下一次节点的循环。
|
||||
currentMap = newWordMap;
|
||||
}
|
||||
}
|
||||
|
||||
// 判断是否为最后一个,添加是否结束的标识。
|
||||
currentMap.put(WordConst.IS_END, true);
|
||||
}
|
||||
|
||||
// 最后更新为新的 map,保证更新过程中旧的数据可用
|
||||
this.innerWordMap = newInnerWordMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doRemoveWord(Collection<String> collection) {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doAddWord(Collection<String> collection) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否包含
|
||||
* (1)直接遍历所有
|
||||
* (2)如果遇到,则直接返回 true
|
||||
*
|
||||
* @param stringBuilder 字符串
|
||||
* @param innerContext 内部上下文
|
||||
* @return 是否包含
|
||||
* @since 0.0.1
|
||||
*/
|
||||
@Override
|
||||
public WordContainsTypeEnum doContains(final StringBuilder stringBuilder,
|
||||
final InnerSensitiveWordContext innerContext) {
|
||||
return innerContainsSensitive(stringBuilder, innerContext);
|
||||
}
|
||||
|
||||
private WordContainsTypeEnum innerContainsSensitive(StringBuilder stringBuilder,
|
||||
final InnerSensitiveWordContext innerContext) {
|
||||
// 初始化为当前的 map
|
||||
Map nowMap = this.innerWordMap;
|
||||
|
||||
// 记录敏感词的长度
|
||||
final int len = stringBuilder.length();
|
||||
for (int i = 0; i < len; i++) {
|
||||
// 获取当前的 map 信息
|
||||
nowMap = getNowMap(nowMap, i, stringBuilder, innerContext);
|
||||
|
||||
// 如果不为空,则判断是否为结尾。
|
||||
if (ObjectUtil.isNull(nowMap)) {
|
||||
return WordContainsTypeEnum.NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
// 是否为结尾,便于快速失败
|
||||
boolean isEnd = isEnd(nowMap);
|
||||
if(isEnd) {
|
||||
return WordContainsTypeEnum.CONTAINS_END;
|
||||
}
|
||||
|
||||
return WordContainsTypeEnum.CONTAINS_PREFIX;
|
||||
}
|
||||
|
||||
/**
|
||||
* 判断是否结束
|
||||
* BUG-FIX: 避免出现敏感词库中没有的文字。
|
||||
* @param map map 信息
|
||||
* @return 是否结束
|
||||
* @since 0.0.9
|
||||
*/
|
||||
private static boolean isEnd(final Map map) {
|
||||
if(ObjectUtil.isNull(map)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Object value = map.get(WordConst.IS_END);
|
||||
if(ObjectUtil.isNull(value)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (boolean)value;
|
||||
}
|
||||
/**
|
||||
* 获取当前的 Map
|
||||
* @param nowMap 原始的当前 map
|
||||
* @param index 下标
|
||||
* @param stringBuilder 文本缓存
|
||||
* @param sensitiveContext 上下文
|
||||
* @return 实际的当前 map
|
||||
* @since 0.0.7
|
||||
*/
|
||||
private Map getNowMap(Map nowMap,
|
||||
final int index,
|
||||
final StringBuilder stringBuilder,
|
||||
final InnerSensitiveWordContext sensitiveContext) {
|
||||
final IWordContext context = sensitiveContext.wordContext();
|
||||
|
||||
// 这里的 char 已经是统一格式化之后的,所以可以不用再次格式化。
|
||||
char mappingChar = stringBuilder.charAt(index);
|
||||
|
||||
// 这里做一次重复词的处理
|
||||
//TODO: 这里可以优化,是否获取一次。
|
||||
Map currentMap = (Map) nowMap.get(mappingChar);
|
||||
// 启用忽略重复&当前下标不是第一个
|
||||
if(context.ignoreRepeat()
|
||||
&& index > 0) {
|
||||
char preMappingChar = stringBuilder.charAt(index-1);
|
||||
|
||||
// 直接赋值为上一个 map
|
||||
if(preMappingChar == mappingChar) {
|
||||
currentMap = nowMap;
|
||||
}
|
||||
}
|
||||
|
||||
return currentMap;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void destroy() {
|
||||
if(innerWordMap != null) {
|
||||
innerWordMap.clear();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,15 +1,18 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 忽略英文的各种格式
|
||||
* @author binbin.hou
|
||||
* @since 0.0.6
|
||||
*/
|
||||
@Deprecated
|
||||
@ThreadSafe
|
||||
public class WordFormatIgnoreEnglishStyle implements IWordFormat {
|
||||
|
||||
@@ -19,9 +22,52 @@ public class WordFormatIgnoreEnglishStyle implements IWordFormat {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
/**
|
||||
* 英文字母1
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final String LETTERS_ONE =
|
||||
"ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
|
||||
"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
|
||||
"⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
|
||||
|
||||
/**
|
||||
* 英文字母2
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final String LETTERS_TWO =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
|
||||
"abcdefghijklmnopqrstuvwxyz" +
|
||||
"abcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
|
||||
/**
|
||||
* 字母映射表
|
||||
*/
|
||||
private static final Map<Character,Character> LETTER_MAP = new HashMap<>(LETTERS_ONE.length());
|
||||
|
||||
static {
|
||||
final int size = LETTERS_ONE.length();
|
||||
for(int i = 0; i < size; i++) {
|
||||
LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 映射后的 char
|
||||
* @param c 待转换的 char
|
||||
* @return 转换结果
|
||||
* @since 0.29.x
|
||||
*/
|
||||
private char getMappingChar(final char c) {
|
||||
Character mapChar = LETTER_MAP.get(c);
|
||||
return mapChar == null ? c : mapChar;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return InnerWordCharUtils.getMappingChar(original);
|
||||
return getMappingChar(original);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.collection.Char2CharMap;
|
||||
|
||||
/**
|
||||
* 忽略英文的各种格式
|
||||
* @author binbin.hou
|
||||
* @since 0.0.6
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class WordFormatIgnoreEnglishStyleC2C implements IWordFormat {
|
||||
|
||||
private static final IWordFormat INSTANCE = new WordFormatIgnoreEnglishStyleC2C();
|
||||
|
||||
public static IWordFormat getInstance() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
/**
|
||||
* 英文字母1
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final String LETTERS_ONE =
|
||||
"ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
|
||||
"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
|
||||
"⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
|
||||
|
||||
/**
|
||||
* 英文字母2
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final String LETTERS_TWO =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
|
||||
"abcdefghijklmnopqrstuvwxyz" +
|
||||
"abcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
|
||||
/**
|
||||
* 字母映射表
|
||||
*/
|
||||
private static final Char2CharMap LETTER_MAP = new Char2CharMap(LETTERS_ONE.length());
|
||||
|
||||
static {
|
||||
final int size = LETTERS_ONE.length();
|
||||
for(int i = 0; i < size; i++) {
|
||||
LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 映射后的 char
|
||||
* @param c 待转换的 char
|
||||
* @return 转换结果
|
||||
* @since 0.29.x
|
||||
*/
|
||||
private char getMappingChar(final char c) {
|
||||
char mc = LETTER_MAP.get(c);
|
||||
return mc == 0 ? c : mc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return getMappingChar(original);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -3,13 +3,16 @@ package com.github.houbb.sensitive.word.support.format;
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 忽略数字的样式
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@Deprecated
|
||||
@ThreadSafe
|
||||
public class WordFormatIgnoreNumStyle implements IWordFormat {
|
||||
|
||||
@@ -19,9 +22,65 @@ public class WordFormatIgnoreNumStyle implements IWordFormat {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
private static final String NUM_ONE = "⓪0零º₀⓿○" +
|
||||
"123456789" +
|
||||
"一二三四五六七八九" +
|
||||
"壹贰叁肆伍陆柒捌玖" +
|
||||
"¹²³⁴⁵⁶⁷⁸⁹" +
|
||||
"₁₂₃₄₅₆₇₈₉" +
|
||||
"①②③④⑤⑥⑦⑧⑨" +
|
||||
"⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
|
||||
"⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
|
||||
"❶❷❸❹❺❻❼❽❾" +
|
||||
"➀➁➂➃➄➅➆➇➈" +
|
||||
"➊➋➌➍➎➏➐➑➒" +
|
||||
"㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
|
||||
"⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
|
||||
"㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
|
||||
"ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
|
||||
"ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
|
||||
|
||||
private static final String NUM_TWO = "0000000"+
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789";
|
||||
|
||||
private static final Map<Character,Character> NUMBER_MAP = new HashMap<>(NUM_ONE.length());
|
||||
|
||||
static {
|
||||
final int size = NUM_ONE.length();
|
||||
for(int i = 0; i < size; i++) {
|
||||
NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 映射后的 char
|
||||
* @param c 待转换的 char
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private char getMappingChar(final char c) {
|
||||
Character mapChar = NUMBER_MAP.get(c);
|
||||
return mapChar == null ? c : mapChar;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return InnerWordNumUtils.getMappingChar(original);
|
||||
return getMappingChar(original);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,86 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.collection.Char2CharMap;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* 忽略数字的样式
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
public class WordFormatIgnoreNumStyleC2C implements IWordFormat {
|
||||
|
||||
private static final IWordFormat INSTANCE = new WordFormatIgnoreNumStyleC2C();
|
||||
|
||||
public static IWordFormat getInstance() {
|
||||
return INSTANCE;
|
||||
}
|
||||
|
||||
private static final String NUM_ONE = "⓪0零º₀⓿○" +
|
||||
"123456789" +
|
||||
"一二三四五六七八九" +
|
||||
"壹贰叁肆伍陆柒捌玖" +
|
||||
"¹²³⁴⁵⁶⁷⁸⁹" +
|
||||
"₁₂₃₄₅₆₇₈₉" +
|
||||
"①②③④⑤⑥⑦⑧⑨" +
|
||||
"⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
|
||||
"⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
|
||||
"❶❷❸❹❺❻❼❽❾" +
|
||||
"➀➁➂➃➄➅➆➇➈" +
|
||||
"➊➋➌➍➎➏➐➑➒" +
|
||||
"㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
|
||||
"⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
|
||||
"㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
|
||||
"ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
|
||||
"ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
|
||||
|
||||
private static final String NUM_TWO = "0000000"+
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789";
|
||||
|
||||
private static final Char2CharMap NUMBER_MAP = new Char2CharMap(NUM_ONE.length());
|
||||
|
||||
static {
|
||||
final int size = NUM_ONE.length();
|
||||
for(int i = 0; i < size; i++) {
|
||||
NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 映射后的 char
|
||||
* @param c 待转换的 char
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private char getMappingChar(final char c) {
|
||||
char mc = NUMBER_MAP.get(c);
|
||||
return mc == 0 ? c : mc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return getMappingChar(original);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,9 +1,9 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.utils.InnerCharUtils;
|
||||
|
||||
/**
|
||||
* 格式化字宽度
|
||||
@@ -21,7 +21,7 @@ public class WordFormatIgnoreWidth implements IWordFormat {
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
return CharUtil.toHalfWidth(original);
|
||||
return InnerCharUtils.toHalfWidth(original);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.heaven.annotation.ThreadSafe;
|
||||
import com.github.houbb.heaven.support.pipeline.Pipeline;
|
||||
import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 格式化责任链
|
||||
* @author binbin.hou
|
||||
* @since 0.0.5
|
||||
*/
|
||||
@ThreadSafe
|
||||
@Deprecated
|
||||
public abstract class WordFormatInit implements IWordFormat {
|
||||
|
||||
/**
|
||||
* 初始化列表
|
||||
*
|
||||
* @param pipeline 当前列表泳道
|
||||
* @since 0.0.13
|
||||
*/
|
||||
protected abstract void init(final Pipeline<IWordFormat> pipeline);
|
||||
|
||||
@Override
|
||||
public char format(char original, IWordContext context) {
|
||||
Pipeline<IWordFormat> pipeline = new DefaultPipeline<>();
|
||||
init(pipeline);
|
||||
|
||||
char result = original;
|
||||
|
||||
// 循环执行
|
||||
List<IWordFormat> charFormats = pipeline.list();
|
||||
for(IWordFormat charFormat : charFormats) {
|
||||
result = charFormat.format(result, context);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -51,7 +51,7 @@ public final class WordFormats {
|
||||
}
|
||||
|
||||
public static IWordFormat ignoreEnglishStyle() {
|
||||
return WordFormatIgnoreEnglishStyle.getInstance();
|
||||
return WordFormatIgnoreEnglishStyleC2C.getInstance();
|
||||
}
|
||||
|
||||
public static IWordFormat ignoreChineseStyle() {
|
||||
@@ -59,7 +59,7 @@ public final class WordFormats {
|
||||
}
|
||||
|
||||
public static IWordFormat ignoreNumStyle() {
|
||||
return WordFormatIgnoreNumStyle.getInstance();
|
||||
return WordFormatIgnoreNumStyleC2C.getInstance();
|
||||
}
|
||||
|
||||
public static IWordFormat ignoreWidth() {
|
||||
|
||||
@@ -2,7 +2,6 @@ package com.github.houbb.sensitive.word.support.format.mapping;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
import com.github.houbb.sensitive.word.support.check.WordCheckNone;
|
||||
import com.github.houbb.sensitive.word.support.format.WordFormatNone;
|
||||
|
||||
import java.util.Collections;
|
||||
@@ -26,6 +25,7 @@ public class WordFormatTextDefault extends AbstractWordFormatText {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
//v0.29.2
|
||||
Map<Character, Character> map = new HashMap<>();
|
||||
for(int i = 0; i < text.length(); i++) {
|
||||
char c = text.charAt(i);
|
||||
|
||||
@@ -5,6 +5,24 @@ package com.github.houbb.sensitive.word.utils;
|
||||
*/
|
||||
public class InnerCharUtils {
|
||||
|
||||
/**
|
||||
* 转换为半角
|
||||
* @param original 原始
|
||||
* @return 半角
|
||||
* @since 0.29.2
|
||||
*/
|
||||
public static char toHalfWidth(char original) {
|
||||
// 全角空格
|
||||
if (original == '\u3000') return ' ';
|
||||
// 其他可转换全角字符
|
||||
if (original >= '\uFF01' && original <= '\uFF5E') {
|
||||
return (char) (original - 0xFEE0);
|
||||
}
|
||||
// 其他字符保持不变
|
||||
return original;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* 转换为整数
|
||||
* @param text 文本
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
package com.github.houbb.sensitive.word.utils;
|
||||
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordResult;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-NumUtils </p>
|
||||
* <p> create on 2020/1/8 22:18 </p>
|
||||
@@ -18,84 +14,6 @@ public final class InnerWordCharUtils {
|
||||
private InnerWordCharUtils() {
|
||||
}
|
||||
|
||||
/**
|
||||
* 英文字母1
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final String LETTERS_ONE =
|
||||
"ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
|
||||
"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
|
||||
"⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
|
||||
|
||||
/**
|
||||
* 英文字母2
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final String LETTERS_TWO =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
|
||||
"abcdefghijklmnopqrstuvwxyz" +
|
||||
"abcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
|
||||
/**
|
||||
* 英文字母 map
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final Map<Character, Character> LETTER_MAP = Guavas.newHashMap(LETTERS_ONE.length());
|
||||
|
||||
static {
|
||||
final int size = LETTERS_ONE.length();
|
||||
|
||||
for(int i = 0; i < size; i++) {
|
||||
LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 映射后的 char
|
||||
* @param character 待转换的 char
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
public static Character getMappingChar(final Character character) {
|
||||
final Character mapChar = LETTER_MAP.get(character);
|
||||
if(ObjectUtil.isNotNull(mapChar)) {
|
||||
return mapChar;
|
||||
}
|
||||
|
||||
return character;
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建字符串
|
||||
* @param chars 字符数组
|
||||
* @param startIndex 开始位置
|
||||
* @param endIndex 结束位置
|
||||
* @return 结果
|
||||
* @since 0.5.0
|
||||
*/
|
||||
// @Deprecated
|
||||
// public static String getString(final char[] chars,
|
||||
// final int startIndex,
|
||||
// final int endIndex) {
|
||||
// // 截取
|
||||
// int len = endIndex - startIndex;
|
||||
// return new String(chars, startIndex, len);
|
||||
// }
|
||||
|
||||
/**
|
||||
* 构建字符串
|
||||
* @param chars 字符数组
|
||||
* @param wordResult 结果
|
||||
* @return 结果
|
||||
* @since 0.5.0
|
||||
*/
|
||||
// @Deprecated
|
||||
// public static String getString(final char[] chars,
|
||||
// final IWordResult wordResult) {
|
||||
// return getString(chars, wordResult.startIndex(), wordResult.endIndex());
|
||||
// }
|
||||
|
||||
/**
|
||||
* 构建字符串
|
||||
* @param text 字符串
|
||||
|
||||
@@ -47,17 +47,20 @@ public final class InnerWordFormatUtils {
|
||||
|
||||
/**
|
||||
* 字符串统一的格式化处理
|
||||
*
|
||||
* 注意:这个需要 map 的实现是 {@link it.unimi.dsi.fastutil.chars.Char2CharOpenHashMap}
|
||||
* @param map 映射集合
|
||||
* @param c 原始
|
||||
* @return 结果
|
||||
* @since 0.28.0
|
||||
*/
|
||||
public static char getMappingChar(final Map<Character, Character> map, char c) {
|
||||
Character mc = map.get(c);
|
||||
if(mc != null) {
|
||||
return mc;
|
||||
//Char2CharOpenHashMap 不存在映射也是返回 null
|
||||
Object mc = map.get(c);
|
||||
if(mc == null) {
|
||||
return c;
|
||||
}
|
||||
return c;
|
||||
return (char) mc;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,128 +0,0 @@
|
||||
package com.github.houbb.sensitive.word.utils;
|
||||
|
||||
import com.github.houbb.heaven.util.guava.Guavas;
|
||||
import com.github.houbb.heaven.util.lang.ObjectUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* <p> project: sensitive-word-NumUtils </p>
|
||||
* <p> create on 2020/1/8 22:18 </p>
|
||||
*
|
||||
* @author Administrator
|
||||
* @since 0.0.4
|
||||
*/
|
||||
public final class InnerWordNumUtils {
|
||||
|
||||
private InnerWordNumUtils(){}
|
||||
|
||||
private static final String NUM_ONE = "⓪0零º₀⓿○" +
|
||||
"123456789" +
|
||||
"一二三四五六七八九" +
|
||||
"壹贰叁肆伍陆柒捌玖" +
|
||||
"¹²³⁴⁵⁶⁷⁸⁹" +
|
||||
"₁₂₃₄₅₆₇₈₉" +
|
||||
"①②③④⑤⑥⑦⑧⑨" +
|
||||
"⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
|
||||
"⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
|
||||
"❶❷❸❹❺❻❼❽❾" +
|
||||
"➀➁➂➃➄➅➆➇➈" +
|
||||
"➊➋➌➍➎➏➐➑➒" +
|
||||
"㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
|
||||
"⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
|
||||
"㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
|
||||
"ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
|
||||
"ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
|
||||
|
||||
private static final String NUM_TWO = "0000000"+
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789" +
|
||||
"123456789";
|
||||
|
||||
/**
|
||||
* 英文字母 map
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static final Map<Character, Character> NUMBER_MAP = Guavas.newHashMap(NUM_ONE.length());
|
||||
|
||||
static {
|
||||
final int size = NUM_ONE.length();
|
||||
|
||||
for(int i = 0; i < size; i++) {
|
||||
NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 映射后的 char
|
||||
* @param character 待转换的 char
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
public static Character getMappingChar(final Character character) {
|
||||
final Character mapChar = NUMBER_MAP.get(character);
|
||||
if(ObjectUtil.isNotNull(mapChar)) {
|
||||
return mapChar;
|
||||
}
|
||||
|
||||
return character;
|
||||
}
|
||||
|
||||
public static String getMappingString(final String string) {
|
||||
if(StringUtil.isEmpty(string)) {
|
||||
return string;
|
||||
}
|
||||
|
||||
int length = string.length();
|
||||
StringBuilder stringBuilder = new StringBuilder(length);
|
||||
for(int i = 0; i < length; i++) {
|
||||
char mapChar = getMappingChar(string.charAt(i));
|
||||
|
||||
//TODO: stop word 的处理
|
||||
stringBuilder.append(mapChar);
|
||||
}
|
||||
|
||||
return stringBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* 检查敏感词数量
|
||||
* <p>
|
||||
* (1)如果未命中敏感词,直接返回 0
|
||||
* (2)命中敏感词,则返回敏感词的长度。
|
||||
*
|
||||
* ps: 这里结果进行优化,
|
||||
* 1. 是否包含敏感词。
|
||||
* 2. 敏感词的长度
|
||||
* 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复)
|
||||
*
|
||||
* @param txt 文本信息
|
||||
* @param beginIndex 开始下标
|
||||
* @param wordValidModeEnum 验证模式
|
||||
* @param context 执行上下文
|
||||
* @return 敏感数字对应的长度
|
||||
* @since 0.0.5
|
||||
*/
|
||||
private int getSensitiveNumber(final String txt, final int beginIndex,
|
||||
final WordValidModeEnum wordValidModeEnum,
|
||||
final IWordContext context) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
package com.github.houbb.sensitive.word.benchmark;
|
||||
|
||||
import com.github.houbb.heaven.util.lang.CharUtil;
|
||||
import com.github.houbb.sensitive.word.utils.InnerCharUtils;
|
||||
|
||||
public class CharUtilPerfTest {
|
||||
|
||||
|
||||
private static final int COUNT = 10_00_000;
|
||||
|
||||
public static void main(String[] args) {
|
||||
char[] testData = new char[COUNT];
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
testData[i] = (char) ('A' + (i % 52)); // A-Z a-z
|
||||
}
|
||||
|
||||
// 测试新小写
|
||||
// 测试原始半角
|
||||
char[] fullWidthData = new char[COUNT];
|
||||
for (int i = 0; i < COUNT; i++) {
|
||||
fullWidthData[i] = (char) ('\uFF01' + (i % 94)); // 常见全角字符
|
||||
}
|
||||
|
||||
long t5 = System.currentTimeMillis();
|
||||
char sum3 = 0;
|
||||
for (char c : fullWidthData) {
|
||||
sum3 += CharUtil.toHalfWidth(c);
|
||||
}
|
||||
long t6 = System.currentTimeMillis();
|
||||
System.out.println("原始 toHalfWidth 耗时: " + (t6 - t5) + "ms, sum=" + sum3);
|
||||
|
||||
// 测试新半角
|
||||
long t7 = System.currentTimeMillis();
|
||||
char sum4 = 0;
|
||||
for (char c : fullWidthData) {
|
||||
sum4 += InnerCharUtils.toHalfWidth(c);
|
||||
}
|
||||
long t8 = System.currentTimeMillis();
|
||||
System.out.println("优化 toHalfWidth 耗时: " + (t8 - t7) + "ms, sum=" + sum4);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -1,149 +1,149 @@
|
||||
package com.github.houbb.sensitive.word.data;
|
||||
|
||||
import com.github.houbb.heaven.support.filter.IFilter;
|
||||
import com.github.houbb.heaven.support.handler.IHandler;
|
||||
import com.github.houbb.heaven.util.io.FileUtil;
|
||||
import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
|
||||
import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
|
||||
import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 数据初始化
|
||||
* @author binbin.hou
|
||||
* @since 0.0.3
|
||||
*/
|
||||
@Ignore
|
||||
public class DictSlimTest {
|
||||
|
||||
/**
|
||||
* 统一格式
|
||||
*
|
||||
* 1. 将所有的大写字母统一转换为小写
|
||||
* 2. 将所有的全角转换为半角
|
||||
* 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了)
|
||||
* 4. 繁体字统一转换为简体字
|
||||
* @since 0.0.3
|
||||
*/
|
||||
@Test
|
||||
@Ignore
|
||||
public void formatTest() {
|
||||
final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
|
||||
List<String> words = FileUtil.readAllLines(sourceFile);
|
||||
|
||||
List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
|
||||
@Override
|
||||
public String handle(String string) {
|
||||
String lower = string.toLowerCase();
|
||||
String half = StringUtil.toHalfWidth(lower);
|
||||
String trim = StringUtil.trimAnyBlank(half);
|
||||
String punc = StringUtil.trimAnyPunctionAndSymbol(trim);
|
||||
return ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(punc);
|
||||
}
|
||||
});
|
||||
|
||||
List<String> resultList = DataUtil.disctinctAndSort(formats);
|
||||
FileUtil.write(targetFile, resultList);
|
||||
}
|
||||
|
||||
/**
|
||||
* 移除测试
|
||||
*
|
||||
* 1. 移除 QQ 号的类似数字
|
||||
* 2. 移除所有网址(.com、cn、.org)
|
||||
* 3. 移除纯英文
|
||||
* 4. 移除乱码 `<60>`
|
||||
* 5. 移除英文+数字的
|
||||
*
|
||||
* @since 0.0.3
|
||||
*/
|
||||
@Test
|
||||
@Ignore
|
||||
public void removeTest() {
|
||||
final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
|
||||
List<String> words = FileUtil.readAllLines(sourceFile);
|
||||
|
||||
List<String> formats = CollectionUtil.filterList(words, new IFilter<String>() {
|
||||
@Override
|
||||
public boolean filter(String string) {
|
||||
return StringUtil.isDigitOrLetter(string)
|
||||
|| string.contains("<EFBFBD>")
|
||||
|| string.contains("删掉")
|
||||
|| isUrl(string);
|
||||
}
|
||||
});
|
||||
|
||||
List<String> resultList = DataUtil.disctinctAndSort(formats);
|
||||
FileUtil.write(targetFile, resultList);
|
||||
}
|
||||
|
||||
/**
|
||||
* 数字映射处理
|
||||
* @since 0.0.4
|
||||
*/
|
||||
@Test
|
||||
public void removeNumberMappingTest() {
|
||||
final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
|
||||
List<String> words = FileUtil.readAllLines(sourceFile);
|
||||
List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
|
||||
@Override
|
||||
public String handle(String s) {
|
||||
return s.replaceAll(" ", "");
|
||||
}
|
||||
});
|
||||
List<String> filters = CollectionUtil.filterList(formats, new IFilter<String>() {
|
||||
@Override
|
||||
public boolean filter(String string) {
|
||||
return isNumber(string);
|
||||
}
|
||||
});
|
||||
|
||||
List<String> resultList = DataUtil.disctinctAndSort(filters);
|
||||
FileUtil.write(targetFile, resultList);
|
||||
}
|
||||
|
||||
/**
|
||||
* 是否为存数字
|
||||
* (1)数字小于4的直接跳过。
|
||||
* @param string 原始字符串
|
||||
* @return 结果
|
||||
* @since 0.0.4
|
||||
*/
|
||||
private static boolean isNumber(final String string) {
|
||||
if(string.length() <= 4) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 停顿词语
|
||||
String trim = string.replaceAll("加|否|与|和", "");
|
||||
String mapString = InnerWordNumUtils.getMappingString(trim);
|
||||
boolean result = StringUtil.isDigit(mapString);
|
||||
if(result) {
|
||||
System.out.println(string);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static boolean isUrl(final String string) {
|
||||
return string.endsWith(".com")
|
||||
|| string.endsWith(".cn")
|
||||
|| string.endsWith(".org");
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String trim = "1和2".replaceAll("加|否|与|和", "");
|
||||
System.out.println(trim);
|
||||
}
|
||||
|
||||
}
|
||||
//package com.github.houbb.sensitive.word.data;
|
||||
//
|
||||
//import com.github.houbb.heaven.support.filter.IFilter;
|
||||
//import com.github.houbb.heaven.support.handler.IHandler;
|
||||
//import com.github.houbb.heaven.util.io.FileUtil;
|
||||
//import com.github.houbb.heaven.util.lang.StringUtil;
|
||||
//import com.github.houbb.heaven.util.util.CollectionUtil;
|
||||
//import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
|
||||
//import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
|
||||
//import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
|
||||
//import org.junit.Ignore;
|
||||
//import org.junit.Test;
|
||||
//
|
||||
//import java.util.List;
|
||||
//
|
||||
///**
|
||||
// * 数据初始化
|
||||
// * @author binbin.hou
|
||||
// * @since 0.0.3
|
||||
// */
|
||||
//@Ignore
|
||||
//public class DictSlimTest {
|
||||
//
|
||||
// /**
|
||||
// * 统一格式
|
||||
// *
|
||||
// * 1. 将所有的大写字母统一转换为小写
|
||||
// * 2. 将所有的全角转换为半角
|
||||
// * 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了)
|
||||
// * 4. 繁体字统一转换为简体字
|
||||
// * @since 0.0.3
|
||||
// */
|
||||
// @Test
|
||||
// @Ignore
|
||||
// public void formatTest() {
|
||||
// final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
// final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
//
|
||||
// List<String> words = FileUtil.readAllLines(sourceFile);
|
||||
//
|
||||
// List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
|
||||
// @Override
|
||||
// public String handle(String string) {
|
||||
// String lower = string.toLowerCase();
|
||||
// String half = StringUtil.toHalfWidth(lower);
|
||||
// String trim = StringUtil.trimAnyBlank(half);
|
||||
// String punc = StringUtil.trimAnyPunctionAndSymbol(trim);
|
||||
// return ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(punc);
|
||||
// }
|
||||
// });
|
||||
//
|
||||
// List<String> resultList = DataUtil.disctinctAndSort(formats);
|
||||
// FileUtil.write(targetFile, resultList);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 移除测试
|
||||
// *
|
||||
// * 1. 移除 QQ 号的类似数字
|
||||
// * 2. 移除所有网址(.com、cn、.org)
|
||||
// * 3. 移除纯英文
|
||||
// * 4. 移除乱码 `<60>`
|
||||
// * 5. 移除英文+数字的
|
||||
// *
|
||||
// * @since 0.0.3
|
||||
// */
|
||||
// @Test
|
||||
// @Ignore
|
||||
// public void removeTest() {
|
||||
// final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
// final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
//
|
||||
// List<String> words = FileUtil.readAllLines(sourceFile);
|
||||
//
|
||||
// List<String> formats = CollectionUtil.filterList(words, new IFilter<String>() {
|
||||
// @Override
|
||||
// public boolean filter(String string) {
|
||||
// return StringUtil.isDigitOrLetter(string)
|
||||
// || string.contains("<22>")
|
||||
// || string.contains("删掉")
|
||||
// || isUrl(string);
|
||||
// }
|
||||
// });
|
||||
//
|
||||
// List<String> resultList = DataUtil.disctinctAndSort(formats);
|
||||
// FileUtil.write(targetFile, resultList);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 数字映射处理
|
||||
// * @since 0.0.4
|
||||
// */
|
||||
// @Test
|
||||
// public void removeNumberMappingTest() {
|
||||
// final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
// final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
|
||||
//
|
||||
// List<String> words = FileUtil.readAllLines(sourceFile);
|
||||
// List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
|
||||
// @Override
|
||||
// public String handle(String s) {
|
||||
// return s.replaceAll(" ", "");
|
||||
// }
|
||||
// });
|
||||
// List<String> filters = CollectionUtil.filterList(formats, new IFilter<String>() {
|
||||
// @Override
|
||||
// public boolean filter(String string) {
|
||||
// return isNumber(string);
|
||||
// }
|
||||
// });
|
||||
//
|
||||
// List<String> resultList = DataUtil.disctinctAndSort(filters);
|
||||
// FileUtil.write(targetFile, resultList);
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * 是否为存数字
|
||||
// * (1)数字小于4的直接跳过。
|
||||
// * @param string 原始字符串
|
||||
// * @return 结果
|
||||
// * @since 0.0.4
|
||||
// */
|
||||
// private static boolean isNumber(final String string) {
|
||||
// if(string.length() <= 4) {
|
||||
// return false;
|
||||
// }
|
||||
//
|
||||
// // 停顿词语
|
||||
// String trim = string.replaceAll("加|否|与|和", "");
|
||||
//// String mapString = InnerWordNumUtils.getMappingString(trim);
|
||||
//// boolean result = StringUtil.isDigit(mapString);
|
||||
//// if(result) {
|
||||
//// System.out.println(string);
|
||||
//// }
|
||||
//// return result;
|
||||
// }
|
||||
//
|
||||
// private static boolean isUrl(final String string) {
|
||||
// return string.endsWith(".com")
|
||||
// || string.endsWith(".cn")
|
||||
// || string.endsWith(".org");
|
||||
// }
|
||||
//
|
||||
// public static void main(String[] args) {
|
||||
// String trim = "1和2".replaceAll("加|否|与|和", "");
|
||||
// System.out.println(trim);
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
|
||||
import com.github.houbb.sensitive.word.api.IWordContext;
|
||||
import com.github.houbb.sensitive.word.api.IWordFormat;
|
||||
|
||||
public class EnglishStylePerfTest {
|
||||
|
||||
public static void main(String[] args) {
|
||||
final int times = 200000;
|
||||
|
||||
|
||||
// 不涉及
|
||||
IWordContext context = null;
|
||||
|
||||
// 每次随机选择?
|
||||
String demo1 = "产品尺寸参数§60mn§50mm§210枚/包§160枚/包§名称A4银色不干胶§规格60mm*40mm 送配套模板§规格70mm*50mm 送配套模板§数量每大张21枚一包10张总计210枚§数量每大张16枚一包10张总计160枚§适用激光打印机打印油性笔书写§95mm§100mn§55mm§100枚/包§80枚/包§名称 A4银色不干胶§规格95mm*55mm 送配套模板§规格100mm*70mm 送配套模板§数量每大张10枚一包10张总计100枚§数量 每大张8枚一包10张 总计80枚§100mm§120枚/包§140枚/包§规格80mm*50mm 送配套模板§规格100mm*40mm 送配套模板§数量每大张12枚一包10张总计120枚§数量§每大张14枚包10张总计140枚§适用 激光打印机打印油性笔书写§40mm§65mm§70mm§35mm§200枚/包§240枚/包§规格70mm*40mm送配套模板§规格§65mm*35mm 送配套模板§数量 每大张20枚一包10张总计200枚§每大张24枚包10张总计240枚§适 激光打印机打印油性笔书写§适用§激光打印机打印油性笔书写§40mn§280枚/包§360枚/包§规格50mm*40mm 送配套模板§规格40mm*30mm 送配套模板§数量每大张28枚一包10张总计280枚§数量每大张36枚一包10张总计360枚§45.7mm§38.1mm§400枚/包§650枚/包§45.7mm*25.4mm送配套模板§38.1mm*21.2mm 送配套模板§每大张40枚一包10张总计400枚§数量每大张65枚一包10张总计650枚§30mm§25mr§20mm§840枚/包§1260枚/包§规格 30mm*20mm 送配套模板§规格25mm*13mm 送配套模板§数量每张84枚包10张总计840枚§数量每大张126枚一包10张总计1260枚§46mm§意制§任§1000枚/包§定§名称定制A4内割银不胶§规格46mm*11.1mm送配套模板§任意规格定制§每大张100枚包10张总计1000枚§包10张满5包送专属模板§适激光打印机打印油性笔书写§产品实拍§8格打印实拍展示(100mm*70mm)§上海荠骞文化用品固定资产标识卡§资产编号:§规格型号:§资产名称:§使用状态:§资产类别:§资产原值§存放地点§生产厂家:§使用人§备§注:§*请爱护公司财产,不要随意撕毁此标签§16格全内容打印实拍展示§固定资产标识卡§资产名称§四层货架(平板)§资产编号§3F跑菜区§规格型号§1800×500×1500§使用部门§财务部§使用时间§2019-04-26§李强§21格手写款打印展示 (60mm*40mm)§固定资标识卡§36格打印实拍展示(40mm*30mm)§固定资产标签§名称:§编号:§部门:§40格打印实拍展示(45.7mm*25.4mm)§固定资§名称:电脑§编号:20210§部门:财务部§20210201§使用人:我最强§八:找最强§编号:20210201§65格打印实拍展示(38mm*21mm)§名称:§编号:§数量:§数量:§100格打印实拍展示(46mm*11.1mm)§客服电话:159 9569 3815§: 159 9569 3815§.§客服电话:159 9569§客服电话:1599§客服电话§服电话:159 9569 3815§话:159 9569 3815§客服电话:1599569 3815§电话:159 9569 3815§9569 3815§159 9569 3815§客服电话:§低值易耗品标识牌(70mm*50mm)§购买日期§保管部门§责任人§生产厂家§不要随意撕毁此标牌*§*请爱护公司财产,不要随意撕导§品标识牌§低值易耗品标识牌§随意撕毁此标牌*§*请爱护公司财产,不要随意撕毁此标牌*§三人沙发§行政酒廊§2200*860*900§2018-07-23§应用范围§多用于产品信息固有资产登记航空仓库管理 医疗政府机构等§Mainly used for product information inherent assets registration, aviation warehouse management, medi§cal government institutions, etc§政府单位§企业办公§仓储行业§医疗器械§教育单位§耐用品§电子产品包装§商城卖场";
|
||||
// hash
|
||||
cost1(demo1, times, context);
|
||||
cost2(demo1, times, context);
|
||||
}
|
||||
|
||||
private static void cost1(String text, int times, IWordContext context) {
|
||||
IWordFormat hashMap = new WordFormatIgnoreEnglishStyle();
|
||||
|
||||
long s1 = System.currentTimeMillis();
|
||||
for(int i = 0; i < times; i++) {
|
||||
char c = text.charAt(i % text.length());
|
||||
hashMap.format(c, context);
|
||||
}
|
||||
long cost = System.currentTimeMillis() - s1;
|
||||
System.out.println(cost);
|
||||
}
|
||||
|
||||
private static void cost2(String text, int times, IWordContext context) {
|
||||
IWordFormat hashMap = new WordFormatIgnoreEnglishStyleC2C();
|
||||
|
||||
long s1 = System.currentTimeMillis();
|
||||
for(int i = 0; i < times; i++) {
|
||||
char c = text.charAt(i % text.length());
|
||||
hashMap.format(c, context);
|
||||
}
|
||||
long cost = System.currentTimeMillis() - s1;
|
||||
System.out.println(cost);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1 @@
|
||||
package com.github.houbb.sensitive.word.support.format;
|
||||
Reference in New Issue
Block a user