[Feature] add for new

This commit is contained in:
binbin.hou
2025-09-05 17:16:46 +08:00
parent a46f43024d
commit 8378e202bb
20 changed files with 672 additions and 622 deletions

View File

@@ -471,3 +471,9 @@
|:---|:-----|-----------------------------|:------------------|:--------------------|
| 1 | O | 改进 check、format 的 chains 方法 | 2025-9-5 16:22:24 | 优化性能 |
| 2 | O | InnerWordFormatUtils#format | 2025-9-5 16:22:24 | 优化性能+内存 toCharArray |
# release_0.29.2
| 序号 | 变更类型 | 说明 | 时间 | 备注 |
|:---|:-----|---------------------|:------------------|:--------------------|
| 1 | O | 拆箱、装箱优化。优化数字。英文的格式化 | 2025-9-5 16:22:24 | 优化性能 |

28
pom.xml
View File

@@ -6,7 +6,7 @@
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.29.1</version>
<version>0.29.2</version>
<properties>
<!--============================== All Plugins START ==============================-->
@@ -115,7 +115,6 @@
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</dependency>
</dependencies>
<build>
@@ -182,6 +181,20 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>${plugin.maven-javadoc-plugin.version}</version>
<configuration>
<!-- 指定源码编码 -->
<encoding>UTF-8</encoding>
<!-- 指定文档编码 -->
<docencoding>UTF-8</docencoding>
<!-- 输出 HTML 的 charset -->
<charset>UTF-8</charset>
<!-- 强制生成,不因警告/错误中断 -->
<failOnError>false</failOnError>
<!-- 可以跳过 doclint -->
<additionalJOptions>
<additionalJOption>-Xdoclint:none</additionalJOption>
</additionalJOptions>
</configuration>
</plugin>
</plugins>
@@ -252,6 +265,17 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>${plugin.maven-javadoc-plugin.version}</version>
<configuration>
<encoding>UTF-8</encoding>
<charset>UTF-8</charset>
<docencoding>UTF-8</docencoding>
<additionalJOptions>
<additionalJOption>-Xdoclint:none</additionalJOption>
<additionalJOption>-charset UTF-8</additionalJOption>
<additionalJOption>-encoding UTF-8</additionalJOption>
<additionalJOption>-docencoding UTF-8</additionalJOption>
</additionalJOptions>
</configuration>
<executions>
<execution>
<phase>package</phase>

View File

@@ -0,0 +1,106 @@
package com.github.houbb.sensitive.word.collection;
/**
* 原生无装箱、拆箱的实现
*
* @since 0.29.2
*/
public final class Char2CharMap {
private static final char EMPTY_KEY = '\0'; // 特殊标记,表示空槽
private static final float LOAD_FACTOR = 0.5f;
private char[] keys;
private char[] values;
private int size;
private int mask; // capacity-1用于快速取模
private int maxSize;
public Char2CharMap(int expectedSize) {
int capacity = tableSizeFor((int) (expectedSize / LOAD_FACTOR) + 1);
this.keys = new char[capacity];
this.values = new char[capacity];
this.mask = capacity - 1;
this.maxSize = (int) (capacity * LOAD_FACTOR);
this.size = 0;
}
/** 2 的幂次方容量 */
private static int tableSizeFor(int cap) {
int n = cap - 1;
n |= n >>> 1;
n |= n >>> 2;
n |= n >>> 4;
n |= n >>> 8;
n |= n >>> 16;
return (n < 2) ? 2 : (n >= (1 << 30) ? (1 << 30) : n + 1);
}
private int hash(char k) {
return (k * 0x9E3779B9) & mask; // 乘法哈希 + mask
}
/** 插入或覆盖 */
public void put(char key, char value) {
if (key == EMPTY_KEY) {
throw new IllegalArgumentException("Key '\0' is reserved as EMPTY_KEY.");
}
int idx = hash(key);
while (true) {
if (keys[idx] == EMPTY_KEY) {
keys[idx] = key;
values[idx] = value;
if (++size >= maxSize) {
resize();
}
return;
} else if (keys[idx] == key) {
values[idx] = value;
return;
}
idx = (idx + 1) & mask;
}
}
/** 查询,不存在时返回 defaultValue */
public char get(char key, char defaultValue) {
if (key == EMPTY_KEY) return defaultValue;
int idx = hash(key);
while (true) {
char k = keys[idx];
if (k == EMPTY_KEY) return defaultValue;
if (k == key) return values[idx];
idx = (idx + 1) & mask;
}
}
public char get(char key) {
char defaultVal = 0;
return get(key, defaultVal);
}
private void resize() {
int newCap = keys.length << 1;
char[] oldKeys = keys;
char[] oldVals = values;
keys = new char[newCap];
values = new char[newCap];
mask = newCap - 1;
maxSize = (int) (newCap * LOAD_FACTOR);
size = 0;
for (int i = 0; i < oldKeys.length; i++) {
char k = oldKeys[i];
if (k != EMPTY_KEY) {
put(k, oldVals[i]);
}
}
}
public int size() {
return size;
}
}

View File

@@ -1,203 +0,0 @@
package com.github.houbb.sensitive.word.support.data;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
import com.github.houbb.sensitive.word.constant.WordConst;
import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* 敏感词 map
*
* 不再维护,降低维护成本
*
* @author binbin.hou
* @since 0.0.1
*/
@Deprecated
public class WordDataHashMap extends AbstractWordData {
/**
* 脱敏单词 map
*
* @since 0.0.1
*/
private Map innerWordMap;
/**
* 读取敏感词库将敏感词放入HashSet中构建一个DFA算法模型
*
* @param collection 敏感词库集合
* @since 0.0.1
* <p>
* 使用对象代码 map 的这种一直递归。
* 参考资料https://www.cnblogs.com/AlanLee/p/5329555.html
* https://blog.csdn.net/chenssy/article/details/26961957
*/
@Override
@SuppressWarnings("unchecked")
public synchronized void doInitWordData(Collection<String> collection) {
// 避免扩容带来的消耗
Map newInnerWordMap = new HashMap(collection.size());
for (String key : collection) {
if (StringUtil.isEmpty(key)) {
continue;
}
// 用来按照相应的格式保存敏感词库数据
final int size = key.length();
// 每一个新词的循环,直接将结果设置为当前 map所有变化都会体现在结果的 map 中
Map currentMap = newInnerWordMap;
for (int i = 0; i < size; i++) {
// 截取敏感词当中的字在敏感词库中字为HashMap对象的Key键值
char charKey = key.charAt(i);
// 如果集合存在
Object wordMap = currentMap.get(charKey);
// 如果集合存在
if (ObjectUtil.isNotNull(wordMap)) {
// 直接将获取到的 map 当前当前 map 进行继续的操作
currentMap = (Map) wordMap;
} else {
//不存在则则构建一个新的map同时将isEnd设置为0因为他不是最后一
Map<String, Boolean> newWordMap = new HashMap<>(8);
newWordMap.put(WordConst.IS_END, false);
// 将新的节点放入当前 map 中
currentMap.put(charKey, newWordMap);
// 将新节点设置为当前节点,方便下一次节点的循环。
currentMap = newWordMap;
}
}
// 判断是否为最后一个,添加是否结束的标识。
currentMap.put(WordConst.IS_END, true);
}
// 最后更新为新的 map保证更新过程中旧的数据可用
this.innerWordMap = newInnerWordMap;
}
@Override
protected void doRemoveWord(Collection<String> collection) {
}
@Override
protected void doAddWord(Collection<String> collection) {
}
/**
* 是否包含
* 1直接遍历所有
* 2如果遇到则直接返回 true
*
* @param stringBuilder 字符串
* @param innerContext 内部上下文
* @return 是否包含
* @since 0.0.1
*/
@Override
public WordContainsTypeEnum doContains(final StringBuilder stringBuilder,
final InnerSensitiveWordContext innerContext) {
return innerContainsSensitive(stringBuilder, innerContext);
}
private WordContainsTypeEnum innerContainsSensitive(StringBuilder stringBuilder,
final InnerSensitiveWordContext innerContext) {
// 初始化为当前的 map
Map nowMap = this.innerWordMap;
// 记录敏感词的长度
final int len = stringBuilder.length();
for (int i = 0; i < len; i++) {
// 获取当前的 map 信息
nowMap = getNowMap(nowMap, i, stringBuilder, innerContext);
// 如果不为空,则判断是否为结尾。
if (ObjectUtil.isNull(nowMap)) {
return WordContainsTypeEnum.NOT_FOUND;
}
}
// 是否为结尾,便于快速失败
boolean isEnd = isEnd(nowMap);
if(isEnd) {
return WordContainsTypeEnum.CONTAINS_END;
}
return WordContainsTypeEnum.CONTAINS_PREFIX;
}
/**
* 判断是否结束
* BUG-FIX: 避免出现敏感词库中没有的文字。
* @param map map 信息
* @return 是否结束
* @since 0.0.9
*/
private static boolean isEnd(final Map map) {
if(ObjectUtil.isNull(map)) {
return false;
}
Object value = map.get(WordConst.IS_END);
if(ObjectUtil.isNull(value)) {
return false;
}
return (boolean)value;
}
/**
* 获取当前的 Map
* @param nowMap 原始的当前 map
* @param index 下标
* @param stringBuilder 文本缓存
* @param sensitiveContext 上下文
* @return 实际的当前 map
* @since 0.0.7
*/
private Map getNowMap(Map nowMap,
final int index,
final StringBuilder stringBuilder,
final InnerSensitiveWordContext sensitiveContext) {
final IWordContext context = sensitiveContext.wordContext();
// 这里的 char 已经是统一格式化之后的,所以可以不用再次格式化。
char mappingChar = stringBuilder.charAt(index);
// 这里做一次重复词的处理
//TODO: 这里可以优化,是否获取一次。
Map currentMap = (Map) nowMap.get(mappingChar);
// 启用忽略重复&当前下标不是第一个
if(context.ignoreRepeat()
&& index > 0) {
char preMappingChar = stringBuilder.charAt(index-1);
// 直接赋值为上一个 map
if(preMappingChar == mappingChar) {
currentMap = nowMap;
}
}
return currentMap;
}
@Override
public synchronized void destroy() {
if(innerWordMap != null) {
innerWordMap.clear();
}
}
}

View File

@@ -1,15 +1,18 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;
import com.github.houbb.sensitive.word.api.IWordFormat;
import java.util.HashMap;
import java.util.Map;
/**
* 忽略英文的各种格式
* @author binbin.hou
* @since 0.0.6
*/
@Deprecated
@ThreadSafe
public class WordFormatIgnoreEnglishStyle implements IWordFormat {
@@ -19,9 +22,52 @@ public class WordFormatIgnoreEnglishStyle implements IWordFormat {
return INSTANCE;
}
/**
* 英文字母1
* @since 0.0.4
*/
private static final String LETTERS_ONE =
"ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
"⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
/**
* 英文字母2
* @since 0.0.4
*/
private static final String LETTERS_TWO =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
"abcdefghijklmnopqrstuvwxyz" +
"abcdefghijklmnopqrstuvwxyz";
/**
* 字母映射表
*/
private static final Map<Character,Character> LETTER_MAP = new HashMap<>(LETTERS_ONE.length());
static {
final int size = LETTERS_ONE.length();
for(int i = 0; i < size; i++) {
LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param c 待转换的 char
* @return 转换结果
* @since 0.29.x
*/
private char getMappingChar(final char c) {
Character mapChar = LETTER_MAP.get(c);
return mapChar == null ? c : mapChar;
}
@Override
public char format(char original, IWordContext context) {
return InnerWordCharUtils.getMappingChar(original);
return getMappingChar(original);
}
}

View File

@@ -0,0 +1,69 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.collection.Char2CharMap;
/**
* 忽略英文的各种格式
* @author binbin.hou
* @since 0.0.6
*/
@ThreadSafe
public class WordFormatIgnoreEnglishStyleC2C implements IWordFormat {
private static final IWordFormat INSTANCE = new WordFormatIgnoreEnglishStyleC2C();
public static IWordFormat getInstance() {
return INSTANCE;
}
/**
* 英文字母1
* @since 0.0.4
*/
private static final String LETTERS_ONE =
"ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
"⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
/**
* 英文字母2
* @since 0.0.4
*/
private static final String LETTERS_TWO =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
"abcdefghijklmnopqrstuvwxyz" +
"abcdefghijklmnopqrstuvwxyz";
/**
* 字母映射表
*/
private static final Char2CharMap LETTER_MAP = new Char2CharMap(LETTERS_ONE.length());
static {
final int size = LETTERS_ONE.length();
for(int i = 0; i < size; i++) {
LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param c 待转换的 char
* @return 转换结果
* @since 0.29.x
*/
private char getMappingChar(final char c) {
char mc = LETTER_MAP.get(c);
return mc == 0 ? c : mc;
}
@Override
public char format(char original, IWordContext context) {
return getMappingChar(original);
}
}

View File

@@ -3,13 +3,16 @@ package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
import java.util.HashMap;
import java.util.Map;
/**
* 忽略数字的样式
* @author binbin.hou
* @since 0.0.5
*/
@Deprecated
@ThreadSafe
public class WordFormatIgnoreNumStyle implements IWordFormat {
@@ -19,9 +22,65 @@ public class WordFormatIgnoreNumStyle implements IWordFormat {
return INSTANCE;
}
private static final String NUM_ONE = "⓪0零º₀⓿○" +
"" +
"一二三四五六七八九" +
"壹贰叁肆伍陆柒捌玖" +
"¹²³⁴⁵⁶⁷⁸⁹" +
"₁₂₃₄₅₆₇₈₉" +
"①②③④⑤⑥⑦⑧⑨" +
"⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
"⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
"❶❷❸❹❺❻❼❽❾" +
"➀➁➂➃➄➅➆➇➈" +
"➊➋➌➍➎➏➐➑➒" +
"㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
"⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
"㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
"ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
"ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
private static final String NUM_TWO = "0000000"+
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789";
private static final Map<Character,Character> NUMBER_MAP = new HashMap<>(NUM_ONE.length());
static {
final int size = NUM_ONE.length();
for(int i = 0; i < size; i++) {
NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param c 待转换的 char
* @return 结果
* @since 0.0.4
*/
private char getMappingChar(final char c) {
Character mapChar = NUMBER_MAP.get(c);
return mapChar == null ? c : mapChar;
}
@Override
public char format(char original, IWordContext context) {
return InnerWordNumUtils.getMappingChar(original);
return getMappingChar(original);
}
}

View File

@@ -0,0 +1,86 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.collection.Char2CharMap;
import java.util.HashMap;
import java.util.Map;
/**
* 忽略数字的样式
* @author binbin.hou
* @since 0.0.5
*/
@ThreadSafe
public class WordFormatIgnoreNumStyleC2C implements IWordFormat {
private static final IWordFormat INSTANCE = new WordFormatIgnoreNumStyleC2C();
public static IWordFormat getInstance() {
return INSTANCE;
}
private static final String NUM_ONE = "⓪0零º₀⓿○" +
"" +
"一二三四五六七八九" +
"壹贰叁肆伍陆柒捌玖" +
"¹²³⁴⁵⁶⁷⁸⁹" +
"₁₂₃₄₅₆₇₈₉" +
"①②③④⑤⑥⑦⑧⑨" +
"⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
"⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
"❶❷❸❹❺❻❼❽❾" +
"➀➁➂➃➄➅➆➇➈" +
"➊➋➌➍➎➏➐➑➒" +
"㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
"⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
"㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
"ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
"ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
private static final String NUM_TWO = "0000000"+
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789";
private static final Char2CharMap NUMBER_MAP = new Char2CharMap(NUM_ONE.length());
static {
final int size = NUM_ONE.length();
for(int i = 0; i < size; i++) {
NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param c 待转换的 char
* @return 结果
* @since 0.0.4
*/
private char getMappingChar(final char c) {
char mc = NUMBER_MAP.get(c);
return mc == 0 ? c : mc;
}
@Override
public char format(char original, IWordContext context) {
return getMappingChar(original);
}
}

View File

@@ -1,9 +1,9 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.util.lang.CharUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.utils.InnerCharUtils;
/**
* 格式化字宽度
@@ -21,7 +21,7 @@ public class WordFormatIgnoreWidth implements IWordFormat {
@Override
public char format(char original, IWordContext context) {
return CharUtil.toHalfWidth(original);
return InnerCharUtils.toHalfWidth(original);
}
}

View File

@@ -1,44 +0,0 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.heaven.support.pipeline.Pipeline;
import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline;
import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.api.IWordContext;
import java.util.List;
/**
* 格式化责任链
* @author binbin.hou
* @since 0.0.5
*/
@ThreadSafe
@Deprecated
public abstract class WordFormatInit implements IWordFormat {
/**
* 初始化列表
*
* @param pipeline 当前列表泳道
* @since 0.0.13
*/
protected abstract void init(final Pipeline<IWordFormat> pipeline);
@Override
public char format(char original, IWordContext context) {
Pipeline<IWordFormat> pipeline = new DefaultPipeline<>();
init(pipeline);
char result = original;
// 循环执行
List<IWordFormat> charFormats = pipeline.list();
for(IWordFormat charFormat : charFormats) {
result = charFormat.format(result, context);
}
return result;
}
}

View File

@@ -51,7 +51,7 @@ public final class WordFormats {
}
public static IWordFormat ignoreEnglishStyle() {
return WordFormatIgnoreEnglishStyle.getInstance();
return WordFormatIgnoreEnglishStyleC2C.getInstance();
}
public static IWordFormat ignoreChineseStyle() {
@@ -59,7 +59,7 @@ public final class WordFormats {
}
public static IWordFormat ignoreNumStyle() {
return WordFormatIgnoreNumStyle.getInstance();
return WordFormatIgnoreNumStyleC2C.getInstance();
}
public static IWordFormat ignoreWidth() {

View File

@@ -2,7 +2,6 @@ package com.github.houbb.sensitive.word.support.format.mapping;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.support.check.WordCheckNone;
import com.github.houbb.sensitive.word.support.format.WordFormatNone;
import java.util.Collections;
@@ -26,6 +25,7 @@ public class WordFormatTextDefault extends AbstractWordFormatText {
return Collections.emptyMap();
}
//v0.29.2
Map<Character, Character> map = new HashMap<>();
for(int i = 0; i < text.length(); i++) {
char c = text.charAt(i);

View File

@@ -5,6 +5,24 @@ package com.github.houbb.sensitive.word.utils;
*/
public class InnerCharUtils {
/**
* 转换为半角
* @param original 原始
* @return 半角
* @since 0.29.2
*/
public static char toHalfWidth(char original) {
// 全角空格
if (original == '\u3000') return ' ';
// 其他可转换全角字符
if (original >= '\uFF01' && original <= '\uFF5E') {
return (char) (original - 0xFEE0);
}
// 其他字符保持不变
return original;
}
/**
* 转换为整数
* @param text 文本

View File

@@ -1,11 +1,7 @@
package com.github.houbb.sensitive.word.utils;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.sensitive.word.api.IWordResult;
import java.util.Map;
/**
* <p> project: sensitive-word-NumUtils </p>
* <p> create on 2020/1/8 22:18 </p>
@@ -18,84 +14,6 @@ public final class InnerWordCharUtils {
private InnerWordCharUtils() {
}
/**
* 英文字母1
* @since 0.0.4
*/
private static final String LETTERS_ONE =
"ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
"ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
"⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
/**
* 英文字母2
* @since 0.0.4
*/
private static final String LETTERS_TWO =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
"abcdefghijklmnopqrstuvwxyz" +
"abcdefghijklmnopqrstuvwxyz";
/**
* 英文字母 map
* @since 0.0.4
*/
private static final Map<Character, Character> LETTER_MAP = Guavas.newHashMap(LETTERS_ONE.length());
static {
final int size = LETTERS_ONE.length();
for(int i = 0; i < size; i++) {
LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param character 待转换的 char
* @return 结果
* @since 0.0.4
*/
public static Character getMappingChar(final Character character) {
final Character mapChar = LETTER_MAP.get(character);
if(ObjectUtil.isNotNull(mapChar)) {
return mapChar;
}
return character;
}
/**
* 构建字符串
* @param chars 字符数组
* @param startIndex 开始位置
* @param endIndex 结束位置
* @return 结果
* @since 0.5.0
*/
// @Deprecated
// public static String getString(final char[] chars,
// final int startIndex,
// final int endIndex) {
// // 截取
// int len = endIndex - startIndex;
// return new String(chars, startIndex, len);
// }
/**
* 构建字符串
* @param chars 字符数组
* @param wordResult 结果
* @return 结果
* @since 0.5.0
*/
// @Deprecated
// public static String getString(final char[] chars,
// final IWordResult wordResult) {
// return getString(chars, wordResult.startIndex(), wordResult.endIndex());
// }
/**
* 构建字符串
* @param text 字符串

View File

@@ -47,17 +47,20 @@ public final class InnerWordFormatUtils {
/**
* 字符串统一的格式化处理
*
* 注意:这个需要 map 的实现是 {@link it.unimi.dsi.fastutil.chars.Char2CharOpenHashMap}
* @param map 映射集合
* @param c 原始
* @return 结果
* @since 0.28.0
*/
public static char getMappingChar(final Map<Character, Character> map, char c) {
Character mc = map.get(c);
if(mc != null) {
return mc;
//Char2CharOpenHashMap 不存在映射也是返回 null
Object mc = map.get(c);
if(mc == null) {
return c;
}
return c;
return (char) mc;
}
/**

View File

@@ -1,128 +0,0 @@
package com.github.houbb.sensitive.word.utils;
import com.github.houbb.heaven.util.guava.Guavas;
import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum;
import java.util.Map;
/**
* <p> project: sensitive-word-NumUtils </p>
* <p> create on 2020/1/8 22:18 </p>
*
* @author Administrator
* @since 0.0.4
*/
public final class InnerWordNumUtils {
private InnerWordNumUtils(){}
private static final String NUM_ONE = "⓪0零º₀⓿○" +
"" +
"一二三四五六七八九" +
"壹贰叁肆伍陆柒捌玖" +
"¹²³⁴⁵⁶⁷⁸⁹" +
"₁₂₃₄₅₆₇₈₉" +
"①②③④⑤⑥⑦⑧⑨" +
"⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
"⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
"❶❷❸❹❺❻❼❽❾" +
"➀➁➂➃➄➅➆➇➈" +
"➊➋➌➍➎➏➐➑➒" +
"㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
"⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
"㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
"ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
"ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
private static final String NUM_TWO = "0000000"+
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789" +
"123456789";
/**
* 英文字母 map
* @since 0.0.4
*/
private static final Map<Character, Character> NUMBER_MAP = Guavas.newHashMap(NUM_ONE.length());
static {
final int size = NUM_ONE.length();
for(int i = 0; i < size; i++) {
NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
}
}
/**
* 映射后的 char
* @param character 待转换的 char
* @return 结果
* @since 0.0.4
*/
public static Character getMappingChar(final Character character) {
final Character mapChar = NUMBER_MAP.get(character);
if(ObjectUtil.isNotNull(mapChar)) {
return mapChar;
}
return character;
}
public static String getMappingString(final String string) {
if(StringUtil.isEmpty(string)) {
return string;
}
int length = string.length();
StringBuilder stringBuilder = new StringBuilder(length);
for(int i = 0; i < length; i++) {
char mapChar = getMappingChar(string.charAt(i));
//TODO: stop word 的处理
stringBuilder.append(mapChar);
}
return stringBuilder.toString();
}
/**
* 检查敏感词数量
* <p>
* 1如果未命中敏感词直接返回 0
* 2命中敏感词则返回敏感词的长度。
*
* ps: 这里结果进行优化,
* 1. 是否包含敏感词。
* 2. 敏感词的长度
* 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复)
*
* @param txt 文本信息
* @param beginIndex 开始下标
* @param wordValidModeEnum 验证模式
* @param context 执行上下文
* @return 敏感数字对应的长度
* @since 0.0.5
*/
private int getSensitiveNumber(final String txt, final int beginIndex,
final WordValidModeEnum wordValidModeEnum,
final IWordContext context) {
return 0;
}
}

View File

@@ -0,0 +1,43 @@
package com.github.houbb.sensitive.word.benchmark;
import com.github.houbb.heaven.util.lang.CharUtil;
import com.github.houbb.sensitive.word.utils.InnerCharUtils;
public class CharUtilPerfTest {
private static final int COUNT = 10_00_000;
public static void main(String[] args) {
char[] testData = new char[COUNT];
for (int i = 0; i < COUNT; i++) {
testData[i] = (char) ('A' + (i % 52)); // A-Z a-z
}
// 测试新小写
// 测试原始半角
char[] fullWidthData = new char[COUNT];
for (int i = 0; i < COUNT; i++) {
fullWidthData[i] = (char) ('\uFF01' + (i % 94)); // 常见全角字符
}
long t5 = System.currentTimeMillis();
char sum3 = 0;
for (char c : fullWidthData) {
sum3 += CharUtil.toHalfWidth(c);
}
long t6 = System.currentTimeMillis();
System.out.println("原始 toHalfWidth 耗时: " + (t6 - t5) + "ms, sum=" + sum3);
// 测试新半角
long t7 = System.currentTimeMillis();
char sum4 = 0;
for (char c : fullWidthData) {
sum4 += InnerCharUtils.toHalfWidth(c);
}
long t8 = System.currentTimeMillis();
System.out.println("优化 toHalfWidth 耗时: " + (t8 - t7) + "ms, sum=" + sum4);
}
}

View File

@@ -1,149 +1,149 @@
package com.github.houbb.sensitive.word.data;
import com.github.houbb.heaven.support.filter.IFilter;
import com.github.houbb.heaven.support.handler.IHandler;
import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
import org.junit.Ignore;
import org.junit.Test;
import java.util.List;
/**
* 数据初始化
* @author binbin.hou
* @since 0.0.3
*/
@Ignore
public class DictSlimTest {
/**
* 统一格式
*
* 1. 将所有的大写字母统一转换为小写
* 2. 将所有的全角转换为半角
* 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了)
* 4. 繁体字统一转换为简体字
* @since 0.0.3
*/
@Test
@Ignore
public void formatTest() {
final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
List<String> words = FileUtil.readAllLines(sourceFile);
List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
@Override
public String handle(String string) {
String lower = string.toLowerCase();
String half = StringUtil.toHalfWidth(lower);
String trim = StringUtil.trimAnyBlank(half);
String punc = StringUtil.trimAnyPunctionAndSymbol(trim);
return ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(punc);
}
});
List<String> resultList = DataUtil.disctinctAndSort(formats);
FileUtil.write(targetFile, resultList);
}
/**
* 移除测试
*
* 1. 移除 QQ 号的类似数字
* 2. 移除所有网址(.com、cn、.org
* 3. 移除纯英文
* 4. 移除乱码 `<60>`
* 5. 移除英文+数字的
*
* @since 0.0.3
*/
@Test
@Ignore
public void removeTest() {
final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
List<String> words = FileUtil.readAllLines(sourceFile);
List<String> formats = CollectionUtil.filterList(words, new IFilter<String>() {
@Override
public boolean filter(String string) {
return StringUtil.isDigitOrLetter(string)
|| string.contains("<EFBFBD>")
|| string.contains("删掉")
|| isUrl(string);
}
});
List<String> resultList = DataUtil.disctinctAndSort(formats);
FileUtil.write(targetFile, resultList);
}
/**
* 数字映射处理
* @since 0.0.4
*/
@Test
public void removeNumberMappingTest() {
final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
List<String> words = FileUtil.readAllLines(sourceFile);
List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
@Override
public String handle(String s) {
return s.replaceAll(" ", "");
}
});
List<String> filters = CollectionUtil.filterList(formats, new IFilter<String>() {
@Override
public boolean filter(String string) {
return isNumber(string);
}
});
List<String> resultList = DataUtil.disctinctAndSort(filters);
FileUtil.write(targetFile, resultList);
}
/**
* 是否为存数字
* 1数字小于4的直接跳过。
* @param string 原始字符串
* @return 结果
* @since 0.0.4
*/
private static boolean isNumber(final String string) {
if(string.length() <= 4) {
return false;
}
// 停顿词语
String trim = string.replaceAll("加|否|与|和", "");
String mapString = InnerWordNumUtils.getMappingString(trim);
boolean result = StringUtil.isDigit(mapString);
if(result) {
System.out.println(string);
}
return result;
}
private static boolean isUrl(final String string) {
return string.endsWith(".com")
|| string.endsWith(".cn")
|| string.endsWith(".org");
}
public static void main(String[] args) {
String trim = "1和2".replaceAll("加|否|与|和", "");
System.out.println(trim);
}
}
//package com.github.houbb.sensitive.word.data;
//
//import com.github.houbb.heaven.support.filter.IFilter;
//import com.github.houbb.heaven.support.handler.IHandler;
//import com.github.houbb.heaven.util.io.FileUtil;
//import com.github.houbb.heaven.util.lang.StringUtil;
//import com.github.houbb.heaven.util.util.CollectionUtil;
//import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
//import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
//import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
//import org.junit.Ignore;
//import org.junit.Test;
//
//import java.util.List;
//
///**
// * 数据初始化
// * @author binbin.hou
// * @since 0.0.3
// */
//@Ignore
//public class DictSlimTest {
//
// /**
// * 统一格式
// *
// * 1. 将所有的大写字母统一转换为小写
// * 2. 将所有的全角转换为半角
// * 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了)
// * 4. 繁体字统一转换为简体字
// * @since 0.0.3
// */
// @Test
// @Ignore
// public void formatTest() {
// final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
// final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
//
// List<String> words = FileUtil.readAllLines(sourceFile);
//
// List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
// @Override
// public String handle(String string) {
// String lower = string.toLowerCase();
// String half = StringUtil.toHalfWidth(lower);
// String trim = StringUtil.trimAnyBlank(half);
// String punc = StringUtil.trimAnyPunctionAndSymbol(trim);
// return ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(punc);
// }
// });
//
// List<String> resultList = DataUtil.disctinctAndSort(formats);
// FileUtil.write(targetFile, resultList);
// }
//
// /**
// * 移除测试
// *
// * 1. 移除 QQ 号的类似数字
// * 2. 移除所有网址(.com、cn、.org
// * 3. 移除纯英文
// * 4. 移除乱码 `<60>`
// * 5. 移除英文+数字的
// *
// * @since 0.0.3
// */
// @Test
// @Ignore
// public void removeTest() {
// final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
// final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
//
// List<String> words = FileUtil.readAllLines(sourceFile);
//
// List<String> formats = CollectionUtil.filterList(words, new IFilter<String>() {
// @Override
// public boolean filter(String string) {
// return StringUtil.isDigitOrLetter(string)
// || string.contains("<22>")
// || string.contains("删掉")
// || isUrl(string);
// }
// });
//
// List<String> resultList = DataUtil.disctinctAndSort(formats);
// FileUtil.write(targetFile, resultList);
// }
//
// /**
// * 数字映射处理
// * @since 0.0.4
// */
// @Test
// public void removeNumberMappingTest() {
// final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
// final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
//
// List<String> words = FileUtil.readAllLines(sourceFile);
// List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
// @Override
// public String handle(String s) {
// return s.replaceAll(" ", "");
// }
// });
// List<String> filters = CollectionUtil.filterList(formats, new IFilter<String>() {
// @Override
// public boolean filter(String string) {
// return isNumber(string);
// }
// });
//
// List<String> resultList = DataUtil.disctinctAndSort(filters);
// FileUtil.write(targetFile, resultList);
// }
//
// /**
// * 是否为存数字
// * 1数字小于4的直接跳过。
// * @param string 原始字符串
// * @return 结果
// * @since 0.0.4
// */
// private static boolean isNumber(final String string) {
// if(string.length() <= 4) {
// return false;
// }
//
// // 停顿词语
// String trim = string.replaceAll("加|否|与|和", "");
//// String mapString = InnerWordNumUtils.getMappingString(trim);
//// boolean result = StringUtil.isDigit(mapString);
//// if(result) {
//// System.out.println(string);
//// }
//// return result;
// }
//
// private static boolean isUrl(final String string) {
// return string.endsWith(".com")
// || string.endsWith(".cn")
// || string.endsWith(".org");
// }
//
// public static void main(String[] args) {
// String trim = "1和2".replaceAll("加|否|与|和", "");
// System.out.println(trim);
// }
//
//}

View File

@@ -0,0 +1,46 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
public class EnglishStylePerfTest {
public static void main(String[] args) {
final int times = 200000;
// 不涉及
IWordContext context = null;
// 每次随机选择?
String demo1 = "产品尺寸参数§60mn§50mm§210枚/包§160枚/包§名称A4银色不干胶§规格60mm*40mm 送配套模板§规格70mm*50mm 送配套模板§数量每大张21枚一包10张总计210枚§数量每大张16枚一包10张总计160枚§适用激光打印机打印油性笔书写§95mm§100mn§55mm§100枚/包§80枚/包§名称 A4银色不干胶§规格95mm*55mm 送配套模板§规格100mm*70mm 送配套模板§数量每大张10枚一包10张总计100枚§数量 每大张8枚一包10张 总计80枚§100mm§120枚/包§140枚/包§规格80mm*50mm 送配套模板§规格100mm*40mm 送配套模板§数量每大张12枚一包10张总计120枚§数量§每大张14枚包10张总计140枚§适用 激光打印机打印油性笔书写§40mm§65mm§70mm§35mm§200枚/包§240枚/包§规格70mm*40mm送配套模板§规格§65mm*35mm 送配套模板§数量 每大张20枚一包10张总计200枚§每大张24枚包10张总计240枚§适 激光打印机打印油性笔书写§适用§激光打印机打印油性笔书写§40mn§280枚/包§360枚/包§规格50mm*40mm 送配套模板§规格40mm*30mm 送配套模板§数量每大张28枚一包10张总计280枚§数量每大张36枚一包10张总计360枚§45.7mm§38.1mm§400枚/包§650枚/包§45.7mm*25.4mm送配套模板§38.1mm*21.2mm 送配套模板§每大张40枚一包10张总计400枚§数量每大张65枚一包10张总计650枚§30mm§25mr§20mm§840枚/包§1260枚/包§规格 30mm*20mm 送配套模板§规格25mm*13mm 送配套模板§数量每张84枚包10张总计840枚§数量每大张126枚一包10张总计1260枚§46mm§意制§任§1000枚/包§定§名称定制A4内割银不胶§规格46mm*11.1mm送配套模板§任意规格定制§每大张100枚包10张总计1000枚§包10张满5包送专属模板§适激光打印机打印油性笔书写§产品实拍§8格打印实拍展示(100mm*70mm)§上海荠骞文化用品固定资产标识卡§资产编号:§规格型号:§资产名称:§使用状态:§资产类别:§资产原值§存放地点§生产厂家:§使用人§备§注:§*请爱护公司财产不要随意撕毁此标签§16格全内容打印实拍展示§固定资产标识卡§资产名称§四层货架平板§资产编号§3F跑菜区§规格型号§1800×500×1500§使用部门§财务部§使用时间§2019-04-26§李强§21格手写款打印展示 (60mm*40mm)§固定资标识卡§36格打印实拍展示(40mm*30mm)§固定资产标签§名称:§编号:§部门:§40格打印实拍展示(45.7mm*25.4mm)§固定资§名称电脑§编号20210§部门财务部§20210201§使用人我最强§八找最强§编号20210201§65格打印实拍展示(38mm*21mm)§名称:§编号:§数量:§数量:§100格打印实拍展示(46mm*11.1mm)§客服电话159 9569 3815§: 159 9569 3815§.§客服电话159 9569§客服电话1599§客服电话§服电话159 9569 3815§话159 9569 3815§客服电话1599569 3815§电话159 9569 3815§9569 3815§159 9569 3815§客服电话§低值易耗品标识牌(70mm*50mm)§购买日期§保管部门§责任人§生产厂家§不要随意撕毁此标牌*§*请爱护公司财产,不要随意撕导§品标识牌§低值易耗品标识牌§随意撕毁此标牌*§*请爱护公司财产,不要随意撕毁此标牌*§三人沙发§行政酒廊§2200*860*900§2018-07-23§应用范围§多用于产品信息固有资产登记航空仓库管理 医疗政府机构等§Mainly used for product information inherent assets registration, aviation warehouse management, medi§cal government institutions, etc§政府单位§企业办公§仓储行业§医疗器械§教育单位§耐用品§电子产品包装§商城卖场";
// hash
cost1(demo1, times, context);
cost2(demo1, times, context);
}
private static void cost1(String text, int times, IWordContext context) {
IWordFormat hashMap = new WordFormatIgnoreEnglishStyle();
long s1 = System.currentTimeMillis();
for(int i = 0; i < times; i++) {
char c = text.charAt(i % text.length());
hashMap.format(c, context);
}
long cost = System.currentTimeMillis() - s1;
System.out.println(cost);
}
private static void cost2(String text, int times, IWordContext context) {
IWordFormat hashMap = new WordFormatIgnoreEnglishStyleC2C();
long s1 = System.currentTimeMillis();
for(int i = 0; i < times; i++) {
char c = text.charAt(i % text.length());
hashMap.format(c, context);
}
long cost = System.currentTimeMillis() - s1;
System.out.println(cost);
}
}

View File

@@ -0,0 +1 @@
package com.github.houbb.sensitive.word.support.format;