package
diff --git a/src/main/java/com/github/houbb/sensitive/word/collection/Char2CharMap.java b/src/main/java/com/github/houbb/sensitive/word/collection/Char2CharMap.java
new file mode 100644
index 0000000..cc326f8
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/collection/Char2CharMap.java
@@ -0,0 +1,106 @@
+package com.github.houbb.sensitive.word.collection;
+
+/**
+ * 原生无装箱、拆箱的实现
+ *
+ * @since 0.29.2
+ */
+public final class Char2CharMap {
+
+ private static final char EMPTY_KEY = '\0'; // 特殊标记,表示空槽
+ private static final float LOAD_FACTOR = 0.5f;
+
+ private char[] keys;
+ private char[] values;
+ private int size;
+ private int mask; // capacity-1,用于快速取模
+ private int maxSize;
+
+ public Char2CharMap(int expectedSize) {
+ int capacity = tableSizeFor((int) (expectedSize / LOAD_FACTOR) + 1);
+ this.keys = new char[capacity];
+ this.values = new char[capacity];
+ this.mask = capacity - 1;
+ this.maxSize = (int) (capacity * LOAD_FACTOR);
+ this.size = 0;
+ }
+
+ /** 2 的幂次方容量 */
+ private static int tableSizeFor(int cap) {
+ int n = cap - 1;
+ n |= n >>> 1;
+ n |= n >>> 2;
+ n |= n >>> 4;
+ n |= n >>> 8;
+ n |= n >>> 16;
+ return (n < 2) ? 2 : (n >= (1 << 30) ? (1 << 30) : n + 1);
+ }
+
+ private int hash(char k) {
+ return (k * 0x9E3779B9) & mask; // 乘法哈希 + mask
+ }
+
+ /** 插入或覆盖 */
+ public void put(char key, char value) {
+ if (key == EMPTY_KEY) {
+ throw new IllegalArgumentException("Key '\0' is reserved as EMPTY_KEY.");
+ }
+ int idx = hash(key);
+ while (true) {
+ if (keys[idx] == EMPTY_KEY) {
+ keys[idx] = key;
+ values[idx] = value;
+ if (++size >= maxSize) {
+ resize();
+ }
+ return;
+ } else if (keys[idx] == key) {
+ values[idx] = value;
+ return;
+ }
+ idx = (idx + 1) & mask;
+ }
+ }
+
+ /** 查询,不存在时返回 defaultValue */
+ public char get(char key, char defaultValue) {
+ if (key == EMPTY_KEY) return defaultValue;
+ int idx = hash(key);
+ while (true) {
+ char k = keys[idx];
+ if (k == EMPTY_KEY) return defaultValue;
+ if (k == key) return values[idx];
+ idx = (idx + 1) & mask;
+ }
+ }
+
+ public char get(char key) {
+ char defaultVal = 0;
+ return get(key, defaultVal);
+ }
+
+ private void resize() {
+ int newCap = keys.length << 1;
+ char[] oldKeys = keys;
+ char[] oldVals = values;
+
+ keys = new char[newCap];
+ values = new char[newCap];
+ mask = newCap - 1;
+ maxSize = (int) (newCap * LOAD_FACTOR);
+ size = 0;
+
+ for (int i = 0; i < oldKeys.length; i++) {
+ char k = oldKeys[i];
+ if (k != EMPTY_KEY) {
+ put(k, oldVals[i]);
+ }
+ }
+ }
+
+ public int size() {
+ return size;
+ }
+}
+
+
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataHashMap.java b/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataHashMap.java
deleted file mode 100644
index a4d4c6d..0000000
--- a/src/main/java/com/github/houbb/sensitive/word/support/data/WordDataHashMap.java
+++ /dev/null
@@ -1,203 +0,0 @@
-package com.github.houbb.sensitive.word.support.data;
-
-import com.github.houbb.heaven.util.lang.ObjectUtil;
-import com.github.houbb.heaven.util.lang.StringUtil;
-import com.github.houbb.sensitive.word.api.IWordContext;
-import com.github.houbb.sensitive.word.api.context.InnerSensitiveWordContext;
-import com.github.houbb.sensitive.word.constant.WordConst;
-import com.github.houbb.sensitive.word.constant.enums.WordContainsTypeEnum;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * 敏感词 map
- *
- * 不再维护,降低维护成本
- *
- * @author binbin.hou
- * @since 0.0.1
- */
-@Deprecated
-public class WordDataHashMap extends AbstractWordData {
-
- /**
- * 脱敏单词 map
- *
- * @since 0.0.1
- */
- private Map innerWordMap;
-
- /**
- * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:
- *
- * @param collection 敏感词库集合
- * @since 0.0.1
- *
- * 使用对象代码 map 的这种一直递归。
- * 参考资料:https://www.cnblogs.com/AlanLee/p/5329555.html
- * https://blog.csdn.net/chenssy/article/details/26961957
- */
- @Override
- @SuppressWarnings("unchecked")
- public synchronized void doInitWordData(Collection collection) {
- // 避免扩容带来的消耗
- Map newInnerWordMap = new HashMap(collection.size());
-
- for (String key : collection) {
- if (StringUtil.isEmpty(key)) {
- continue;
- }
-
- // 用来按照相应的格式保存敏感词库数据
- final int size = key.length();
-
- // 每一个新词的循环,直接将结果设置为当前 map,所有变化都会体现在结果的 map 中
- Map currentMap = newInnerWordMap;
-
- for (int i = 0; i < size; i++) {
- // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
- char charKey = key.charAt(i);
- // 如果集合存在
- Object wordMap = currentMap.get(charKey);
-
- // 如果集合存在
- if (ObjectUtil.isNotNull(wordMap)) {
- // 直接将获取到的 map 当前当前 map 进行继续的操作
- currentMap = (Map) wordMap;
- } else {
- //不存在则,则构建一个新的map,同时将isEnd设置为0,因为他不是最后一
- Map newWordMap = new HashMap<>(8);
- newWordMap.put(WordConst.IS_END, false);
-
- // 将新的节点放入当前 map 中
- currentMap.put(charKey, newWordMap);
-
- // 将新节点设置为当前节点,方便下一次节点的循环。
- currentMap = newWordMap;
- }
- }
-
- // 判断是否为最后一个,添加是否结束的标识。
- currentMap.put(WordConst.IS_END, true);
- }
-
- // 最后更新为新的 map,保证更新过程中旧的数据可用
- this.innerWordMap = newInnerWordMap;
- }
-
- @Override
- protected void doRemoveWord(Collection collection) {
-
- }
-
- @Override
- protected void doAddWord(Collection collection) {
-
- }
-
- /**
- * 是否包含
- * (1)直接遍历所有
- * (2)如果遇到,则直接返回 true
- *
- * @param stringBuilder 字符串
- * @param innerContext 内部上下文
- * @return 是否包含
- * @since 0.0.1
- */
- @Override
- public WordContainsTypeEnum doContains(final StringBuilder stringBuilder,
- final InnerSensitiveWordContext innerContext) {
- return innerContainsSensitive(stringBuilder, innerContext);
- }
-
- private WordContainsTypeEnum innerContainsSensitive(StringBuilder stringBuilder,
- final InnerSensitiveWordContext innerContext) {
- // 初始化为当前的 map
- Map nowMap = this.innerWordMap;
-
- // 记录敏感词的长度
- final int len = stringBuilder.length();
- for (int i = 0; i < len; i++) {
- // 获取当前的 map 信息
- nowMap = getNowMap(nowMap, i, stringBuilder, innerContext);
-
- // 如果不为空,则判断是否为结尾。
- if (ObjectUtil.isNull(nowMap)) {
- return WordContainsTypeEnum.NOT_FOUND;
- }
- }
-
- // 是否为结尾,便于快速失败
- boolean isEnd = isEnd(nowMap);
- if(isEnd) {
- return WordContainsTypeEnum.CONTAINS_END;
- }
-
- return WordContainsTypeEnum.CONTAINS_PREFIX;
- }
-
- /**
- * 判断是否结束
- * BUG-FIX: 避免出现敏感词库中没有的文字。
- * @param map map 信息
- * @return 是否结束
- * @since 0.0.9
- */
- private static boolean isEnd(final Map map) {
- if(ObjectUtil.isNull(map)) {
- return false;
- }
-
- Object value = map.get(WordConst.IS_END);
- if(ObjectUtil.isNull(value)) {
- return false;
- }
-
- return (boolean)value;
- }
- /**
- * 获取当前的 Map
- * @param nowMap 原始的当前 map
- * @param index 下标
- * @param stringBuilder 文本缓存
- * @param sensitiveContext 上下文
- * @return 实际的当前 map
- * @since 0.0.7
- */
- private Map getNowMap(Map nowMap,
- final int index,
- final StringBuilder stringBuilder,
- final InnerSensitiveWordContext sensitiveContext) {
- final IWordContext context = sensitiveContext.wordContext();
-
- // 这里的 char 已经是统一格式化之后的,所以可以不用再次格式化。
- char mappingChar = stringBuilder.charAt(index);
-
- // 这里做一次重复词的处理
- //TODO: 这里可以优化,是否获取一次。
- Map currentMap = (Map) nowMap.get(mappingChar);
- // 启用忽略重复&当前下标不是第一个
- if(context.ignoreRepeat()
- && index > 0) {
- char preMappingChar = stringBuilder.charAt(index-1);
-
- // 直接赋值为上一个 map
- if(preMappingChar == mappingChar) {
- currentMap = nowMap;
- }
- }
-
- return currentMap;
- }
-
- @Override
- public synchronized void destroy() {
- if(innerWordMap != null) {
- innerWordMap.clear();
- }
- }
-
-}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyle.java b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyle.java
index eb81f50..a3d5745 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyle.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyle.java
@@ -1,15 +1,18 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
-import com.github.houbb.sensitive.word.api.IWordFormat;
import com.github.houbb.sensitive.word.api.IWordContext;
-import com.github.houbb.sensitive.word.utils.InnerWordCharUtils;
+import com.github.houbb.sensitive.word.api.IWordFormat;
+
+import java.util.HashMap;
+import java.util.Map;
/**
* 忽略英文的各种格式
* @author binbin.hou
* @since 0.0.6
*/
+@Deprecated
@ThreadSafe
public class WordFormatIgnoreEnglishStyle implements IWordFormat {
@@ -19,9 +22,52 @@ public class WordFormatIgnoreEnglishStyle implements IWordFormat {
return INSTANCE;
}
+ /**
+ * 英文字母1
+ * @since 0.0.4
+ */
+ private static final String LETTERS_ONE =
+ "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
+ "ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
+ "⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
+
+ /**
+ * 英文字母2
+ * @since 0.0.4
+ */
+ private static final String LETTERS_TWO =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
+ "abcdefghijklmnopqrstuvwxyz" +
+ "abcdefghijklmnopqrstuvwxyz";
+
+
+ /**
+ * 字母映射表
+ */
+ private static final Map LETTER_MAP = new HashMap<>(LETTERS_ONE.length());
+
+ static {
+ final int size = LETTERS_ONE.length();
+ for(int i = 0; i < size; i++) {
+ LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
+ }
+ }
+
+ /**
+ * 映射后的 char
+ * @param c 待转换的 char
+ * @return 转换结果
+ * @since 0.29.x
+ */
+ private char getMappingChar(final char c) {
+ Character mapChar = LETTER_MAP.get(c);
+ return mapChar == null ? c : mapChar;
+ }
+
+
@Override
public char format(char original, IWordContext context) {
- return InnerWordCharUtils.getMappingChar(original);
+ return getMappingChar(original);
}
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyleC2C.java b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyleC2C.java
new file mode 100644
index 0000000..4383dd0
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreEnglishStyleC2C.java
@@ -0,0 +1,69 @@
+package com.github.houbb.sensitive.word.support.format;
+
+import com.github.houbb.heaven.annotation.ThreadSafe;
+import com.github.houbb.sensitive.word.api.IWordContext;
+import com.github.houbb.sensitive.word.api.IWordFormat;
+import com.github.houbb.sensitive.word.collection.Char2CharMap;
+
+/**
+ * 忽略英文的各种格式
+ * @author binbin.hou
+ * @since 0.0.6
+ */
+@ThreadSafe
+public class WordFormatIgnoreEnglishStyleC2C implements IWordFormat {
+
+ private static final IWordFormat INSTANCE = new WordFormatIgnoreEnglishStyleC2C();
+
+ public static IWordFormat getInstance() {
+ return INSTANCE;
+ }
+
+ /**
+ * 英文字母1
+ * @since 0.0.4
+ */
+ private static final String LETTERS_ONE =
+ "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
+ "ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
+ "⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
+
+ /**
+ * 英文字母2
+ * @since 0.0.4
+ */
+ private static final String LETTERS_TWO =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
+ "abcdefghijklmnopqrstuvwxyz" +
+ "abcdefghijklmnopqrstuvwxyz";
+
+
+ /**
+ * 字母映射表
+ */
+ private static final Char2CharMap LETTER_MAP = new Char2CharMap(LETTERS_ONE.length());
+
+ static {
+ final int size = LETTERS_ONE.length();
+ for(int i = 0; i < size; i++) {
+ LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
+ }
+ }
+
+ /**
+ * 映射后的 char
+ * @param c 待转换的 char
+ * @return 转换结果
+ * @since 0.29.x
+ */
+ private char getMappingChar(final char c) {
+ char mc = LETTER_MAP.get(c);
+ return mc == 0 ? c : mc;
+ }
+
+ @Override
+ public char format(char original, IWordContext context) {
+ return getMappingChar(original);
+ }
+
+}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyle.java b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyle.java
index 89c85a2..0d65e46 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyle.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyle.java
@@ -3,13 +3,16 @@ package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
-import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
+
+import java.util.HashMap;
+import java.util.Map;
/**
* 忽略数字的样式
* @author binbin.hou
* @since 0.0.5
*/
+@Deprecated
@ThreadSafe
public class WordFormatIgnoreNumStyle implements IWordFormat {
@@ -19,9 +22,65 @@ public class WordFormatIgnoreNumStyle implements IWordFormat {
return INSTANCE;
}
+ private static final String NUM_ONE = "⓪0零º₀⓿○" +
+ "123456789" +
+ "一二三四五六七八九" +
+ "壹贰叁肆伍陆柒捌玖" +
+ "¹²³⁴⁵⁶⁷⁸⁹" +
+ "₁₂₃₄₅₆₇₈₉" +
+ "①②③④⑤⑥⑦⑧⑨" +
+ "⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
+ "⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
+ "❶❷❸❹❺❻❼❽❾" +
+ "➀➁➂➃➄➅➆➇➈" +
+ "➊➋➌➍➎➏➐➑➒" +
+ "㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
+ "⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
+ "㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
+ "ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
+ "ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
+
+ private static final String NUM_TWO = "0000000"+
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789";
+
+ private static final Map NUMBER_MAP = new HashMap<>(NUM_ONE.length());
+
+ static {
+ final int size = NUM_ONE.length();
+ for(int i = 0; i < size; i++) {
+ NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
+ }
+ }
+
+ /**
+ * 映射后的 char
+ * @param c 待转换的 char
+ * @return 结果
+ * @since 0.0.4
+ */
+ private char getMappingChar(final char c) {
+ Character mapChar = NUMBER_MAP.get(c);
+ return mapChar == null ? c : mapChar;
+ }
+
@Override
public char format(char original, IWordContext context) {
- return InnerWordNumUtils.getMappingChar(original);
+ return getMappingChar(original);
}
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyleC2C.java b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyleC2C.java
new file mode 100644
index 0000000..a92efe9
--- /dev/null
+++ b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreNumStyleC2C.java
@@ -0,0 +1,86 @@
+package com.github.houbb.sensitive.word.support.format;
+
+import com.github.houbb.heaven.annotation.ThreadSafe;
+import com.github.houbb.sensitive.word.api.IWordContext;
+import com.github.houbb.sensitive.word.api.IWordFormat;
+import com.github.houbb.sensitive.word.collection.Char2CharMap;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * 忽略数字的样式
+ * @author binbin.hou
+ * @since 0.0.5
+ */
+@ThreadSafe
+public class WordFormatIgnoreNumStyleC2C implements IWordFormat {
+
+ private static final IWordFormat INSTANCE = new WordFormatIgnoreNumStyleC2C();
+
+ public static IWordFormat getInstance() {
+ return INSTANCE;
+ }
+
+ private static final String NUM_ONE = "⓪0零º₀⓿○" +
+ "123456789" +
+ "一二三四五六七八九" +
+ "壹贰叁肆伍陆柒捌玖" +
+ "¹²³⁴⁵⁶⁷⁸⁹" +
+ "₁₂₃₄₅₆₇₈₉" +
+ "①②③④⑤⑥⑦⑧⑨" +
+ "⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
+ "⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
+ "❶❷❸❹❺❻❼❽❾" +
+ "➀➁➂➃➄➅➆➇➈" +
+ "➊➋➌➍➎➏➐➑➒" +
+ "㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
+ "⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
+ "㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
+ "ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
+ "ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
+
+ private static final String NUM_TWO = "0000000"+
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789" +
+ "123456789";
+
+ private static final Char2CharMap NUMBER_MAP = new Char2CharMap(NUM_ONE.length());
+
+ static {
+ final int size = NUM_ONE.length();
+ for(int i = 0; i < size; i++) {
+ NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
+ }
+ }
+
+ /**
+ * 映射后的 char
+ * @param c 待转换的 char
+ * @return 结果
+ * @since 0.0.4
+ */
+ private char getMappingChar(final char c) {
+ char mc = NUMBER_MAP.get(c);
+ return mc == 0 ? c : mc;
+ }
+
+ @Override
+ public char format(char original, IWordContext context) {
+ return getMappingChar(original);
+ }
+
+}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreWidth.java b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreWidth.java
index 4b700bd..2191310 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreWidth.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatIgnoreWidth.java
@@ -1,9 +1,9 @@
package com.github.houbb.sensitive.word.support.format;
import com.github.houbb.heaven.annotation.ThreadSafe;
-import com.github.houbb.heaven.util.lang.CharUtil;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
+import com.github.houbb.sensitive.word.utils.InnerCharUtils;
/**
* 格式化字宽度
@@ -21,7 +21,7 @@ public class WordFormatIgnoreWidth implements IWordFormat {
@Override
public char format(char original, IWordContext context) {
- return CharUtil.toHalfWidth(original);
+ return InnerCharUtils.toHalfWidth(original);
}
}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatInit.java b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatInit.java
deleted file mode 100644
index c047d59..0000000
--- a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormatInit.java
+++ /dev/null
@@ -1,44 +0,0 @@
-package com.github.houbb.sensitive.word.support.format;
-
-import com.github.houbb.heaven.annotation.ThreadSafe;
-import com.github.houbb.heaven.support.pipeline.Pipeline;
-import com.github.houbb.heaven.support.pipeline.impl.DefaultPipeline;
-import com.github.houbb.sensitive.word.api.IWordFormat;
-import com.github.houbb.sensitive.word.api.IWordContext;
-
-import java.util.List;
-
-/**
- * 格式化责任链
- * @author binbin.hou
- * @since 0.0.5
- */
-@ThreadSafe
-@Deprecated
-public abstract class WordFormatInit implements IWordFormat {
-
- /**
- * 初始化列表
- *
- * @param pipeline 当前列表泳道
- * @since 0.0.13
- */
- protected abstract void init(final Pipeline pipeline);
-
- @Override
- public char format(char original, IWordContext context) {
- Pipeline pipeline = new DefaultPipeline<>();
- init(pipeline);
-
- char result = original;
-
- // 循环执行
- List charFormats = pipeline.list();
- for(IWordFormat charFormat : charFormats) {
- result = charFormat.format(result, context);
- }
-
- return result;
- }
-
-}
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormats.java b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormats.java
index 505ea0d..1d67652 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormats.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/format/WordFormats.java
@@ -51,7 +51,7 @@ public final class WordFormats {
}
public static IWordFormat ignoreEnglishStyle() {
- return WordFormatIgnoreEnglishStyle.getInstance();
+ return WordFormatIgnoreEnglishStyleC2C.getInstance();
}
public static IWordFormat ignoreChineseStyle() {
@@ -59,7 +59,7 @@ public final class WordFormats {
}
public static IWordFormat ignoreNumStyle() {
- return WordFormatIgnoreNumStyle.getInstance();
+ return WordFormatIgnoreNumStyleC2C.getInstance();
}
public static IWordFormat ignoreWidth() {
diff --git a/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java
index b41b20d..b36fd9f 100644
--- a/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java
+++ b/src/main/java/com/github/houbb/sensitive/word/support/format/mapping/WordFormatTextDefault.java
@@ -2,7 +2,6 @@ package com.github.houbb.sensitive.word.support.format.mapping;
import com.github.houbb.sensitive.word.api.IWordContext;
import com.github.houbb.sensitive.word.api.IWordFormat;
-import com.github.houbb.sensitive.word.support.check.WordCheckNone;
import com.github.houbb.sensitive.word.support.format.WordFormatNone;
import java.util.Collections;
@@ -26,6 +25,7 @@ public class WordFormatTextDefault extends AbstractWordFormatText {
return Collections.emptyMap();
}
+ //v0.29.2
Map map = new HashMap<>();
for(int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/InnerCharUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/InnerCharUtils.java
index 55b1c2a..0a2ea9f 100644
--- a/src/main/java/com/github/houbb/sensitive/word/utils/InnerCharUtils.java
+++ b/src/main/java/com/github/houbb/sensitive/word/utils/InnerCharUtils.java
@@ -5,6 +5,24 @@ package com.github.houbb.sensitive.word.utils;
*/
public class InnerCharUtils {
+ /**
+ * 转换为半角
+ * @param original 原始
+ * @return 半角
+ * @since 0.29.2
+ */
+ public static char toHalfWidth(char original) {
+ // 全角空格
+ if (original == '\u3000') return ' ';
+ // 其他可转换全角字符
+ if (original >= '\uFF01' && original <= '\uFF5E') {
+ return (char) (original - 0xFEE0);
+ }
+ // 其他字符保持不变
+ return original;
+ }
+
+
/**
* 转换为整数
* @param text 文本
diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordCharUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordCharUtils.java
index ca61060..a15b62c 100644
--- a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordCharUtils.java
+++ b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordCharUtils.java
@@ -1,11 +1,7 @@
package com.github.houbb.sensitive.word.utils;
-import com.github.houbb.heaven.util.guava.Guavas;
-import com.github.houbb.heaven.util.lang.ObjectUtil;
import com.github.houbb.sensitive.word.api.IWordResult;
-import java.util.Map;
-
/**
* project: sensitive-word-NumUtils
* create on 2020/1/8 22:18
@@ -18,84 +14,6 @@ public final class InnerWordCharUtils {
private InnerWordCharUtils() {
}
- /**
- * 英文字母1
- * @since 0.0.4
- */
- private static final String LETTERS_ONE =
- "ⒶⒷⒸⒹⒺⒻⒼⒽⒾⒿⓀⓁⓂⓃⓄⓅⓆⓇⓈⓉⓊⓋⓌⓍⓎⓏ" +
- "ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ" +
- "⒜⒝⒞⒟⒠⒡⒢⒣⒤⒥⒦⒧⒨⒩⒪⒫⒬⒭⒮⒯⒰⒱⒲⒳⒴⒵";
-
- /**
- * 英文字母2
- * @since 0.0.4
- */
- private static final String LETTERS_TWO =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
- "abcdefghijklmnopqrstuvwxyz" +
- "abcdefghijklmnopqrstuvwxyz";
-
-
- /**
- * 英文字母 map
- * @since 0.0.4
- */
- private static final Map LETTER_MAP = Guavas.newHashMap(LETTERS_ONE.length());
-
- static {
- final int size = LETTERS_ONE.length();
-
- for(int i = 0; i < size; i++) {
- LETTER_MAP.put(LETTERS_ONE.charAt(i), LETTERS_TWO.charAt(i));
- }
- }
-
- /**
- * 映射后的 char
- * @param character 待转换的 char
- * @return 结果
- * @since 0.0.4
- */
- public static Character getMappingChar(final Character character) {
- final Character mapChar = LETTER_MAP.get(character);
- if(ObjectUtil.isNotNull(mapChar)) {
- return mapChar;
- }
-
- return character;
- }
-
- /**
- * 构建字符串
- * @param chars 字符数组
- * @param startIndex 开始位置
- * @param endIndex 结束位置
- * @return 结果
- * @since 0.5.0
- */
-// @Deprecated
-// public static String getString(final char[] chars,
-// final int startIndex,
-// final int endIndex) {
-// // 截取
-// int len = endIndex - startIndex;
-// return new String(chars, startIndex, len);
-// }
-
- /**
- * 构建字符串
- * @param chars 字符数组
- * @param wordResult 结果
- * @return 结果
- * @since 0.5.0
- */
-// @Deprecated
-// public static String getString(final char[] chars,
-// final IWordResult wordResult) {
-// return getString(chars, wordResult.startIndex(), wordResult.endIndex());
-// }
-
/**
* 构建字符串
* @param text 字符串
diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java
index a03775f..e0ea24b 100644
--- a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java
+++ b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordFormatUtils.java
@@ -47,17 +47,20 @@ public final class InnerWordFormatUtils {
/**
* 字符串统一的格式化处理
+ *
+ * 注意:这个需要 map 的实现是 {@link it.unimi.dsi.fastutil.chars.Char2CharOpenHashMap}
* @param map 映射集合
* @param c 原始
* @return 结果
* @since 0.28.0
*/
public static char getMappingChar(final Map map, char c) {
- Character mc = map.get(c);
- if(mc != null) {
- return mc;
+ //Char2CharOpenHashMap 不存在映射也是返回 null
+ Object mc = map.get(c);
+ if(mc == null) {
+ return c;
}
- return c;
+ return (char) mc;
}
/**
diff --git a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordNumUtils.java b/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordNumUtils.java
deleted file mode 100644
index ce9d8a3..0000000
--- a/src/main/java/com/github/houbb/sensitive/word/utils/InnerWordNumUtils.java
+++ /dev/null
@@ -1,128 +0,0 @@
-package com.github.houbb.sensitive.word.utils;
-
-import com.github.houbb.heaven.util.guava.Guavas;
-import com.github.houbb.heaven.util.lang.ObjectUtil;
-import com.github.houbb.heaven.util.lang.StringUtil;
-import com.github.houbb.sensitive.word.api.IWordContext;
-import com.github.houbb.sensitive.word.constant.enums.WordValidModeEnum;
-
-import java.util.Map;
-
-/**
- * project: sensitive-word-NumUtils
- * create on 2020/1/8 22:18
- *
- * @author Administrator
- * @since 0.0.4
- */
-public final class InnerWordNumUtils {
-
- private InnerWordNumUtils(){}
-
- private static final String NUM_ONE = "⓪0零º₀⓿○" +
- "123456789" +
- "一二三四五六七八九" +
- "壹贰叁肆伍陆柒捌玖" +
- "¹²³⁴⁵⁶⁷⁸⁹" +
- "₁₂₃₄₅₆₇₈₉" +
- "①②③④⑤⑥⑦⑧⑨" +
- "⑴⑵⑶⑷⑸⑹⑺⑻⑼" +
- "⒈⒉⒊⒋⒌⒍⒎⒏⒐" +
- "❶❷❸❹❺❻❼❽❾" +
- "➀➁➂➃➄➅➆➇➈" +
- "➊➋➌➍➎➏➐➑➒" +
- "㈠㈡㈢㈣㈤㈥㈦㈧㈨" +
- "⓵⓶⓷⓸⓹⓺⓻⓼⓽" +
- "㊀㊁㊂㊃㊄㊅㊆㊇㊈" +
- "ⅰⅱⅲⅳⅴⅵⅶⅷⅸ" +
- "ⅠⅡⅢⅣⅤⅥⅦⅧⅨ";
-
- private static final String NUM_TWO = "0000000"+
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789" +
- "123456789";
-
- /**
- * 英文字母 map
- * @since 0.0.4
- */
- private static final Map NUMBER_MAP = Guavas.newHashMap(NUM_ONE.length());
-
- static {
- final int size = NUM_ONE.length();
-
- for(int i = 0; i < size; i++) {
- NUMBER_MAP.put(NUM_ONE.charAt(i), NUM_TWO.charAt(i));
- }
- }
-
- /**
- * 映射后的 char
- * @param character 待转换的 char
- * @return 结果
- * @since 0.0.4
- */
- public static Character getMappingChar(final Character character) {
- final Character mapChar = NUMBER_MAP.get(character);
- if(ObjectUtil.isNotNull(mapChar)) {
- return mapChar;
- }
-
- return character;
- }
-
- public static String getMappingString(final String string) {
- if(StringUtil.isEmpty(string)) {
- return string;
- }
-
- int length = string.length();
- StringBuilder stringBuilder = new StringBuilder(length);
- for(int i = 0; i < length; i++) {
- char mapChar = getMappingChar(string.charAt(i));
-
- //TODO: stop word 的处理
- stringBuilder.append(mapChar);
- }
-
- return stringBuilder.toString();
- }
-
- /**
- * 检查敏感词数量
- *
- * (1)如果未命中敏感词,直接返回 0
- * (2)命中敏感词,则返回敏感词的长度。
- *
- * ps: 这里结果进行优化,
- * 1. 是否包含敏感词。
- * 2. 敏感词的长度
- * 3. 正常走过字段的长度(便于后期替换优化,避免不必要的循环重复)
- *
- * @param txt 文本信息
- * @param beginIndex 开始下标
- * @param wordValidModeEnum 验证模式
- * @param context 执行上下文
- * @return 敏感数字对应的长度
- * @since 0.0.5
- */
- private int getSensitiveNumber(final String txt, final int beginIndex,
- final WordValidModeEnum wordValidModeEnum,
- final IWordContext context) {
- return 0;
- }
-
-}
diff --git a/src/test/java/com/github/houbb/sensitive/word/benchmark/CharUtilPerfTest.java b/src/test/java/com/github/houbb/sensitive/word/benchmark/CharUtilPerfTest.java
new file mode 100644
index 0000000..000d3ee
--- /dev/null
+++ b/src/test/java/com/github/houbb/sensitive/word/benchmark/CharUtilPerfTest.java
@@ -0,0 +1,43 @@
+package com.github.houbb.sensitive.word.benchmark;
+
+import com.github.houbb.heaven.util.lang.CharUtil;
+import com.github.houbb.sensitive.word.utils.InnerCharUtils;
+
+public class CharUtilPerfTest {
+
+
+ private static final int COUNT = 10_00_000;
+
+ public static void main(String[] args) {
+ char[] testData = new char[COUNT];
+ for (int i = 0; i < COUNT; i++) {
+ testData[i] = (char) ('A' + (i % 52)); // A-Z a-z
+ }
+
+ // 测试新小写
+ // 测试原始半角
+ char[] fullWidthData = new char[COUNT];
+ for (int i = 0; i < COUNT; i++) {
+ fullWidthData[i] = (char) ('\uFF01' + (i % 94)); // 常见全角字符
+ }
+
+ long t5 = System.currentTimeMillis();
+ char sum3 = 0;
+ for (char c : fullWidthData) {
+ sum3 += CharUtil.toHalfWidth(c);
+ }
+ long t6 = System.currentTimeMillis();
+ System.out.println("原始 toHalfWidth 耗时: " + (t6 - t5) + "ms, sum=" + sum3);
+
+ // 测试新半角
+ long t7 = System.currentTimeMillis();
+ char sum4 = 0;
+ for (char c : fullWidthData) {
+ sum4 += InnerCharUtils.toHalfWidth(c);
+ }
+ long t8 = System.currentTimeMillis();
+ System.out.println("优化 toHalfWidth 耗时: " + (t8 - t7) + "ms, sum=" + sum4);
+ }
+
+
+}
diff --git a/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java b/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java
index 535f121..afc86eb 100644
--- a/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java
+++ b/src/test/java/com/github/houbb/sensitive/word/data/DictSlimTest.java
@@ -1,149 +1,149 @@
-package com.github.houbb.sensitive.word.data;
-
-import com.github.houbb.heaven.support.filter.IFilter;
-import com.github.houbb.heaven.support.handler.IHandler;
-import com.github.houbb.heaven.util.io.FileUtil;
-import com.github.houbb.heaven.util.lang.StringUtil;
-import com.github.houbb.heaven.util.util.CollectionUtil;
-import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
-import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
-import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
-import org.junit.Ignore;
-import org.junit.Test;
-
-import java.util.List;
-
-/**
- * 数据初始化
- * @author binbin.hou
- * @since 0.0.3
- */
-@Ignore
-public class DictSlimTest {
-
- /**
- * 统一格式
- *
- * 1. 将所有的大写字母统一转换为小写
- * 2. 将所有的全角转换为半角
- * 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了)
- * 4. 繁体字统一转换为简体字
- * @since 0.0.3
- */
- @Test
- @Ignore
- public void formatTest() {
- final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
- final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
-
- List words = FileUtil.readAllLines(sourceFile);
-
- List formats = CollectionUtil.toList(words, new IHandler() {
- @Override
- public String handle(String string) {
- String lower = string.toLowerCase();
- String half = StringUtil.toHalfWidth(lower);
- String trim = StringUtil.trimAnyBlank(half);
- String punc = StringUtil.trimAnyPunctionAndSymbol(trim);
- return ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(punc);
- }
- });
-
- List resultList = DataUtil.disctinctAndSort(formats);
- FileUtil.write(targetFile, resultList);
- }
-
- /**
- * 移除测试
- *
- * 1. 移除 QQ 号的类似数字
- * 2. 移除所有网址(.com、cn、.org)
- * 3. 移除纯英文
- * 4. 移除乱码 `�`
- * 5. 移除英文+数字的
- *
- * @since 0.0.3
- */
- @Test
- @Ignore
- public void removeTest() {
- final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
- final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
-
- List words = FileUtil.readAllLines(sourceFile);
-
- List formats = CollectionUtil.filterList(words, new IFilter() {
- @Override
- public boolean filter(String string) {
- return StringUtil.isDigitOrLetter(string)
- || string.contains("�")
- || string.contains("删掉")
- || isUrl(string);
- }
- });
-
- List resultList = DataUtil.disctinctAndSort(formats);
- FileUtil.write(targetFile, resultList);
- }
-
- /**
- * 数字映射处理
- * @since 0.0.4
- */
- @Test
- public void removeNumberMappingTest() {
- final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
- final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
-
- List words = FileUtil.readAllLines(sourceFile);
- List formats = CollectionUtil.toList(words, new IHandler() {
- @Override
- public String handle(String s) {
- return s.replaceAll(" ", "");
- }
- });
- List filters = CollectionUtil.filterList(formats, new IFilter() {
- @Override
- public boolean filter(String string) {
- return isNumber(string);
- }
- });
-
- List resultList = DataUtil.disctinctAndSort(filters);
- FileUtil.write(targetFile, resultList);
- }
-
- /**
- * 是否为存数字
- * (1)数字小于4的直接跳过。
- * @param string 原始字符串
- * @return 结果
- * @since 0.0.4
- */
- private static boolean isNumber(final String string) {
- if(string.length() <= 4) {
- return false;
- }
-
- // 停顿词语
- String trim = string.replaceAll("加|否|与|和", "");
- String mapString = InnerWordNumUtils.getMappingString(trim);
- boolean result = StringUtil.isDigit(mapString);
- if(result) {
- System.out.println(string);
- }
- return result;
- }
-
- private static boolean isUrl(final String string) {
- return string.endsWith(".com")
- || string.endsWith(".cn")
- || string.endsWith(".org");
- }
-
- public static void main(String[] args) {
- String trim = "1和2".replaceAll("加|否|与|和", "");
- System.out.println(trim);
- }
-
-}
+//package com.github.houbb.sensitive.word.data;
+//
+//import com.github.houbb.heaven.support.filter.IFilter;
+//import com.github.houbb.heaven.support.handler.IHandler;
+//import com.github.houbb.heaven.util.io.FileUtil;
+//import com.github.houbb.heaven.util.lang.StringUtil;
+//import com.github.houbb.heaven.util.util.CollectionUtil;
+//import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
+//import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
+//import com.github.houbb.sensitive.word.utils.InnerWordNumUtils;
+//import org.junit.Ignore;
+//import org.junit.Test;
+//
+//import java.util.List;
+//
+///**
+// * 数据初始化
+// * @author binbin.hou
+// * @since 0.0.3
+// */
+//@Ignore
+//public class DictSlimTest {
+//
+// /**
+// * 统一格式
+// *
+// * 1. 将所有的大写字母统一转换为小写
+// * 2. 将所有的全角转换为半角
+// * 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了)
+// * 4. 繁体字统一转换为简体字
+// * @since 0.0.3
+// */
+// @Test
+// @Ignore
+// public void formatTest() {
+// final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
+// final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
+//
+// List words = FileUtil.readAllLines(sourceFile);
+//
+// List formats = CollectionUtil.toList(words, new IHandler() {
+// @Override
+// public String handle(String string) {
+// String lower = string.toLowerCase();
+// String half = StringUtil.toHalfWidth(lower);
+// String trim = StringUtil.trimAnyBlank(half);
+// String punc = StringUtil.trimAnyPunctionAndSymbol(trim);
+// return ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(punc);
+// }
+// });
+//
+// List resultList = DataUtil.disctinctAndSort(formats);
+// FileUtil.write(targetFile, resultList);
+// }
+//
+// /**
+// * 移除测试
+// *
+// * 1. 移除 QQ 号的类似数字
+// * 2. 移除所有网址(.com、cn、.org)
+// * 3. 移除纯英文
+// * 4. 移除乱码 `�`
+// * 5. 移除英文+数字的
+// *
+// * @since 0.0.3
+// */
+// @Test
+// @Ignore
+// public void removeTest() {
+// final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
+// final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
+//
+// List words = FileUtil.readAllLines(sourceFile);
+//
+// List formats = CollectionUtil.filterList(words, new IFilter() {
+// @Override
+// public boolean filter(String string) {
+// return StringUtil.isDigitOrLetter(string)
+// || string.contains("�")
+// || string.contains("删掉")
+// || isUrl(string);
+// }
+// });
+//
+// List resultList = DataUtil.disctinctAndSort(formats);
+// FileUtil.write(targetFile, resultList);
+// }
+//
+// /**
+// * 数字映射处理
+// * @since 0.0.4
+// */
+// @Test
+// public void removeNumberMappingTest() {
+// final String sourceFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
+// final String targetFile = "D:\\_github\\sensitive-word\\src\\main\\resources\\sensitive_word_dict.txt";
+//
+// List words = FileUtil.readAllLines(sourceFile);
+// List formats = CollectionUtil.toList(words, new IHandler() {
+// @Override
+// public String handle(String s) {
+// return s.replaceAll(" ", "");
+// }
+// });
+// List filters = CollectionUtil.filterList(formats, new IFilter() {
+// @Override
+// public boolean filter(String string) {
+// return isNumber(string);
+// }
+// });
+//
+// List resultList = DataUtil.disctinctAndSort(filters);
+// FileUtil.write(targetFile, resultList);
+// }
+//
+// /**
+// * 是否为存数字
+// * (1)数字小于4的直接跳过。
+// * @param string 原始字符串
+// * @return 结果
+// * @since 0.0.4
+// */
+// private static boolean isNumber(final String string) {
+// if(string.length() <= 4) {
+// return false;
+// }
+//
+// // 停顿词语
+// String trim = string.replaceAll("加|否|与|和", "");
+//// String mapString = InnerWordNumUtils.getMappingString(trim);
+//// boolean result = StringUtil.isDigit(mapString);
+//// if(result) {
+//// System.out.println(string);
+//// }
+//// return result;
+// }
+//
+// private static boolean isUrl(final String string) {
+// return string.endsWith(".com")
+// || string.endsWith(".cn")
+// || string.endsWith(".org");
+// }
+//
+// public static void main(String[] args) {
+// String trim = "1和2".replaceAll("加|否|与|和", "");
+// System.out.println(trim);
+// }
+//
+//}
diff --git a/src/test/java/com/github/houbb/sensitive/word/support/format/EnglishStylePerfTest.java b/src/test/java/com/github/houbb/sensitive/word/support/format/EnglishStylePerfTest.java
new file mode 100644
index 0000000..3ec8580
--- /dev/null
+++ b/src/test/java/com/github/houbb/sensitive/word/support/format/EnglishStylePerfTest.java
@@ -0,0 +1,46 @@
+package com.github.houbb.sensitive.word.support.format;
+
+import com.github.houbb.sensitive.word.api.IWordContext;
+import com.github.houbb.sensitive.word.api.IWordFormat;
+
+public class EnglishStylePerfTest {
+
+ public static void main(String[] args) {
+ final int times = 200000;
+
+
+ // 不涉及
+ IWordContext context = null;
+
+ // 每次随机选择?
+ String demo1 = "产品尺寸参数§60mn§50mm§210枚/包§160枚/包§名称A4银色不干胶§规格60mm*40mm 送配套模板§规格70mm*50mm 送配套模板§数量每大张21枚一包10张总计210枚§数量每大张16枚一包10张总计160枚§适用激光打印机打印油性笔书写§95mm§100mn§55mm§100枚/包§80枚/包§名称 A4银色不干胶§规格95mm*55mm 送配套模板§规格100mm*70mm 送配套模板§数量每大张10枚一包10张总计100枚§数量 每大张8枚一包10张 总计80枚§100mm§120枚/包§140枚/包§规格80mm*50mm 送配套模板§规格100mm*40mm 送配套模板§数量每大张12枚一包10张总计120枚§数量§每大张14枚包10张总计140枚§适用 激光打印机打印油性笔书写§40mm§65mm§70mm§35mm§200枚/包§240枚/包§规格70mm*40mm送配套模板§规格§65mm*35mm 送配套模板§数量 每大张20枚一包10张总计200枚§每大张24枚包10张总计240枚§适 激光打印机打印油性笔书写§适用§激光打印机打印油性笔书写§40mn§280枚/包§360枚/包§规格50mm*40mm 送配套模板§规格40mm*30mm 送配套模板§数量每大张28枚一包10张总计280枚§数量每大张36枚一包10张总计360枚§45.7mm§38.1mm§400枚/包§650枚/包§45.7mm*25.4mm送配套模板§38.1mm*21.2mm 送配套模板§每大张40枚一包10张总计400枚§数量每大张65枚一包10张总计650枚§30mm§25mr§20mm§840枚/包§1260枚/包§规格 30mm*20mm 送配套模板§规格25mm*13mm 送配套模板§数量每张84枚包10张总计840枚§数量每大张126枚一包10张总计1260枚§46mm§意制§任§1000枚/包§定§名称定制A4内割银不胶§规格46mm*11.1mm送配套模板§任意规格定制§每大张100枚包10张总计1000枚§包10张满5包送专属模板§适激光打印机打印油性笔书写§产品实拍§8格打印实拍展示(100mm*70mm)§上海荠骞文化用品固定资产标识卡§资产编号:§规格型号:§资产名称:§使用状态:§资产类别:§资产原值§存放地点§生产厂家:§使用人§备§注:§*请爱护公司财产,不要随意撕毁此标签§16格全内容打印实拍展示§固定资产标识卡§资产名称§四层货架(平板)§资产编号§3F跑菜区§规格型号§1800×500×1500§使用部门§财务部§使用时间§2019-04-26§李强§21格手写款打印展示 (60mm*40mm)§固定资标识卡§36格打印实拍展示(40mm*30mm)§固定资产标签§名称:§编号:§部门:§40格打印实拍展示(45.7mm*25.4mm)§固定资§名称:电脑§编号:20210§部门:财务部§20210201§使用人:我最强§八:找最强§编号:20210201§65格打印实拍展示(38mm*21mm)§名称:§编号:§数量:§数量:§100格打印实拍展示(46mm*11.1mm)§客服电话:159 9569 3815§: 159 9569 3815§.§客服电话:159 9569§客服电话:1599§客服电话§服电话:159 9569 3815§话:159 9569 3815§客服电话:1599569 3815§电话:159 9569 3815§9569 3815§159 9569 3815§客服电话:§低值易耗品标识牌(70mm*50mm)§购买日期§保管部门§责任人§生产厂家§不要随意撕毁此标牌*§*请爱护公司财产,不要随意撕导§品标识牌§低值易耗品标识牌§随意撕毁此标牌*§*请爱护公司财产,不要随意撕毁此标牌*§三人沙发§行政酒廊§2200*860*900§2018-07-23§应用范围§多用于产品信息固有资产登记航空仓库管理 医疗政府机构等§Mainly used for product information inherent assets registration, aviation warehouse management, medi§cal government institutions, etc§政府单位§企业办公§仓储行业§医疗器械§教育单位§耐用品§电子产品包装§商城卖场";
+ // hash
+ cost1(demo1, times, context);
+ cost2(demo1, times, context);
+ }
+
+ private static void cost1(String text, int times, IWordContext context) {
+ IWordFormat hashMap = new WordFormatIgnoreEnglishStyle();
+
+ long s1 = System.currentTimeMillis();
+ for(int i = 0; i < times; i++) {
+ char c = text.charAt(i % text.length());
+ hashMap.format(c, context);
+ }
+ long cost = System.currentTimeMillis() - s1;
+ System.out.println(cost);
+ }
+
+ private static void cost2(String text, int times, IWordContext context) {
+ IWordFormat hashMap = new WordFormatIgnoreEnglishStyleC2C();
+
+ long s1 = System.currentTimeMillis();
+ for(int i = 0; i < times; i++) {
+ char c = text.charAt(i % text.length());
+ hashMap.format(c, context);
+ }
+ long cost = System.currentTimeMillis() - s1;
+ System.out.println(cost);
+ }
+
+}
diff --git a/src/test/java/com/github/houbb/sensitive/word/support/format/package-info.java b/src/test/java/com/github/houbb/sensitive/word/support/format/package-info.java
new file mode 100644
index 0000000..4e67d83
--- /dev/null
+++ b/src/test/java/com/github/houbb/sensitive/word/support/format/package-info.java
@@ -0,0 +1 @@
+package com.github.houbb.sensitive.word.support.format;
\ No newline at end of file