[Feature] add for new

This commit is contained in:
binbin.hou
2020-01-08 20:57:00 +08:00
parent 98374a1da2
commit f673cc32cf
6 changed files with 120 additions and 3651 deletions

19
pom.xml
View File

@@ -25,7 +25,7 @@
<project.compiler.level>1.7</project.compiler.level>
<!--============================== INTER ==============================-->
<heaven.version>0.1.67</heaven.version>
<heaven.version>0.1.68-SNAPSHOT</heaven.version>
<!--============================== OTHER ==============================-->
<junit.version>4.12</junit.version>
</properties>
@@ -41,6 +41,18 @@
<version>${heaven.version}</version>
</dependency>
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>opencc4j</artifactId>
<version>1.2.0</version>
<optional>true</optional>
<exclusions>
<exclusion>
<groupId>com.github.houbb</groupId>
<artifactId>heaven</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--============================== OTHER ==============================-->
<dependency>
@@ -63,6 +75,11 @@
<artifactId>heaven</artifactId>
</dependency>
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>opencc4j</artifactId>
</dependency>
<!--============================== OTHER ==============================-->
<dependency>
<groupId>junit</groupId>

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -3,6 +3,8 @@ package com.github.houbb.sensitive.word.data;
import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
/**
@@ -24,4 +26,11 @@ public class DataUtil {
return CollectionUtil.distinct(lines);
}
public static List<String> disctinctAndSort(final Collection<String> collection) {
List<String> stringList = CollectionUtil.distinct(collection);
Collections.sort(stringList);
return stringList;
}
}

View File

@@ -0,0 +1,93 @@
package com.github.houbb.sensitive.word.data;
import com.github.houbb.heaven.support.filter.IFilter;
import com.github.houbb.heaven.support.handler.IHandler;
import com.github.houbb.heaven.util.io.FileUtil;
import com.github.houbb.heaven.util.lang.StringUtil;
import com.github.houbb.heaven.util.util.CollectionUtil;
import com.github.houbb.opencc4j.core.impl.ZhConvertBootstrap;
import com.github.houbb.opencc4j.support.segment.impl.CharSegment;
import org.junit.Ignore;
import org.junit.Test;
import java.util.List;
/**
* 数据初始化
* @author binbin.hou
* @since 0.0.3
*/
public class DictSlimTest {
/**
* 统一格式
*
* 1. 将所有的大写字母统一转换为小写
* 2. 将所有的全角转换为半角
* 3. 移除所有【空格】【符号】(这个就是各种符号的过滤了)
* 4. 繁体字统一转换为简体字
* @since 0.0.3
*/
@Test
@Ignore
public void formatTest() {
final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\dict.txt";
final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\dict_format.txt";
List<String> words = FileUtil.readAllLines(sourceFile);
List<String> formats = CollectionUtil.toList(words, new IHandler<String, String>() {
@Override
public String handle(String string) {
String lower = string.toLowerCase();
String half = StringUtil.toHalfWidth(lower);
String trim = StringUtil.trimAnyBlank(half);
String punc = StringUtil.trimAnyPunctionAndSymbol(trim);
return ZhConvertBootstrap.newInstance(new CharSegment()).toSimple(punc);
}
});
List<String> resultList = DataUtil.disctinctAndSort(formats);
FileUtil.write(targetFile, resultList);
}
/**
* 移除测试
*
* 1. 移除 QQ 号的类似数字
* 2. 移除所有网址(.com、cn、.org
* 3. 移除纯英文
* 4. 移除乱码 `<60>`
* 5. 移除英文+数字的
*
* @since 0.0.3
*/
@Test
@Ignore
public void removeTest() {
final String sourceFile = "D:\\github\\sensitive-word\\src\\main\\resources\\dict_format.txt";
final String targetFile = "D:\\github\\sensitive-word\\src\\main\\resources\\dict.txt";
List<String> words = FileUtil.readAllLines(sourceFile);
List<String> formats = CollectionUtil.filterList(words, new IFilter<String>() {
@Override
public boolean filter(String string) {
return StringUtil.isDigitOrLetter(string)
|| string.contains("<EFBFBD>")
|| string.contains("删掉")
|| isUrl(string);
}
});
List<String> resultList = DataUtil.disctinctAndSort(formats);
FileUtil.write(targetFile, resultList);
}
private static boolean isUrl(final String string) {
return string.endsWith(".com")
|| string.endsWith(".cn")
|| string.endsWith(".org");
}
}