定义树结构
/*** 树的节点* 保存了:当前字符c和子树next*/public class Word implements Comparable<Word> { public char c; public List next = null; public Word(char c) { this.c = c; } @Override public int compareTo(Word word) { return c - word.c; } public String toString() { return c + "(" + (next == null ? null : next.size()) + ")"; }}
多叉树某一层所有树节点的集合
/** * 多叉树某一层所有树节点的集合 */public class List extends ArrayList<Word> { public Word get(char c) { for (Word w : this) { if (w.c == c) return w; } return null; } /** * 二分查找,必须先升序排序 * * @param c 需要查找的字符 * @return Word对象:如果找到 null:如果没找到 */ public Word binaryGet(char c) { int left, right, key; Word word; left = 0; right = this.size() - 1; while (left <= right) { key = (left + right) / 2; word = get(key); if (word.c == c) { return word; } else if (word.c > c) { right = key - 1; } else { left = key + 1; } } return null; } public Word add(char c) { Word word = new Word(c); super.add(word); return word; }}
DFA算法核心
import java.io.*;import java.util.ArrayList;import java.util.Collections;/** * DFA算法核心 */public final class SensitiveWordFilter { //加载完后的敏感词多叉树 public static List wordList; private final static char replace = '*'; // 替代字符 private final static char[] skip = new char[]{ // 遇到这些字符就会跳过,例如,如果"AB"是敏感词,那么"A B","A=B"也会被屏蔽 '!', '*', '-', '+', '_', '=', ',', '.', '@' }; /** * 敏感词替换 * * @param text 待替换文本 * @return 替换后的文本 */ public static String Filter(String text) { if (wordList == null || wordList.size() == 0) return text; char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历 int i, j; Word word; boolean flag; // 是否需要替换 for (i = 0; i < __char__.length; i++) { // 遍历所有字符 char c = __char__[i]; word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率 if (word != null) { // word != null说明找到了 flag = false; j = i + 1; while (j < __char__.length) { // 开始逐个比较后面的字符 if (skip(__char__[j])) { // 跳过空格之类的无关字符 j++; continue; } if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词 /* 以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少 例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个 假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找 */ word = word.next.get(__char__[j]); if (word == null) { break; } j++; } else { // 字符串已结束,存在敏感词汇 flag = true; break; } } if (word != null && word.next == null) { flag = true; } if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换 while (i < j) { if (skip(__char__[i])) { // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句 i++; continue; } __char__[i] = replace; i++; } i--; } } } return new String(__char__); } /** * 加载敏感词列表 * * @param words 敏感词数组 */ public static void loadWord(ArrayList<String> words) { if (words == null) return; char[] chars; List now; Word word; wordList = new List(); for (String __word__ : words) { if (__word__ == null) { continue; } chars = __word__.toCharArray(); now = wordList; word = null; for (char c : chars) { if (word != null) { if (word.next == null) { word.next = new List(); } now = word.next; } word = now.get(c); if (word == null) { word = now.add(c); } } } sort(wordList); } /** * 加载敏感词txt文件,每个敏感词独占一行,不可出现空格,空行,逗号等非文字内容,必须使用UTF-8编码 * * @param path txt文件的绝对地址 */ public static void loadWordFromFile(String path) { String encoding = "UTF-8"; File file = new File(path); try { if (file.isFile() && file.exists()) { InputStreamReader inputStreamReader = new InputStreamReader( new FileInputStream(file), encoding ); BufferedReader bufferedReader = new BufferedReader(inputStreamReader); String line; ArrayList<String> list = new ArrayList<>(); while ((line = bufferedReader.readLine()) != null) { list.add(line); } bufferedReader.close(); inputStreamReader.close(); loadWord(list); } } catch (IOException e) { e.printStackTrace(); } } /** * 对敏感词多叉树递增排序 * * @param list 待排序List */ private static void sort(List list) { if (list == null) return; Collections.sort(list); // 递增排序 for (Word word : list) { sort(word.next); } } /** * 判断是否跳过当前字符 * * @param c 待检测字符 * @return true:需要跳过 false:不需要跳过 */ private static boolean skip(char c) { for (char c1 : skip) { if (c1 == c) return true; } return false; }}
测试代码
public class Main { public static void main(String[] args) throws Exception { SensitiveWordFilter.loadWordFromFile("C:\\code\\test\\SensitiveWord\\SensitiveWordList.txt"); Scanner sc = new Scanner(System.in); while (sc.hasNextLine()){ String line = sc.nextLine(); if (line.equals("bye")){ break; } String filter = SensitiveWordFilter.Filter(line); System.out.println("结果:"+filter); } }}
测试结果示例
敏感词文件示例
机吧TMD
参考
原代码地址,本文只做总结搬运