方案一:
#!/usr/bin python3# -*- encoding: utf-8 -*-'''@File : charcorrect.py@Time : 2020/07/28 11:27:03@Author : 陈培杞@Version : 1.0@docs :原始数据集样本格式* wp_2gram.txt:1 the git2 in the* eml.txtHéllo I'm a programmér who crackéd your émail account and dévicé about过程:1. 将原始数据结构化为dict —— 按字符串长度进行分桶处理。{2:['aa','bb'], 8:['isthethe','aboutyou']}2. 将字符转为ASCII码3. 处理成 faiss 模块的输入形式,分桶搜索'''from pathlib import Pathproject_path = Path(__file__).parent.resolve()import reimport astimport jsonimport numpy as npfrom collections import defaultdictclass TrieTree(object):"""用于模糊搜索"""def __init__(self):self.root={}self.word_end = -1def add(self,word):nownode=self.rootfor char in word:if char not in nownode:nownode[char] = {}nownode = nownode[char]nownode[self.word_end] = Truedef search(self, word):nownode = self.rootfor char in word:if char not in nownode:return Falsenownode = nownode[char]if self.word_end not in nownode:return Falsereturn Truedef fuzzyMatch(self, tree, word):nowNode = treematchWord = ''matchWordList=[]for i,char in enumerate(word):if char in nowNode:matchWord += charnowNode = nowNode[char]else:if ord(char) <= 122 : # 既不在树中,也不是通配符,返回空串return ['']else: # 通配符处理for key in nowNode.keys():nextNodes = nowNode[key]if key == self.word_end:return ['']NextMatchWordList = self.fuzzyMatch(nextNodes, word[i+1:])matchWordList += [ matchWord + key + chars for chars in NextMatchWordList]if matchWordList :return matchWordListelse:return [matchWord]def getTree(self):return self.rootdef calculateSimilarity( searchDict:dict, charlength=5):key = charlengthdict_eml = getProcessedData(f"{EmlSaveFile}_{charlength}")dict_wiki = getProcessedData(f"{WikiSaveFile}_{charlength}")# trie tree 检索mytree = TrieTree()for word in dict_wiki[key]:mytree.add(word)for word in dict_eml[key]:matchResult = mytree.fuzzyMatch(mytree.getTree(), word)matchResult = [ i for i in matchResult if len(i)==len(word) ]searchDict[word] = list(set(matchResult))if __name__=='__main__':searchDict = dict()calculateSimilarity(searchDict)
方案二:
import collectionsclass TrieNode:# Initialize your data structure here.def __init__(self):self.children = collections.defaultdict(TrieNode)self.is_word = Falseclass Trie:def __init__(self):self.root = TrieNode()def insert(self, word):current = self.rootfor letter in word:current = current.children[letter]current.is_word = Truedef search(self, word):current = self.rootfor letter in word:current = current.children.get(letter)if current is None:return Falsereturn current.is_worddef startsWith(self, prefix):current = self.rootfor letter in prefix:current = current.children.get(letter)if current is None:return Falsereturn Truedef enumerateMatch(self, word, space="_", backward=False):matched = []## while len(word) > 1 does not keep character itself, while word keed character itselfwhile len(word) > 1:if self.search(word):matched.append(space.join(word[:]))del word[-1]return matched
