数据格式
0 涬 芸 飛 铤新规画带洄䒸不凭感觉不靠運氣数据支撑全天⑤⑥⑦⑧码精準人工計划 导師Q企鹅4805042881 项目名称《设计项目》,项目编号 务复评提醒ii,您好! 购买标书,并至少提前4周申请投标保函开立“,自评已完成,请及时复评,谢谢! 登录地址:http:// 温馨提示:本邮件由系统自动生成,请勿直接回复。2 项目名称。。。。。3 项目名称。。。。。4 项目名称。。。。。5 项目名称。。。。。
第一列为label,第二列为空格,剩下的为文本。
6个类别影响了fasttext的label和textcnn的输入输出神经元数量
#!/usr/bin python3# -*- encoding: utf-8 -*-'''@File : model.py@Time : 2020/09/23 17:42:25@Author : 陈培杞@Version : 1.0'''import reimport timeimport jiebaimport joblibimport functoolsimport subprocessimport numpy as npimport pandas as pdfrom memory_profiler import profileimport fasttextimport xgboost as xgbimport lightgbm as lgbmfrom sklearn.svm import SVCfrom sklearn.metrics import classification_reportfrom sklearn.linear_model import LogisticRegressionfrom sklearn.model_selection import train_test_splitfrom sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizerfrom sklearn import preprocessing, decomposition, model_selection, metrics, pipelinefrom sklearn.metrics import classification_reportfrom tensorflow import kerasfrom tensorflow.keras.models import Model as Model_from tensorflow.keras.preprocessing.text import Tokenizerfrom tensorflow.keras.preprocessing.sequence import pad_sequencesfrom tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, concatenate, Flatten, Dropout, Input, Denseimport argparseparser = argparse.ArgumentParser()parser.add_argument('--dataprocess', default=False, help='数据重新处理')args = parser.parse_args()CONTENTTRAIN = './data/content.train'CONTENTTEST = './data/content.test'CACTRANSTRAIN = './data/cactrans.train'CACTRANSTEST = './data/cactrans.test'TRAINDATA = './data/train.data'TRAINLABEL = './data/train.label'TESTDATA = './data/test.data'TESTLABEL = './data/test.label'FTTRAINDATA = './data/fasttext.train'FTTESTDATA = './data/fasttext.test'TFIDFMODEL = './model/tfidf.pkl'SVMMODEL = './model/svm.pkl'FTMODEL = './model/fasttext.ftz'XGBOOSTMODEL = './model/xgboost.pkl'LIGHTGBMMODEL = './model/lightgbm.pkl'def timeCost(func):@functools.wraps(func)def wrapper(*args, **kwargs):time_start = time.time()func(*args, **kwargs)print('\033[1;35m time cost \033[0m',time.time()-time_start,'s',end='\n\n')return wrapperclass Model(object):def __init__(self):self.model = Noneself.modelPath = './model.pkl'def save(self):if self.model:joblib.dump(self.model, self.modelPath)def load(self):self.model = joblib.load(self.modelPath)@timeCostdef fit_transform(self, X_train, X_test, y_train, y_test):self.model.fit(X_train, y_train)self.save()y_pre = self.model.predict(X_test)self.report(y_test, y_pre)def report(self,y_true, y_pre):print(classification_report(y_true, y_pre))class SvmModel(Model):def __init__(self):self.model = SVC()self.modelPath = SVMMODELclass FasttextModel(Model):# 直接使用已经训练好的fasttext进行预测,不再单独训练新模型def __init__(self):self.model = fasttext.load_model(FTMODEL)@timeCostdef fit_transform(self):y_test = []y_pre = []labels = {'__label__a':0,'__label__b':1,'__label__c':2,'__label__d':3,'__label__e':4,'__label__f':5,}with open(FTTESTDATA, 'r') as f:for line in f:y_test.append(int(line[9]))content = line[11:].replace('\n', '')y_pre.append(labels[self.model.predict(content)[0][0]])self.report(y_test, y_pre)class TextCNNModel(Model):def __init__(self):passdef TextCNN_model_1(self, vocab,x_train_padded_seqs,y_train,x_test_padded_seqs,y_test):main_input = Input(shape=(50,), dtype='float64')embedder = Embedding(len(vocab) + 1, 300, input_length=50, trainable=False)embed = embedder(main_input)cnn1 = Conv1D(256, 3, padding='same', strides=1, activation='relu')(embed)cnn1 = MaxPooling1D(pool_size=48)(cnn1)cnn2 = Conv1D(256, 4, padding='same', strides=1, activation='relu')(embed)cnn2 = MaxPooling1D(pool_size=47)(cnn2)cnn3 = Conv1D(256, 5, padding='same', strides=1, activation='relu')(embed)cnn3 = MaxPooling1D(pool_size=46)(cnn3)cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)flat = Flatten()(cnn)drop = Dropout(0.2)(flat)main_output = Dense(6, activation='softmax')(drop)model = Model_(inputs=main_input, outputs=main_output)model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])one_hot_labels = keras.utils.to_categorical(y_train, num_classes=6)model.fit(x_train_padded_seqs, one_hot_labels, batch_size=800, epochs=20, verbose=1)result = model.predict(x_test_padded_seqs)result_labels = np.argmax(result, axis=1)y_predict = list(map(str, result_labels))self.report(y_test, y_predict)@timeCostdef fit_transform(self, X_train, X_test, y_train, y_test):tokenizer=Tokenizer()tokenizer.fit_on_texts(X_train)vocab=tokenizer.word_indexx_train_word_ids=tokenizer.texts_to_sequences(X_train)x_test_word_ids = tokenizer.texts_to_sequences(X_test)x_train_padded_seqs = pad_sequences(x_train_word_ids,maxlen=50)x_test_padded_seqs = pad_sequences(x_test_word_ids, maxlen=50)self.TextCNN_model_1(vocab,x_train_padded_seqs,y_train,x_test_padded_seqs,y_test)class LightgbmModel(Model):def __init__(self):self.model = lgbm.LGBMClassifier(objective='multiclass', verbose=-1, learning_rate=0.5, max_depth=20, num_leaves=50, n_estimators=120, max_bin=2000,)self.modelPath =LIGHTGBMMODELclass XgboostModel(Model):def __init__(self):self.model = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,subsample=0.8, nthread=10, learning_rate=0.1)self.modelPath =XGBOOSTMODELdef TFIDF(corpus):vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_df=0.5)tfidfModel = vectorizer.fit(corpus)joblib.dump(tfidfModel, TFIDFMODEL)def dataProcess(reProcess=True):def process(contentHandle ,cactransHandle, XHandle, yHandle, ftHandle):subprocess.call("cat %s | awk -F'\\[TID\\]' '{print $1}' > %s"%(contentHandle ,cactransHandle),shell=True)with open(cactransHandle , 'r') as f:labels = []corpus = []#vocabulary = set()for line in f:label = line[0]content = re.sub('\W+', ' ', line[1:]).replace("_", '')tokenList = list(jieba.cut(content))tokenList = [ token for token in tokenList if token!=' ']tmpcorpus = " ".join(tokenList)# fasttext, textcnnprint(f"__label__{label} {tmpcorpus}", file=ftHandle)# svm, libgbm, xgboostprint(f'{label}', file=yHandle)print(f'{tmpcorpus}', file=XHandle)labels.append(label)corpus.append(tmpcorpus)#vocabulary = vocabulary | set(tokenList)return corpus, labelsif reProcess==False:X_train = open(TRAINDATA, 'r').readlines()X_test = open(TESTDATA, 'r').readlines()y_train = open(TRAINLABEL, 'r').read().split('\n')[:-1]y_test = open(TESTLABEL, 'r').read().split('\n')[:-1]else:ft_trainHandle, X_trainHandle, y_trainHandle = open(FTTRAINDATA,'w'), open(TRAINDATA, 'w'), open(TRAINLABEL, 'w')X_train, y_train = process(CONTENTTRAIN, CACTRANSTRAIN, X_trainHandle, y_trainHandle, ft_trainHandle)TFIDF(X_train)ft_testHandle, X_testHandle, y_testHandle = open(FTTESTDATA,'w'), open(TESTDATA, 'w'), open(TESTLABEL, 'w')X_test, y_test = process(CONTENTTEST, CACTRANSTEST, X_testHandle, y_testHandle, ft_testHandle)ft_trainHandle.close()ft_testHandle.close()X_trainHandle.close()X_testHandle.close()y_trainHandle.close()y_testHandle.close()return X_train, X_test, y_train, y_test@profiledef main(X_train, X_test, y_train, y_test):FasttextModel().fit_transform()TextCNNModel().fit_transform(X_train, X_test, y_train, y_test)X_train = tfidfModel.transform(X_train)X_test = tfidfModel.transform(X_test)LightgbmModel().fit_transform(X_train, X_test, y_train, y_test)SvmModel().fit_transform(X_train, X_test, y_train, y_test)XgboostModel().fit_transform(X_train, X_test, y_train, y_test)if __name__=='__main__':X_train, X_test, y_train, y_test = dataProcess(args.dataprocess)tfidfModel = joblib.load(TFIDFMODEL)main(X_train, X_test, y_train, y_test)
