import jieba.posseg as pseg import codecs import pickle import warnings warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') from gensim import corpora, models, similarities # from gensim.summarization import bm25 import os from django.conf import settings basedir = settings.BASE_DIR +'/duibiao/' #basedir = 'F:/project/safeyun/duibiao/' #初始化部分数据 #设定停用词 stop_words = basedir + 'chineseStopWords.txt' #stopwords = codecs.open(stop_words,'r').readlines() stopwords = [ ] stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r'] #标准文件列表 dirname = basedir + 'biao/' filename = [] #文件名列表 #词袋列表 corpus = [] #分词函数 def tokenization(filename): result = [] with open(dirname + filename, 'r', encoding = 'utf-8') as f: text = f.read() words = pseg.cut(text) for word, flag in words: if flag not in stop_flag and word not in stopwords: result.append(word) return result def tokenizationFromStr(text): result = [] words = pseg.cut(text) for word, flag in words: if flag not in stop_flag and word not in stopwords: result.append(word) return result def genmodel(): #获取标准全部分词 for root,dirs,files in os.walk(dirname): for f in files: if f.endswith(".txt"): #f = dirname + '/' + f corpus.append(tokenization(f)) filename.append(f) #建立词袋模型并制作语料库 dictionary = corpora.Dictionary(corpus) doc_vectors = [dictionary.doc2bow(text) for text in corpus] #针对语料库建立tfidf模型 tfidf = models.TfidfModel(doc_vectors) tfidf_vectors = tfidf[doc_vectors] #将语料库和字典序列化保存 pickle.dump(dictionary, open(basedir + 'biaom/dictionary.pkl','wb')) pickle.dump(tfidf, open(basedir + 'biaom/tfidf.m', 'wb')) pickle.dump(tfidf_vectors, open(basedir + 'biaom/tfidfv.m', 'wb')) pickle.dump(filename, open(basedir + 'biaom/filename', 'wb')) def calsim(txtpath): #取得序列化的模型 dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb')) tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb')) tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb')) filename = pickle.load(open(basedir + 'biaom/filename','rb')) #获取所要查询的文本并计算tfidf值 query = tokenization(txtpath) query_bow = dictionary.doc2bow(query) query_tfidf = tfidf[query_bow] #计算相似度并排序 index = similarities.MatrixSimilarity(tfidf_vectors) sims = index[query_tfidf] paixu = sorted(enumerate(sims), key=lambda item: -item[1]) #打印出最相似的几条标准 thestr = '最相似的安全标准是:' + '
' for i in range(5): if paixu[i][1] > 0: filepath = dirname + filename[paixu[i][0]] with open(filepath,'r', encoding = 'utf-8') as f: thestr = thestr + f.read()+'
------------
' return thestr def calsimTojson(text): #取得序列化的模型 dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb')) tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb')) tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb')) filename = pickle.load(open(basedir + 'biaom/filename','rb')) #获取所要查询的文本并计算tfidf值 query = tokenizationFromStr(text) query_bow = dictionary.doc2bow(query) query_tfidf = tfidf[query_bow] #计算相似度并排序 index = similarities.MatrixSimilarity(tfidf_vectors) sims = index[query_tfidf] paixu = sorted(enumerate(sims), key=lambda item: -item[1]) #返回最相似的五条标准 thelist = [] for i in range(5): if paixu[i][1] > 0: filepath = dirname + filename[paixu[i][0]] with open(filepath,'r', encoding = 'utf-8') as f: thelist.append(f.read()) return thelist if __name__ == '__main__': genmodel() #thestr = calsim(basedir + 'onetrouble.txt') #print(thestr)