132 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
			
		
		
	
	
			132 lines
		
	
	
		
			4.1 KiB
		
	
	
	
		
			Python
		
	
	
	
| import jieba.posseg as pseg
 | |
| import codecs
 | |
| import pickle
 | |
| import warnings
 | |
| warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
 | |
| from gensim import corpora, models, similarities
 | |
| from gensim.summarization import bm25
 | |
| import os
 | |
| from django.conf import settings 
 | |
| 
 | |
| 
 | |
| basedir = settings.BASE_DIR +'/duibiao/'
 | |
| #basedir = 'F:/project/safeyun/duibiao/'
 | |
| #初始化部分数据
 | |
| #设定停用词
 | |
| stop_words = basedir +  'chineseStopWords.txt'
 | |
| stopwords = codecs.open(stop_words,'r').readlines()
 | |
| stopwords = [ w.strip() for w in stopwords ]
 | |
| stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']
 | |
| 
 | |
| #标准文件列表
 | |
| dirname = basedir + 'biao/'
 | |
| filename = [] #文件名列表
 | |
| #词袋列表
 | |
| corpus = []
 | |
| 
 | |
| #分词函数
 | |
| def tokenization(filename):
 | |
|     result = []
 | |
|     with open(dirname + filename, 'r', encoding = 'utf-8') as f:
 | |
|         text = f.read()
 | |
|     words = pseg.cut(text)
 | |
|     for word, flag in words:
 | |
|         if flag not in stop_flag and word not in stopwords:
 | |
|             result.append(word)
 | |
|     return result
 | |
| 
 | |
| def tokenizationFromStr(text):
 | |
|     result = []
 | |
|     words = pseg.cut(text)
 | |
|     for word, flag in words:
 | |
|         if flag not in stop_flag and word not in stopwords:
 | |
|             result.append(word)
 | |
|     return result
 | |
| 
 | |
| def genmodel():
 | |
|     #获取标准全部分词
 | |
|     for root,dirs,files in os.walk(dirname):
 | |
|         for f in files:
 | |
|             if f.endswith(".txt"):
 | |
|                 #f = dirname + '/' + f
 | |
|                 corpus.append(tokenization(f))
 | |
|                 filename.append(f)
 | |
|                 
 | |
|     #建立词袋模型并制作语料库      
 | |
|     dictionary = corpora.Dictionary(corpus)
 | |
|     doc_vectors = [dictionary.doc2bow(text) for text in corpus]
 | |
| 
 | |
|     #针对语料库建立tfidf模型
 | |
|     tfidf = models.TfidfModel(doc_vectors)
 | |
|     tfidf_vectors = tfidf[doc_vectors]
 | |
| 
 | |
|     #将语料库和字典序列化保存
 | |
|     pickle.dump(dictionary, open(basedir + 'biaom/dictionary.pkl','wb'))
 | |
|     pickle.dump(tfidf, open(basedir + 'biaom/tfidf.m', 'wb'))
 | |
|     pickle.dump(tfidf_vectors, open(basedir + 'biaom/tfidfv.m', 'wb'))
 | |
|     pickle.dump(filename, open(basedir + 'biaom/filename', 'wb'))
 | |
| 
 | |
|     
 | |
| def calsim(txtpath):
 | |
|     #取得序列化的模型
 | |
|     dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb'))
 | |
|     tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb'))
 | |
|     tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb'))
 | |
|     filename = pickle.load(open(basedir + 'biaom/filename','rb'))    
 | |
| 
 | |
|     #获取所要查询的文本并计算tfidf值
 | |
|     query = tokenization(txtpath)
 | |
|     query_bow = dictionary.doc2bow(query)
 | |
|     query_tfidf = tfidf[query_bow]
 | |
| 
 | |
|     #计算相似度并排序
 | |
|     index = similarities.MatrixSimilarity(tfidf_vectors)
 | |
|     sims = index[query_tfidf]
 | |
|     paixu = sorted(enumerate(sims), key=lambda item: -item[1])
 | |
| 
 | |
|     #打印出最相似的几条标准
 | |
|     thestr = '最相似的安全标准是:' + '<br/>'
 | |
|     for i in range(5):
 | |
|         if paixu[i][1] > 0:
 | |
|             filepath = dirname  + filename[paixu[i][0]]
 | |
|             with open(filepath,'r', encoding = 'utf-8') as f:
 | |
|                 thestr = thestr + f.read()+'<br/>------------<br/>'
 | |
| 
 | |
|     return thestr
 | |
| 
 | |
| 
 | |
| def calsimTojson(text):
 | |
|     #取得序列化的模型
 | |
|     dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb'))
 | |
|     tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb'))
 | |
|     tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb'))
 | |
|     filename = pickle.load(open(basedir + 'biaom/filename','rb'))    
 | |
| 
 | |
|     #获取所要查询的文本并计算tfidf值
 | |
|     query = tokenizationFromStr(text)
 | |
|     query_bow = dictionary.doc2bow(query)
 | |
|     query_tfidf = tfidf[query_bow]
 | |
| 
 | |
|     #计算相似度并排序
 | |
|     index = similarities.MatrixSimilarity(tfidf_vectors)
 | |
|     sims = index[query_tfidf]
 | |
|     paixu = sorted(enumerate(sims), key=lambda item: -item[1])
 | |
| 
 | |
|     #返回最相似的五条标准
 | |
|     thelist = []
 | |
|     for i in range(5):
 | |
|         if paixu[i][1] > 0:
 | |
|             filepath = dirname  + filename[paixu[i][0]]
 | |
|             with open(filepath,'r', encoding = 'utf-8') as f:
 | |
|                 thelist.append(f.read())
 | |
| 
 | |
|     return thelist
 | |
|     
 | |
| if __name__ == '__main__':
 | |
|     genmodel()
 | |
|     #thestr = calsim(basedir + 'onetrouble.txt')
 | |
|     #print(thestr)
 | |
| 
 | |
| 
 | |
| 
 |