132 lines
4.1 KiB
Python
132 lines
4.1 KiB
Python
import jieba.posseg as pseg
|
|
import codecs
|
|
import pickle
|
|
import warnings
|
|
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
|
|
from gensim import corpora, models, similarities
|
|
# from gensim.summarization import bm25
|
|
import os
|
|
from django.conf import settings
|
|
|
|
|
|
basedir = settings.BASE_DIR +'/duibiao/'
|
|
#basedir = 'F:/project/safeyun/duibiao/'
|
|
#初始化部分数据
|
|
#设定停用词
|
|
stop_words = basedir + 'chineseStopWords.txt'
|
|
#stopwords = codecs.open(stop_words,'r').readlines()
|
|
stopwords = [ ]
|
|
stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']
|
|
|
|
#标准文件列表
|
|
dirname = basedir + 'biao/'
|
|
filename = [] #文件名列表
|
|
#词袋列表
|
|
corpus = []
|
|
|
|
#分词函数
|
|
def tokenization(filename):
|
|
result = []
|
|
with open(dirname + filename, 'r', encoding = 'utf-8') as f:
|
|
text = f.read()
|
|
words = pseg.cut(text)
|
|
for word, flag in words:
|
|
if flag not in stop_flag and word not in stopwords:
|
|
result.append(word)
|
|
return result
|
|
|
|
def tokenizationFromStr(text):
|
|
result = []
|
|
words = pseg.cut(text)
|
|
for word, flag in words:
|
|
if flag not in stop_flag and word not in stopwords:
|
|
result.append(word)
|
|
return result
|
|
|
|
def genmodel():
|
|
#获取标准全部分词
|
|
for root,dirs,files in os.walk(dirname):
|
|
for f in files:
|
|
if f.endswith(".txt"):
|
|
#f = dirname + '/' + f
|
|
corpus.append(tokenization(f))
|
|
filename.append(f)
|
|
|
|
#建立词袋模型并制作语料库
|
|
dictionary = corpora.Dictionary(corpus)
|
|
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
|
|
|
|
#针对语料库建立tfidf模型
|
|
tfidf = models.TfidfModel(doc_vectors)
|
|
tfidf_vectors = tfidf[doc_vectors]
|
|
|
|
#将语料库和字典序列化保存
|
|
pickle.dump(dictionary, open(basedir + 'biaom/dictionary.pkl','wb'))
|
|
pickle.dump(tfidf, open(basedir + 'biaom/tfidf.m', 'wb'))
|
|
pickle.dump(tfidf_vectors, open(basedir + 'biaom/tfidfv.m', 'wb'))
|
|
pickle.dump(filename, open(basedir + 'biaom/filename', 'wb'))
|
|
|
|
|
|
def calsim(txtpath):
|
|
#取得序列化的模型
|
|
dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb'))
|
|
tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb'))
|
|
tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb'))
|
|
filename = pickle.load(open(basedir + 'biaom/filename','rb'))
|
|
|
|
#获取所要查询的文本并计算tfidf值
|
|
query = tokenization(txtpath)
|
|
query_bow = dictionary.doc2bow(query)
|
|
query_tfidf = tfidf[query_bow]
|
|
|
|
#计算相似度并排序
|
|
index = similarities.MatrixSimilarity(tfidf_vectors)
|
|
sims = index[query_tfidf]
|
|
paixu = sorted(enumerate(sims), key=lambda item: -item[1])
|
|
|
|
#打印出最相似的几条标准
|
|
thestr = '最相似的安全标准是:' + '<br/>'
|
|
for i in range(5):
|
|
if paixu[i][1] > 0:
|
|
filepath = dirname + filename[paixu[i][0]]
|
|
with open(filepath,'r', encoding = 'utf-8') as f:
|
|
thestr = thestr + f.read()+'<br/>------------<br/>'
|
|
|
|
return thestr
|
|
|
|
|
|
def calsimTojson(text):
|
|
#取得序列化的模型
|
|
dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb'))
|
|
tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb'))
|
|
tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb'))
|
|
filename = pickle.load(open(basedir + 'biaom/filename','rb'))
|
|
|
|
#获取所要查询的文本并计算tfidf值
|
|
query = tokenizationFromStr(text)
|
|
query_bow = dictionary.doc2bow(query)
|
|
query_tfidf = tfidf[query_bow]
|
|
|
|
#计算相似度并排序
|
|
index = similarities.MatrixSimilarity(tfidf_vectors)
|
|
sims = index[query_tfidf]
|
|
paixu = sorted(enumerate(sims), key=lambda item: -item[1])
|
|
|
|
#返回最相似的五条标准
|
|
thelist = []
|
|
for i in range(5):
|
|
if paixu[i][1] > 0:
|
|
filepath = dirname + filename[paixu[i][0]]
|
|
with open(filepath,'r', encoding = 'utf-8') as f:
|
|
thelist.append(f.read())
|
|
|
|
return thelist
|
|
|
|
if __name__ == '__main__':
|
|
genmodel()
|
|
#thestr = calsim(basedir + 'onetrouble.txt')
|
|
#print(thestr)
|
|
|
|
|
|
|