safesite/duibiao/calsim.py

132 lines
4.1 KiB
Python

import jieba.posseg as pseg
import codecs
import pickle
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
from gensim import corpora, models, similarities
from gensim.summarization import bm25
import os
from django.conf import settings
basedir = settings.BASE_DIR +'/duibiao/'
#basedir = 'F:/project/safeyun/duibiao/'
#初始化部分数据
#设定停用词
stop_words = basedir + 'chineseStopWords.txt'
stopwords = codecs.open(stop_words,'r').readlines()
stopwords = [ w.strip() for w in stopwords ]
stop_flag = ['x', 'c', 'u','d', 'p', 't', 'uj', 'm', 'f', 'r']
#标准文件列表
dirname = basedir + 'biao/'
filename = [] #文件名列表
#词袋列表
corpus = []
#分词函数
def tokenization(filename):
result = []
with open(dirname + filename, 'r', encoding = 'utf-8') as f:
text = f.read()
words = pseg.cut(text)
for word, flag in words:
if flag not in stop_flag and word not in stopwords:
result.append(word)
return result
def tokenizationFromStr(text):
result = []
words = pseg.cut(text)
for word, flag in words:
if flag not in stop_flag and word not in stopwords:
result.append(word)
return result
def genmodel():
#获取标准全部分词
for root,dirs,files in os.walk(dirname):
for f in files:
if f.endswith(".txt"):
#f = dirname + '/' + f
corpus.append(tokenization(f))
filename.append(f)
#建立词袋模型并制作语料库
dictionary = corpora.Dictionary(corpus)
doc_vectors = [dictionary.doc2bow(text) for text in corpus]
#针对语料库建立tfidf模型
tfidf = models.TfidfModel(doc_vectors)
tfidf_vectors = tfidf[doc_vectors]
#将语料库和字典序列化保存
pickle.dump(dictionary, open(basedir + 'biaom/dictionary.pkl','wb'))
pickle.dump(tfidf, open(basedir + 'biaom/tfidf.m', 'wb'))
pickle.dump(tfidf_vectors, open(basedir + 'biaom/tfidfv.m', 'wb'))
pickle.dump(filename, open(basedir + 'biaom/filename', 'wb'))
def calsim(txtpath):
#取得序列化的模型
dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb'))
tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb'))
tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb'))
filename = pickle.load(open(basedir + 'biaom/filename','rb'))
#获取所要查询的文本并计算tfidf值
query = tokenization(txtpath)
query_bow = dictionary.doc2bow(query)
query_tfidf = tfidf[query_bow]
#计算相似度并排序
index = similarities.MatrixSimilarity(tfidf_vectors)
sims = index[query_tfidf]
paixu = sorted(enumerate(sims), key=lambda item: -item[1])
#打印出最相似的几条标准
thestr = '最相似的安全标准是:' + '<br/>'
for i in range(5):
if paixu[i][1] > 0:
filepath = dirname + filename[paixu[i][0]]
with open(filepath,'r', encoding = 'utf-8') as f:
thestr = thestr + f.read()+'<br/>------------<br/>'
return thestr
def calsimTojson(text):
#取得序列化的模型
dictionary = pickle.load(open(basedir + 'biaom/dictionary.pkl','rb'))
tfidf = pickle.load(open(basedir + 'biaom/tfidf.m','rb'))
tfidf_vectors = pickle.load(open(basedir + 'biaom/tfidfv.m','rb'))
filename = pickle.load(open(basedir + 'biaom/filename','rb'))
#获取所要查询的文本并计算tfidf值
query = tokenizationFromStr(text)
query_bow = dictionary.doc2bow(query)
query_tfidf = tfidf[query_bow]
#计算相似度并排序
index = similarities.MatrixSimilarity(tfidf_vectors)
sims = index[query_tfidf]
paixu = sorted(enumerate(sims), key=lambda item: -item[1])
#返回最相似的五条标准
thelist = []
for i in range(5):
if paixu[i][1] > 0:
filepath = dirname + filename[paixu[i][0]]
with open(filepath,'r', encoding = 'utf-8') as f:
thelist.append(f.read())
return thelist
if __name__ == '__main__':
genmodel()
#thestr = calsim(basedir + 'onetrouble.txt')
#print(thestr)