dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
def
lda_model_values
(
num_topics, corpus, dictionary
):
x = []
perplexity_values = []
coherence_values = []
model_list = []
for
topic
in
range
(num_topics):
print
(
"主题数量:"
, topic+
1
)
lda_model = models.LdaModel(corpus=corpus, num_topics=topic+
1
, id2word =dictionary, chunksize =
2000
, passes=
20
, iterations =
400
)
model_list.append(lda_model)
x.append(topic+
1
)
perplexity_values.append(lda_model.log_perplexity(corpus))
coherencemodel = models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence=
'c_v'
)
coherence_values.append(coherencemodel.get_coherence())
print
(
"该主题评价完成\n"
)
return
model_list, x, perplexity_values, coherence_values
import matplotlib.pyplot as plt
import matplotlib
from pylab import xticks,yticks,np
model_list, x, perplexity_values, coherence_values = lda_model_values(num_topics, corpus, dictionary)
fig = plt.figure(figsize=(15,5))
plt.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
ax1 = fig.add_subplot(1, 2, 1)
plt.plot(x, perplexity_values, marker="o")
plt.title("主题建模-困惑度")
plt.xlabel('主题数目')
plt.ylabel('困惑度大小')
xticks(np.linspace(1, num_topics, num_topics, endpoint=True))
ax2 = fig.add_subplot(1, 2, 2)
plt.plot(x, coherence_values, marker="o")
plt.title("主题建模-一致性")
plt.xlabel("主题数目")
plt.ylabel("一致性大小")
xticks(np.linspace(1, num_topics, num_topics, endpoint=True))
plt.show()
根据困惑度和一致性进行主题数量的选择
基本选择:困惑度越低越好,一致性越高越好
- 923
-
CeshirenTester
Python