其中的⼀些LDA的参数需要结合⾃⼰的实际进⾏设定
直接计算出的log_perplexity是负值,是困惑度经过对数去相反数得到的。
import
csv
import
datetime
import
re
import
pandas
as
pd
import
numpy
as
np
import
jieba
import
matplotlib
.
pyplot
as
plt
import
jieba
.
posseg
as
jp
,
jieba
import
gensim
from
snownlp
import
seg
from
snownlp
import
SnowNLP
from
snownlp
import
sentiment
from
gensim
import
corpora
,
models
from
gensim
.
models
import
CoherenceModel
from
sklearn
.
model_selection
import
train_test_split
from
sklearn
.
model_selection
import
KFold
from
sklearn
.
feature_extraction
.
text
import
TfidfVectorizer
,
CountVectorizer
from
sklearn
.
decomposition
import
NMF
,
LatentDirichletAllocation
import
warnings
warnings
.
filterwarnings
(
"ignore"
)
comment
=
pd
.
read_csv
(
r
"good_1"
,
header
=
0
,
index_col
=
False
,
engine
='python'
,
encoding
=
'utf-8'
)
csv_data
=
comment
[[(
len
(
str
(
x
))
>
100
)
for
x
in
comment
[
'segment'
]]]
print
(
csv_data
.
shape
)
#
构造
corpus
train
=
[]
for
i
in
range
(
csv_data
.
shape
[
0
]):
comment
=
csv_data
.
iloc
[
i
,
7
].
split
()
train
.
append
(
comment
)
id2word
=
corpora
.
Dictionary
(
train
)
corpus
=
[
id2word
.
doc2bow
(
sentence
)
for
sentence
in
train
]
#
⼀致性和困惑度计算
coherence_values
=
[]
perplexity_values
=
[]
model_list
=
[]
for
topic
in
range
(
15
):
lda_model
=
gensim
.
models
.
LdaMulticore
(
corpus
=
corpus
,
num_topics
=
topic
+
1
,
id2word
=
id2word
,
random_state
=
100
,
chunksize
=
100
,
passes
=
10
,
pe
r_word_topics
=
True
)
perplexity
=
pow
(
2
,
-
lda_model
.
log_perplexity
(
corpus
))
print
(
perplexity
,
end
=' '
)
perplexity_values
.
append
(
round
(
perplexity
,
3
))
model_list
.
append
(
lda_model
)
coherencemodel
=
CoherenceModel
(
model
=
lda_model
,
texts
=
train
,
dictionary
=
id2word
,
coherence
='c_v'
)
coherence_values
.
append
(
round
(
coherencemodel
.
get_coherence
(),
3
))
下⾯展⽰⼀种⼀致性可视化的⽅法