Python_

⽂本分析

_

困惑度计算和⼀致性检验

在做LDA的过程中⽐较⽐较难的问题就是主题数的确定,下⾯介绍困惑度、⼀致性这两种⽅法的实现。

其中的⼀些LDA的参数需要结合⾃⼰的实际进⾏设定

直接计算出的log_perplexity是负值,是困惑度经过对数去相反数得到的。

import

csv

import

datetime

import

re

import

pandas

as

pd

import

numpy

as

np

import

jieba

import

matplotlib

.

pyplot

as

plt

import

jieba

.

posseg

as

jp

,

jieba

import

gensim

from

snownlp

import

seg

from

snownlp

import

SnowNLP

from

snownlp

import

sentiment

from

gensim

import

corpora

,

models

from

gensim

.

models

import

CoherenceModel

from

sklearn

.

model_selection

import

train_test_split

from

sklearn

.

model_selection

import

KFold

from

sklearn

.

feature_extraction

.

text

import

TfidfVectorizer

,

CountVectorizer

from

sklearn

.

decomposition

import

NMF

,

LatentDirichletAllocation

import

warnings

warnings

.

filterwarnings

(

"ignore"

)

comment

=

pd

.

read_csv

(

r

"good_1"

,

header

=

0

,

index_col

=

False

,

engine

='python'

,

encoding

=

'utf-8'

)

csv_data

=

comment

[[(

len

(

str

(

x

))

>

100

)

for

x

in

comment

[

'segment'

]]]

print

(

csv_data

.

shape

)

#

构造

corpus

train

=

[]

for

i

in

range

(

csv_data

.

shape

[

0

]):

comment

=

csv_data

.

iloc

[

i

,

7

].

split

()

train

.

append

(

comment

)

id2word

=

corpora

.

Dictionary

(

train

)

corpus

=

[

id2word

.

doc2bow

(

sentence

)

for

sentence

in

train

]

#

⼀致性和困惑度计算

coherence_values

=

[]

perplexity_values

=

[]

model_list

=

[]

for

topic

in

range

(

15

):

lda_model

=

gensim

.

models

.

LdaMulticore

(

corpus

=

corpus

,

num_topics

=

topic

+

1

,

id2word

=

id2word

,

random_state

=

100

,

chunksize

=

100

,

passes

=

10

,

pe

r_word_topics

=

True

)

perplexity

=

pow

(

2

,

-

lda_model

.

log_perplexity

(

corpus

))

print

(

perplexity

,

end

='   '

)

perplexity_values

.

append

(

round

(

perplexity

,

3

))

model_list

.

append

(

lda_model

)

coherencemodel

=

CoherenceModel

(

model

=

lda_model

,

texts

=

train

,

dictionary

=

id2word

,

coherence

='c_v'

)

coherence_values

.

append

(

round

(

coherencemodel

.

get_coherence

(),

3

))

下⾯展⽰⼀种⼀致性可视化的⽅法