您當前的位置:首頁 > 體育

工具篇(一)gensim安裝及使用

作者:由 uncle livin 發表于 體育時間:2022-08-12

1 gensim安裝

(venv) F:\WenNLTK\venv\Scripts>pip install gensim

2 gensim使用

2。1 構建詞頻矩陣

from

gensim

import

corpora

models

import

jieba。posseg

as

posseg

jieba

# —————— 文字預處理

texts

=

‘花唄的安全沒有驗證成功’

‘借唄還款了,額度未恢復’

‘閒魚可以使用花唄嗎’

‘借唄每月還款時間’

‘花唄付款成功為何美團顯示支付超時’

‘國外賬戶可以開通借唄嗎’

flags

=

‘n’

‘nr’

‘ns’

‘nt’

‘eng’

‘v’

‘d’

# 詞性

stopwords

=

i

strip

()

for

i

in

open

‘stopwords。txt’

‘r’

encoding

=

‘utf-8’

)]

words_list

=

[]

for

text

in

texts

words

=

word

word

for

word

in

posseg

cut

text

if

word

flag

in

flags

and

word

word

not

in

stopwords

words_list

append

words

# —————— 構建詞頻矩陣

dictionary

=

corpora

Dictionary

words_list

# Dictionary(15 unique tokens: [‘花唄’, ‘驗證’, ‘借唄’, ‘恢復’, ‘未’]。。。)

corpus

=

dictionary

doc2bow

words

for

words

in

words_list

print

corpus

# [[(0, 1), (1, 1)], [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(0, 1), (7, 1)], [(2, 1), (5, 1), (8, 1)], [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(2, 1), (13, 1), (14, 1)]]

2。2 LDA主題提取

# —————— 訓練LDA模型

lda = models。LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)

for topic in lda。print_topics(num_words=8):

print(topic)

# (0, ‘0。231*“花唄” + 0。096*“顯示” + 0。095*“付款” + 0。095*“支付” + 0。095*“美團” + 0。092*“閒魚” + 0。086*“驗證” + 0。028*“借唄”’)

# (1, ‘0。116*“還款” + 0。103*“未” + 0。103*“恢復” + 0。102*“借唄” + 0。086*“額度” + 0。057*“花唄” + 0。055*“驗證” + 0。050*“閒魚”’)

# (2, ‘0。207*“借唄” + 0。127*“還款” + 0。093*“賬戶” + 0。093*“時間” + 0。092*“開通” + 0。073*“額度” + 0。064*“恢復” + 0。064*“未”’)

# —————— 主題推斷

text = ‘花唄怎樣邀請好友’

bow = dictionary。doc2bow([word。word for word in seg。cut(text) if word。flag in flags and word。word not in stopwords])

ndarray = lda。inference([bow])[0]

for e, value in enumerate(ndarray[0]):

print(‘主題%d推斷值%。2f’ % (e, value))

# 主題0推斷值0。35

# 主題1推斷值0。34

# 主題2推斷值1。31

2。3 文字相似度

2。3。1 TF-IDF

from gensim。models import TfidfModel

# —————— 模型訓練

tfidf = models。TfidfModel(corpus, normalize=False)

corpus_tfidf = tfidf[corpus]

# [[(0, 1。0), (1, 2。584962500721156)], [(2, 1。0), (3, 2。584962500721156), (4, 2。584962500721156), (5, 1。5849625007211563), (6, 2。584962500721156)], [(0, 1。0), (7, 2。584962500721156)], [(2, 1。0), (5, 1。5849625007211563), (8, 2。584962500721156)], [(0, 1。0), (9, 2。584962500721156), (10, 2。584962500721156), (11, 2。584962500721156), (12, 2。584962500721156)], [(2, 1。0), (13, 2。584962500721156), (14, 2。584962500721156)]]

# —————— 模板轉換

text = ‘花唄怎樣邀請好友’

vec_bow = dictionary。doc2bow([word。word for word in seg。cut(text) if word。flag in flags and word。word not in stopwords])

vec_tfidf = tfidf[vec_bow]

# —————— 相似度計算

index = similarities。MatrixSimilarity(corpus_tfidf)

sims = index[vec_tfidf]

print(list(sims)) # [0。3607962, 0。0, 0。3607962, 0。0, 0。18990646, 0。0]

2。3。2 LSI

from gensim。models import TfidfModel

# —————— 模型訓練

lsi = models。LsiModel(corpus, id2word=dictionary, num_topics=2)

corpus_lsi = lsi[corpus]

# [[(1, 0。7782431706752038)], [(0, 2。0407766379694)], [(1, 0。7782431706752035)], [(0, 1。3303019916097427)], [(1, 2。126199882928153)], [(0, 0。9001963134335645)]]

# —————— 模板轉換

text = ‘花唄怎樣邀請好友’

vec_bow = dictionary。doc2bow([word。word for word in seg。cut(text) if word。flag in flags and word。word not in stopwords])

vec_lsi = lsi[vec_bow]

print(vec_lsi) # [(1, 0。6424727114099833)]

# —————— 相似度計算

index = similarities。MatrixSimilarity(corpus_lsi)

sims = index[vec_lsi]

print(list(sims)) # [1。0, 0。0, 1。0, 0。0, 1。0, 0。0]

# —————— index儲存和載入

index。save(“lsi_index。index”)

index=similarities。MatrixSimilarity。load(“lsi_index。index”)

2。4 word2vec詞向量

from gensim。models import Word2Vec

sentences = [[‘first’, ‘sentence’], [‘second’, ‘sentence’]]

if isinstance(sentences, list):

sentences = sentences

else:

sentences = word2vec。Text8Corpus(r“seg。txt”)

# —————— 模型訓練

model = Word2Vec(sentences, sg=1, size=100, window=5, min_count=1, negative=3, sample=0。001, hs=1, workers=40)

# —————— 模型儲存

model。save(“word2vec。model”)

model。wv。save_word2vec_format(“vec。txt”)

標簽: word  corpus  584962500721156  index  vec