工具篇（一）gensim安裝及使用

作者：由 uncle livin 發表于體育時間：2022-08-12

1 gensim安裝

（venv） F：\WenNLTK\venv\Scripts>pip install gensim

2 gensim使用

2。1 構建詞頻矩陣

from

gensim

import

corpora

，

models

import

jieba。posseg

posseg

，

jieba

# —————— 文字預處理

texts

［

‘花唄的安全沒有驗證成功’

，

‘借唄還款了，額度未恢復’

，

‘閒魚可以使用花唄嗎’

，

‘借唄每月還款時間’

，

‘花唄付款成功為何美團顯示支付超時’

，

‘國外賬戶可以開通借唄嗎’

］

flags

（

‘n’

，

‘nr’

，

‘ns’

，

‘nt’

，

‘eng’

，

‘v’

，

‘d’

）

# 詞性

stopwords

［

。

strip

（）

for

open

（

‘stopwords。txt’

，

‘r’

，

encoding

‘utf-8’

）］

words_list

［］

for

text

texts

：

words

［

word

。

word

for

word

posseg

。

cut

（

text

）

word

。

flag

flags

and

word

。

word

not

stopwords

］

words_list

。

append

（

words

）

# —————— 構建詞頻矩陣

dictionary

corpora

。

Dictionary

（

words_list

）

# Dictionary（15 unique tokens：［‘花唄’， ‘驗證’， ‘借唄’， ‘恢復’， ‘未’］。。。）

corpus

［

dictionary

。

doc2bow

（

words

）

for

words

words_list

］

（

corpus

）

# ［［（0， 1），（1， 1）］，［（2， 1），（3， 1），（4， 1），（5， 1），（6， 1）］，［（0， 1），（7， 1）］，［（2， 1），（5， 1），（8， 1）］，［（0， 1），（9， 1），（10， 1），（11， 1），（12， 1）］，［（2， 1），（13， 1），（14， 1）］］

2。2 LDA主題提取

# —————— 訓練LDA模型

lda = models。LdaModel（corpus=corpus， id2word=dictionary， num_topics=3）

for topic in lda。print_topics（num_words=8）：

print（topic）

# （0， ‘0。231*“花唄” + 0。096*“顯示” + 0。095*“付款” + 0。095*“支付” + 0。095*“美團” + 0。092*“閒魚” + 0。086*“驗證” + 0。028*“借唄”’）

# （1， ‘0。116*“還款” + 0。103*“未” + 0。103*“恢復” + 0。102*“借唄” + 0。086*“額度” + 0。057*“花唄” + 0。055*“驗證” + 0。050*“閒魚”’）

# （2， ‘0。207*“借唄” + 0。127*“還款” + 0。093*“賬戶” + 0。093*“時間” + 0。092*“開通” + 0。073*“額度” + 0。064*“恢復” + 0。064*“未”’）

# —————— 主題推斷

text = ‘花唄怎樣邀請好友’

bow = dictionary。doc2bow（［word。word for word in seg。cut（text） if word。flag in flags and word。word not in stopwords］）

ndarray = lda。inference（［bow］）［0］

for e， value in enumerate（ndarray［0］）：

print（‘主題%d推斷值%。2f’ % （e， value））

# 主題0推斷值0。35

# 主題1推斷值0。34

# 主題2推斷值1。31

2。3 文字相似度

2。3。1 TF-IDF

from gensim。models import TfidfModel

# —————— 模型訓練

tfidf = models。TfidfModel（corpus， normalize=False）

corpus_tfidf = tfidf［corpus］

# ［［（0， 1。0），（1， 2。584962500721156）］，［（2， 1。0），（3， 2。584962500721156），（4， 2。584962500721156），（5， 1。5849625007211563），（6， 2。584962500721156）］，［（0， 1。0），（7， 2。584962500721156）］，［（2， 1。0），（5， 1。5849625007211563），（8， 2。584962500721156）］，［（0， 1。0），（9， 2。584962500721156），（10， 2。584962500721156），（11， 2。584962500721156），（12， 2。584962500721156）］，［（2， 1。0），（13， 2。584962500721156），（14， 2。584962500721156）］］

# —————— 模板轉換

text = ‘花唄怎樣邀請好友’

vec_bow = dictionary。doc2bow（［word。word for word in seg。cut（text） if word。flag in flags and word。word not in stopwords］）

vec_tfidf = tfidf［vec_bow］

# —————— 相似度計算

index = similarities。MatrixSimilarity（corpus_tfidf）

sims = index［vec_tfidf］

print（list（sims）） # ［0。3607962， 0。0， 0。3607962， 0。0， 0。18990646， 0。0］

2。3。2 LSI

from gensim。models import TfidfModel

# —————— 模型訓練

lsi = models。LsiModel（corpus， id2word=dictionary， num_topics=2）

corpus_lsi = lsi［corpus］

# ［［（1， 0。7782431706752038）］，［（0， 2。0407766379694）］，［（1， 0。7782431706752035）］，［（0， 1。3303019916097427）］，［（1， 2。126199882928153）］，［（0， 0。9001963134335645）］］

# —————— 模板轉換

text = ‘花唄怎樣邀請好友’

vec_bow = dictionary。doc2bow（［word。word for word in seg。cut（text） if word。flag in flags and word。word not in stopwords］）

vec_lsi = lsi［vec_bow］

print（vec_lsi） # ［（1， 0。6424727114099833）］

# —————— 相似度計算

index = similarities。MatrixSimilarity（corpus_lsi）

sims = index［vec_lsi］

print（list（sims）） # ［1。0， 0。0， 1。0， 0。0， 1。0， 0。0］

# —————— index儲存和載入

index。save（“lsi_index。index”）

index=similarities。MatrixSimilarity。load（“lsi_index。index”）

2。4 word2vec詞向量

from gensim。models import Word2Vec

sentences = ［［‘first’， ‘sentence’］，［‘second’， ‘sentence’］］

if isinstance（sentences， list）：

sentences = sentences

else：

sentences = word2vec。Text8Corpus（r“seg。txt”）

# —————— 模型訓練

model = Word2Vec（sentences， sg=1， size=100， window=5， min_count=1， negative=3， sample=0。001， hs=1， workers=40）

# —————— 模型儲存

model。save（“word2vec。model”）

model。wv。save_word2vec_format（“vec。txt”）

標簽： word corpus 584962500721156 index vec

上一篇:食品行業從業幾年的理解和感悟

下一篇：錐形海洋塑膠浮標引數

工具篇（一）gensim安裝及使用

猜你喜歡

比心騙局

如何將Word文件快速轉為Ppt？

40行Python程式碼區分英語單詞和漢語拼音

vr主題公園？

請問如何在數字間合適的位置填上“+”或“-”使等式成立：246951=72？