工具篇(一)gensim安裝及使用
1 gensim安裝
(venv) F:\WenNLTK\venv\Scripts>pip install gensim
2 gensim使用
2。1 構建詞頻矩陣
from
gensim
import
corpora
,
models
import
jieba。posseg
as
posseg
,
jieba
# —————— 文字預處理
texts
=
[
‘花唄的安全沒有驗證成功’
,
‘借唄還款了,額度未恢復’
,
‘閒魚可以使用花唄嗎’
,
‘借唄每月還款時間’
,
‘花唄付款成功為何美團顯示支付超時’
,
‘國外賬戶可以開通借唄嗎’
]
flags
=
(
‘n’
,
‘nr’
,
‘ns’
,
‘nt’
,
‘eng’
,
‘v’
,
‘d’
)
# 詞性
stopwords
=
[
i
。
strip
()
for
i
in
open
(
‘stopwords。txt’
,
‘r’
,
encoding
=
‘utf-8’
)]
words_list
=
[]
for
text
in
texts
:
words
=
[
word
。
word
for
word
in
posseg
。
cut
(
text
)
if
word
。
flag
in
flags
and
word
。
word
not
in
stopwords
]
words_list
。
append
(
words
)
# —————— 構建詞頻矩陣
dictionary
=
corpora
。
Dictionary
(
words_list
)
# Dictionary(15 unique tokens: [‘花唄’, ‘驗證’, ‘借唄’, ‘恢復’, ‘未’]。。。)
corpus
=
[
dictionary
。
doc2bow
(
words
)
for
words
in
words_list
]
(
corpus
)
# [[(0, 1), (1, 1)], [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(0, 1), (7, 1)], [(2, 1), (5, 1), (8, 1)], [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(2, 1), (13, 1), (14, 1)]]
2。2 LDA主題提取
# —————— 訓練LDA模型
lda = models。LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)
for topic in lda。print_topics(num_words=8):
print(topic)
# (0, ‘0。231*“花唄” + 0。096*“顯示” + 0。095*“付款” + 0。095*“支付” + 0。095*“美團” + 0。092*“閒魚” + 0。086*“驗證” + 0。028*“借唄”’)
# (1, ‘0。116*“還款” + 0。103*“未” + 0。103*“恢復” + 0。102*“借唄” + 0。086*“額度” + 0。057*“花唄” + 0。055*“驗證” + 0。050*“閒魚”’)
# (2, ‘0。207*“借唄” + 0。127*“還款” + 0。093*“賬戶” + 0。093*“時間” + 0。092*“開通” + 0。073*“額度” + 0。064*“恢復” + 0。064*“未”’)
# —————— 主題推斷
text = ‘花唄怎樣邀請好友’
bow = dictionary。doc2bow([word。word for word in seg。cut(text) if word。flag in flags and word。word not in stopwords])
ndarray = lda。inference([bow])[0]
for e, value in enumerate(ndarray[0]):
print(‘主題%d推斷值%。2f’ % (e, value))
# 主題0推斷值0。35
# 主題1推斷值0。34
# 主題2推斷值1。31
2。3 文字相似度
2。3。1 TF-IDF
from gensim。models import TfidfModel
# —————— 模型訓練
tfidf = models。TfidfModel(corpus, normalize=False)
corpus_tfidf = tfidf[corpus]
# [[(0, 1。0), (1, 2。584962500721156)], [(2, 1。0), (3, 2。584962500721156), (4, 2。584962500721156), (5, 1。5849625007211563), (6, 2。584962500721156)], [(0, 1。0), (7, 2。584962500721156)], [(2, 1。0), (5, 1。5849625007211563), (8, 2。584962500721156)], [(0, 1。0), (9, 2。584962500721156), (10, 2。584962500721156), (11, 2。584962500721156), (12, 2。584962500721156)], [(2, 1。0), (13, 2。584962500721156), (14, 2。584962500721156)]]
# —————— 模板轉換
text = ‘花唄怎樣邀請好友’
vec_bow = dictionary。doc2bow([word。word for word in seg。cut(text) if word。flag in flags and word。word not in stopwords])
vec_tfidf = tfidf[vec_bow]
# —————— 相似度計算
index = similarities。MatrixSimilarity(corpus_tfidf)
sims = index[vec_tfidf]
print(list(sims)) # [0。3607962, 0。0, 0。3607962, 0。0, 0。18990646, 0。0]
2。3。2 LSI
from gensim。models import TfidfModel
# —————— 模型訓練
lsi = models。LsiModel(corpus, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus]
# [[(1, 0。7782431706752038)], [(0, 2。0407766379694)], [(1, 0。7782431706752035)], [(0, 1。3303019916097427)], [(1, 2。126199882928153)], [(0, 0。9001963134335645)]]
# —————— 模板轉換
text = ‘花唄怎樣邀請好友’
vec_bow = dictionary。doc2bow([word。word for word in seg。cut(text) if word。flag in flags and word。word not in stopwords])
vec_lsi = lsi[vec_bow]
print(vec_lsi) # [(1, 0。6424727114099833)]
# —————— 相似度計算
index = similarities。MatrixSimilarity(corpus_lsi)
sims = index[vec_lsi]
print(list(sims)) # [1。0, 0。0, 1。0, 0。0, 1。0, 0。0]
# —————— index儲存和載入
index。save(“lsi_index。index”)
index=similarities。MatrixSimilarity。load(“lsi_index。index”)
2。4 word2vec詞向量
from gensim。models import Word2Vec
sentences = [[‘first’, ‘sentence’], [‘second’, ‘sentence’]]
if isinstance(sentences, list):
sentences = sentences
else:
sentences = word2vec。Text8Corpus(r“seg。txt”)
# —————— 模型訓練
model = Word2Vec(sentences, sg=1, size=100, window=5, min_count=1, negative=3, sample=0。001, hs=1, workers=40)
# —————— 模型儲存
model。save(“word2vec。model”)
model。wv。save_word2vec_format(“vec。txt”)
上一篇:食品行業從業幾年的理解和感悟
下一篇:錐形海洋塑膠浮標引數