import jieba
import gensim
mddesc = ['测试数据库','用户支付表','支付金额','支付用户']
train_corpus = []
for desc in mddesc:
train_corpus.append("/".join(jieba.cut(desc)).split("/"))
train_corpus.append("/".join(jieba.cut(desc)).split("/"))
#set the params(min_count, threshold) carefully when you use small corpus.
phrases = gensim.models.phrases.Phrases(train_corpus, min_count = 1, threshold=0.1)
bigram = gensim.models.phrases.Phraser(phrases)
input = "从用户支付表中选择支付金额大于5的用户。"
inputarr = "/".join(jieba.cut(input)).split("/")
repl = [s.replace("_","") for s in bigram[inputarr]]
print(repl)
参考:
1. https://radimrehurek.com/gensim/models/phrases.html
2. http://www.nltk.org/howto/collocations.html