@zhengyuhong 2015-06-01T02:21:07.000000Z 字数 2551 阅读 1451

gensim笔记

Python tool gensim

Corpora and Vector Spaces

From string to vector

from gensim import corpora, models, similarities
documents = ["Human machine interface for lab abc computer applications",\
"A survey of user opinion of computer system response time",\
"The EPS user interface management system",\
"System and human system engineering testing of EPS",\
"Relation of user perceived response time to error measurement",\
"The generation of random binary unordered trees",\
"The intersection graph of paths in trees",\
"Graph minors IV Widths of trees and well quasi ordering",\
"Graph minors A survey"]
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
# store the dictionary, for future reference
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
corpus = [dictionary.doc2bow(text) for text in texts]
class MyCorpus(object):
     def __iter__(self):
         for line in open('mycorpus.txt'):
             #assume there's one document per line, tokens separated by whitespace
             yield dictionary.doc2bow(line.lower().split())
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
for vector in corpus_memory_friendly:
    print(vector)

Corpus Formats

from gensim import corpora
# create a toy corpus of 2 documents, as a plain Python list
corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it
corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
corpus = corpora.MmCorpus('/tmp/corpus.mm')
#Conversely, to load a corpus iterator from a Matrix Market file:

gensim.corpora.dictionary.Dictionary

TF-IDF model

tfidf = models.TfidfModel(corpus)
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

LSI model

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]
lsi.save('/tmp/model.lsi')
lsi = models.LsiModel.load('/tmp/model.lsi')
model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)
model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)

LDA model

lda = LdaModel(corpus, num_topics=10)
doc_lda = lda[doc_bow]
lda.update(other_corpus) #The model can be updated (trained) with new documents
lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data

gensim笔记

Corpora and Vector Spaces

From string to vector

Corpus Formats

TF-IDF model

LSI model

LDA model

内容目录

选择主题