@zhengyuhong
2015-06-01T02:21:07.000000Z
字数 2551
阅读 1376
Python tool gensim
from gensim import corpora, models, similaritiesdocuments = ["Human machine interface for lab abc computer applications",\"A survey of user opinion of computer system response time",\"The EPS user interface management system",\"System and human system engineering testing of EPS",\"Relation of user perceived response time to error measurement",\"The generation of random binary unordered trees",\"The intersection graph of paths in trees",\"Graph minors IV Widths of trees and well quasi ordering",\"Graph minors A survey"]# remove common words and tokenizestoplist = set('for a of the and to in'.split())texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]# remove words that appear only onceall_tokens = sum(texts, [])tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)texts = [[word for word in text if word not in tokens_once] for text in texts]dictionary = corpora.Dictionary(texts)dictionary.save('/tmp/deerwester.dict')# store the dictionary, for future referencenew_doc = "Human computer interaction"new_vec = dictionary.doc2bow(new_doc.lower().split())corpus = [dictionary.doc2bow(text) for text in texts]class MyCorpus(object):def __iter__(self):for line in open('mycorpus.txt'):#assume there's one document per line, tokens separated by whitespaceyield dictionary.doc2bow(line.lower().split())corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!for vector in corpus_memory_friendly:print(vector)
from gensim import corpora# create a toy corpus of 2 documents, as a plain Python listcorpus = [[(1, 0.5)], []] # make one document empty, for the heck of itcorpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)corpus = corpora.MmCorpus('/tmp/corpus.mm')#Conversely, to load a corpus iterator from a Matrix Market file:
gensim.corpora.dictionary.Dictionary
tfidf = models.TfidfModel(corpus)doc_bow = [(0, 1), (1, 1)]print(tfidf[doc_bow])corpus_tfidf = tfidf[corpus]for doc in corpus_tfidf:print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)corpus_lsi = lsi[corpus_tfidf]lsi.save('/tmp/model.lsi')lsi = models.LsiModel.load('/tmp/model.lsi')model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
lda = LdaModel(corpus, num_topics=10)doc_lda = lda[doc_bow]lda.update(other_corpus) #The model can be updated (trained) with new documentslda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data
