[关闭]
@zhengyuhong 2015-06-01T02:21:07.000000Z 字数 2551 阅读 1376

gensim笔记

Python tool gensim


Corpora and Vector Spaces

From string to vector

  1. from gensim import corpora, models, similarities
  2. documents = ["Human machine interface for lab abc computer applications",\
  3. "A survey of user opinion of computer system response time",\
  4. "The EPS user interface management system",\
  5. "System and human system engineering testing of EPS",\
  6. "Relation of user perceived response time to error measurement",\
  7. "The generation of random binary unordered trees",\
  8. "The intersection graph of paths in trees",\
  9. "Graph minors IV Widths of trees and well quasi ordering",\
  10. "Graph minors A survey"]
  11. # remove common words and tokenize
  12. stoplist = set('for a of the and to in'.split())
  13. texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
  14. # remove words that appear only once
  15. all_tokens = sum(texts, [])
  16. tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
  17. texts = [[word for word in text if word not in tokens_once] for text in texts]
  18. dictionary = corpora.Dictionary(texts)
  19. dictionary.save('/tmp/deerwester.dict')
  20. # store the dictionary, for future reference
  21. new_doc = "Human computer interaction"
  22. new_vec = dictionary.doc2bow(new_doc.lower().split())
  23. corpus = [dictionary.doc2bow(text) for text in texts]
  24. class MyCorpus(object):
  25. def __iter__(self):
  26. for line in open('mycorpus.txt'):
  27. #assume there's one document per line, tokens separated by whitespace
  28. yield dictionary.doc2bow(line.lower().split())
  29. corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
  30. for vector in corpus_memory_friendly:
  31. print(vector)

Corpus Formats

  1. from gensim import corpora
  2. # create a toy corpus of 2 documents, as a plain Python list
  3. corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it
  4. corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
  5. corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
  6. corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
  7. corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
  8. corpus = corpora.MmCorpus('/tmp/corpus.mm')
  9. #Conversely, to load a corpus iterator from a Matrix Market file:

gensim.corpora.dictionary.Dictionary

TF-IDF model

  1. tfidf = models.TfidfModel(corpus)
  2. doc_bow = [(0, 1), (1, 1)]
  3. print(tfidf[doc_bow])
  4. corpus_tfidf = tfidf[corpus]
  5. for doc in corpus_tfidf:
  6. print(doc)

LSI model

  1. lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
  2. corpus_lsi = lsi[corpus_tfidf]
  3. lsi.save('/tmp/model.lsi')
  4. lsi = models.LsiModel.load('/tmp/model.lsi')
  5. model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)
  6. model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)

LDA model

  1. lda = LdaModel(corpus, num_topics=10)
  2. doc_lda = lda[doc_bow]
  3. lda.update(other_corpus) #The model can be updated (trained) with new documents
  4. lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注