# -*- coding: utf-8 -*- import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from nltk.tokenize import RegexpTokenizer from nltk.stem.porter import PorterStemmer import gensim, re from gensim import corpora, models from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, Similarity tokenizer = RegexpTokenizer(r'\w+') # Create p_stemmer of class PorterStemmer texts = [] # loop through document list for line in open('D:\Implementations\Experiments\JabRef2.6\Source\CorpusRaw-AfterSplitStopStem.corpusRawMethodLevelGranularity'): texts.append(tokenizer.tokenize(line)) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word = dictionary, num_topics=500, iterations=1000, passes=5) #alpha=1/500, eta=1/500,t_tau =offset, k=decay, η=eta, K=num-topics [offset=1024, decay=0.9: for historical simulation] index = MatrixSimilarity(ldamodel[corpus]) for line in open('D:\Implementations\Experiments\JabRef2.6\Source\queries-AfterSplitStopStem.txt'): new_vec = dictionary.doc2bow(tokenizer.tokenize(line)) doc_lda = ldamodel[new_vec] sims = index[doc_lda] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sims