org.apache.mahout.clustering.lda.cvb
Class TopicModel

java.lang.Object
  extended by org.apache.mahout.clustering.lda.cvb.TopicModel
All Implemented Interfaces:
Iterable<MatrixSlice>, org.apache.hadoop.conf.Configurable

public class TopicModel
extends Object
implements org.apache.hadoop.conf.Configurable, Iterable<MatrixSlice>

Thin wrapper around a Matrix of counts of occurrences of (topic, term) pairs. Dividing {code topicTermCount.viewRow(topic).get(term)} by the sum over the values for all terms in that row yields p(term | topic). Instead dividing it by all topic columns for that term yields p(topic | term). Multithreading is enabled for the update(Matrix) method: this method is async, and merely submits the matrix to a work queue. When all work has been submitted, awaitTermination() should be called, which will block until updates have been accumulated.


Constructor Summary
TopicModel(org.apache.hadoop.conf.Configuration conf, double eta, double alpha, String[] dictionary, int numThreads, double modelWeight, org.apache.hadoop.fs.Path... modelpath)
           
TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random, String[] dictionary, int numThreads, double modelWeight)
           
TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary, double modelWeight)
           
TopicModel(int numTopics, int numTerms, double eta, double alpha, String[] dictionary, int numThreads, double modelWeight)
           
TopicModel(Matrix topicTermCounts, double eta, double alpha, String[] dictionary, int numThreads, double modelWeight)
           
TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha, String[] dictionary, double modelWeight)
           
TopicModel(Matrix topicTermCounts, Vector topicSums, double eta, double alpha, String[] dictionary, int numThreads, double modelWeight)
           
 
Method Summary
 org.apache.hadoop.conf.Configuration getConf()
           
 int getNumTerms()
           
 int getNumTopics()
           
 Vector infer(Vector original, Vector docTopics)
           
 Iterator<MatrixSlice> iterator()
           
static Pair<Matrix,Vector> loadModel(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path... modelPaths)
           
 double perplexity(Vector document, Vector docTopics)
          \(sum_x sum_a (c_ai * log(p(x|i) * p(a|x)))\)
 void persist(org.apache.hadoop.fs.Path outputDir, boolean overwrite)
           
 void renormalize()
           
 void reset()
           
 int sampleTerm(int topic)
           
 int sampleTerm(Vector topicDistribution)
           
 void setConf(org.apache.hadoop.conf.Configuration configuration)
           
 void stop()
           
 Vector topicSums()
           
 String toString()
           
 void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel)
           
 void update(int termId, Vector topicCounts)
           
 void update(Matrix docTopicCounts)
           
 void updateTopic(int topic, Vector docTopicCounts)
           
static String vectorToSortedString(Vector vector, String[] dictionary)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
 

Constructor Detail

TopicModel

public TopicModel(int numTopics,
                  int numTerms,
                  double eta,
                  double alpha,
                  String[] dictionary,
                  double modelWeight)

TopicModel

public TopicModel(org.apache.hadoop.conf.Configuration conf,
                  double eta,
                  double alpha,
                  String[] dictionary,
                  int numThreads,
                  double modelWeight,
                  org.apache.hadoop.fs.Path... modelpath)
           throws IOException
Throws:
IOException

TopicModel

public TopicModel(int numTopics,
                  int numTerms,
                  double eta,
                  double alpha,
                  String[] dictionary,
                  int numThreads,
                  double modelWeight)

TopicModel

public TopicModel(int numTopics,
                  int numTerms,
                  double eta,
                  double alpha,
                  Random random,
                  String[] dictionary,
                  int numThreads,
                  double modelWeight)

TopicModel

public TopicModel(Matrix topicTermCounts,
                  Vector topicSums,
                  double eta,
                  double alpha,
                  String[] dictionary,
                  double modelWeight)

TopicModel

public TopicModel(Matrix topicTermCounts,
                  double eta,
                  double alpha,
                  String[] dictionary,
                  int numThreads,
                  double modelWeight)

TopicModel

public TopicModel(Matrix topicTermCounts,
                  Vector topicSums,
                  double eta,
                  double alpha,
                  String[] dictionary,
                  int numThreads,
                  double modelWeight)
Method Detail

getNumTerms

public int getNumTerms()

getNumTopics

public int getNumTopics()

iterator

public Iterator<MatrixSlice> iterator()
Specified by:
iterator in interface Iterable<MatrixSlice>

topicSums

public Vector topicSums()

loadModel

public static Pair<Matrix,Vector> loadModel(org.apache.hadoop.conf.Configuration conf,
                                            org.apache.hadoop.fs.Path... modelPaths)
                                     throws IOException
Throws:
IOException

toString

public String toString()
Overrides:
toString in class Object

sampleTerm

public int sampleTerm(Vector topicDistribution)

sampleTerm

public int sampleTerm(int topic)

reset

public void reset()

stop

public void stop()

renormalize

public void renormalize()

trainDocTopicModel

public void trainDocTopicModel(Vector original,
                               Vector topics,
                               Matrix docTopicModel)

infer

public Vector infer(Vector original,
                    Vector docTopics)

update

public void update(Matrix docTopicCounts)

updateTopic

public void updateTopic(int topic,
                        Vector docTopicCounts)

update

public void update(int termId,
                   Vector topicCounts)

persist

public void persist(org.apache.hadoop.fs.Path outputDir,
                    boolean overwrite)
             throws IOException
Throws:
IOException

perplexity

public double perplexity(Vector document,
                         Vector docTopics)
\(sum_x sum_a (c_ai * log(p(x|i) * p(a|x)))\)


vectorToSortedString

public static String vectorToSortedString(Vector vector,
                                          String[] dictionary)

setConf

public void setConf(org.apache.hadoop.conf.Configuration configuration)
Specified by:
setConf in interface org.apache.hadoop.conf.Configurable

getConf

public org.apache.hadoop.conf.Configuration getConf()
Specified by:
getConf in interface org.apache.hadoop.conf.Configurable


Copyright © 2008–2014 The Apache Software Foundation. All rights reserved.