@article{Estimations:2282,
      recid = {2282},
      author = {Wang, Minzhe},
      title = {Optimal Estimations in Topic Modeling},
      publisher = {University of Chicago},
      school = {Ph.D.},
      address = {2020-06},
      pages = {257},
      abstract = {Topic modeling is a useful tool in computational social  science, digital humanities, biology, and chemistry. A  popular topic model is the probabilistic Latent Semantic  Indexing (pLSI) model. It assumes that the word-document  matrix factorizes into the product of a low-rank word-topic  matrix A, and a low-rank topic-document matrixW. The goal  is to estimate these matrices.

While many algorithms are  available for topic modeling, there is relatively little  statistical understanding. The first contribution of this  thesis is providing rigorous statistical theory for both  problems, including the optimal rate of convergence for  estimating A, the optimal rate of convergence for  estimating W, and an unconventional theory for including  "sparsity" in topic modeling. The second contribution of  this thesis is proposing an assortment of new methods,  including a spectral approach to estimating A, a spectral  approach to estimating W, and a word-screening method. All  these methods are computationally efficient and  statistically optimal for a wide range of settings. 

The  thesis is composed of three parts. In the first part we  propose a new algorithm for estimating the word-topic  matrix using the entry-wise ratios of the left singular  vectors of the normalized word-document matrix, which is  shown to possess the minimax optimal row-wise error rate  using an entry-wise bounds for singular vectors. The second  part we study topic-document matrix estimation problem,  where we introduce a new notion of sparsity, the  non-informativeness, and propose to use a new  non-informative words screening method, before conducting  topic-document matrix estimation based on the resulting  right singular vectors of the normalized word-document  matrix. We show the algorithm enjoys minimax convergence  rate under the existence of the non-informative words. Both  algorithms are simple, but surprisingly enjoy various deep  algebraic insights underneath the pLSI model. In the last  part we study a different but topic-model-related problem,  information retrieval, where we propose a new model-based  algorithm which explicitly takes into account the  heterogeneity between documents and queries generation. In  each part we also provide various simulations and real data  applications to support the competitiveness of our proposed  models and algorithms.},
      url = {http://knowledge.uchicago.edu/record/2282},
      doi = {https://doi.org/10.6082/uchicago.2282},
}