@article{THESIS,
      recid = {14559},
      author = {Cheng, Yihua},
      title = {A Scalable Approach to Distributed Large Language Model  Inference},
      publisher = {University of Chicago},
      school = {Ph.D.},
      address = {2025-03},
      number = {THESIS},
      abstract = {As the use of large language models (LLMs) expands  rapidly, so does the intensity and scale of the workloads  required to query LLMs. Thus, the requirements for serving  LLMs evolve beyond single-instance deployment to  large-scale distributed deployment. As most of today’s LLM  serving system optimizations focus only on speeding up a  single serving instance, key techniques for distributed LLM  deployments are still missing. The key contribution of this  dissertation is the design and implementation of an  efficient system for distributed LLM serving engine  deployment. Our thesis is that by decoupling the inference  states (KV caches) from the LLM serving engine, the  performance of distributed LLM inference can be  substantially improved. Unlike prior work, which treats the  LLM serving engine as a black box and builds global  orchestrators for serving engines, our approach uses a  separate module to transfer, store, and share the KV caches  across different serving engines. To prove this thesis,  this dissertation provides a suite of techniques to address  the follow- ing fundamental challenges. First, we need to  offload the KV caches from serving engines and insert them  back without impacting the LLM’s runtime performance.  Second, we need an efficient way to store, transmit, and  compose KV caches across the whole distributed cluster.  Third, we need a scalable way to manage the serving engines  and schedule the requests to maximize the utilization of  computing resources. Our key insight is that many KV caches  can be reused across serving engines in the large-scale  distributed setup. With the reused KV cache, the LLM can  skip computationally intensive prefills and start  generating outputs immediately. Based on the insight, we  develop LMStack, an end-to-end software stack for  distributed LLM deployment that can fully realize the  potential of KV cache reuse. We have shown that our  solution can substantially reduce the computational cost on  real-world workloads, as well as provide better  production-ready guarantees, including easy-deployment,  automatic scaling, and fault tolerance. We have fully  open-sourced LMStack at https://github.com/LMCache/LMCache  and https://github.com/vllm-project/production-stack, where  the former implements the KV cache offloading, storage, and  transmission component and the latter includes the  distributed cluster management.},
      url = {http://knowledge.uchicago.edu/record/14559},
      doi = {https://doi.org/10.6082/uchicago.14559},
}