@article{TEXTUAL,
      recid = {11726},
      author = {Ma, Xiaolong and Yan, Feng and Yang, Lei and Foster, Ian  and Papka, Michael E. and Liu, Zhengchun and Kettimuthu,  Rajkumar},
      title = {MalleTrain: Deep Neural Networks Training on Unfillable  Supercomputer Nodes},
      journal = {Proceedings of the 15th ACM/SPEC International Conference  on Performance Engineering},
      address = {2024-05-07},
      number = {TEXTUAL},
      abstract = {First-come first-serve scheduling can result in  substantial (up to 10%) of transiently idle nodes on  supercomputers. Recognizing that such unfilled nodes are  well-suited for deep neural network (DNN) training, due to  the flexible nature of DNN training tasks, Liu et al.  proposed that the re-scaling DNN training tasks to fit gaps  in schedules be formulated as a mixed-integer linear  programming (MILP) problem, and demonstrated via simulation  the potential benefits of the approach. Here, we introduce  MalleTrain, a system that provides the first practical  implementation of this approach and that furthermore  generalizes it by allowing it to be used even for DNN  training applications for which model information is  unknown before runtime. Key to this latter innovation is  the use of a lightweight online job profiling advisor (JPA)  to collect critical scalability information for DNN  jobs---information that it then employs to optimize  resource allocations dynamically, in real time. We describe  the MalleTrain architecture and present the results of a  detailed experimental evaluation on a supercomputer GPU  cluster and several representative DNN training workloads,  including neural architecture search and hyperparameter  optimization. Our results not only confirm the practical  feasibility of leveraging idle supercomputer nodes for DNN  training but improve significantly on prior results,  improving training throughput by up to 22.3% without  requiring users to provide job scalability information.},
      url = {http://knowledge.uchicago.edu/record/11726},
}