@article{THESIS,
      recid = {14573},
      author = {Wang, Chaoqi},
      title = {Towards Robust Alignment of Language Models with Human  Preferences},
      publisher = {University of Chicago},
      school = {Ph.D.},
      address = {2025-03},
      number = {THESIS},
      abstract = {The rapid advancement of large language models (LLMs)  offers transformative potential across a wide range of  applications, but concurrently raises critical safety  concerns, under- scoring the importance of aligning AI  systems with human values. This thesis investigates a  series of methodologies designed to enhance AI alignment  through novel optimization strate- gies that improve  performance, robustness, and trustworthiness. By addressing  limitations  intraditionalreinforcementlearningfromhumanfeedback(RLHF)pipelines,  ourworkintro- duces streamlined alternatives and  complementary techniques to optimize alignment while  maintaining computational efficiency and effectiveness. In  Chapter 2, we propose f-DPO (generalized Direct Preference  Optimization), which extends the DPO framework by  incorporating diverse divergence constraints, such as  Jensen- Shannon and forward KL divergences. Through an  analytical exploration of the Karush- Kuhn-Tucker  conditions, we derive simplified relationships between  reward functions and optimal policies under these  divergences. Empirical evaluations demonstrate that f-DPO  balances alignment performance and generative diversity  more effectively than RLHF-based  ProximalPolicyOptimization(PPO).Italsoachieveslowerexpectedcalibrationerror(ECE)  while providing practical benefits, such as improved  divergence efficiency. Chapter 3 extends alignment  optimization by addressing the limitations of single-  sample preference comparison. We introduce Multi-sample  Direct Preference Optimization (mDPO) and Multi-sample  Identity Preference Optimization (mIPO), frameworks that  op- timize group-wise characteristics to improve  distributional properties such as diversity and bias  reduction. These approaches significantly enhance  generative diversity in LLMs and mitigate demographic  biases in diffusion models. Moreover, multi-sample methods  exhibit robustness to noisy human-labeled preference data,  making them particularly effective for fine-tuning in  real-world scenarios where label quality may be imperfect.  More importantly, it offers more controllability for  improving the alignment of generative models over f-dpo. In  Chapter 4, we move beyond the traditional supervised  learning in chapters 2 and 3, and address a critical  challenge in reward modeling for online RLHF: spurious  correlations that can distort alignment objectives. We  introduce a causal reward modeling framework that  integrates causal inference techniques to mitigate these  biases. By enforcing counter- factual invariance, this  approach ensures reward predictions remain unaffected by  irrelevant or confounding variables. Experiments on  synthetic and real-world datasets demonstrate significant  improvements in addressing biases such as length  preference, sycophancy, and concept biases, ultimately  enhancing the fairness and reliability of alignment.  Together, these contributions advance the theoretical and  practical foundations of AI alignment.  Byofferingscalableandrobustmethodologies,  thisthesisbridgesthegapbetween current capabilities and the  long-term goal of developing safe and trustworthy AI  systems. The proposed approaches not only improve alignment  workflows but also address critical shortcomings in  existing pipelines, paving the way for more reliable,  diverse, and human- aligned AI systems.},
      url = {http://knowledge.uchicago.edu/record/14573},
      doi = {https://doi.org/10.6082/uchicago.14573},
}