Ongoing work to reproduce DPO.
Cite:
@article{rafailov2024direct, title={Direct preference optimization: Your language model is secretly a reward model}, author={Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Manning, Christopher D and Ermon, Stefano and Finn, Chelsea}, journal={Advances in Neural Information Processing Systems}, volume={36}, year={2024} }
@article{wang2023beyond, title={Beyond reverse kl: Generalizing direct preference optimization with diverse divergence constraints}, author={Wang, Chaoqi and Jiang, Yibo and Yang, Chenghao and Liu, Han and Chen, Yuxin}, journal={arXiv preprint arXiv:2309.16240}, year={2023} }