{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,27]],"date-time":"2026-05-27T13:04:39Z","timestamp":1779887079960,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"name":"Institute for Guo Qiang, Tsinghua University","award":["2019GQB0003"],"award-info":[{"award-number":["2019GQB0003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671564","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:55:12Z","timestamp":1724561712000},"page":"5813-5824","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["R-Eval: A Unified Toolkit for Evaluating Domain Knowledge of Retrieval Augmented Large Language Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0640-3413","authenticated-orcid":false,"given":"Shangqing","family":"Tu","sequence":"first","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3708-1381","authenticated-orcid":false,"given":"Yuanchun","family":"Wang","sequence":"additional","affiliation":[{"name":"SoI, Renmin University of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3430-4048","authenticated-orcid":false,"given":"Jifan","family":"Yu","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4468-3805","authenticated-orcid":false,"given":"Yuyang","family":"Xie","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8751-3316","authenticated-orcid":false,"given":"Yaran","family":"Shi","sequence":"additional","affiliation":[{"name":"SIOE, Beihang University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5727-143X","authenticated-orcid":false,"given":"Xiaozhi","family":"Wang","sequence":"additional","affiliation":[{"name":"DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3633-485X","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[{"name":"SoI, Renmin University of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8907-3526","authenticated-orcid":false,"given":"Lei","family":"Hou","sequence":"additional","affiliation":[{"name":"BNRist, DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6244-0664","authenticated-orcid":false,"given":"Juanzi","family":"Li","sequence":"additional","affiliation":[{"name":"BNRist, DCST, Tsinghua University, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/221270.221321"},{"key":"e_1_3_2_2_2_1","volume-title":"Yuanzhi Li, Scott Lundberg, et al.","author":"Bubeck S\u00e9bastien","year":"2023","unstructured":"S\u00e9bastien Bubeck, Varun Chandrasekaran, Ronen Eldan, Johannes Gehrke, Eric Horvitz, Ece Kamar, Peter Lee, Yin Tat Lee, Yuanzhi Li, Scott Lundberg, et al. 2023. Sparks of artificial general intelligence: Early experiments with gpt-4. arXiv preprint arXiv:2303.12712 (2023). https:\/\/arxiv.org\/abs\/2303.12712"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.422"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1171"},{"key":"e_1_3_2_2_5_1","volume-title":"INSTRUCTEVAL: Towards Holistic Evaluation of Instruction-Tuned Large Language Models. arXiv preprint arXiv:2306.04757","author":"Chia Yew Ken","year":"2023","unstructured":"Yew Ken Chia, Pengfei Hong, Lidong Bing, and Soujanya Poria. 2023. INSTRUCTEVAL: Towards Holistic Evaluation of Instruction-Tuned Large Language Models. arXiv preprint arXiv:2306.04757 (2023). https:\/\/arxiv.org\/abs\/2306.04757"},{"key":"e_1_3_2_2_6_1","volume-title":"Ragas: Automated evaluation of retrieval augmented generation. arXiv preprint arXiv:2309.15217","author":"Es Shahul","year":"2023","unstructured":"Shahul Es, Jithin James, Luis Espinosa-Anke, and Steven Schockaert. 2023. Ragas: Automated evaluation of retrieval augmented generation. arXiv preprint arXiv:2309.15217 (2023)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-44699-3_7"},{"key":"e_1_3_2_2_8_1","volume-title":"LawBench: Benchmarking Legal Knowledge of Large Language Models. arXiv preprint arXiv:2309.16289","author":"Fei Zhiwei","year":"2023","unstructured":"Zhiwei Fei, Xiaoyu Shen, Dawei Zhu, Fengzhe Zhou, Zhuo Han, Songyang Zhang, Kai Chen, Zongwen Shen, and Jidong Ge. 2023. LawBench: Benchmarking Legal Knowledge of Large Language Models. arXiv preprint arXiv:2309.16289 (2023)."},{"key":"e_1_3_2_2_9_1","volume-title":"Alexis Chevalier, and Julius Berner.","author":"Frieder Simon","year":"2023","unstructured":"Simon Frieder, Luca Pinchetti, Ryan-Rhys Griffiths, Tommaso Salvatori, Thomas Lukasiewicz, Philipp Christian Petersen, Alexis Chevalier, and Julius Berner. 2023. Mathematical capabilities of chatgpt. arXiv preprint arXiv:2301.13867 (2023). https:\/\/arxiv.org\/abs\/2301.13867"},{"key":"e_1_3_2_2_10_1","volume-title":"International Conference on Machine Learning. PMLR, 10764--10799","author":"Gao Luyu","year":"2023","unstructured":"Luyu Gao, Aman Madaan, Shuyan Zhou, Uri Alon, Pengfei Liu, Yiming Yang, Jamie Callan, and Graham Neubig. 2023. Pal: Program-aided language models. In International Conference on Machine Learning. PMLR, 10764--10799."},{"key":"e_1_3_2_2_11_1","volume-title":"Brandon Waldon, Daniel N Rockmore, et al.","author":"Guha Neel","year":"2023","unstructured":"Neel Guha, Julian Nyarko, Daniel E Ho, Christopher R\u00e9, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N Rockmore, et al. 2023. Legalbench: A collaboratively built benchmark for measuring legal reasoning in large language models. arXiv preprint arXiv:2308.11462 (2023)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.495"},{"key":"e_1_3_2_2_13_1","volume-title":"Genegpt: Augmenting large language models with domain tools for improved access to biomedical information. ArXiv","author":"Jin Qiao","year":"2023","unstructured":"Qiao Jin, Yifan Yang, Qingyu Chen, and Zhiyong Lu. 2023. Genegpt: Augmenting large language models with domain tools for improved access to biomedical information. ArXiv (2023)."},{"key":"e_1_3_2_2_14_1","volume-title":"Decomposed prompting: A modular approach for solving complex tasks. arXiv preprint arXiv:2210.02406","author":"Khot Tushar","year":"2022","unstructured":"Tushar Khot, Harsh Trivedi, Matthew Finlayson, Yao Fu, Kyle Richardson, Peter Clark, and Ashish Sabharwal. 2022. Decomposed prompting: A modular approach for solving complex tasks. arXiv preprint arXiv:2210.02406 (2022)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","unstructured":"Jan Koco'n Igor Cichecki Oliwier Kaszyca Mateusz Kochanek Dominika Szyd\u0142o Joanna Baran Julita Bielaniewicz Marcin Gruza Arkadiusz Janz Kamil Kanclerz et al. 2023. ChatGPT: Jack of all trades master of none. arXiv preprint arXiv:2302.10724 (2023). https:\/\/arxiv.org\/abs\/2302.10724","DOI":"10.2139\/ssrn.4372889"},{"key":"e_1_3_2_2_16_1","volume-title":"A revision of Bloom's taxonomy: An overview. Theory into practice","author":"Krathwohl David R","year":"2002","unstructured":"David R Krathwohl. 2002. A revision of Bloom's taxonomy: An overview. Theory into practice, Vol. 41, 4 (2002), 212--218."},{"key":"e_1_3_2_2_17_1","volume-title":"Internet-augmented language models through few-shot prompting for open-domain question answering. arXiv preprint arXiv:2203.05115","author":"Lazaridou Angeliki","year":"2022","unstructured":"Angeliki Lazaridou, Elena Gribovskaya, Wojciech Stokowiec, and Nikolai Grigorev. 2022. Internet-augmented language models through few-shot prompting for open-domain question answering. arXiv preprint arXiv:2203.05115 (2022)."},{"key":"e_1_3_2_2_18_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive nlp tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\u00e4schel, et al. 2020. Retrieval-augmented generation for knowledge-intensive nlp tasks. Advances in Neural Information Processing Systems, Vol. 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_19_1","volume-title":"Jian-Yun Nie, and Ji-Rong Wen.","author":"Li Junyi","year":"2023","unstructured":"Junyi Li, Xiaoxue Cheng, Wayne Xin Zhao, Jian-Yun Nie, and Ji-Rong Wen. 2023. HaluEval: A Large-Scale Hallucination Evaluation Benchmark for Large Language Models. https:\/\/arxiv.org\/abs\/2305.11747"},{"key":"e_1_3_2_2_20_1","volume-title":"a Large-scale Chinese Medical QA Dataset. arXiv preprint arXiv:2305.01526","author":"Li Jianquan","year":"2023","unstructured":"Jianquan Li, Xidong Wang, Xiangbo Wu, Zhiyi Zhang, Xiaolong Xu, Jie Fu, Prayag Tiwari, Xiang Wan, and Benyou Wang. 2023. Huatuo-26M, a Large-scale Chinese Medical QA Dataset. arXiv preprint arXiv:2305.01526 (2023)."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.187"},{"key":"e_1_3_2_2_22_1","volume-title":"EcomGPT: Instruction-tuning Large Language Model with Chain-of-Task Tasks for E-commerce. arXiv preprint arXiv:2308.06966","author":"Li Yangning","year":"2023","unstructured":"Yangning Li, Shirong Ma, Xiaobin Wang, Shen Huang, Chengyue Jiang, Hai-Tao Zheng, Pengjun Xie, Fei Huang, and Yong Jiang. 2023. EcomGPT: Instruction-tuning Large Language Model with Chain-of-Task Tasks for E-commerce. arXiv preprint arXiv:2308.06966 (2023)."},{"key":"e_1_3_2_2_23_1","first-page":"21698","article-title":"Decoupled context processing for context augmented language modeling","volume":"35","author":"Li Zonglin","year":"2022","unstructured":"Zonglin Li, Ruiqi Guo, and Sanjiv Kumar. 2022. Decoupled context processing for context augmented language modeling. Advances in Neural Information Processing Systems, Vol. 35 (2022), 21698--21710.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_24_1","volume-title":"RETA-LLM: A Retrieval-Augmented Large Language Model Toolkit. arXiv preprint arXiv:2306.05212","author":"Liu Jiongnan","year":"2023","unstructured":"Jiongnan Liu, Jiajie Jin, Zihan Wang, Jiehan Cheng, Zhicheng Dou, and Ji-Rong Wen. 2023. RETA-LLM: A Retrieval-Augmented Large Language Model Toolkit. arXiv preprint arXiv:2306.05212 (2023)."},{"key":"e_1_3_2_2_25_1","volume-title":"Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172","author":"Liu Nelson F","year":"2023","unstructured":"Nelson F Liu, Kevin Lin, John Hewitt, Ashwin Paranjape, Michele Bevilacqua, Fabio Petroni, and Percy Liang. 2023. Lost in the middle: How language models use long contexts. arXiv preprint arXiv:2307.03172 (2023)."},{"key":"e_1_3_2_2_26_1","volume-title":"2023 d. ArguGPT: evaluating, understanding and identifying argumentative essays generated by GPT models. arXiv preprint arXiv:2304.07666","author":"Liu Yikang","year":"2023","unstructured":"Yikang Liu, Ziyin Zhang, Wanyang Zhang, Shisen Yue, Xiaojing Zhao, Xinyuan Cheng, Yiwen Zhang, and Hai Hu. 2023 d. ArguGPT: evaluating, understanding and identifying argumentative essays generated by GPT models. arXiv preprint arXiv:2304.07666 (2023). https:\/\/arxiv.org\/abs\/2304.07666"},{"key":"e_1_3_2_2_27_1","volume-title":"Devansh Arpit, et al.","author":"Liu Zhiwei","year":"2023","unstructured":"Zhiwei Liu, Weiran Yao, Jianguo Zhang, Le Xue, Shelby Heinecke, Rithesh Murthy, Yihao Feng, Zeyuan Chen, Juan Carlos Niebles, Devansh Arpit, et al. 2023. Bolaa: Benchmarking and orchestrating llm-augmented autonomous agents. arXiv preprint arXiv:2308.05960 (2023)."},{"key":"e_1_3_2_2_28_1","volume-title":"The knowledge level. Artificial intelligence","author":"Newell Allen","year":"1982","unstructured":"Allen Newell. 1982. The knowledge level. Artificial intelligence, Vol. 18, 1 (1982), 87--127."},{"key":"e_1_3_2_2_29_1","volume-title":"GPT-4 technical report. arXiv preprint arxiv:2303.08774","author":"AI.","year":"2023","unstructured":"OpenAI. 2023. GPT-4 technical report. arXiv preprint arxiv:2303.08774 (2023). https:\/\/arxiv.org\/pdf\/2303.08774.pdf"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.335"},{"key":"e_1_3_2_2_31_1","unstructured":"Yujia Qin Shihao Liang Yining Ye Kunlun Zhu Lan Yan Yaxi Lu Yankai Lin Xin Cong Xiangru Tang Bill Qian et al. 2023. ToolLLM: Facilitating Large Language Models to Master 16000 Real-world APIs. arXiv preprint arXiv:2307.16789 (2023)."},{"key":"e_1_3_2_2_32_1","volume-title":"Toolllm: Facilitating large language models to master 16000 real-world apis. arXiv preprint arXiv:2307.16789","author":"Qin Yujia","year":"2023","unstructured":"Yujia Qin, Shihao Liang, Yining Ye, Kunlun Zhu, Lan Yan, Yaxi Lu, Yankai Lin, Xin Cong, Xiangru Tang, Bill Qian, et al. 2023. Toolllm: Facilitating large language models to master 16000 real-world apis. arXiv preprint arXiv:2307.16789 (2023)."},{"key":"e_1_3_2_2_33_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog Vol. 1 8 (2019) 9."},{"key":"e_1_3_2_2_34_1","volume-title":"Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al.","author":"Roziere Baptiste","year":"2023","unstructured":"Baptiste Roziere, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, J\u00e9r\u00e9my Rapin, et al. 2023. Code llama: Open foundation models for code. arXiv preprint arXiv:2308.12950 (2023)."},{"key":"e_1_3_2_2_35_1","volume-title":"Ares: An automated evaluation framework for retrieval-augmented generation systems. arXiv preprint arXiv:2311.09476","author":"Saad-Falcon Jon","year":"2023","unstructured":"Jon Saad-Falcon, Omar Khattab, Christopher Potts, and Matei Zaharia. 2023. Ares: An automated evaluation framework for retrieval-augmented generation systems. arXiv preprint arXiv:2311.09476 (2023)."},{"key":"e_1_3_2_2_36_1","volume-title":"Replug: Retrieval-augmented black-box language models. arXiv preprint arXiv:2301.12652","author":"Shi Weijia","year":"2023","unstructured":"Weijia Shi, Sewon Min, Michihiro Yasunaga, Minjoon Seo, Rich James, Mike Lewis, Luke Zettlemoyer, and Wen-tau Yih. 2023. Replug: Retrieval-augmented black-box language models. arXiv preprint arXiv:2301.12652 (2023)."},{"key":"e_1_3_2_2_37_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Shinn Noah","year":"2023","unstructured":"Noah Shinn, Federico Cassano, Ashwin Gopinath, Karthik R Narasimhan, and Shunyu Yao. 2023. Reflexion: Language agents with verbal reinforcement learning. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/1401890.1402008"},{"key":"e_1_3_2_2_39_1","volume-title":"A Comprehensive Survey of Hallucination Mitigation Techniques in Large Language Models. arXiv preprint arXiv:2401.01313","author":"Tonmoy SM","year":"2024","unstructured":"SM Tonmoy, SM Zaman, Vinija Jain, Anku Rani, Vipula Rawte, Aman Chadha, and Amitava Das. 2024. A Comprehensive Survey of Hallucination Mitigation Techniques in Large Language Models. arXiv preprint arXiv:2401.01313 (2024)."},{"key":"e_1_3_2_2_40_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arxiv:2302.13971","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arxiv:2302.13971 (2023). https:\/\/arxiv.org\/pdf\/2302.13971.pdf"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00475"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.557"},{"key":"e_1_3_2_2_43_1","volume-title":"A stitch in time saves nine: Detecting and mitigating hallucinations of llms by validating low-confidence generation. arXiv preprint arXiv:2307.03987","author":"Varshney Neeraj","year":"2023","unstructured":"Neeraj Varshney, Wenlin Yao, Hongming Zhang, Jianshu Chen, and Dong Yu. 2023. A stitch in time saves nine: Detecting and mitigating hallucinations of llms by validating low-confidence generation. arXiv preprint arXiv:2307.03987 (2023)."},{"key":"e_1_3_2_2_44_1","unstructured":"Cunxiang Wang Xiaoze Liu Yuanhao Yue Xiangru Tang Tianhang Zhang Cheng Jiayang Yunzhi Yao Wenyang Gao Xuming Hu Zehan Qi et al. 2023. Survey on factuality in large language models: Knowledge retrieval and domain-specificity. arXiv preprint arXiv:2310.07521 (2023)."},{"key":"e_1_3_2_2_45_1","volume-title":"Dingjie Song, Zhiyi Zhang, Zhihong Chen, Qingying Xiao, Feng Jiang, Jianquan Li, Xiang Wan, Benyou Wang, et al.","author":"Wang Xidong","year":"2023","unstructured":"Xidong Wang, Guiming Hardy Chen, Dingjie Song, Zhiyi Zhang, Zhihong Chen, Qingying Xiao, Feng Jiang, Jianquan Li, Xiang Wan, Benyou Wang, et al. 2023. Cmb: A comprehensive medical benchmark in chinese. arXiv preprint arXiv:2308.08833 (2023)."},{"key":"e_1_3_2_2_46_1","volume-title":"David Wadden","author":"Wang Yizhong","year":"2023","unstructured":"Yizhong Wang, Hamish Ivison, Pradeep Dasigi, Jack Hessel, Tushar Khot, Khyathi Raghavi Chandu, David Wadden, Kelsey MacMillan, Noah A Smith, Iz Beltagy, et al. 2023. How Far Can Camels Go? Exploring the State of Instruction Tuning on Open Resources. arXiv preprint arXiv:2306.04751 (2023)."},{"key":"e_1_3_2_2_47_1","unstructured":"Yuanchun Wang Jifan Yu Zijun Yao Jing Zhang Yuyang Xie Shangqing Tu Yiyang Fu Youhe Feng Jinkai Zhang Jingyao Zhang Bowen Huang Yuanyao Li Huihui Yuan Lei Hou Juanzi Li and Jie Tang. 2024. A Solution-based LLM API-using Methodology for Academic Information Seeking. arxiv: 2405.15165 [cs.CL]"},{"key":"e_1_3_2_2_48_1","volume-title":"RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. arXiv preprint arXiv:2401.00396","author":"Wu Yuanhao","year":"2023","unstructured":"Yuanhao Wu, Juno Zhu, Siliang Xu, Kashun Shum, Cheng Niu, Randy Zhong, Juntong Song, and Tong Zhang. 2023. RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models. arXiv preprint arXiv:2401.00396 (2023)."},{"key":"e_1_3_2_2_49_1","volume-title":"PIXIU: A Large Language Model, Instruction Data and Evaluation Benchmark for Finance. arXiv preprint arXiv:2306.05443","author":"Xie Qianqian","year":"2023","unstructured":"Qianqian Xie, Weiguang Han, Xiao Zhang, Yanzhao Lai, Min Peng, Alejandro Lopez-Lira, and Jimin Huang. 2023. PIXIU: A Large Language Model, Instruction Data and Evaluation Benchmark for Finance. arXiv preprint arXiv:2306.05443 (2023)."},{"key":"e_1_3_2_2_50_1","volume-title":"Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond. arXiv preprint arXiv:2304.13712","author":"Yang Jingfeng","year":"2023","unstructured":"Jingfeng Yang, Hongye Jin, Ruixiang Tang, Xiaotian Han, Qizhang Feng, Haoming Jiang, Bing Yin, and Xia Hu. 2023. Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond. arXiv preprint arXiv:2304.13712 (2023). https:\/\/arxiv.org\/abs\/2304.13712"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1237"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_2_53_1","volume-title":"React: Synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629","author":"Yao Shunyu","year":"2022","unstructured":"Shunyu Yao, Jeffrey Zhao, Dian Yu, Nan Du, Izhak Shafran, Karthik Narasimhan, and Yuan Cao. 2022. React: Synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629 (2022)."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1074"},{"key":"e_1_3_2_2_55_1","unstructured":"Jifan Yu Xiaozhi Wang Shangqing Tu Shulin Cao Daniel Zhang-Li Xin Lv Hao Peng Zijun Yao Xiaohan Zhang Hanming Li et al. 2023. KoLA: Carefully Benchmarking World Knowledge of Large Language Models. arXiv preprint arXiv:2306.09296 (2023)."},{"key":"e_1_3_2_2_56_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric Xing et al. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arXiv preprint arXiv:2306.05685 (2023)."},{"key":"e_1_3_2_2_57_1","volume-title":"ToolQA: A Dataset for LLM Question Answering with External Tools. arXiv preprint arXiv:2306.13304","author":"Zhuang Yuchen","year":"2023","unstructured":"Yuchen Zhuang, Yue Yu, Kuan Wang, Haotian Sun, and Chao Zhang. 2023. ToolQA: A Dataset for LLM Question Answering with External Tools. arXiv preprint arXiv:2306.13304 (2023)."}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Barcelona Spain","acronym":"KDD '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671564","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671564","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:04:19Z","timestamp":1750291459000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671564"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":57,"alternative-id":["10.1145\/3637528.3671564","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671564","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}