{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T16:46:07Z","timestamp":1755794767233,"version":"3.44.0"},"reference-count":94,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icdew67478.2025.00032","type":"proceedings-article","created":{"date-parts":[[2025,8,15]],"date-time":"2025-08-15T18:11:29Z","timestamp":1755281489000},"page":"218-229","source":"Crossref","is-referenced-by-count":0,"title":["GReaTER: Generate Realistic Tabular data after data Enhancement and Reduction"],"prefix":"10.1109","author":[{"given":"Tung Sum Thomas","family":"Kwok","sequence":"first","affiliation":[{"name":"University of California, Los Angeles,Department of Statistics and Data Science,Los Angeles,CA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chi-Hua","family":"Wang","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles,Department of Statistics and Data Science,Los Angeles,CA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guang","family":"Cheng","sequence":"additional","affiliation":[{"name":"University of California, Los Angeles,Department of Statistics and Data Science,Los Angeles,CA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cose.2017.09.002"},{"issue":"5","key":"ref2","doi-asserted-by":"crossref","first-page":"831","DOI":"10.1007\/s11633-022-1411-7","article-title":"A survey of synthetic data augmentation methods in machine vision","volume":"21","author":"Mumuni","year":"2024","journal-title":"Machine Intelligence Research"},{"key":"ref3","article-title":"Synthetic data generation for scarce road scene detection scenarios","volume-title":"NeurIPS Workshop on Synthetic Data Generation with GenerativeAI","author":"Khullar","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1038\/s41746-023-00927-3"},{"key":"ref5","article-title":"Synthetic data applications in finance","volume-title":"Forbes Coun-cils","author":"Ribeiro","year":"2024"},{"key":"ref6","article-title":"Language models are realistic tabular data generators","volume-title":"The Eleventh International Conference on Learning Representations","author":"Borisov","year":"2023"},{"key":"ref7","article-title":"Realtabformer: Generating real-istic relational and tabular data using transformers","author":"Solatorio","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"DEREC-SIMPRO: Unlock Language Model Benefits to Advance Synthesis in Data Clean Room","volume-title":"ACM ICAIF Workshop","author":"Kwok","year":"2024"},{"key":"ref9","article-title":"Tabddpm: modelling tabular data with diffusion models","volume-title":"Proceedings of the 40th International Conference on Machine Learning, ser. ICML\u201923. JMLR.org","author":"Kotelnikov","year":"2023"},{"key":"ref10","article-title":"Modeling tabular data using conditional gan","author":"Xu","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/DSAA.2016.49"},{"key":"ref12","article-title":"Adapting differentially private synthetic data to relational databases","volume-title":"arXiv preprint","author":"Alimohammadi","year":"2024"},{"key":"ref13","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.naacl-long.395","article-title":"Bridging the gap between different vocabularies for llm ensemble","volume-title":"arXiv preprint","author":"Xu","year":"2024"},{"key":"ref14","article-title":"Tabmeta: Table metadata generation with LLM-curated dataset and LLM-judges","volume-title":"Submitted to ACL Rolling Review","year":"2024"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3321386"},{"issue":"2065","key":"ref16","first-page":"20150202","article-title":"Principal component analysis: A review and recent developments","volume":"374","author":"Jolliffe","year":"2016","journal-title":"Philosophical Transactions. Series A, Mathematical, Physical, and Engineering Sciences"},{"key":"ref17","article-title":"Ctr prediction - 2022 digix global ai challenge","year":"2022","journal-title":"Xiaojiu1414"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.2307\/2280095"},{"key":"ref19","article-title":"Wasserstein distance in deep learning","author":"Leo","year":"2023","journal-title":"SSRN Electron. J."},{"key":"ref20","author":"Hunner","year":"2016","journal-title":"names: A python library for generating random names"},{"key":"ref21","author":"Achiam","year":"2024","journal-title":"Gpt-4 technical report"},{"key":"ref22","year":"2024","journal-title":"Introducing llama 3.2"},{"issue":"1","key":"ref23","doi-asserted-by":"crossref","first-page":"36","DOI":"10.1145\/1188913.1188915","article-title":"The patent holder\u2019s dilemma: Buy, sell, or troll?","volume":"50","author":"Abril","year":"2007","journal-title":"Communications of the ACM"},{"issue":"2","key":"ref24","doi-asserted-by":"crossref","DOI":"10.1145\/1219092.1219093","article-title":"Deciding equivalances among conjunctive aggregate queries","volume":"54","author":"Cohen","year":"2007","journal-title":"J. ACM"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-56939-1_77"},{"key":"ref26","article-title":"Downstream task-oriented generative model selections on synthetic data training for fraud detection models","author":"Cheng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref27","article-title":"On the utility recovery incapability of neural net-based differential private tabular training data synthesizer under privacy deregulation","author":"Liu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref28","article-title":"Improve fidelity and utility of synthetic credit card transaction time series from data-centric perspective","author":"Hsieh","year":"2024","journal-title":"arXiv preprint"},{"issue":"1","key":"ref29","article-title":"Modular regression: improving linear models by incorporating auxiliary data","volume":"24","author":"Jin","year":"2023","journal-title":"J. Mach. Learn. Res."},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.4103\/2395-5414.157577"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2017.2695801"},{"issue":"6","key":"ref32","doi-asserted-by":"crossref","first-page":"405","DOI":"10.1146\/annurev-statistics-030718-104938","article-title":"Statistical aspects of wasserstein distances","volume":"6","author":"Panaretos","year":"2019","journal-title":"Annual Review of Statistics and Its Application"},{"key":"ref33","article-title":"version 0.12.0","year":"2023","journal-title":"Synthetic Data Metrics, DataCebo, Inc."},{"key":"ref34","doi-asserted-by":"crossref","DOI":"10.1613\/jair.1.12125","article-title":"Confident learning: Estimating uncertainty in dataset labels","author":"Northcutt","year":"2021","journal-title":"Journal of Artificial Intelligence Research(JAIR)"},{"key":"ref35","article-title":"Gen-erating multi -label discrete patient records using generative adversarial networks","volume-title":"Machine Learning for Healthcare Conference (PMLR)","author":"Choi","year":"2017"},{"key":"ref36","article-title":"Veegan: Reducing mode collapse in gans using implicit variational learning","author":"Srivastava","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.14778\/3231751.3231757"},{"key":"ref38","article-title":"A survey and comparison of relational and non-relational database","author":"Jatana","year":"2012","journal-title":"International Journal of Engineering Research & Technology"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1211"},{"key":"ref40","article-title":"A nonparametric test to detect data-copying in generative models","volume-title":"International Conference on Artificial Intelligence and Statistics","author":"Meehan","year":"2020"},{"key":"ref41","article-title":"Extracting training data from diffusion models","volume-title":"Proceedings of the 32nd USENIX Conference on Security Symposium, ser. SEC \u201923","author":"Carlini"},{"key":"ref42","article-title":"Ctab-gan: Effective table data synthesizing","volume-title":"Asian Conference on Machine Learning (PMLR)","author":"Zhao","year":"2021"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1613\/jair.953"},{"key":"ref44","article-title":"A neural proba-bilistic language model","volume":"3","author":"Bengio","year":"2002","journal-title":"Journal of machine learning research"},{"key":"ref45","author":"Radford","year":"2019","journal-title":"Language models are unsupervised multi task learners"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414142"},{"key":"ref47","first-page":"121","volume-title":"The Importance of Counting for Qualitative Research","author":"Fife","year":"2020"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1177\/1056492610375988"},{"issue":"4","key":"ref49","doi-asserted-by":"crossref","first-page":"421","DOI":"10.4103\/0970-9185.194772","article-title":"The american statistical association statement on p-values explained","volume":"32","author":"Yaddanapudi","year":"2016","journal-title":"Journal of Anaesthesiology, Clinical Pharmacology"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.2298\/FIL2006889P"},{"key":"ref51","article-title":"How data collaboration platforms can help companies build better ai","author":"Parra-Moyano","year":"2024","journal-title":"Harvard Business Review"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3490354.3494381"},{"key":"ref53","doi-asserted-by":"crossref","DOI":"10.3905\/jfds.2021.3.4.130","article-title":"On robustness of mutual funds categorization and distance metric learning","author":"Desai","year":"2021","journal-title":"The Journal of Financial Data Science"},{"issue":"2","key":"ref54","doi-asserted-by":"crossref","DOI":"10.3390\/e19020047","article-title":"On wasserstein two-sample testing and related families of nonparametric tests","volume":"19","author":"Ramdas","year":"2017","journal-title":"Entropy"},{"key":"ref55","article-title":"Fidelity and privacy of synthetic medical data","volume":"abs\/2101.08658","author":"Mendelevitch","year":"2021","journal-title":"ArXiv"},{"key":"ref56","article-title":"A non-parametric test to detect data-copying in generative models","volume":"abs\/2004.05675","author":"Meehan","year":"2020","journal-title":"ArXiv"},{"key":"ref57","article-title":"Diffusion models beat GANs on image synthesis","author":"Dhariwal","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref58","article-title":"A study on sample diversity in generative models: GANs vs. diffusion models","author":"Bayat","year":"2023","journal-title":"ICLR 2023 Tiny Papers"},{"issue":"1","key":"ref59","doi-asserted-by":"crossref","first-page":"289","DOI":"10.1111\/j.2517-6161.1995.tb02031.x","article-title":"Controlling the false discovery rate: A practical and powerful approach to multiple testing","volume":"57","author":"Benjamini","year":"1995","journal-title":"Journal of the Royal Statistical Society. Series B (Methodological)"},{"key":"ref60","doi-asserted-by":"crossref","first-page":"108","DOI":"10.1016\/j.neucom.2014.10.085","article-title":"Effect of label noise in the complexity of classification problems","volume":"160","author":"Garcia","year":"2015","journal-title":"Neurocomputing"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE60146.2024.00016"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.9785\/cri-2022-230404"},{"key":"ref63","article-title":"Llama: Open and efficient foun-dation language models","volume":"abs\/2302.13971","author":"Touvron","year":"2023","journal-title":"ArXiv"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1137\/1.9780898717655"},{"key":"ref65","first-page":"n107","article-title":"Cramer\u2019s v","volume-title":"Sage Encyclopedia of Communication Research Methods","author":"Kearney","year":"2017"},{"key":"ref66","first-page":"1090","volume-title":"Pearson\u2019s Correlation Coefficient","author":"Kirch","year":"2008"},{"key":"ref67","article-title":"Modern hierarchical, agglomerative clustering algorithms","volume":"absIl109.2378","author":"Mullner","year":"2011","journal-title":"ArXiv"},{"key":"ref68","article-title":"Differential privacy and machine learning: a survey and review","volume":"abs\/1412.7584","author":"Ji","year":"2014","journal-title":"ArXiv"},{"key":"ref69","article-title":"A multi-faceted evaluation framework for assessing synthetic data generated by large language models","author":"Yuan","year":"2024","journal-title":"arXiv preprint"},{"key":"ref70","article-title":"A comprehensive capability analysis of gpt-3 and gpt-3.5 series models","volume":"abs\/2303.10420","author":"Ye","year":"2023","journal-title":"ArXiv"},{"key":"ref71","author":"Samuels","year":"2024","journal-title":"One-hot encoding and two-hot encoding: An introduction"},{"key":"ref72","article-title":"Use case: Synthetic data generation","year":"2024","journal-title":"NVIDIA"},{"key":"ref73","article-title":"Utilizing imperfect synthetic data to improve speech recognition","author":"Hu","year":"2022","journal-title":"ICASSP"},{"key":"ref74","article-title":"Synthesized uses gen ai for compliant bigquery dataset snapshots google - cloud blog","author":"Baldin","year":"2024","journal-title":"Google Data Analytics"},{"key":"ref75","article-title":"Rethinking interpretability in the era of large language models","volume":"abs\/2402.01761","author":"Singh","year":"2024","journal-title":"ArXiv"},{"key":"ref76","doi-asserted-by":"crossref","DOI":"10.21203\/rs.3.rs-3006112\/v1","article-title":"Post hoc explanations of language models can improve language models","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems, ser. NIPS \u201923","author":"Krishna"},{"key":"ref77","article-title":"How data collaboration platforms can help companies build better ai","author":"Parra-Moyano","year":"2024","journal-title":"Harvard Business Review"},{"key":"ref78","article-title":"Introducing gpt-4o and more tools to chatgpt free users","year":"2024","journal-title":"OpenAI Blog"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/TrustCom50675.2020.00115"},{"key":"ref80","article-title":"Collaborating for an improved civil registration system to advance health and population data system in nepal","year":"2024","journal-title":"WHO Newsroom"},{"key":"ref81","article-title":"Introducing the next generation of claude","year":"2024","journal-title":"Anthropic"},{"key":"ref82","article-title":"Gpt-3.5 turbo","year":"2024","journal-title":"OpenAI"},{"key":"ref83","article-title":"Llama 2: Open foundation and fine-tuned chat models","volume":"abs\/2307.09288","author":"Touvron","year":"2023","journal-title":"ArXiv"},{"key":"ref84","first-page":"678","article-title":"Tokenization is more than compression","volume-title":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","author":"Schmidt"},{"key":"ref85","article-title":"Utility theory of synthetic data generation","author":"Xu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref86","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown","year":"2020"},{"key":"ref87","article-title":"Synthesizing tabular data using generative adversarial networks","volume":"abs\/1811.11264","author":"Xu","year":"2018","journal-title":"ArXiv"},{"key":"ref88","article-title":"Advancing retail data science: Comprehensive evaluation of synthetic data","author":"Xia","year":"2024","journal-title":"arXiv preprint"},{"key":"ref89","article-title":"Data plagiarism index: Characterizing the privacy risk of data-copying in tabular generative models","author":"Ward","year":"2024","journal-title":"arXiv preprint"},{"key":"ref90","article-title":"Data deletion for linear regression with noisy sgd","author":"Xia","year":"2024","journal-title":"arXiv preprint"},{"key":"ref91","article-title":"Federated high-dimensional online decision making","author":"Wang","journal-title":"Transactions on Machine Learning Research"},{"key":"ref92","article-title":"Badgd: A unified data-centric framework to identify gradient descent vulnerabilities","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref93","article-title":"Discriminative estimation of total variation distance: A fidelity auditor for generative data","author":"Tao","year":"2024","journal-title":"arXiv preprint"},{"key":"ref94","first-page":"217","article-title":"Online forgetting process for linear regression models","volume-title":"International Conference on Artificial Intelligence and Statistics. PMLR","author":"Li","year":"2021"}],"event":{"name":"2025 IEEE 41st International Conference on Data Engineering Workshops (ICDEW)","location":"Hong Kong, Hong Kong","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE 41st International Conference on Data Engineering Workshops (ICDEW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11108124\/11107458\/11108145.pdf?arnumber=11108145","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,16]],"date-time":"2025-08-16T05:57:56Z","timestamp":1755323876000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11108145\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":94,"URL":"https:\/\/doi.org\/10.1109\/icdew67478.2025.00032","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}