Can Synthetic Data Improve Symbolic Regression Extrapolation Performance?
Created by W.Langdon from
gp-bibliography.bib Revision:1.8506
- @InProceedings{ramlan:2025:GECCOcomp,
-
author = "Fitria Wulandari Ramlan and Colm O'Riordan and
Gabriel Kronberger and James McDermott",
-
title = "Can Synthetic Data Improve Symbolic Regression
Extrapolation Performance?",
-
booktitle = "Symbolic Regression",
-
year = "2025",
-
editor = "Gabriel Kronberger and
Fabricio {Olivetti de Franca} William {La Cava} and Steven Gustafson",
-
pages = "2548--2555",
-
address = "Malaga, Spain",
-
series = "GECCO '25 Companion",
-
month = "14-18 " # jul,
-
organisation = "SIGEVO",
-
publisher = "Association for Computing Machinery",
-
publisher_address = "New York, NY, USA",
-
keywords = "genetic algorithms, genetic programming, symbolic
regression, extrapolation, synthetic data, data
augmentation, heterogeneous errors",
-
isbn13 = "979-8-4007-1464-1",
-
URL = "
https://doi.org/10.1145/3712255.3734356",
-
DOI = "
doi:10.1145/3712255.3734356",
-
size = "8 pages",
-
abstract = "Many machine learning models perform well when making
predictions within the training data range, but often
struggle when required to extrapolate beyond it.
Symbolic regression (SR) using genetic programming (GP)
can generate flexible models but is prone to unreliable
behaviour in extrapolation. This paper investigates
whether adding synthetic data can help improve
performance in such cases. We apply Kernel Density
Estimation (KDE) to identify regions in the input space
where the training data is sparse. Synthetic data is
then generated in those regions using a knowledge
distillation approach: a teacher model generates
predictions on new input points, which are then used to
train a student model. We evaluate this method across
six benchmark datasets, using neural networks (NN),
random forests (RF), and GP both as teacher models (to
generate synthetic data) and as student models (trained
on the augmented data). Results show that GP models
benefit most when trained with synthetic data from NN
and RF. The most significant improvements are observed
in extrapolation regions, while changes in
interpolation areas show only slight changes. We also
observe heterogeneous errors, where model performance
varies across different regions of the input space.
Overall, this approach offers a practical solution for
better extrapolation.",
-
notes = "GECCO-2025 SymReg workshop A Recombination of the 34th
International Conference on Genetic Algorithms (ICGA)
and the 30th Annual Genetic Programming Conference
(GP)",
- }
Genetic Programming entries for
Fitria Wulandari Ramlan
Colm O'Riordan
Gabriel Kronberger
James McDermott
Citations