Learning to deduplicate
Created by W.Langdon from
gp-bibliography.bib Revision:1.7970
- @InProceedings{deCarvalho:2006:JCDL,
-
author = "Moises G. {de Carvalho} and Marcos Andre Goncalves and
Alberto H. F. Laender and Altigran S. {da Silva}",
-
title = "Learning to deduplicate",
-
booktitle = "Proceedings of the 6th ACM/IEEE-CS Joint Conference on
Digital Libraries, JCDL '06",
-
year = "2006",
-
pages = "41--50",
-
address = "Chapel Hill, NC, USA",
-
month = jun,
-
publisher = "IEEE",
-
keywords = "genetic algorithms, genetic programming,
Deduplication, Digital Libraries",
-
ISBN = "1-59593-354-9",
-
URL = "http://delivery.acm.org/10.1145/1150000/1141760/p41-decarvalho.pdf?key1=1141760&key2=6906456911&coll=GUIDE&dl=GUIDE&CFID=45325455&CFTOKEN=75817203",
-
DOI = "doi:10.1145/1141753.1141760",
-
size = "10 pages",
-
abstract = "Identifying record replicas in digital libraries and
other types of digital repositories is fundamental to
improve the quality of their content and services as
well as to yield eventual sharing efforts. Several
deduplication strategies are available, but most of
them rely on manually chosen settings to combine
evidence used to identify records as being replicas. In
this paper, we present the results of experiments we
have carried out with a novel machine learning approach
we have proposed for the de duplication problem. This
approach, based on genetic programming (GP), is able to
automatically generate similarity functions to identify
record replicas in a given repository. The generated
similarity functions properly combine and weight the
best evidence available among the record fields in
order to tell when two distinct records represent the
same real-world entity. The results of the experiments
show that our approach outperforms the baseline method
by Fellegi and Sunter by more than 12percent when
identifying replicas in a data set containing
researcher's personal data, and by more than 7percent,
in a data set with article citation data",
-
notes = "Comput. Sci. Dept., Fed. Univ. of Minas Gerais, Belo
Horizonte",
- }
Genetic Programming entries for
Moises G de Carvalho
Marcos Andre Goncalves
Alberto H F Laender
Altigran S da Silva
Citations