@inproceedings{d0b1973faf014ff3a70e748785181a4e,
title = "Evaluating Similarity Measures for Dataset Search",
abstract = "Dataset search engines help scientists to find research datasets for scientific experiments. Current dataset search engines are query-driven, making them limited by the appropriate specification of search queries. An alternative would be to adopt a recommendation paradigm (“if you like this dataset, you{\textquoteright}ll also like..”). Such a recommendation service requires an appropriate similarity metric between datasets. Various similarity measures have been proposed in computational linguistics and informational retrieval. The goal of this paper is to determine which similarity measure is suitable for a dataset search engine. We will report our experiments on different similarity measures over datasets. We will evaluate these similarity measures against the gold standards which are developed for Elsevier DataSearch, a commercial dataset search engine. With the help of F-measure evaluation measure and nDCG evaluation measure, we find that Wu-Palmer Similarity, a similarity measure which is based on hierarchical terminologies, can score quite good in our benchmarks.",
keywords = "Data science, Dataset search, Google Distance, Ontology-based similarity, Semantic similarity",
author = "Xu Wang and Zhisheng Huang and {van Harmelen}, Frank",
year = "2020",
doi = "10.1007/978-3-030-62008-0_3",
language = "English",
isbn = "9783030620073",
volume = "2",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "38--51",
editor = "Zhisheng Huang and Wouter Beek and Hua Wang and Yanchun Zhang and Rui Zhou",
booktitle = "Web Information Systems Engineering – WISE 2020",
address = "Germany",
note = "21st International Conference on Web Information Systems Engineering, WISE 2020 ; Conference date: 20-10-2020 Through 24-10-2020",
}