Database systems use static analysis to determine upfront which data is needed for answering a query and use indexes and other physical design techniques to speed-up access to that data. However, for important classes of queries, e.g., HAVING and top-k queries, it is impossible to determine up-front what data is relevant. To overcome this limitation, we develop provenance-based data skipping (PBDS), a novel approach that generates provenance sketches to concisely encode what data is relevant for a query. Once a provenance sketch has been captured it is used to speed up subsequent queries. PBDS can exploit physical design artifacts such as indexes and zone maps.
@article{NL21,
author = {Niu, Xing and Liu, Ziyu and Li, Pengyuan and Glavic, Boris and Gawlick, Dieter and Krishnaswamy, Vasudha and Liu, Zhen Hua and Porobic, Danica},
keywords = {Provenance, Data Skipping, Relevance-based Data Management},
title = {Provenance-based Data Skipping},
journal = {Proceedings of the VLDB Endowment},
projects = {Relevance-based Data Management},
pages = {451 - 464},
volume = {15},
issue = {3},
year = {2021},
doi = {10.14778/3494124.3494130},
reproducibility = {https://github.com/IITDBGroup/2021_pbds_reproducibility},
reproducibilitybatch = {available},
venueshort = {{PVLDB}},
pdfurl = {https://vldb.org/pvldb/vol15/p451-niu.pdf},
longversionurl = {https://arxiv.org/pdf/2104.12815}
}