From 77ca90c474f1a385e9eaa34777a2789887bc9cdc Mon Sep 17 00:00:00 2001 From: Jon Date: Mon, 14 Oct 2024 01:00:58 -0700 Subject: [PATCH 1/3] WIP: bloom filter. fix issue where t-digest default_k didn't export (dunno why) and temporarily build off master cpp branch --- CMakeLists.txt | 8 +- src/bloom_wrapper.cpp | 161 ++++++++++++++++++++++++++++++++++++++++ src/datasketches.cpp | 2 + src/tdigest_wrapper.cpp | 2 +- 4 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 src/bloom_wrapper.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e9ba96a..89839862 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,11 @@ cmake_minimum_required(VERSION 3.16.0) +# version of the code library +set(DATASKETCHES_CPP_VERSION master) + +set(CMAKE_BUILD_TYPE "Debug") + string(TIMESTAMP DT %Y%m%d UTC) string(TIMESTAMP HHMM %H%M UTC) configure_file(version.cfg.in version.cfg @ONLY) @@ -109,6 +114,7 @@ target_sources(python src/ks_wrapper.cpp src/count_wrapper.cpp src/tdigest_wrapper.cpp + src/bloom_wrapper.cpp src/vector_of_kll.cpp src/py_serde.cpp ) @@ -117,7 +123,7 @@ cmake_policy(SET CMP0097 NEW) include(ExternalProject) ExternalProject_Add(datasketches GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git - GIT_TAG 5.1.0 + GIT_TAG ${DATASKETCHES_CPP_VERSION} GIT_SHALLOW true GIT_SUBMODULES "" INSTALL_DIR /tmp/datasketches diff --git a/src/bloom_wrapper.cpp b/src/bloom_wrapper.cpp new file mode 100644 index 00000000..c40c4728 --- /dev/null +++ b/src/bloom_wrapper.cpp @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include // TODO: remove when bloom_filter_impl.hpp includes this + +#include +#include +#include + +#include "bloom_filter.hpp" +#include "common_defs.hpp" + +namespace nb = nanobind; + +void init_bloom(nb::module_ &m) { + using namespace datasketches; + + nb::class_(m, "bloom_filter") + .def_static("suggest_num_hashes", + nb::overload_cast(&bloom_filter::builder::suggest_num_hashes), + nb::arg("max_distinct_items"), + nb::arg("num_filter_bits") + ) + .def_static("suggest_num_hashes", + nb::overload_cast(&bloom_filter::builder::suggest_num_hashes), + nb::arg("target_false_positive_prob") + ) + .def_static("suggest_num_filter_bits", + nb::overload_cast(&bloom_filter::builder::suggest_num_filter_bits), + nb::arg("max_distinct_items"), + nb::arg("target_false_positive_prob") + ) + .def_static("create_by_accuracy", + [](uint64_t max_distinct_items, double target_fpp, std::optional seed) { + return bloom_filter::builder::create_by_accuracy(max_distinct_items, + target_fpp, + seed.value_or(bloom_filter::builder::generate_random_seed())); + }, + nb::arg("max_distinct_items"), + nb::arg("target_false_positive_prob"), + nb::arg("seed") = nb::none() + ) + .def_static("create_by_size", + [](uint64_t num_bits, uint16_t num_hashes, std::optional seed) { + return bloom_filter::builder::create_by_size(num_bits, + num_hashes, + seed.value_or(bloom_filter::builder::generate_random_seed())); + }, + nb::arg("num_bits"), + nb::arg("num_hashes"), + nb::arg("seed") = nb::none() + ) + + + .def_static("deserialize", + [](const nb::bytes& bytes) { return bloom_filter::deserialize(bytes.c_str(), bytes.size()); }, + nb::arg("bytes"), + "Reads a bytes object and returns the corresponding bloom_filter" + ) + .def_static( + "wrap", + [](const nb::bytes& bytes) { return bloom_filter::wrap(bytes.c_str(), bytes.size()); }, + nb::arg("bytes"), + "Wraps the provided bytearray as a read-only Bloom filter" + ) + .def_static( + "writable_wrap", + [](const nb::bytearray& bytearray) { + return bloom_filter::writable_wrap(const_cast(bytearray.c_str()), bytearray.size()); + }, + nb::arg("bytearray"), + "Wraps the provided bytearray as a writable Bloom filter" + ) + .def("serialize", + [](const bloom_filter& bf) { + auto bytes = bf.serialize(); + return nb::bytes(reinterpret_cast(bytes.data()), bytes.size()); + }, + "Serializes the filter into a bytes object" + ) + + .def("__copy__", [](const bloom_filter& bf){ return bloom_filter(bf); }) + .def("__str__", [](const bloom_filter& bf) { return bf.to_string(false); }, + "Produces a string summary of the filter") + .def("to_string", &bloom_filter::to_string, + nb::arg("print_filter") = false, + "Produces a string summary of the filter") + + .def("update", nb::overload_cast(&bloom_filter::update), nb::arg("item"), + "Updates the sketch with the given 64-bit integer value") + .def("update", nb::overload_cast(&bloom_filter::update), nb::arg("item"), + "Updates the sketch with the given 64-bit floating point value") + .def("update", nb::overload_cast(&bloom_filter::update), nb::arg("item"), + "Updates the sketch with the given string") + + .def("query_and_update", nb::overload_cast(&bloom_filter::query_and_update), nb::arg("item"), + "Updates the sketch with the given 64-bit integer value\n" + "and returns the value from querying prior to the update") + .def("query_and_update", nb::overload_cast(&bloom_filter::query_and_update), nb::arg("item"), + "Updates the sketch with the given 64-bit floating point value\n" + "and returns the value from querying prior to the update") + .def("query_and_update", nb::overload_cast(&bloom_filter::query_and_update), nb::arg("item"), + "Updates the sketch with the given string and returns\n" + "the value from querying prior to the update") + + .def("query", nb::overload_cast(&bloom_filter::query, nb::const_), nb::arg("item"), + "Queries the sketch with the given 64-bit integer value and\n" + "returns whether the value might have been seen previously") + .def("query", nb::overload_cast(&bloom_filter::query, nb::const_), nb::arg("item"), + "Updates the sketch with the given 64-bit floating point value and\n" + "returns whether the value might have been seen previously") + .def("query", nb::overload_cast(&bloom_filter::query, nb::const_), nb::arg("item"), + "Updates the sketch with the given string and returns whether the\n" + "string might have been seen previously") + + .def("union", &bloom_filter::union_with, nb::arg("other"), + "Unions two Bloom filters by applying a logical OR. The result will\n" + "recognize any values seen by either filter, as well as false positives") + .def("intersect", &bloom_filter::intersect, nb::arg("other"), + "Intersects two Bloom filters by applying a logical AND. The result will\n" + "recognize only values seen by both filter, as well as false positives") + .def("invert", &bloom_filter::invert, + "Inverts all the bits of the filter. Approximately inverts the notion of set-membership.") + + .def("is_empty", &bloom_filter::is_empty, + "Returns True if the filter is empty, otherwise False") + .def("get_bits_used", &bloom_filter::get_bits_used, + "Returns the number of bits in the Bloom filter that are set to 1") + .def("get_capacity", &bloom_filter::get_capacity, + "Returns the total number of bits in the Bloom filter") + .def("get_num_hashes", &bloom_filter::get_num_hashes, + "Returns the configured number of hash functions for this Bloom filter") + .def("get_seed", &bloom_filter::get_seed, + "Returns the hash seed for this Bloom filter") + .def("reset", &bloom_filter::reset, + "Returns the Bloom filter to its original empty state") + + .def("get_serialized_size_bytes", [](const bloom_filter& bf) { return bf.get_serialized_size_bytes(); }, + "Returns the number of bytes needed to serialize the Bloom filter") + .def_static("get_serialized_size_bytes_given_bits", [](uint64_t num_bits) { return bloom_filter::get_serialized_size_bytes(num_bits); }, + nb::arg("num_bits"), + "Returns the number of bytes needed to serialize a Bloom filter with a capacity of num_bits") + ; + +} diff --git a/src/datasketches.cpp b/src/datasketches.cpp index 118683bc..121f9a54 100644 --- a/src/datasketches.cpp +++ b/src/datasketches.cpp @@ -40,6 +40,7 @@ void init_quantiles(nb::module_& m); void init_count_min(nb::module_& m); void init_density(nb::module_& m); void init_tdigest(nb::module_& m); +void init_bloom(nb::module_& m); void init_vector_of_kll(nb::module_& m); // supporting objects @@ -72,6 +73,7 @@ NB_MODULE(_datasketches, m) { init_count_min(m); init_density(m); init_tdigest(m); + init_bloom(m); init_vector_of_kll(m); init_kolmogorov_smirnov(m); diff --git a/src/tdigest_wrapper.cpp b/src/tdigest_wrapper.cpp index 059ee1d6..1f240dfb 100644 --- a/src/tdigest_wrapper.cpp +++ b/src/tdigest_wrapper.cpp @@ -36,7 +36,7 @@ void bind_tdigest(nb::module_ &m, const char* name) { using namespace datasketches; auto tdigest_class = nb::class_>(m, name) - .def(nb::init(), nb::arg("k")=tdigest::DEFAULT_K, + .def(nb::init(), nb::arg("k")=static_cast(tdigest::DEFAULT_K), "Creates a tdigest instance with the given value of k.\n\n" ":param k: Controls the size/accuracy trade-off of the sketch. Default is 200.\n" ":type k: int, optional" From ae2e5b359b455b30063e5cae518527de0c4bcaf6 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 22 Oct 2024 11:36:02 -0700 Subject: [PATCH 2/3] add additional methods to bloom wrapper --- src/bloom_wrapper.cpp | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/bloom_wrapper.cpp b/src/bloom_wrapper.cpp index c40c4728..0c23bbd5 100644 --- a/src/bloom_wrapper.cpp +++ b/src/bloom_wrapper.cpp @@ -56,6 +56,20 @@ void init_bloom(nb::module_ &m) { nb::arg("target_false_positive_prob"), nb::arg("seed") = nb::none() ) + .def_static("initalize_by_accuracy", + [](nb::bytearray& bytearray, uint64_t max_distinct_items, double target_fpp, std::optional seed) { + return bloom_filter::builder::initialize_by_accuracy( + PyByteArray_AsString(bytearray.ptr()), + bytearray.size(), + max_distinct_items, + target_fpp, + seed.value_or(bloom_filter::builder::generate_random_seed())); + }, + nb::arg("memory"), + nb::arg("max_distinct_items"), + nb::arg("target_false_positive_prob"), + nb::arg("seed") = nb::none() + ) .def_static("create_by_size", [](uint64_t num_bits, uint16_t num_hashes, std::optional seed) { return bloom_filter::builder::create_by_size(num_bits, @@ -66,6 +80,20 @@ void init_bloom(nb::module_ &m) { nb::arg("num_hashes"), nb::arg("seed") = nb::none() ) + .def_static("initialize_by_size", + [](nb::bytearray& bytearray, uint64_t num_bits, uint16_t num_hashes, std::optional seed) { + return bloom_filter::builder::initialize_by_size( + PyByteArray_AsString(bytearray.ptr()), + bytearray.size(), + num_bits, + num_hashes, + seed.value_or(bloom_filter::builder::generate_random_seed())); + }, + nb::arg("memory"), + nb::arg("num_bits"), + nb::arg("num_hashes"), + nb::arg("seed") = nb::none() + ) .def_static("deserialize", @@ -81,7 +109,7 @@ void init_bloom(nb::module_ &m) { ) .def_static( "writable_wrap", - [](const nb::bytearray& bytearray) { + [](nb::bytearray& bytearray) { return bloom_filter::writable_wrap(const_cast(bytearray.c_str()), bytearray.size()); }, nb::arg("bytearray"), @@ -150,7 +178,16 @@ void init_bloom(nb::module_ &m) { "Returns the hash seed for this Bloom filter") .def("reset", &bloom_filter::reset, "Returns the Bloom filter to its original empty state") - + .def("is_read_only", &bloom_filter::is_read_only, + "Returns True if the Bloom filter is read-only, otherwise False") + .def("is_memory_owned", &bloom_filter::is_memory_owned, + "Returns True if the Bloom filter owns the backing array, otherwise False") + .def("is_wrapped", &bloom_filter::is_wrapped, + "Returns True if the Bloom filter was created by wrapping memory\n" + "whether writable or not, otherwise False") + .def("is_compatible", &bloom_filter::is_compatible, + nb::arg("other"), + "Returns True iff the two Bloom filters may be unioned or intersected") .def("get_serialized_size_bytes", [](const bloom_filter& bf) { return bf.get_serialized_size_bytes(); }, "Returns the number of bytes needed to serialize the Bloom filter") .def_static("get_serialized_size_bytes_given_bits", [](uint64_t num_bits) { return bloom_filter::get_serialized_size_bytes(num_bits); }, From 5e3105f2e46d46dcc86496d3f421f39a67b9b02e Mon Sep 17 00:00:00 2001 From: Jon Malkin <786705+jmalkin@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:44:10 -0700 Subject: [PATCH 3/3] modify tdiget merge input to be const --- src/tdigest_wrapper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tdigest_wrapper.cpp b/src/tdigest_wrapper.cpp index 1f240dfb..b159cce0 100644 --- a/src/tdigest_wrapper.cpp +++ b/src/tdigest_wrapper.cpp @@ -44,7 +44,7 @@ void bind_tdigest(nb::module_ &m, const char* name) { .def("__copy__", [](const tdigest& sk) { return tdigest(sk); }) .def("update", (void(tdigest::*)(T)) &tdigest::update, nb::arg("item"), "Updates the sketch with the given value") - .def("merge", (void(tdigest::*)(tdigest&)) &tdigest::merge, nb::arg("sketch"), + .def("merge", (void(tdigest::*)(const tdigest&)) &tdigest::merge, nb::arg("sketch"), "Merges the provided sketch into this one") .def("__str__", [](const tdigest& sk) { return sk.to_string(); }, "Produces a string summary of the sketch")