cd24c84217424e3f8bde3f51a063eefc/doc-preview/bloom_8h_source.html

 /*

  * SPDX-FileCopyrightText: 2014 Freie Universität Berlin

  * SPDX-License-Identifier: LGPL-2.1-only

  */


 #pragma once


 /*

  * bloom.c

  *

  * Bloom filters

  *

  * HISTORY

  *                                                   {x,  y,  z}

  * A Bloom filter is a probibalistic                  :   :   :

  * data structure with several interesting           /|\ /|\ /|\

  * properties, such as low memory usage,            / | X | X | \

  * asymmetric query confidence, and a very         /  |/ \|/ \|  \

  * speedy O(k) membership test.                   /   |   |   \   \

  *                                               /   /|  /|\  |\   \

  * Because a Bloom filter can                   .   . . . . . . .   .

  * accept any input that can be       00000000001000101010101010100010000000000

  * hashed effectively (such as                       "    "    "

  * strings), that membership test                     \   |   /

  * tends to draw a crowd. TNSTAAFL, but                \  |  /

  * as caveats go, the Bloom filters' are                \ | /

  * more interesting than incapacitating.                 \|/

  *                                                        :

  * Most notably, it can tell you with certainty          {w}

  * that an item 'i' is *not* a member of set 's',

  * but it can only tell you with some finite

  * probability whether an item 'i' *is* a member

  * of set 's'.

  *

  * Still, along with the intriguing possibility of using bitwise AND and OR

  * to compute the logical union and intersection of two filters, the cheap

  * cost of adding elements to the filter set, and the low memory requirements,

  * the Bloom filter is a good choice for many applications.

  *

  * NOTES

  *

  * Let's look more closely at the probability values.

  *

  * Assume that a hash function selects each array position with equal

  * probability. If m is the number of bits in the array, and k is the number

  * of hash functions, then the probability that a certain bit is not set

  * to 1 by a certain hash function during the insertion of an element is

  *

  *      1-(1/m).

  *

  * The probability that it is not set to 1 by any of the hash functions is

  *

  *      (1-(1/m))^k.

  *

  * If we have inserted n elements, the probability that a certain bit is

  * set 0 is

  *

  *      (1-(1/m))^kn,

  *

  * Meaning that the probability said bit is set to 1 is therefore

  *

  *      1-([1-(1/m)]^kn).

  *

  * Now test membership of an element that is not in the set. Each of the k

  * array positions computed by the hash functions is 1 with a probability

  * as above. The probability of all of them being 1, which would cause the

  * algorithm to erroneously claim that the element is in the set, is often

  * given as

  *

  *      (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.

  *

  * This is not strictly correct as it assumes independence for the

  * probabilities of each bit being set. However, assuming it is a close

  * approximation we have that the probability of false positives decreases

  * as m (the number of bits in the array) increases, and increases as n

  * (the number of inserted elements) increases. For a given m and n, the

  * value of k (the number of hash functions) that minimizes the probability

  * is

  *

  *      (m/n)ln(2) ~~ 0.7(m/n),

  *

  * which gives the false positive probability of

  *

  *      2^-k ~~ 0.6185^(m/n).

  *

  * The required number of bits m, given n and a desired false positive

  * probability p (and assuming the optimal value of k is used) can be

  * computed by substituting the optimal value of k in the probability

  * expression above:

  *

  *      p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),

  *

  * which simplifies to

  *

  *      ln(p) = -(m/n) * (ln2)^2.

  *

  * This results in the equation

  *

  *      m = -((n*ln(p)) / ((ln(2))^2))

  *

  * The classic filter uses

  *

  *       1.44*log2(1/eta)

  *

  * bits of space per inserted key, where eta is the false positive rate of

  * the Bloom filter.

  *

  */


 #include <stdlib.h>

 #include <stdbool.h>

 #include <stdint.h>


 #ifdef __cplusplus

 extern "C" {

 #endif


 typedef uint32_t (*hashfp_t)(const uint8_t *, size_t len);


 typedef struct {

     size_t m;

     size_t k;

     uint8_t *a;

     hashfp_t *hash;

 } bloom_t;


 void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);


 void bloom_del(bloom_t *bloom);


 void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);


 bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);


 #ifdef __cplusplus

 }

 #endif


hashfp_t
uint32_t(* hashfp_t)(const uint8_t *, size_t len)
hash function to use in thee filter
Definition: bloom.h:133

bloom_init
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof)
Initialize a Bloom Filter.

bloom_del
void bloom_del(bloom_t *bloom)
Delete a Bloom filter.

bloom_check
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len)
Determine if a string is in the Bloom filter.

bloom_add
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len)
Add a string to a Bloom filter.

bloom_t
bloom_t bloom filter object
Definition: bloom.h:138

bloom_t::m
size_t m
number of bits in the bloom array
Definition: bloom.h:140

bloom_t::k
size_t k
number of hash functions
Definition: bloom.h:142

bloom_t::hash
hashfp_t * hash
the hash functions
Definition: bloom.h:146

bloom_t::a
uint8_t * a
the bloom array
Definition: bloom.h:144