bloom.h
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: 2014 Freie Universität Berlin
3  * SPDX-License-Identifier: LGPL-2.1-only
4  */
5 
6 #pragma once
7 
8 /*
9  * bloom.c
10  *
11  * Bloom filters
12  *
13  * HISTORY
14  * {x, y, z}
15  * A Bloom filter is a probibalistic : : :
16  * data structure with several interesting /|\ /|\ /|\
17  * properties, such as low memory usage, / | X | X | \
18  * asymmetric query confidence, and a very / |/ \|/ \| \
19  * speedy O(k) membership test. / | | \ \
20  * / /| /|\ |\ \
21  * Because a Bloom filter can . . . . . . . . .
22  * accept any input that can be 00000000001000101010101010100010000000000
23  * hashed effectively (such as " " "
24  * strings), that membership test \ | /
25  * tends to draw a crowd. TNSTAAFL, but \ | /
26  * as caveats go, the Bloom filters' are \ | /
27  * more interesting than incapacitating. \|/
28  * :
29  * Most notably, it can tell you with certainty {w}
30  * that an item 'i' is *not* a member of set 's',
31  * but it can only tell you with some finite
32  * probability whether an item 'i' *is* a member
33  * of set 's'.
34  *
35  * Still, along with the intriguing possibility of using bitwise AND and OR
36  * to compute the logical union and intersection of two filters, the cheap
37  * cost of adding elements to the filter set, and the low memory requirements,
38  * the Bloom filter is a good choice for many applications.
39  *
40  * NOTES
41  *
42  * Let's look more closely at the probability values.
43  *
44  * Assume that a hash function selects each array position with equal
45  * probability. If m is the number of bits in the array, and k is the number
46  * of hash functions, then the probability that a certain bit is not set
47  * to 1 by a certain hash function during the insertion of an element is
48  *
49  * 1-(1/m).
50  *
51  * The probability that it is not set to 1 by any of the hash functions is
52  *
53  * (1-(1/m))^k.
54  *
55  * If we have inserted n elements, the probability that a certain bit is
56  * set 0 is
57  *
58  * (1-(1/m))^kn,
59  *
60  * Meaning that the probability said bit is set to 1 is therefore
61  *
62  * 1-([1-(1/m)]^kn).
63  *
64  * Now test membership of an element that is not in the set. Each of the k
65  * array positions computed by the hash functions is 1 with a probability
66  * as above. The probability of all of them being 1, which would cause the
67  * algorithm to erroneously claim that the element is in the set, is often
68  * given as
69  *
70  * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
71  *
72  * This is not strictly correct as it assumes independence for the
73  * probabilities of each bit being set. However, assuming it is a close
74  * approximation we have that the probability of false positives decreases
75  * as m (the number of bits in the array) increases, and increases as n
76  * (the number of inserted elements) increases. For a given m and n, the
77  * value of k (the number of hash functions) that minimizes the probability
78  * is
79  *
80  * (m/n)ln(2) ~~ 0.7(m/n),
81  *
82  * which gives the false positive probability of
83  *
84  * 2^-k ~~ 0.6185^(m/n).
85  *
86  * The required number of bits m, given n and a desired false positive
87  * probability p (and assuming the optimal value of k is used) can be
88  * computed by substituting the optimal value of k in the probability
89  * expression above:
90  *
91  * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
92  *
93  * which simplifies to
94  *
95  * ln(p) = -(m/n) * (ln2)^2.
96  *
97  * This results in the equation
98  *
99  * m = -((n*ln(p)) / ((ln(2))^2))
100  *
101  * The classic filter uses
102  *
103  * 1.44*log2(1/eta)
104  *
105  * bits of space per inserted key, where eta is the false positive rate of
106  * the Bloom filter.
107  *
108  */
109 
122 #include <stdlib.h>
123 #include <stdbool.h>
124 #include <stdint.h>
125 
126 #ifdef __cplusplus
127 extern "C" {
128 #endif
129 
133 typedef uint32_t (*hashfp_t)(const uint8_t *, size_t len);
134 
138 typedef struct {
140  size_t m;
142  size_t k;
144  uint8_t *a;
147 } bloom_t;
148 
162 void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);
163 
169 void bloom_del(bloom_t *bloom);
170 
181 void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);
182 
221 bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);
222 
223 #ifdef __cplusplus
224 }
225 #endif
226 
uint32_t(* hashfp_t)(const uint8_t *, size_t len)
hash function to use in thee filter
Definition: bloom.h:133
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof)
Initialize a Bloom Filter.
void bloom_del(bloom_t *bloom)
Delete a Bloom filter.
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len)
Determine if a string is in the Bloom filter.
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len)
Add a string to a Bloom filter.
bloom_t bloom filter object
Definition: bloom.h:138
size_t m
number of bits in the bloom array
Definition: bloom.h:140
size_t k
number of hash functions
Definition: bloom.h:142
hashfp_t * hash
the hash functions
Definition: bloom.h:146
uint8_t * a
the bloom array
Definition: bloom.h:144