bloom.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2014 Freie Universität Berlin
3  *
4  * This file is subject to the terms and conditions of the GNU Lesser
5  * General Public License v2.1. See the file LICENSE in the top level
6  * directory for more details.
7  */
8 
9 #pragma once
10 
11 /*
12  * bloom.c
13  *
14  * Bloom filters
15  *
16  * HISTORY
17  * {x, y, z}
18  * A Bloom filter is a probibalistic : : :
19  * data structure with several interesting /|\ /|\ /|\
20  * properties, such as low memory usage, / | X | X | \
21  * asymmetric query confidence, and a very / |/ \|/ \| \
22  * speedy O(k) membership test. / | | \ \
23  * / /| /|\ |\ \
24  * Because a Bloom filter can . . . . . . . . .
25  * accept any input that can be 00000000001000101010101010100010000000000
26  * hashed effectively (such as " " "
27  * strings), that membership test \ | /
28  * tends to draw a crowd. TNSTAAFL, but \ | /
29  * as caveats go, the Bloom filters' are \ | /
30  * more interesting than incapacitating. \|/
31  * :
32  * Most notably, it can tell you with certainty {w}
33  * that an item 'i' is *not* a member of set 's',
34  * but it can only tell you with some finite
35  * probability whether an item 'i' *is* a member
36  * of set 's'.
37  *
38  * Still, along with the intriguing possibility of using bitwise AND and OR
39  * to compute the logical union and intersection of two filters, the cheap
40  * cost of adding elements to the filter set, and the low memory requirements,
41  * the Bloom filter is a good choice for many applications.
42  *
43  * NOTES
44  *
45  * Let's look more closely at the probability values.
46  *
47  * Assume that a hash function selects each array position with equal
48  * probability. If m is the number of bits in the array, and k is the number
49  * of hash functions, then the probability that a certain bit is not set
50  * to 1 by a certain hash function during the insertion of an element is
51  *
52  * 1-(1/m).
53  *
54  * The probability that it is not set to 1 by any of the hash functions is
55  *
56  * (1-(1/m))^k.
57  *
58  * If we have inserted n elements, the probability that a certain bit is
59  * set 0 is
60  *
61  * (1-(1/m))^kn,
62  *
63  * Meaning that the probability said bit is set to 1 is therefore
64  *
65  * 1-([1-(1/m)]^kn).
66  *
67  * Now test membership of an element that is not in the set. Each of the k
68  * array positions computed by the hash functions is 1 with a probability
69  * as above. The probability of all of them being 1, which would cause the
70  * algorithm to erroneously claim that the element is in the set, is often
71  * given as
72  *
73  * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
74  *
75  * This is not strictly correct as it assumes independence for the
76  * probabilities of each bit being set. However, assuming it is a close
77  * approximation we have that the probability of false positives decreases
78  * as m (the number of bits in the array) increases, and increases as n
79  * (the number of inserted elements) increases. For a given m and n, the
80  * value of k (the number of hash functions) that minimizes the probability
81  * is
82  *
83  * (m/n)ln(2) ~~ 0.7(m/n),
84  *
85  * which gives the false positive probability of
86  *
87  * 2^-k ~~ 0.6185^(m/n).
88  *
89  * The required number of bits m, given n and a desired false positive
90  * probability p (and assuming the optimal value of k is used) can be
91  * computed by substituting the optimal value of k in the probability
92  * expression above:
93  *
94  * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
95  *
96  * which simplifies to
97  *
98  * ln(p) = -(m/n) * (ln2)^2.
99  *
100  * This results in the equation
101  *
102  * m = -((n*ln(p)) / ((ln(2))^2))
103  *
104  * The classic filter uses
105  *
106  * 1.44*log2(1/eta)
107  *
108  * bits of space per inserted key, where eta is the false positive rate of
109  * the Bloom filter.
110  *
111  */
112 
125 #include <stdlib.h>
126 #include <stdbool.h>
127 #include <stdint.h>
128 
129 #ifdef __cplusplus
130 extern "C" {
131 #endif
132 
136 typedef uint32_t (*hashfp_t)(const uint8_t *, size_t len);
137 
141 typedef struct {
143  size_t m;
145  size_t k;
147  uint8_t *a;
150 } bloom_t;
151 
165 void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);
166 
172 void bloom_del(bloom_t *bloom);
173 
184 void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);
185 
224 bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);
225 
226 #ifdef __cplusplus
227 }
228 #endif
229 
uint32_t(* hashfp_t)(const uint8_t *, size_t len)
hash function to use in thee filter
Definition: bloom.h:136
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof)
Initialize a Bloom Filter.
void bloom_del(bloom_t *bloom)
Delete a Bloom filter.
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len)
Determine if a string is in the Bloom filter.
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len)
Add a string to a Bloom filter.
bloom_t bloom filter object
Definition: bloom.h:141
size_t m
number of bits in the bloom array
Definition: bloom.h:143
size_t k
number of hash functions
Definition: bloom.h:145
hashfp_t * hash
the hash functions
Definition: bloom.h:149
uint8_t * a
the bloom array
Definition: bloom.h:147