infotheory/
datagen.rs

1//! # Datagen: Synthetic Data Generators for Validation
2//!
3//! This module provides generators that produce synthetic data with known
4//! theoretical entropy values. Useful for validating entropy estimators and
5//! creating reproducible test cases.
6//!
7//! ## Example
8//! ```rust
9//! use infotheory::datagen::{bernoulli, uniform_random};
10//!
11//! // Bernoulli(0.5) has H(p) = 1.0 bit
12//! let data = bernoulli(1000, 0.5, 42);
13//!
14//! // Uniform random bytes have H = 8.0 bits/byte
15//! let uniform = uniform_random(1000, 42);
16//! ```
17
18/// Simple PRNG (xorshift64) for reproducible generation.
19struct Xorshift64 {
20    state: u64,
21}
22
23impl Xorshift64 {
24    fn new(seed: u64) -> Self {
25        Self {
26            state: if seed == 0 { 0xDEADBEEF } else { seed },
27        }
28    }
29
30    fn next(&mut self) -> u64 {
31        let mut x = self.state;
32        x ^= x << 13;
33        x ^= x >> 7;
34        x ^= x << 17;
35        self.state = x;
36        x
37    }
38
39    fn next_f64(&mut self) -> f64 {
40        (self.next() as f64) / (u64::MAX as f64)
41    }
42}
43
44// ============================================================================
45// Public Generators
46// ============================================================================
47
48/// Generate a Bernoulli bit sequence with probability `p` of each bit being 1.
49///
50/// **Theoretical entropy**: `H(p) = -p*log2(p) - (1-p)*log2(1-p)` bits/symbol.
51///
52/// Returns bytes where each byte is a sample (0 or 1).
53///
54/// # Arguments
55/// * `n` - Number of samples to generate
56/// * `p` - Probability of 1 (must be in [0, 1])
57/// * `seed` - Random seed for reproducibility
58///
59/// # Returns
60/// Vector of n bytes, each being 0 or 1.
61pub fn bernoulli(n: usize, p: f64, seed: u64) -> Vec<u8> {
62    let mut rng = Xorshift64::new(seed);
63    (0..n)
64        .map(|_| if rng.next_f64() < p { 1 } else { 0 })
65        .collect()
66}
67
68/// Theoretical entropy for a Bernoulli(p) source in bits.
69pub fn bernoulli_entropy(p: f64) -> f64 {
70    if p <= 0.0 || p >= 1.0 {
71        return 0.0;
72    }
73    -p * p.log2() - (1.0 - p) * (1.0 - p).log2()
74}
75
76/// Generate uniform random bytes.
77///
78/// **Theoretical entropy**: 8.0 bits/byte exactly.
79///
80/// # Arguments
81/// * `n` - Number of bytes to generate
82/// * `seed` - Random seed for reproducibility
83pub fn uniform_random(n: usize, seed: u64) -> Vec<u8> {
84    let mut rng = Xorshift64::new(seed);
85    (0..n).map(|_| (rng.next() & 0xFF) as u8).collect()
86}
87
88/// Generate a periodic sequence by repeating a pattern.
89///
90/// **Theoretical entropy rate**: Approaches 0 as context order → ∞.
91/// For order >= pattern length, H_rate = 0.
92///
93/// # Arguments
94/// * `n` - Total length of output
95/// * `pattern` - The pattern to repeat
96pub fn periodic(n: usize, pattern: &[u8]) -> Vec<u8> {
97    if pattern.is_empty() {
98        return vec![0; n];
99    }
100    (0..n).map(|i| pattern[i % pattern.len()]).collect()
101}
102
103/// Generate a first-order binary Markov chain.
104///
105/// **Theoretical entropy rate**: Can be computed from the stationary distribution
106/// and transition probabilities.
107///
108/// # Arguments
109/// * `n` - Number of bytes to generate (each 0 or 1)
110/// * `p00` - P(next=0 | current=0)
111/// * `p11` - P(next=1 | current=1)
112/// * `seed` - Random seed for reproducibility
113///
114/// # Returns
115/// Vector of n bytes, each being 0 or 1.
116pub fn markov_1_binary(n: usize, p00: f64, p11: f64, seed: u64) -> Vec<u8> {
117    if n == 0 {
118        return Vec::new();
119    }
120    let mut rng = Xorshift64::new(seed);
121    let mut out = Vec::with_capacity(n);
122
123    // Start with stationary distribution
124    // Stationary: pi_0 = (1-p11) / (2 - p00 - p11), pi_1 = (1-p00) / (2 - p00 - p11)
125    let denom = 2.0 - p00 - p11;
126    let pi_0 = if denom.abs() < 1e-12 {
127        0.5
128    } else {
129        (1.0 - p11) / denom
130    };
131    let mut current: u8 = if rng.next_f64() < pi_0 { 0 } else { 1 };
132    out.push(current);
133
134    for _ in 1..n {
135        let stay_prob = if current == 0 { p00 } else { p11 };
136        if rng.next_f64() < stay_prob {
137            // Stay in same state
138        } else {
139            current = 1 - current;
140        }
141        out.push(current);
142    }
143    out
144}
145
146/// Theoretical entropy rate for a binary Markov chain with transition probs p00, p11.
147///
148/// H_rate = pi_0 * h(1-p00) + pi_1 * h(1-p11)
149/// where h(p) = -p*log2(p) - (1-p)*log2(1-p) is binary entropy.
150pub fn markov_1_binary_entropy_rate(p00: f64, p11: f64) -> f64 {
151    let denom = 2.0 - p00 - p11;
152    if denom.abs() < 1e-12 {
153        // Degenerate case: p00 = p11 = 1.0 means deterministic (always stay)
154        // This has entropy rate = 0
155        return 0.0;
156    }
157    let pi_0 = (1.0 - p11) / denom;
158    let pi_1 = (1.0 - p00) / denom;
159
160    pi_0 * bernoulli_entropy(1.0 - p00) + pi_1 * bernoulli_entropy(1.0 - p11)
161}
162
163/// Generate two identical sequences (for testing identity properties).
164///
165/// MI(X,X) = H(X), NED(X,X) = 0, NTE(X,X) = 0
166pub fn identical_pair(n: usize, seed: u64) -> (Vec<u8>, Vec<u8>) {
167    let data = uniform_random(n, seed);
168    (data.clone(), data)
169}
170
171/// Generate two independent random sequences (for testing independence properties).
172///
173/// MI(X,Y) ≈ 0, NED(X,Y) ≈ 1, NTE(X,Y) ≈ 2 (for similar entropies)
174/// Generate two independent random sequences (for testing independence properties).
175///
176/// MI(X,Y) ≈ 0, NED(X,Y) ≈ 1, NTE(X,Y) ≈ 2 (for similar entropies)
177pub fn independent_pair(n: usize, seed1: u64, seed2: u64) -> (Vec<u8>, Vec<u8>) {
178    (uniform_random(n, seed1), uniform_random(n, seed2))
179}
180
181/// Generate three sequences X, Y, Z where Z = X XOR Y.
182///
183/// X, Y are independent uniform random bits.
184/// Information content: I(X;Z)=0, I(Y;Z)=0, but I(X,Y;Z) > 0.
185/// This forms a classic example where pairwise independence does not imply mutual independence.
186pub fn xor_pair(n: usize, seed: u64) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
187    let x = bernoulli(n, 0.5, seed);
188    let y = bernoulli(n, 0.5, seed.wrapping_add(1337));
189    let z: Vec<u8> = x.iter().zip(y.iter()).map(|(&a, &b)| a ^ b).collect();
190    (x, y, z)
191}
192
193/// Generate a Binary Symmetric Channel (BSC) output.
194///
195/// Input X is uniform random bits. Y is X with bits flipped with probability `flip_prob`.
196/// Theoretical MI: 1 - H(flip_prob).
197pub fn noisy_channel(n: usize, flip_prob: f64, seed: u64) -> (Vec<u8>, Vec<u8>) {
198    let x = bernoulli(n, 0.5, seed);
199    let noise = bernoulli(n, flip_prob, seed.wrapping_add(9999));
200    let y: Vec<u8> = x.iter().zip(noise.iter()).map(|(&a, &b)| a ^ b).collect();
201    (x, y)
202}
203
204/// Generate a functionally dependent pair Y = f(X).
205///
206/// X is uniform random bytes. Y is a deterministic transformation of X.
207/// I(X;Y) = H(Y). H(Y|X) = 0.
208///
209/// `f` maps a byte to a byte.
210pub fn deterministic_func<F>(n: usize, seed: u64, f: F) -> (Vec<u8>, Vec<u8>)
211where
212    F: Fn(u8) -> u8,
213{
214    let x = uniform_random(n, seed);
215    let y: Vec<u8> = x.iter().map(|&b| f(b)).collect();
216    (x, y)
217}
218
219/// Generate a highly compressible string (repeating pattern).
220///
221/// Rate-based entropy should approach 0.
222pub fn highly_compressible(n: usize, period: usize) -> Vec<u8> {
223    let pattern: Vec<u8> = (0..period).map(|i| (i % 256) as u8).collect();
224    periodic(n, &pattern)
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    #[test]
232    fn bernoulli_generates_correct_length() {
233        let data = bernoulli(100, 0.5, 42);
234        assert_eq!(data.len(), 100);
235    }
236
237    #[test]
238    fn bernoulli_respects_probability() {
239        // With p=1.0, all should be 1
240        let all_ones = bernoulli(100, 1.0, 42);
241        assert!(all_ones.iter().all(|&b| b == 1));
242
243        // With p=0.0, all should be 0
244        let all_zeros = bernoulli(100, 0.0, 42);
245        assert!(all_zeros.iter().all(|&b| b == 0));
246    }
247
248    #[test]
249    fn bernoulli_entropy_bounds() {
250        assert!((bernoulli_entropy(0.5) - 1.0).abs() < 1e-10);
251        assert!(bernoulli_entropy(0.0) == 0.0);
252        assert!(bernoulli_entropy(1.0) == 0.0);
253    }
254
255    #[test]
256    fn periodic_repeats_correctly() {
257        let pat = b"ABC";
258        let data = periodic(10, pat);
259        assert_eq!(data, b"ABCABCABCA");
260    }
261
262    #[test]
263    fn markov_gives_valid_outputs() {
264        let data = markov_1_binary(100, 0.9, 0.9, 42);
265        assert_eq!(data.len(), 100);
266        assert!(data.iter().all(|&b| b == 0 || b == 1));
267    }
268
269    #[test]
270    fn markov_entropy_rate_symmetric() {
271        // For p00 = p11 = 0.5, should be 1.0 bit (like fair coin)
272        let h = markov_1_binary_entropy_rate(0.5, 0.5);
273        assert!((h - 1.0).abs() < 1e-10);
274    }
275
276    #[test]
277    fn markov_entropy_rate_deterministic() {
278        // For p00 = p11 = 1.0 (always stay same state), entropy rate = 0
279        // because no transitions ever happen
280        let h = markov_1_binary_entropy_rate(1.0, 1.0);
281        assert!(
282            h.abs() < 1e-10,
283            "deterministic chain should have H=0, got {}",
284            h
285        );
286    }
287}