infotheory/datagen.rs
1//! # Datagen: Synthetic Data Generators for Validation
2//!
3//! This module provides generators that produce synthetic data with known
4//! theoretical entropy values. Useful for validating entropy estimators and
5//! creating reproducible test cases.
6//!
7//! ## Example
8//! ```rust
9//! use infotheory::datagen::{bernoulli, uniform_random};
10//!
11//! // Bernoulli(0.5) has H(p) = 1.0 bit
12//! let data = bernoulli(1000, 0.5, 42);
13//!
14//! // Uniform random bytes have H = 8.0 bits/byte
15//! let uniform = uniform_random(1000, 42);
16//! ```
17
18/// Simple PRNG (xorshift64) for reproducible generation.
19struct Xorshift64 {
20 state: u64,
21}
22
23impl Xorshift64 {
24 fn new(seed: u64) -> Self {
25 Self {
26 state: if seed == 0 { 0xDEADBEEF } else { seed },
27 }
28 }
29
30 fn next(&mut self) -> u64 {
31 let mut x = self.state;
32 x ^= x << 13;
33 x ^= x >> 7;
34 x ^= x << 17;
35 self.state = x;
36 x
37 }
38
39 fn next_f64(&mut self) -> f64 {
40 (self.next() as f64) / (u64::MAX as f64)
41 }
42}
43
44// ============================================================================
45// Public Generators
46// ============================================================================
47
48/// Generate a Bernoulli bit sequence with probability `p` of each bit being 1.
49///
50/// **Theoretical entropy**: `H(p) = -p*log2(p) - (1-p)*log2(1-p)` bits/symbol.
51///
52/// Returns bytes where each byte is a sample (0 or 1).
53///
54/// # Arguments
55/// * `n` - Number of samples to generate
56/// * `p` - Probability of 1 (must be in [0, 1])
57/// * `seed` - Random seed for reproducibility
58///
59/// # Returns
60/// Vector of n bytes, each being 0 or 1.
61pub fn bernoulli(n: usize, p: f64, seed: u64) -> Vec<u8> {
62 let mut rng = Xorshift64::new(seed);
63 (0..n)
64 .map(|_| if rng.next_f64() < p { 1 } else { 0 })
65 .collect()
66}
67
68/// Theoretical entropy for a Bernoulli(p) source in bits.
69pub fn bernoulli_entropy(p: f64) -> f64 {
70 if p <= 0.0 || p >= 1.0 {
71 return 0.0;
72 }
73 -p * p.log2() - (1.0 - p) * (1.0 - p).log2()
74}
75
76/// Generate uniform random bytes.
77///
78/// **Theoretical entropy**: 8.0 bits/byte exactly.
79///
80/// # Arguments
81/// * `n` - Number of bytes to generate
82/// * `seed` - Random seed for reproducibility
83pub fn uniform_random(n: usize, seed: u64) -> Vec<u8> {
84 let mut rng = Xorshift64::new(seed);
85 (0..n).map(|_| (rng.next() & 0xFF) as u8).collect()
86}
87
88/// Generate a periodic sequence by repeating a pattern.
89///
90/// **Theoretical entropy rate**: Approaches 0 as context order → ∞.
91/// For order >= pattern length, H_rate = 0.
92///
93/// # Arguments
94/// * `n` - Total length of output
95/// * `pattern` - The pattern to repeat
96pub fn periodic(n: usize, pattern: &[u8]) -> Vec<u8> {
97 if pattern.is_empty() {
98 return vec![0; n];
99 }
100 (0..n).map(|i| pattern[i % pattern.len()]).collect()
101}
102
103/// Generate a first-order binary Markov chain.
104///
105/// **Theoretical entropy rate**: Can be computed from the stationary distribution
106/// and transition probabilities.
107///
108/// # Arguments
109/// * `n` - Number of bytes to generate (each 0 or 1)
110/// * `p00` - P(next=0 | current=0)
111/// * `p11` - P(next=1 | current=1)
112/// * `seed` - Random seed for reproducibility
113///
114/// # Returns
115/// Vector of n bytes, each being 0 or 1.
116pub fn markov_1_binary(n: usize, p00: f64, p11: f64, seed: u64) -> Vec<u8> {
117 if n == 0 {
118 return Vec::new();
119 }
120 let mut rng = Xorshift64::new(seed);
121 let mut out = Vec::with_capacity(n);
122
123 // Start with stationary distribution
124 // Stationary: pi_0 = (1-p11) / (2 - p00 - p11), pi_1 = (1-p00) / (2 - p00 - p11)
125 let denom = 2.0 - p00 - p11;
126 let pi_0 = if denom.abs() < 1e-12 {
127 0.5
128 } else {
129 (1.0 - p11) / denom
130 };
131 let mut current: u8 = if rng.next_f64() < pi_0 { 0 } else { 1 };
132 out.push(current);
133
134 for _ in 1..n {
135 let stay_prob = if current == 0 { p00 } else { p11 };
136 if rng.next_f64() < stay_prob {
137 // Stay in same state
138 } else {
139 current = 1 - current;
140 }
141 out.push(current);
142 }
143 out
144}
145
146/// Theoretical entropy rate for a binary Markov chain with transition probs p00, p11.
147///
148/// H_rate = pi_0 * h(1-p00) + pi_1 * h(1-p11)
149/// where h(p) = -p*log2(p) - (1-p)*log2(1-p) is binary entropy.
150pub fn markov_1_binary_entropy_rate(p00: f64, p11: f64) -> f64 {
151 let denom = 2.0 - p00 - p11;
152 if denom.abs() < 1e-12 {
153 // Degenerate case: p00 = p11 = 1.0 means deterministic (always stay)
154 // This has entropy rate = 0
155 return 0.0;
156 }
157 let pi_0 = (1.0 - p11) / denom;
158 let pi_1 = (1.0 - p00) / denom;
159
160 pi_0 * bernoulli_entropy(1.0 - p00) + pi_1 * bernoulli_entropy(1.0 - p11)
161}
162
163/// Generate two identical sequences (for testing identity properties).
164///
165/// MI(X,X) = H(X), NED(X,X) = 0, NTE(X,X) = 0
166pub fn identical_pair(n: usize, seed: u64) -> (Vec<u8>, Vec<u8>) {
167 let data = uniform_random(n, seed);
168 (data.clone(), data)
169}
170
171/// Generate two independent random sequences (for testing independence properties).
172///
173/// MI(X,Y) ≈ 0, NED(X,Y) ≈ 1, NTE(X,Y) ≈ 2 (for similar entropies)
174/// Generate two independent random sequences (for testing independence properties).
175///
176/// MI(X,Y) ≈ 0, NED(X,Y) ≈ 1, NTE(X,Y) ≈ 2 (for similar entropies)
177pub fn independent_pair(n: usize, seed1: u64, seed2: u64) -> (Vec<u8>, Vec<u8>) {
178 (uniform_random(n, seed1), uniform_random(n, seed2))
179}
180
181/// Generate three sequences X, Y, Z where Z = X XOR Y.
182///
183/// X, Y are independent uniform random bits.
184/// Information content: I(X;Z)=0, I(Y;Z)=0, but I(X,Y;Z) > 0.
185/// This forms a classic example where pairwise independence does not imply mutual independence.
186pub fn xor_pair(n: usize, seed: u64) -> (Vec<u8>, Vec<u8>, Vec<u8>) {
187 let x = bernoulli(n, 0.5, seed);
188 let y = bernoulli(n, 0.5, seed.wrapping_add(1337));
189 let z: Vec<u8> = x.iter().zip(y.iter()).map(|(&a, &b)| a ^ b).collect();
190 (x, y, z)
191}
192
193/// Generate a Binary Symmetric Channel (BSC) output.
194///
195/// Input X is uniform random bits. Y is X with bits flipped with probability `flip_prob`.
196/// Theoretical MI: 1 - H(flip_prob).
197pub fn noisy_channel(n: usize, flip_prob: f64, seed: u64) -> (Vec<u8>, Vec<u8>) {
198 let x = bernoulli(n, 0.5, seed);
199 let noise = bernoulli(n, flip_prob, seed.wrapping_add(9999));
200 let y: Vec<u8> = x.iter().zip(noise.iter()).map(|(&a, &b)| a ^ b).collect();
201 (x, y)
202}
203
204/// Generate a functionally dependent pair Y = f(X).
205///
206/// X is uniform random bytes. Y is a deterministic transformation of X.
207/// I(X;Y) = H(Y). H(Y|X) = 0.
208///
209/// `f` maps a byte to a byte.
210pub fn deterministic_func<F>(n: usize, seed: u64, f: F) -> (Vec<u8>, Vec<u8>)
211where
212 F: Fn(u8) -> u8,
213{
214 let x = uniform_random(n, seed);
215 let y: Vec<u8> = x.iter().map(|&b| f(b)).collect();
216 (x, y)
217}
218
219/// Generate a highly compressible string (repeating pattern).
220///
221/// Rate-based entropy should approach 0.
222pub fn highly_compressible(n: usize, period: usize) -> Vec<u8> {
223 let pattern: Vec<u8> = (0..period).map(|i| (i % 256) as u8).collect();
224 periodic(n, &pattern)
225}
226
227#[cfg(test)]
228mod tests {
229 use super::*;
230
231 #[test]
232 fn bernoulli_generates_correct_length() {
233 let data = bernoulli(100, 0.5, 42);
234 assert_eq!(data.len(), 100);
235 }
236
237 #[test]
238 fn bernoulli_respects_probability() {
239 // With p=1.0, all should be 1
240 let all_ones = bernoulli(100, 1.0, 42);
241 assert!(all_ones.iter().all(|&b| b == 1));
242
243 // With p=0.0, all should be 0
244 let all_zeros = bernoulli(100, 0.0, 42);
245 assert!(all_zeros.iter().all(|&b| b == 0));
246 }
247
248 #[test]
249 fn bernoulli_entropy_bounds() {
250 assert!((bernoulli_entropy(0.5) - 1.0).abs() < 1e-10);
251 assert!(bernoulli_entropy(0.0) == 0.0);
252 assert!(bernoulli_entropy(1.0) == 0.0);
253 }
254
255 #[test]
256 fn periodic_repeats_correctly() {
257 let pat = b"ABC";
258 let data = periodic(10, pat);
259 assert_eq!(data, b"ABCABCABCA");
260 }
261
262 #[test]
263 fn markov_gives_valid_outputs() {
264 let data = markov_1_binary(100, 0.9, 0.9, 42);
265 assert_eq!(data.len(), 100);
266 assert!(data.iter().all(|&b| b == 0 || b == 1));
267 }
268
269 #[test]
270 fn markov_entropy_rate_symmetric() {
271 // For p00 = p11 = 0.5, should be 1.0 bit (like fair coin)
272 let h = markov_1_binary_entropy_rate(0.5, 0.5);
273 assert!((h - 1.0).abs() < 1e-10);
274 }
275
276 #[test]
277 fn markov_entropy_rate_deterministic() {
278 // For p00 = p11 = 1.0 (always stay same state), entropy rate = 0
279 // because no transitions ever happen
280 let h = markov_1_binary_entropy_rate(1.0, 1.0);
281 assert!(
282 h.abs() < 1e-10,
283 "deterministic chain should have H=0, got {}",
284 h
285 );
286 }
287}