aboutsummaryrefslogtreecommitdiffstats
path: root/tests/zipf-test.scm
blob: 2fa99c260b7c2ef3f5685b951a563aca119f88d7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
; SPDX-FileCopyrightText: 2020 Arvydas Silanskas
; SPDX-FileCopyrightText: 2020 Linas Vepštas
; SPDX-License-Identifier: MIT
;
; zipf-test.scm
; Unit tests for the Zipf (zeta) distribution.
;
; Created by Linas Vepstas 10 July 2020
; Part of srfi-194

; srfi-133 compatible API
(define zvector-map vector-map)
(define z2vector-map vector-map)
(define zvector-fold vector-fold)

; srfi-43 compatible API
;(define (zvector-map f vec)
;  (vector-map (lambda (i x) (f x)) vec))
;(define (z2vector-map f veca vecb)
;  (vector-map (lambda (i x y) (f x y)) veca vecb))
;(define (zvector-fold f knil vec)
;  (vector-fold (lambda (i x elt) (f x elt)) knil vec))

; ------------------------------------------------------------------
; Debug utility for gnuplot graphing.
; You can use this to dump a vector to a tab-delimited file.
(define (vector-to-file vec filename)
  (define (write-vec)
    (for-each
      (lambda (i)
        (define index (+ i 1))
        (define val (vector-ref vec i))
        (display index)
        (display "  ")
        (display val)
        (newline))
      (iota (vector-length vec))))
  (with-output-to-file filename write-vec))

; ------------------------------------------------------------------
; Test harness for exploring the Zipf (Riemann/Hurwicz zeta) distribution
; parameter space.
;
;     (test-zipf TEST-ID NVOCAB ESS QUE REPS TOL)
;
; * TEST-ID -- String ID, for debugging.
; * The next three parameters are presented to the generator as
;     (make-zipf-generator NVOCAB ESS QUE)
;   ++  NVOCAB -- Size of vocabulary to select from.
;   ++  ESS -- The Riemann zeta "s" exponent.
;   ++  QUE -- The Hurwicz zeta "q" offset.
; * REPS -- The number of samples to draw from the distribution.
; * TOL -- The test tolerance, governing the expected failure rate.
;
; The algorithm is roughly:
;   Take REPS samples (make-zipf-generator NVOCAB ESS QUE)
;   Accumulate them into NVOCAB histogram bins.
;   Normalize counts to unit probability (i.e. divide by NVOCAB)
;
; The resulting distribution should uniformly converge to C/(k+q)^s
; for 1 <= k <= NVOCAB where C is a normalization constant.
;
; This compares the actual distribution to the expected convergent
; and reports an error if it is not within TOL of the convergent.
; i.e. it computes the Banach l_0 norm of (distribution-convergent)
; TOL is to be given in units of standard deviations. So, for example,
; setting TOL to 6 gives a six-sigma bandpass, allowing the tests to
; usually pass.
;
; The keyword here is "usually". The tail of the Zipf distribution is
; generally quite thin, and experiences a lot of statistical variation.
; There does not seem to be any published theory of exactly how the
; central limit theorm can be applied to estimate the distribution of
; the tail. The code below assumes an approximate gaussian distribution,
; computes it and tests it; however certain parameter ranges violate
; this assumption. Thus, this test will fail from time to time,
; depending on the luck of the draw. If it fails, it should be repeated
; once or twice, ubtil it passes.
;
(define (test-zipf-once TEST-ID NVOCAB ESS QUE REPS TOL)

  ; Default random number generator
  (define ZGEN make-zipf-generator)

  ; Bin-counter containing accumulated histogram.
  (define bin-counts
    (let ((bin-counts (make-vector NVOCAB 0)))
     ; Accumulate samples into the histogram.
     (generator-for-each
       (lambda (SAMP)
         (define offset (- SAMP 1))
         (vector-set! bin-counts offset (+ 1 (vector-ref bin-counts offset))))
       (gtake (ZGEN NVOCAB ESS QUE) REPS))
     bin-counts))

  ; Verify the distribution is within tolerance.
  ; This is written out long-hand for easier debuggability.

  ; Frequency is normalized to be 0.0 to 1.0
  (define frequency (zvector-map (lambda (n) (/ n REPS)) bin-counts))
  (define probility (zvector-map (lambda (n) (inexact n)) frequency))

  ; Sequence 1..NVOCAB
  (define seq (list->vector (iota NVOCAB 1)))

  ; Sequence  1/(k+QUE)^ESS
  (define inv-pow
    (zvector-map (lambda (k) (expt (+ k QUE) (- (inexact ESS)))) seq))

  ; Hurwicz harmonic number sum_1..NVOCAB 1/(k+QUE)^ESS
  (define hnorm
    (zvector-fold
      (lambda (sum cnt) (+ sum cnt)) 0 inv-pow))

  ; The expected distribution.
  (define expect
    (zvector-map (lambda (x) (/ x hnorm)) inv-pow))

  ; Convert to floating point.
  (define prexpect (zvector-map (lambda (x) (inexact x)) expect))

  ; The difference.
  (define diff (z2vector-map (lambda (x y) (- x y)) probility prexpect))

  ; Re-weight the tail by k^{s/2}. This seems give a normal error
  ; distribution. ... at least, for small q. Problems for large q
  ; and with undersampling; so we hack around that.
  (define err-dist
    (if (< 10 QUE) diff
        (z2vector-map (lambda (i x) (* x (expt (+ i 1) (* 0.5 ESS))))
                    (list->vector (iota (vector-length diff)))
                    diff)))

  ; Normalize to unit root-mean-square.
  (define rms (/ 1 (sqrt (* 2 3.141592653 REPS))))
  (define norm-dist (zvector-map (lambda (x) (/ x rms)) err-dist))

  ; Maximum deviation from expected distribution (l_0 norm)
  (define l0-norm
    (zvector-fold
      (lambda (sum x) (if (< sum (abs x)) (abs x) sum)) 0 norm-dist))

  ; The mean.
  (define mean
    (/ (zvector-fold (lambda (sum x) (+ sum x)) 0 norm-dist)
       NVOCAB))

  (define root-mean-square
    (sqrt (/ (zvector-fold (lambda (sum x) (+ sum (* x x))) 0 norm-dist)
             NVOCAB)))

  ; Test for uniform convergence.
  ; (test-assert TEST-ID (<= l0-norm TOL))
  (define tol-result (<= l0-norm TOL))

  ; Should not random walk too far away.
  ; Could tighten this with a proper theory of the error distribution.
  ; (test-assert TEST-ID (< (abs mean) 3))
  (define mean-result (< (abs mean) 3))
  ; I don't understand the error distribution ....
  ; (test-assert (and (< 0.4 root-mean-square) (< root-mean-square 1.5)))

  ; Sanity check. The total counts in the bins should be equal to REPS.
  ; If this fails, the test harness itself is broken.
  (test-assert TEST-ID
    (equal? REPS
            (zvector-fold
              (lambda (sum cnt) (+ sum cnt)) 0 bin-counts)))

  ; Utility debug printing
  ;(vector-to-file probility "probility.dat")
  ;(vector-to-file prexpect "prexpect.dat")
  ;(vector-to-file diff "diff.dat")

  (list tol-result mean-result))

; ------------------------------------------------------------------
; Sometimes the Zip test fails, due to random variation. It should
; pass, if attempted a second time. This gives it three chances.
; If it fails three times, then something bad is happening.
(define (test-zipf TEST-ID NVOCAB ESS QUE REPS TOL)

  ;; Three strikes, you're out.
  (define RETRY 3)

  (define num-failures
    (list-index
     (lambda (n)
       (define rc (test-zipf-once TEST-ID NVOCAB ESS QUE REPS TOL))
       (and (first rc) (second rc)))
     (iota RETRY)))

  ; `num-failures` will be #f if it failed each and every time.
  ;(if (number? num-failures)
  ;  (if (< 0 num-failures)
  ;    (format #t "Test '~A' out of bounds ~A times.\n" TEST-ID num-failures))
  ;  (format #t "Error: Test '~A' failed every time!\n" TEST-ID))

  ; Announce excessive repeated failures.
  (test-assert TEST-ID num-failures)
)

; ------------------------------------------------------------------
; Explore the parameter space.
(define (zipf-test-group)
  ; (test-begin "srfi-194-zipf")

  ; The unit test computes something that is "almost" a standard
  ; deviation for the error distribution. Except, maybe not quite,
  ; I don't fully understand the theory. So most tests seem to come
  ; in fine in well-under a six-sigma deviation, but some of the wilder
  ; parameter choices misbehave, so six-sigma doesn't always work.
  ; Also, when the number of bins is large, its easy to under-sample;
  ; some bins end up empty and the std-dev is thrown off as a result.
  ; Thus, the tolerance bounds below are hand-adjusted.
  (define six-sigma 6.0)

  (define hack-que 3.0)

  ; Zoom into s->1
  (test-zipf "zoom-1"   30 1.1     0 1000 six-sigma)
  (test-zipf "zoom-2"   30 1.01    0 1000 six-sigma)
  (test-zipf "zoom-3"   30 1.001   0 1000 six-sigma)
  (test-zipf "zoom-4"   30 1.0001  0 1000 six-sigma)
  (test-zipf "zoom-5"   30 1.00001 0 1000 six-sigma)

  (test-zipf "zoom-6"   30 (+ 1 1e-6)  0 1000 six-sigma)
  (test-zipf "zoom-8"   30 (+ 1 1e-8)  0 1000 six-sigma)
  (test-zipf "zoom-10"  30 (+ 1 1e-10) 0 1000 six-sigma)
  (test-zipf "zoom-12"  30 (+ 1 1e-12) 0 1000 six-sigma)
  (test-zipf "zoom-14"  30 (+ 1 1e-14) 0 1000 six-sigma)
  (test-zipf "zoom-inf" 30 1           0 1000 six-sigma)

  ; Verify improving uniform convergence
  (test-zipf "uniform-1" 30 1  0 10000   six-sigma)
  (test-zipf "uniform-2" 30 1  0 100000  six-sigma)

  ; Larger vocabulary
  (test-zipf "mid-voc-1" 300 1.1     0 10000 six-sigma)
  (test-zipf "mid-voc-2" 300 1.01    0 10000 six-sigma)
  (test-zipf "mid-voc-3" 300 1.001   0 10000 six-sigma)
  (test-zipf "mid-voc-4" 300 1.0001  0 10000 six-sigma)
  (test-zipf "mid-voc-5" 300 1.00001 0 10000 six-sigma)

  ; Larger vocabulary. Take more samples....
  (test-zipf "large-voc-1" 3701 1.1     0 40000 six-sigma)
  (test-zipf "large-voc-2" 3701 1.01    0 40000 six-sigma)
  (test-zipf "large-voc-3" 3701 1.001   0 40000 six-sigma)
  (test-zipf "large-voc-4" 3701 1.0001  0 40000 six-sigma)
  (test-zipf "large-voc-5" 3701 1.00001 0 40000 six-sigma)

  ; Huge vocabulary; few samples. Many bins will be empty,
  ; causing the std-dev to get large.
  (test-zipf "huge-voc-1" 43701 (+ 1 1e-6)  0 60000 9.5)
  (test-zipf "huge-voc-2" 43701 (+ 1 1e-7)  0 60000 9.5)
  (test-zipf "huge-voc-3" 43701 (+ 1 1e-9)  0 60000 9.5)
  (test-zipf "huge-voc-4" 43701 (+ 1 1e-12) 0 60000 9.5)
  (test-zipf "huge-voc-5" 43701 1           0 60000 9.5)

  ; Large s, small range
  (test-zipf "big-s-lo-1" 5 1.1     0 1000 six-sigma)
  (test-zipf "big-s-lo-2" 5 2.01    0 1000 six-sigma)
  (test-zipf "big-s-lo-3" 5 4.731   0 1000 six-sigma)
  (test-zipf "big-s-lo-4" 5 9.09001 0 1000 six-sigma)
  (test-zipf "big-s-lo-5" 5 13.45   0 1000 8.0)

  ; Large s, larger range. Most histogram bins will be empty
  ; so allow much larger error margins. There are excessively
  ; frequent large failures in this bunch.
  (test-zipf "bis-mid-1" 130 1.5     0 30000 six-sigma)
  (test-zipf "bis-mid-2" 130 2.03    0 30000 9.0)
  (test-zipf "bis-mid-3" 130 4.5     0 30000 36.0) ; This one is problematic
  (test-zipf "bis-mid-4" 130 6.66    0 30000 24.0)

  ; Verify that accuracy improves with more samples.
  (test-zipf "samp-bi-1" 129 1.1     0 10000 six-sigma)
  (test-zipf "samp-bi-2" 129 1.01    0 10000 six-sigma)
  (test-zipf "samp-bi-3" 129 1.001   0 10000 six-sigma)
  (test-zipf "samp-bi-4" 129 1.0001  0 10000 six-sigma)
  (test-zipf "samp-bi-5" 129 1.00001 0 10000 six-sigma)

  ; Non-zero Hurwicz parameter
  (test-zipf "hurw-1" 131 1.1     0.3    10000 six-sigma)
  (test-zipf "hurw-2" 131 1.1     1.3    10000 six-sigma)
  (test-zipf "hurw-3" 131 1.1     6.3    10000 six-sigma)
  (test-zipf "hurw-4" 131 1.1     20.23  10000 six-sigma)

  ; Negative Hurwicz parameter. Must be greater than branch point at -0.5.
  (test-zipf "hneg-1" 81 1.1     -0.1   1000 six-sigma)
  (test-zipf "hneg-2" 81 1.1     -0.3   1000 six-sigma)
  (test-zipf "hneg-3" 81 1.1     -0.4   1000 six-sigma)
  (test-zipf "hneg-4" 81 1.1     -0.499 1000 six-sigma)

  ; A walk into a stranger corner of the parameter space.
  (test-zipf "big-h-1" 131 1.1     41.483 10000 hack-que)
  (test-zipf "big-h-2" 131 2.1     41.483 10000 hack-que)
  (test-zipf "big-h-3" 131 6.1     41.483 10000 hack-que)
  (test-zipf "big-h-4" 131 16.1    41.483 10000 hack-que)
  (test-zipf "big-h-5" 131 46.1    41.483 10000 hack-que)
  (test-zipf "big-h-6" 131 96.1    41.483 10000 hack-que)

  ; A still wilder corner of the parameter space.
  (test-zipf "huhu-1" 131 1.1     1841.4 10000 hack-que)
  (test-zipf "huhu-2" 131 1.1     1.75e6 10000 hack-que)
  (test-zipf "huhu-3" 131 2.1     1.75e6 10000 hack-que)
  (test-zipf "huhu-4" 131 12.1    1.75e6 10000 hack-que)
  (test-zipf "huhu-5" 131 42.1    1.75e6 10000 hack-que)

  ; Lets try s less than 1
  (test-zipf "small-s-1" 35 0.9     0 1000 six-sigma)
  (test-zipf "small-s-2" 35 0.99    0 1000 six-sigma)
  (test-zipf "small-s-3" 35 0.999   0 1000 six-sigma)
  (test-zipf "small-s-4" 35 0.9999  0 1000 six-sigma)
  (test-zipf "small-s-5" 35 0.99999 0 1000 six-sigma)

  ; Attempt to force an overflow
  (test-zipf "ovfl-1" 437 (- 1 1e-6)  0 1000 six-sigma)
  (test-zipf "ovfl-2" 437 (- 1 1e-7)  0 1000 six-sigma)
  (test-zipf "ovfl-3" 437 (- 1 1e-9)  0 1000 six-sigma)
  (test-zipf "ovfl-4" 437 (- 1 1e-12) 0 1000 six-sigma)

  ; Almost flat distribution
  (test-zipf "flat-1" 36 0.8     0 1000 six-sigma)
  (test-zipf "flat-2" 36 0.5     0 1000 six-sigma)
  (test-zipf "flat-3" 36 0.1     0 1000 six-sigma)

  ; A visit to crazy-town -- increasing, not decreasing exponent
  (test-zipf "neg-s-1" 36 0.0     0 1000 six-sigma)
  (test-zipf "neg-s-2" 36 -0.1    0 1000 six-sigma)
  (test-zipf "neg-s-3" 36 -1.0    0 1000 six-sigma)
  (test-zipf "neg-s-4" 36 -3.0    0 1000 six-sigma)

  ; More crazy with some Hurwicz on top.
  (test-zipf "neg-shu-1" 16 0.0     0.5 1000 six-sigma)
  (test-zipf "neg-shu-2" 16 -0.2    2.5 1000 six-sigma)
  (test-zipf "neg-shu-3" 16 -1.3    10  1000 six-sigma)
  (test-zipf "neg-shu-4" 16 -2.9    100 1000 six-sigma)

  ; (test-end "srfi-194-zipf")
  )