diff options
| author | 2025-01-10 16:22:02 -0500 | |
|---|---|---|
| committer | 2025-01-10 16:22:02 -0500 | |
| commit | cd78c57222dfe6ad9906b95e6c0353629469c10f (patch) | |
| tree | f0932a3d2f2be771f4ae6f4f528d6a6eecdb58c0 /tests/zipf-test.scm | |
init
Diffstat (limited to 'tests/zipf-test.scm')
| -rw-r--r-- | tests/zipf-test.scm | 340 |
1 files changed, 340 insertions, 0 deletions
diff --git a/tests/zipf-test.scm b/tests/zipf-test.scm new file mode 100644 index 0000000..2fa99c2 --- /dev/null +++ b/tests/zipf-test.scm @@ -0,0 +1,340 @@ +; SPDX-FileCopyrightText: 2020 Arvydas Silanskas +; SPDX-FileCopyrightText: 2020 Linas Vepštas +; SPDX-License-Identifier: MIT +; +; zipf-test.scm +; Unit tests for the Zipf (zeta) distribution. +; +; Created by Linas Vepstas 10 July 2020 +; Part of srfi-194 + +; srfi-133 compatible API +(define zvector-map vector-map) +(define z2vector-map vector-map) +(define zvector-fold vector-fold) + +; srfi-43 compatible API +;(define (zvector-map f vec) +; (vector-map (lambda (i x) (f x)) vec)) +;(define (z2vector-map f veca vecb) +; (vector-map (lambda (i x y) (f x y)) veca vecb)) +;(define (zvector-fold f knil vec) +; (vector-fold (lambda (i x elt) (f x elt)) knil vec)) + +; ------------------------------------------------------------------ +; Debug utility for gnuplot graphing. +; You can use this to dump a vector to a tab-delimited file. +(define (vector-to-file vec filename) + (define (write-vec) + (for-each + (lambda (i) + (define index (+ i 1)) + (define val (vector-ref vec i)) + (display index) + (display " ") + (display val) + (newline)) + (iota (vector-length vec)))) + (with-output-to-file filename write-vec)) + +; ------------------------------------------------------------------ +; Test harness for exploring the Zipf (Riemann/Hurwicz zeta) distribution +; parameter space. +; +; (test-zipf TEST-ID NVOCAB ESS QUE REPS TOL) +; +; * TEST-ID -- String ID, for debugging. +; * The next three parameters are presented to the generator as +; (make-zipf-generator NVOCAB ESS QUE) +; ++ NVOCAB -- Size of vocabulary to select from. +; ++ ESS -- The Riemann zeta "s" exponent. +; ++ QUE -- The Hurwicz zeta "q" offset. +; * REPS -- The number of samples to draw from the distribution. +; * TOL -- The test tolerance, governing the expected failure rate. +; +; The algorithm is roughly: +; Take REPS samples (make-zipf-generator NVOCAB ESS QUE) +; Accumulate them into NVOCAB histogram bins. +; Normalize counts to unit probability (i.e. divide by NVOCAB) +; +; The resulting distribution should uniformly converge to C/(k+q)^s +; for 1 <= k <= NVOCAB where C is a normalization constant. +; +; This compares the actual distribution to the expected convergent +; and reports an error if it is not within TOL of the convergent. +; i.e. it computes the Banach l_0 norm of (distribution-convergent) +; TOL is to be given in units of standard deviations. So, for example, +; setting TOL to 6 gives a six-sigma bandpass, allowing the tests to +; usually pass. +; +; The keyword here is "usually". The tail of the Zipf distribution is +; generally quite thin, and experiences a lot of statistical variation. +; There does not seem to be any published theory of exactly how the +; central limit theorm can be applied to estimate the distribution of +; the tail. The code below assumes an approximate gaussian distribution, +; computes it and tests it; however certain parameter ranges violate +; this assumption. Thus, this test will fail from time to time, +; depending on the luck of the draw. If it fails, it should be repeated +; once or twice, ubtil it passes. +; +(define (test-zipf-once TEST-ID NVOCAB ESS QUE REPS TOL) + + ; Default random number generator + (define ZGEN make-zipf-generator) + + ; Bin-counter containing accumulated histogram. + (define bin-counts + (let ((bin-counts (make-vector NVOCAB 0))) + ; Accumulate samples into the histogram. + (generator-for-each + (lambda (SAMP) + (define offset (- SAMP 1)) + (vector-set! bin-counts offset (+ 1 (vector-ref bin-counts offset)))) + (gtake (ZGEN NVOCAB ESS QUE) REPS)) + bin-counts)) + + ; Verify the distribution is within tolerance. + ; This is written out long-hand for easier debuggability. + + ; Frequency is normalized to be 0.0 to 1.0 + (define frequency (zvector-map (lambda (n) (/ n REPS)) bin-counts)) + (define probility (zvector-map (lambda (n) (inexact n)) frequency)) + + ; Sequence 1..NVOCAB + (define seq (list->vector (iota NVOCAB 1))) + + ; Sequence 1/(k+QUE)^ESS + (define inv-pow + (zvector-map (lambda (k) (expt (+ k QUE) (- (inexact ESS)))) seq)) + + ; Hurwicz harmonic number sum_1..NVOCAB 1/(k+QUE)^ESS + (define hnorm + (zvector-fold + (lambda (sum cnt) (+ sum cnt)) 0 inv-pow)) + + ; The expected distribution. + (define expect + (zvector-map (lambda (x) (/ x hnorm)) inv-pow)) + + ; Convert to floating point. + (define prexpect (zvector-map (lambda (x) (inexact x)) expect)) + + ; The difference. + (define diff (z2vector-map (lambda (x y) (- x y)) probility prexpect)) + + ; Re-weight the tail by k^{s/2}. This seems give a normal error + ; distribution. ... at least, for small q. Problems for large q + ; and with undersampling; so we hack around that. + (define err-dist + (if (< 10 QUE) diff + (z2vector-map (lambda (i x) (* x (expt (+ i 1) (* 0.5 ESS)))) + (list->vector (iota (vector-length diff))) + diff))) + + ; Normalize to unit root-mean-square. + (define rms (/ 1 (sqrt (* 2 3.141592653 REPS)))) + (define norm-dist (zvector-map (lambda (x) (/ x rms)) err-dist)) + + ; Maximum deviation from expected distribution (l_0 norm) + (define l0-norm + (zvector-fold + (lambda (sum x) (if (< sum (abs x)) (abs x) sum)) 0 norm-dist)) + + ; The mean. + (define mean + (/ (zvector-fold (lambda (sum x) (+ sum x)) 0 norm-dist) + NVOCAB)) + + (define root-mean-square + (sqrt (/ (zvector-fold (lambda (sum x) (+ sum (* x x))) 0 norm-dist) + NVOCAB))) + + ; Test for uniform convergence. + ; (test-assert TEST-ID (<= l0-norm TOL)) + (define tol-result (<= l0-norm TOL)) + + ; Should not random walk too far away. + ; Could tighten this with a proper theory of the error distribution. + ; (test-assert TEST-ID (< (abs mean) 3)) + (define mean-result (< (abs mean) 3)) + ; I don't understand the error distribution .... + ; (test-assert (and (< 0.4 root-mean-square) (< root-mean-square 1.5))) + + ; Sanity check. The total counts in the bins should be equal to REPS. + ; If this fails, the test harness itself is broken. + (test-assert TEST-ID + (equal? REPS + (zvector-fold + (lambda (sum cnt) (+ sum cnt)) 0 bin-counts))) + + ; Utility debug printing + ;(vector-to-file probility "probility.dat") + ;(vector-to-file prexpect "prexpect.dat") + ;(vector-to-file diff "diff.dat") + + (list tol-result mean-result)) + +; ------------------------------------------------------------------ +; Sometimes the Zip test fails, due to random variation. It should +; pass, if attempted a second time. This gives it three chances. +; If it fails three times, then something bad is happening. +(define (test-zipf TEST-ID NVOCAB ESS QUE REPS TOL) + + ;; Three strikes, you're out. + (define RETRY 3) + + (define num-failures + (list-index + (lambda (n) + (define rc (test-zipf-once TEST-ID NVOCAB ESS QUE REPS TOL)) + (and (first rc) (second rc))) + (iota RETRY))) + + ; `num-failures` will be #f if it failed each and every time. + ;(if (number? num-failures) + ; (if (< 0 num-failures) + ; (format #t "Test '~A' out of bounds ~A times.\n" TEST-ID num-failures)) + ; (format #t "Error: Test '~A' failed every time!\n" TEST-ID)) + + ; Announce excessive repeated failures. + (test-assert TEST-ID num-failures) +) + +; ------------------------------------------------------------------ +; Explore the parameter space. +(define (zipf-test-group) + ; (test-begin "srfi-194-zipf") + + ; The unit test computes something that is "almost" a standard + ; deviation for the error distribution. Except, maybe not quite, + ; I don't fully understand the theory. So most tests seem to come + ; in fine in well-under a six-sigma deviation, but some of the wilder + ; parameter choices misbehave, so six-sigma doesn't always work. + ; Also, when the number of bins is large, its easy to under-sample; + ; some bins end up empty and the std-dev is thrown off as a result. + ; Thus, the tolerance bounds below are hand-adjusted. + (define six-sigma 6.0) + + (define hack-que 3.0) + + ; Zoom into s->1 + (test-zipf "zoom-1" 30 1.1 0 1000 six-sigma) + (test-zipf "zoom-2" 30 1.01 0 1000 six-sigma) + (test-zipf "zoom-3" 30 1.001 0 1000 six-sigma) + (test-zipf "zoom-4" 30 1.0001 0 1000 six-sigma) + (test-zipf "zoom-5" 30 1.00001 0 1000 six-sigma) + + (test-zipf "zoom-6" 30 (+ 1 1e-6) 0 1000 six-sigma) + (test-zipf "zoom-8" 30 (+ 1 1e-8) 0 1000 six-sigma) + (test-zipf "zoom-10" 30 (+ 1 1e-10) 0 1000 six-sigma) + (test-zipf "zoom-12" 30 (+ 1 1e-12) 0 1000 six-sigma) + (test-zipf "zoom-14" 30 (+ 1 1e-14) 0 1000 six-sigma) + (test-zipf "zoom-inf" 30 1 0 1000 six-sigma) + + ; Verify improving uniform convergence + (test-zipf "uniform-1" 30 1 0 10000 six-sigma) + (test-zipf "uniform-2" 30 1 0 100000 six-sigma) + + ; Larger vocabulary + (test-zipf "mid-voc-1" 300 1.1 0 10000 six-sigma) + (test-zipf "mid-voc-2" 300 1.01 0 10000 six-sigma) + (test-zipf "mid-voc-3" 300 1.001 0 10000 six-sigma) + (test-zipf "mid-voc-4" 300 1.0001 0 10000 six-sigma) + (test-zipf "mid-voc-5" 300 1.00001 0 10000 six-sigma) + + ; Larger vocabulary. Take more samples.... + (test-zipf "large-voc-1" 3701 1.1 0 40000 six-sigma) + (test-zipf "large-voc-2" 3701 1.01 0 40000 six-sigma) + (test-zipf "large-voc-3" 3701 1.001 0 40000 six-sigma) + (test-zipf "large-voc-4" 3701 1.0001 0 40000 six-sigma) + (test-zipf "large-voc-5" 3701 1.00001 0 40000 six-sigma) + + ; Huge vocabulary; few samples. Many bins will be empty, + ; causing the std-dev to get large. + (test-zipf "huge-voc-1" 43701 (+ 1 1e-6) 0 60000 9.5) + (test-zipf "huge-voc-2" 43701 (+ 1 1e-7) 0 60000 9.5) + (test-zipf "huge-voc-3" 43701 (+ 1 1e-9) 0 60000 9.5) + (test-zipf "huge-voc-4" 43701 (+ 1 1e-12) 0 60000 9.5) + (test-zipf "huge-voc-5" 43701 1 0 60000 9.5) + + ; Large s, small range + (test-zipf "big-s-lo-1" 5 1.1 0 1000 six-sigma) + (test-zipf "big-s-lo-2" 5 2.01 0 1000 six-sigma) + (test-zipf "big-s-lo-3" 5 4.731 0 1000 six-sigma) + (test-zipf "big-s-lo-4" 5 9.09001 0 1000 six-sigma) + (test-zipf "big-s-lo-5" 5 13.45 0 1000 8.0) + + ; Large s, larger range. Most histogram bins will be empty + ; so allow much larger error margins. There are excessively + ; frequent large failures in this bunch. + (test-zipf "bis-mid-1" 130 1.5 0 30000 six-sigma) + (test-zipf "bis-mid-2" 130 2.03 0 30000 9.0) + (test-zipf "bis-mid-3" 130 4.5 0 30000 36.0) ; This one is problematic + (test-zipf "bis-mid-4" 130 6.66 0 30000 24.0) + + ; Verify that accuracy improves with more samples. + (test-zipf "samp-bi-1" 129 1.1 0 10000 six-sigma) + (test-zipf "samp-bi-2" 129 1.01 0 10000 six-sigma) + (test-zipf "samp-bi-3" 129 1.001 0 10000 six-sigma) + (test-zipf "samp-bi-4" 129 1.0001 0 10000 six-sigma) + (test-zipf "samp-bi-5" 129 1.00001 0 10000 six-sigma) + + ; Non-zero Hurwicz parameter + (test-zipf "hurw-1" 131 1.1 0.3 10000 six-sigma) + (test-zipf "hurw-2" 131 1.1 1.3 10000 six-sigma) + (test-zipf "hurw-3" 131 1.1 6.3 10000 six-sigma) + (test-zipf "hurw-4" 131 1.1 20.23 10000 six-sigma) + + ; Negative Hurwicz parameter. Must be greater than branch point at -0.5. + (test-zipf "hneg-1" 81 1.1 -0.1 1000 six-sigma) + (test-zipf "hneg-2" 81 1.1 -0.3 1000 six-sigma) + (test-zipf "hneg-3" 81 1.1 -0.4 1000 six-sigma) + (test-zipf "hneg-4" 81 1.1 -0.499 1000 six-sigma) + + ; A walk into a stranger corner of the parameter space. + (test-zipf "big-h-1" 131 1.1 41.483 10000 hack-que) + (test-zipf "big-h-2" 131 2.1 41.483 10000 hack-que) + (test-zipf "big-h-3" 131 6.1 41.483 10000 hack-que) + (test-zipf "big-h-4" 131 16.1 41.483 10000 hack-que) + (test-zipf "big-h-5" 131 46.1 41.483 10000 hack-que) + (test-zipf "big-h-6" 131 96.1 41.483 10000 hack-que) + + ; A still wilder corner of the parameter space. + (test-zipf "huhu-1" 131 1.1 1841.4 10000 hack-que) + (test-zipf "huhu-2" 131 1.1 1.75e6 10000 hack-que) + (test-zipf "huhu-3" 131 2.1 1.75e6 10000 hack-que) + (test-zipf "huhu-4" 131 12.1 1.75e6 10000 hack-que) + (test-zipf "huhu-5" 131 42.1 1.75e6 10000 hack-que) + + ; Lets try s less than 1 + (test-zipf "small-s-1" 35 0.9 0 1000 six-sigma) + (test-zipf "small-s-2" 35 0.99 0 1000 six-sigma) + (test-zipf "small-s-3" 35 0.999 0 1000 six-sigma) + (test-zipf "small-s-4" 35 0.9999 0 1000 six-sigma) + (test-zipf "small-s-5" 35 0.99999 0 1000 six-sigma) + + ; Attempt to force an overflow + (test-zipf "ovfl-1" 437 (- 1 1e-6) 0 1000 six-sigma) + (test-zipf "ovfl-2" 437 (- 1 1e-7) 0 1000 six-sigma) + (test-zipf "ovfl-3" 437 (- 1 1e-9) 0 1000 six-sigma) + (test-zipf "ovfl-4" 437 (- 1 1e-12) 0 1000 six-sigma) + + ; Almost flat distribution + (test-zipf "flat-1" 36 0.8 0 1000 six-sigma) + (test-zipf "flat-2" 36 0.5 0 1000 six-sigma) + (test-zipf "flat-3" 36 0.1 0 1000 six-sigma) + + ; A visit to crazy-town -- increasing, not decreasing exponent + (test-zipf "neg-s-1" 36 0.0 0 1000 six-sigma) + (test-zipf "neg-s-2" 36 -0.1 0 1000 six-sigma) + (test-zipf "neg-s-3" 36 -1.0 0 1000 six-sigma) + (test-zipf "neg-s-4" 36 -3.0 0 1000 six-sigma) + + ; More crazy with some Hurwicz on top. + (test-zipf "neg-shu-1" 16 0.0 0.5 1000 six-sigma) + (test-zipf "neg-shu-2" 16 -0.2 2.5 1000 six-sigma) + (test-zipf "neg-shu-3" 16 -1.3 10 1000 six-sigma) + (test-zipf "neg-shu-4" 16 -2.9 100 1000 six-sigma) + + ; (test-end "srfi-194-zipf") + ) |
