; SPDX-FileCopyrightText: 2020 Arvydas Silanskas
; SPDX-FileCopyrightText: 2020 Linas Vepštas
; SPDX-License-Identifier: MIT
;
; zipf-test.scm
; Unit tests for the Zipf (zeta) distribution.
;
; Created by Linas Vepstas 10 July 2020
; Part of srfi-194

; srfi-133 compatible API
(define zvector-map vector-map)
(define z2vector-map vector-map)
(define zvector-fold vector-fold)

; srfi-43 compatible API
;(define (zvector-map f vec)
;  (vector-map (lambda (i x) (f x)) vec))
;(define (z2vector-map f veca vecb)
;  (vector-map (lambda (i x y) (f x y)) veca vecb))
;(define (zvector-fold f knil vec)
;  (vector-fold (lambda (i x elt) (f x elt)) knil vec))

; ------------------------------------------------------------------
; Debug utility for gnuplot graphing.
; You can use this to dump a vector to a tab-delimited file.
(define (vector-to-file vec filename)
  (define (write-vec)
    (for-each
      (lambda (i)
        (define index (+ i 1))
        (define val (vector-ref vec i))
        (display index)
        (display "  ")
        (display val)
        (newline))
      (iota (vector-length vec))))
  (with-output-to-file filename write-vec))

; ------------------------------------------------------------------
; Test harness for exploring the Zipf (Riemann/Hurwicz zeta) distribution
; parameter space.
;
;     (test-zipf TEST-ID NVOCAB ESS QUE REPS TOL)
;
; * TEST-ID -- String ID, for debugging.
; * The next three parameters are presented to the generator as
;     (make-zipf-generator NVOCAB ESS QUE)
;   ++  NVOCAB -- Size of vocabulary to select from.
;   ++  ESS -- The Riemann zeta "s" exponent.
;   ++  QUE -- The Hurwicz zeta "q" offset.
; * REPS -- The number of samples to draw from the distribution.
; * TOL -- The test tolerance, governing the expected failure rate.
;
; The algorithm is roughly:
;   Take REPS samples (make-zipf-generator NVOCAB ESS QUE)
;   Accumulate them into NVOCAB histogram bins.
;   Normalize counts to unit probability (i.e. divide by NVOCAB)
;
; The resulting distribution should uniformly converge to C/(k+q)^s
; for 1 <= k <= NVOCAB where C is a normalization constant.
;
; This compares the actual distribution to the expected convergent
; and reports an error if it is not within TOL of the convergent.
; i.e. it computes the Banach l_0 norm of (distribution-convergent)
; TOL is to be given in units of standard deviations. So, for example,
; setting TOL to 6 gives a six-sigma bandpass, allowing the tests to
; usually pass.
;
; The keyword here is "usually". The tail of the Zipf distribution is
; generally quite thin, and experiences a lot of statistical variation.
; There does not seem to be any published theory of exactly how the
; central limit theorm can be applied to estimate the distribution of
; the tail. The code below assumes an approximate gaussian distribution,
; computes it and tests it; however certain parameter ranges violate
; this assumption. Thus, this test will fail from time to time,
; depending on the luck of the draw. If it fails, it should be repeated
; once or twice, ubtil it passes.
;
(define (test-zipf-once TEST-ID NVOCAB ESS QUE REPS TOL)

  ; Default random number generator
  (define ZGEN make-zipf-generator)

  ; Bin-counter containing accumulated histogram.
  (define bin-counts
    (let ((bin-counts (make-vector NVOCAB 0)))
     ; Accumulate samples into the histogram.
     (generator-for-each
       (lambda (SAMP)
         (define offset (- SAMP 1))
         (vector-set! bin-counts offset (+ 1 (vector-ref bin-counts offset))))
       (gtake (ZGEN NVOCAB ESS QUE) REPS))
     bin-counts))

  ; Verify the distribution is within tolerance.
  ; This is written out long-hand for easier debuggability.

  ; Frequency is normalized to be 0.0 to 1.0
  (define frequency (zvector-map (lambda (n) (/ n REPS)) bin-counts))
  (define probility (zvector-map (lambda (n) (inexact n)) frequency))

  ; Sequence 1..NVOCAB
  (define seq (list->vector (iota NVOCAB 1)))

  ; Sequence  1/(k+QUE)^ESS
  (define inv-pow
    (zvector-map (lambda (k) (expt (+ k QUE) (- (inexact ESS)))) seq))

  ; Hurwicz harmonic number sum_1..NVOCAB 1/(k+QUE)^ESS
  (define hnorm
    (zvector-fold
      (lambda (sum cnt) (+ sum cnt)) 0 inv-pow))

  ; The expected distribution.
  (define expect
    (zvector-map (lambda (x) (/ x hnorm)) inv-pow))

  ; Convert to floating point.
  (define prexpect (zvector-map (lambda (x) (inexact x)) expect))

  ; The difference.
  (define diff (z2vector-map (lambda (x y) (- x y)) probility prexpect))

  ; Re-weight the tail by k^{s/2}. This seems give a normal error
  ; distribution. ... at least, for small q. Problems for large q
  ; and with undersampling; so we hack around that.
  (define err-dist
    (if (< 10 QUE) diff
        (z2vector-map (lambda (i x) (* x (expt (+ i 1) (* 0.5 ESS))))
                    (list->vector (iota (vector-length diff)))
                    diff)))

  ; Normalize to unit root-mean-square.
  (define rms (/ 1 (sqrt (* 2 3.141592653 REPS))))
  (define norm-dist (zvector-map (lambda (x) (/ x rms)) err-dist))

  ; Maximum deviation from expected distribution (l_0 norm)
  (define l0-norm
    (zvector-fold
      (lambda (sum x) (if (< sum (abs x)) (abs x) sum)) 0 norm-dist))

  ; The mean.
  (define mean
    (/ (zvector-fold (lambda (sum x) (+ sum x)) 0 norm-dist)
       NVOCAB))

  (define root-mean-square
    (sqrt (/ (zvector-fold (lambda (sum x) (+ sum (* x x))) 0 norm-dist)
             NVOCAB)))

  ; Test for uniform convergence.
  ; (test-assert TEST-ID (<= l0-norm TOL))
  (define tol-result (<= l0-norm TOL))

  ; Should not random walk too far away.
  ; Could tighten this with a proper theory of the error distribution.
  ; (test-assert TEST-ID (< (abs mean) 3))
  (define mean-result (< (abs mean) 3))
  ; I don't understand the error distribution ....
  ; (test-assert (and (< 0.4 root-mean-square) (< root-mean-square 1.5)))

  ; Sanity check. The total counts in the bins should be equal to REPS.
  ; If this fails, the test harness itself is broken.
  (test-assert TEST-ID
    (equal? REPS
            (zvector-fold
              (lambda (sum cnt) (+ sum cnt)) 0 bin-counts)))

  ; Utility debug printing
  ;(vector-to-file probility "probility.dat")
  ;(vector-to-file prexpect "prexpect.dat")
  ;(vector-to-file diff "diff.dat")

  (list tol-result mean-result))

; ------------------------------------------------------------------
; Sometimes the Zip test fails, due to random variation. It should
; pass, if attempted a second time. This gives it three chances.
; If it fails three times, then something bad is happening.
(define (test-zipf TEST-ID NVOCAB ESS QUE REPS TOL)

  ;; Three strikes, you're out.
  (define RETRY 3)

  (define num-failures
    (list-index
     (lambda (n)
       (define rc (test-zipf-once TEST-ID NVOCAB ESS QUE REPS TOL))
       (and (first rc) (second rc)))
     (iota RETRY)))

  ; `num-failures` will be #f if it failed each and every time.
  ;(if (number? num-failures)
  ;  (if (< 0 num-failures)
  ;    (format #t "Test '~A' out of bounds ~A times.\n" TEST-ID num-failures))
  ;  (format #t "Error: Test '~A' failed every time!\n" TEST-ID))

  ; Announce excessive repeated failures.
  (test-assert TEST-ID num-failures)
)

; ------------------------------------------------------------------
; Explore the parameter space.
(define (zipf-test-group)
  ; (test-begin "srfi-194-zipf")

  ; The unit test computes something that is "almost" a standard
  ; deviation for the error distribution. Except, maybe not quite,
  ; I don't fully understand the theory. So most tests seem to come
  ; in fine in well-under a six-sigma deviation, but some of the wilder
  ; parameter choices misbehave, so six-sigma doesn't always work.
  ; Also, when the number of bins is large, its easy to under-sample;
  ; some bins end up empty and the std-dev is thrown off as a result.
  ; Thus, the tolerance bounds below are hand-adjusted.
  (define six-sigma 6.0)

  (define hack-que 3.0)

  ; Zoom into s->1
  (test-zipf "zoom-1"   30 1.1     0 1000 six-sigma)
  (test-zipf "zoom-2"   30 1.01    0 1000 six-sigma)
  (test-zipf "zoom-3"   30 1.001   0 1000 six-sigma)
  (test-zipf "zoom-4"   30 1.0001  0 1000 six-sigma)
  (test-zipf "zoom-5"   30 1.00001 0 1000 six-sigma)

  (test-zipf "zoom-6"   30 (+ 1 1e-6)  0 1000 six-sigma)
  (test-zipf "zoom-8"   30 (+ 1 1e-8)  0 1000 six-sigma)
  (test-zipf "zoom-10"  30 (+ 1 1e-10) 0 1000 six-sigma)
  (test-zipf "zoom-12"  30 (+ 1 1e-12) 0 1000 six-sigma)
  (test-zipf "zoom-14"  30 (+ 1 1e-14) 0 1000 six-sigma)
  (test-zipf "zoom-inf" 30 1           0 1000 six-sigma)

  ; Verify improving uniform convergence
  (test-zipf "uniform-1" 30 1  0 10000   six-sigma)
  (test-zipf "uniform-2" 30 1  0 100000  six-sigma)

  ; Larger vocabulary
  (test-zipf "mid-voc-1" 300 1.1     0 10000 six-sigma)
  (test-zipf "mid-voc-2" 300 1.01    0 10000 six-sigma)
  (test-zipf "mid-voc-3" 300 1.001   0 10000 six-sigma)
  (test-zipf "mid-voc-4" 300 1.0001  0 10000 six-sigma)
  (test-zipf "mid-voc-5" 300 1.00001 0 10000 six-sigma)

  ; Larger vocabulary. Take more samples....
  (test-zipf "large-voc-1" 3701 1.1     0 40000 six-sigma)
  (test-zipf "large-voc-2" 3701 1.01    0 40000 six-sigma)
  (test-zipf "large-voc-3" 3701 1.001   0 40000 six-sigma)
  (test-zipf "large-voc-4" 3701 1.0001  0 40000 six-sigma)
  (test-zipf "large-voc-5" 3701 1.00001 0 40000 six-sigma)

  ; Huge vocabulary; few samples. Many bins will be empty,
  ; causing the std-dev to get large.
  (test-zipf "huge-voc-1" 43701 (+ 1 1e-6)  0 60000 9.5)
  (test-zipf "huge-voc-2" 43701 (+ 1 1e-7)  0 60000 9.5)
  (test-zipf "huge-voc-3" 43701 (+ 1 1e-9)  0 60000 9.5)
  (test-zipf "huge-voc-4" 43701 (+ 1 1e-12) 0 60000 9.5)
  (test-zipf "huge-voc-5" 43701 1           0 60000 9.5)

  ; Large s, small range
  (test-zipf "big-s-lo-1" 5 1.1     0 1000 six-sigma)
  (test-zipf "big-s-lo-2" 5 2.01    0 1000 six-sigma)
  (test-zipf "big-s-lo-3" 5 4.731   0 1000 six-sigma)
  (test-zipf "big-s-lo-4" 5 9.09001 0 1000 six-sigma)
  (test-zipf "big-s-lo-5" 5 13.45   0 1000 8.0)

  ; Large s, larger range. Most histogram bins will be empty
  ; so allow much larger error margins. There are excessively
  ; frequent large failures in this bunch.
  (test-zipf "bis-mid-1" 130 1.5     0 30000 six-sigma)
  (test-zipf "bis-mid-2" 130 2.03    0 30000 9.0)
  (test-zipf "bis-mid-3" 130 4.5     0 30000 36.0) ; This one is problematic
  (test-zipf "bis-mid-4" 130 6.66    0 30000 24.0)

  ; Verify that accuracy improves with more samples.
  (test-zipf "samp-bi-1" 129 1.1     0 10000 six-sigma)
  (test-zipf "samp-bi-2" 129 1.01    0 10000 six-sigma)
  (test-zipf "samp-bi-3" 129 1.001   0 10000 six-sigma)
  (test-zipf "samp-bi-4" 129 1.0001  0 10000 six-sigma)
  (test-zipf "samp-bi-5" 129 1.00001 0 10000 six-sigma)

  ; Non-zero Hurwicz parameter
  (test-zipf "hurw-1" 131 1.1     0.3    10000 six-sigma)
  (test-zipf "hurw-2" 131 1.1     1.3    10000 six-sigma)
  (test-zipf "hurw-3" 131 1.1     6.3    10000 six-sigma)
  (test-zipf "hurw-4" 131 1.1     20.23  10000 six-sigma)

  ; Negative Hurwicz parameter. Must be greater than branch point at -0.5.
  (test-zipf "hneg-1" 81 1.1     -0.1   1000 six-sigma)
  (test-zipf "hneg-2" 81 1.1     -0.3   1000 six-sigma)
  (test-zipf "hneg-3" 81 1.1     -0.4   1000 six-sigma)
  (test-zipf "hneg-4" 81 1.1     -0.499 1000 six-sigma)

  ; A walk into a stranger corner of the parameter space.
  (test-zipf "big-h-1" 131 1.1     41.483 10000 hack-que)
  (test-zipf "big-h-2" 131 2.1     41.483 10000 hack-que)
  (test-zipf "big-h-3" 131 6.1     41.483 10000 hack-que)
  (test-zipf "big-h-4" 131 16.1    41.483 10000 hack-que)
  (test-zipf "big-h-5" 131 46.1    41.483 10000 hack-que)
  (test-zipf "big-h-6" 131 96.1    41.483 10000 hack-que)

  ; A still wilder corner of the parameter space.
  (test-zipf "huhu-1" 131 1.1     1841.4 10000 hack-que)
  (test-zipf "huhu-2" 131 1.1     1.75e6 10000 hack-que)
  (test-zipf "huhu-3" 131 2.1     1.75e6 10000 hack-que)
  (test-zipf "huhu-4" 131 12.1    1.75e6 10000 hack-que)
  (test-zipf "huhu-5" 131 42.1    1.75e6 10000 hack-que)

  ; Lets try s less than 1
  (test-zipf "small-s-1" 35 0.9     0 1000 six-sigma)
  (test-zipf "small-s-2" 35 0.99    0 1000 six-sigma)
  (test-zipf "small-s-3" 35 0.999   0 1000 six-sigma)
  (test-zipf "small-s-4" 35 0.9999  0 1000 six-sigma)
  (test-zipf "small-s-5" 35 0.99999 0 1000 six-sigma)

  ; Attempt to force an overflow
  (test-zipf "ovfl-1" 437 (- 1 1e-6)  0 1000 six-sigma)
  (test-zipf "ovfl-2" 437 (- 1 1e-7)  0 1000 six-sigma)
  (test-zipf "ovfl-3" 437 (- 1 1e-9)  0 1000 six-sigma)
  (test-zipf "ovfl-4" 437 (- 1 1e-12) 0 1000 six-sigma)

  ; Almost flat distribution
  (test-zipf "flat-1" 36 0.8     0 1000 six-sigma)
  (test-zipf "flat-2" 36 0.5     0 1000 six-sigma)
  (test-zipf "flat-3" 36 0.1     0 1000 six-sigma)

  ; A visit to crazy-town -- increasing, not decreasing exponent
  (test-zipf "neg-s-1" 36 0.0     0 1000 six-sigma)
  (test-zipf "neg-s-2" 36 -0.1    0 1000 six-sigma)
  (test-zipf "neg-s-3" 36 -1.0    0 1000 six-sigma)
  (test-zipf "neg-s-4" 36 -3.0    0 1000 six-sigma)

  ; More crazy with some Hurwicz on top.
  (test-zipf "neg-shu-1" 16 0.0     0.5 1000 six-sigma)
  (test-zipf "neg-shu-2" 16 -0.2    2.5 1000 six-sigma)
  (test-zipf "neg-shu-3" 16 -1.3    10  1000 six-sigma)
  (test-zipf "neg-shu-4" 16 -2.9    100 1000 six-sigma)

  ; (test-end "srfi-194-zipf")
  )