# -*- coding: utf-8 -*-
#
# Kullback-Leibler or Jensen-Shannon divergence between two distributions
#
# The Kullback-Leibler divergence is given by:
#   D_{KL}(P(x), Q(x)) = sum[ P(x) * log(P(x) / Q(x)) ]
# where P(x) is the underground true distribution, and Q(x) the approximation
# distribution.  Thus KL divergence measures the information lost when Q is
# used to approximate P.
#
# The Jensen-Shannon divergence is given by:
#   D_{JS}(P, Q) = 0.5 * D_{KL}(P, M) + 0.5 * D_{KL}(Q, M);  M = (P+Q)/2
# This is a symmetrised divergence, and is equal to 1/2 the so-called
# Jeffrey divergence.
#
# Credits:
# [1] Wikipedia - Kullback-Leibler divergence
#     https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
# [2] David Fass, KLDIV
#     http://www.mathworks.com/matlabcentral/fileexchange/13089-kldiv/content//kldiv.m
#
# Aaron LI
# 2015/09/04
#


# Calculate the entropy of the probability mass distribution.
# The zeros are ignored.
#
# Arguments:
#   x - probability mass distribution
#
# Return:
#   entropy in unit "bits"
#
calc.entropy <- function(x) {
    x.gt0 <- x[x>0]
    return(sum(-x.gt0 * log2(x.gt0)))
}


# Calculate the KL divergence of distribution P from Q, or the JS divergence
# between the P and Q distributions.
#
# TODO:
# * to add other methods to deal with zero probabilities:
#   - add eps to p.m.f and renormalize
#   - Byesian prior
#   - smoothing
#
# Credits:
# [1] Calculate the Kullback-Leibler Divergence in practice?
#     http://stats.stackexchange.com/questions/97938/calculate-the-kullback-leibler-divergence-in-practice
# [2] How to compute KL-divergence when PMF contains 0s?
#     http://mathoverflow.net/questions/72668/how-to-compute-kl-divergence-when-pmf-contains-0s
#
# Arguments:
#   p     - probabilities representing the distribution P (underground true)
#   q     - probabilities representing the distribution Q (approximation)
#   type  - which type of divergence to be calculated
#           + "kl": (default) Kullback-Leibler divergence
#           + "klsym": symmetric variant of the Kullback-Leibler divergence,
#                      which given by (KL(p, q) + KL(q, p))/2
#           + "js": Jensen-Shannon divergence
#   zeros - how to deal with the zeros in each distribution probabilities
#           + "ignore": just ignore the data points with probability of zero
#
#   Note that the vectors p and q must have the same length, and the
#   sum of probabilities p and q must be 1 +/- 1e-5
#
# Return:
#   calculate divergence value in unit "bits"
#
kldiv <- function(p, q, type="kl", zeros="ignore") {
    # check length of vectors
    stopifnot(length(p) == length(q))
    # validate probabilities
    eps_prob <- 1e-5
    stopifnot(abs(sum(p) - 1) <= eps_prob, abs(sum(q) - 1) <= eps_prob)
    # how to deal with zero probabilities
    if (zeros == "ignore") {
        # just ignore the zeros in probabilities
        nonzeros <- (p > 0) & (q > 0)
        p <- p[nonzeros]
        q <- q[nonzeros]
    } else {
        stop(paste("Unsupported parameter value zeros=", zeros, "\n", sep=""))
    }
    # check divergence type
    if (type == "kl") {
        # Kullback-Leibler divergence
        div <- sum(p * (log2(p) - log2(q)))
    } else if (type == "klsym") {
        # symmetric variant KL divergence
        div <- 0.5 * (sum(p * (log2(p) - log2(q))) +
                      sum(q * (log2(q) - log2(p))))
    } else if (type == "js") {
        # Jensen-Shannon divergence
        m <- (p + q) / 2
        div <- 0.5 * (sum(p * (log2(p) - log2(m))) +
                      sum(q * (log2(q) - log2(m))))
    } else {
        stop(paste("Unsupported parameter value type=", type, "\n", sep=""))
    }
    return(div)
}


# Estimate the probability mass distribution for the observation data,
# using "density()".
# The range of output coordinates of points is set to be:
#   from: min(x) - cut*bw
#   to:   max(x) + cut*bw
# And the probability mass distribution is normalized.
#
# Arguments:
#   x      - input observation data
#   n      - number of equally spaced points at which the probability mass is
#            to be estimated.
#   bw     - bandwidth to be used
#   kernel - the smoothing kernel
#   from   - coordinate of the left-most point
#   to     - coordinate of the right-most point
#   cut    - c(left, right). Number of bandwidths beyond the left and right
#            extremes of the input data.
#            This allows the estimated density to drop to approximately zero
#            at the extremes.
#            If "from" and "to" specified, then "cut" is ignored.
#
# Returns:
#   list with following components:
#   x      - the coordinates of the points where probability mass estimated
#   y      - the estimated probability mass
#   bw     - bandwidth used
#   kernel - kernel used
#   n      - sample size
#   cut    - left and right cut used
#   from   - coordinate of the left-most point used
#   to     - coordinate of the right-most point used
#
estimate.prob.mass <- function(x, bw="nrd0", kernel="gaussian", n=512,
                               from=NULL, to=NULL, cut=c(3,3)) {
    data <- x[!is.na(x)]
    # calculate the bandwidth
    bw <- get(paste("bw.", bw, sep=""))(data)
    # determine the estimation range
    if (is.null(from)) {
        from <- min(data) - cut[1] * bw
    }
    if (is.null(to)) {
        to <- max(data) + cut[2] * bw
    }
    # estimate with "density()"
    d <- density(data, bw=bw, kernel=kernel, n=n, from=from, to=to)
    # renormalize the probability mass distribution
    pmass <- d$y / sum(d$y)
    prob.mass <- list(x=d$x, y=pmass, bw=bw, kernel=kernel,
                      n=n, from=from, to=to, cut=cut)
    return(prob.mass)
}


# Estimate the probability mass distribution for the source and corresponding
# background data using 'estimate.prob.mass()'.
#
# The coordinates at which the probability masses are estimated are the same
# for the source and corresponding background probability mass distributions.
# Therefore we can calculate the KL divergence between these two distributions.
#
# Argument:
#   srcdata - raw counts data drawn from the source region
#   bkgdata - raw counts data drawn from the background region
#
# Return:
#   data.frame with the following components:
#   x   - the coordinates of the points where probability mass estimated
#   src - the estimated probability masses of the source data
#   bkg - the estimated probability masses of the background data
#
pm.src.bkg <- function(srcdata, bkgdata) {
    # compare the data ranges
    if (max(srcdata) > max(bkgdata)) {
        pm.src <- estimate.prob.mass(srcdata)
        from <- pm.src$from
        to <- pm.src$to
        pm.bkg <- estimate.prob.mass(bkgdata, from=from, to=to)
    } else {
        pm.bkg <- estimate.prob.mass(bkgdata)
        from <- pm.bkg$from
        to <- pm.bkg$to
        pm.src <- estimate.prob.mass(srcdata, from=from, to=to)
    }
    df <- data.frame(x=pm.src$x, src=pm.src$y, bkg=pm.bkg$y)
    return(df)
}


# Calculate the entropies and KL/JS divergences of the source and background
# probability mass distribution group.
#
# Arguments:
#   pmdf - data.frame of the probability mass distribution
#   comp - components to be calculated
#          + "entropy": entropy of the source and background
#          + "kl": KL divergences from source to background and vice versa
#          + "klsym": symmetric variant of KL divergence
#          + "js": JS divergence
#
# Return:
#   list with following components:
#   entropy.src - entropy of the source distribution
#   entropy.bkg - entropy of the background distribution
#   kl.src2bkg  - KL divergence from source to background
#   kl.bkg2src  - KL divergence from background to source
#   klsym       - symmetric variant KL divergence
#   js          - JS divergence
info.src.bkg <- function(pmdf, comp=c("entropy", "kl", "klsym", "js")) {
    pm.src <- pmdf$src
    pm.bkg <- pmdf$bkg
    entropy.src <- NULL
    entropy.bkg <- NULL
    kl.src2bkg  <- NULL
    kl.bkg2src  <- NULL
    klsym       <- NULL
    js          <- NULL
    if ("entropy" %in% comp) {
        entropy.src <- calc.entropy(pm.src)
        entropy.bkg <- calc.entropy(pm.bkg)
    }
    if ("kl" %in% comp) {
        kl.src2bkg <- kldiv(pm.src, pm.bkg, type="kl")
        kl.bkg2src <- kldiv(pm.bkg, pm.src, type="kl")
    }
    if ("klsym" %in% comp) {
        klsym <- kldiv(pm.src, pm.bkg, type="klsym")
    }
    if ("js" %in% comp) {
        js <- kldiv(pm.src, pm.bkg, type="js")
    }
    return(list(entropy.src=entropy.src, entropy.bkg=entropy.bkg,
                kl.src2bkg=kl.src2bkg, kl.bkg2src=kl.bkg2src,
                klsym=klsym, js=js))
}


# Calculate the entropies and KL/JS divergences of the source density
# histogram with respect to the corresponding background data which
# drawn from the estimated Poisson mass distribution.
#
# Arguments:
#   src  - raw counts data of the source region
#   comp - components to be calculated
#          + "entropy": entropy of the source and background
#          + "kl": KL divergences from source to background and vice versa
#          + "klsym": symmetric variant of KL divergence
#          + "js": JS divergence
#
# Return:
#   list with following components:
#   entropy.src - entropy of the source distribution
#   entropy.bkg - entropy of the background distribution
#   kl.src2bkg  - KL divergence from source to background
#   kl.bkg2src  - KL divergence from background to source
#   klsym       - symmetric variant KL divergence
#   js          - JS divergence
#
info.src.pois <- function(src, comp=c("entropy", "kl", "klsym", "js")) {
    # make the density histogram of the source counts data
    hist.src <- hist(src, breaks=(min(src):(max(src)+1)-0.5), plot=FALSE)
    x <- hist.src$mids
    pm.src <- hist.src$density
    # calculate the corresponding theoretical Poisson density/mass distribution
    # as the estimated background
    lambda <- mean(src)
    pm.pois <- dpois(x, lambda)
    pm.pois <- pm.pois / sum(pm.pois)
    # calculate the entropy, KL/JS divergences
    entropy.src <- NULL
    entropy.bkg <- NULL
    kl.src2bkg  <- NULL
    kl.bkg2src  <- NULL
    klsym       <- NULL
    js          <- NULL
    if ("entropy" %in% comp) {
        entropy.src <- calc.entropy(pm.src)
        entropy.bkg <- calc.entropy(pm.pois)
    }
    if ("kl" %in% comp) {
        kl.src2bkg <- kldiv(pm.src, pm.pois, type="kl")
        kl.bkg2src <- kldiv(pm.pois, pm.src, type="kl")
    }
    if ("klsym" %in% comp) {
        klsym <- kldiv(pm.src, pm.pois, type="klsym")
    }
    if ("js" %in% comp) {
        js <- kldiv(pm.src, pm.pois, type="js")
    }
    return(list(entropy.src=entropy.src, entropy.bkg=entropy.bkg,
                kl.src2bkg=kl.src2bkg, kl.bkg2src=kl.bkg2src,
                klsym=klsym, js=js))
}


# Calculate the information (e.g., entropy, divergences) for each group of
# region data.
# If the background data are not provided, then the background is estimated
# with a Poisson density/mass distribution.
info.reglist <- function(srcdatalist, bkgdatalist=NULL) {
    if (is.null(bkgdatalist)) {
        infofunc <- "info.src.pois"
    } else {
        infofunc <- "info.src.bkg"
        stopifnot(length(srcdatalist) == length(bkgdatalist))
    }
    l <- length(srcdatalist)
    infodf <- data.frame(entropy.src=numeric(l), entropy.bkg=numeric(l),
                         kl.src2bkg=numeric(l), kl.bkg2src=numeric(l),
                         klsym=numeric(l), js=numeric(l))
    for (i in 1:length(srcdatalist)) {
        #cat(i, "\n")
        if (is.null(bkgdatalist)) {
            if (sum(srcdatalist[[i]]) == 0) {
                # srcdata all zeros
                cat(i, ": WARNING: srcdata are all zeros!\n")
                info <- list(entropy.src=NA, entropy.bkg=NA,
                             kl.src2bkg=NA, kl.bkg2src=NA,
                             klsym=NA, js=NA)
            } else {
                info <- get(infofunc)(srcdatalist[[i]])
            }
        } else {
            if (sum(srcdatalist[[i]]) == 0 || sum(bkgdatalist[[i]]) == 0) {
                # srcdata / bkgdata all zeros
                cat(i, ": WARNING: srcdata/bkgdata are all zeros!\n")
                info <- list(entropy.src=NA, entropy.bkg=NA,
                             kl.src2bkg=NA, kl.bkg2src=NA,
                             klsym=NA, js=NA)
            } else {
                pmdf <- pm.src.bkg(srcdatalist[[i]], bkgdatalist[[i]])
                info <- get(infofunc)(pmdf)
            }
        }
        infodf[i, ] <- info
    }
    return(infodf)
}


# vim: set ts=8 sw=4 tw=0 fenc=utf-8 ft=r: #