diff options
Diffstat (limited to 'r/kldiv.R')
-rw-r--r-- | r/kldiv.R | 349 |
1 files changed, 349 insertions, 0 deletions
diff --git a/r/kldiv.R b/r/kldiv.R new file mode 100644 index 0000000..a767038 --- /dev/null +++ b/r/kldiv.R @@ -0,0 +1,349 @@ +# -*- coding: utf-8 -*- +# +# Kullback-Leibler or Jensen-Shannon divergence between two distributions +# +# The Kullback-Leibler divergence is given by: +# D_{KL}(P(x), Q(x)) = sum[ P(x) * log(P(x) / Q(x)) ] +# where P(x) is the underground true distribution, and Q(x) the approximation +# distribution. Thus KL divergence measures the information lost when Q is +# used to approximate P. +# +# The Jensen-Shannon divergence is given by: +# D_{JS}(P, Q) = 0.5 * D_{KL}(P, M) + 0.5 * D_{KL}(Q, M); M = (P+Q)/2 +# This is a symmetrised divergence, and is equal to 1/2 the so-called +# Jeffrey divergence. +# +# Credits: +# [1] Wikipedia - Kullback-Leibler divergence +# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence +# [2] David Fass, KLDIV +# http://www.mathworks.com/matlabcentral/fileexchange/13089-kldiv/content//kldiv.m +# +# Aaron LI +# 2015/09/04 +# + + +# Calculate the entropy of the probability mass distribution. +# The zeros are ignored. +# +# Arguments: +# x - probability mass distribution +# +# Return: +# entropy in unit "bits" +# +calc.entropy <- function(x) { + x.gt0 <- x[x>0] + return(sum(-x.gt0 * log2(x.gt0))) +} + + +# Calculate the KL divergence of distribution P from Q, or the JS divergence +# between the P and Q distributions. +# +# TODO: +# * to add other methods to deal with zero probabilities: +# - add eps to p.m.f and renormalize +# - Byesian prior +# - smoothing +# +# Credits: +# [1] Calculate the Kullback-Leibler Divergence in practice? +# http://stats.stackexchange.com/questions/97938/calculate-the-kullback-leibler-divergence-in-practice +# [2] How to compute KL-divergence when PMF contains 0s? +# http://mathoverflow.net/questions/72668/how-to-compute-kl-divergence-when-pmf-contains-0s +# +# Arguments: +# p - probabilities representing the distribution P (underground true) +# q - probabilities representing the distribution Q (approximation) +# type - which type of divergence to be calculated +# + "kl": (default) Kullback-Leibler divergence +# + "klsym": symmetric variant of the Kullback-Leibler divergence, +# which given by (KL(p, q) + KL(q, p))/2 +# + "js": Jensen-Shannon divergence +# zeros - how to deal with the zeros in each distribution probabilities +# + "ignore": just ignore the data points with probability of zero +# +# Note that the vectors p and q must have the same length, and the +# sum of probabilities p and q must be 1 +/- 1e-5 +# +# Return: +# calculate divergence value in unit "bits" +# +kldiv <- function(p, q, type="kl", zeros="ignore") { + # check length of vectors + stopifnot(length(p) == length(q)) + # validate probabilities + eps_prob <- 1e-5 + stopifnot(abs(sum(p) - 1) <= eps_prob, abs(sum(q) - 1) <= eps_prob) + # how to deal with zero probabilities + if (zeros == "ignore") { + # just ignore the zeros in probabilities + nonzeros <- (p > 0) & (q > 0) + p <- p[nonzeros] + q <- q[nonzeros] + } else { + stop(paste("Unsupported parameter value zeros=", zeros, "\n", sep="")) + } + # check divergence type + if (type == "kl") { + # Kullback-Leibler divergence + div <- sum(p * (log2(p) - log2(q))) + } else if (type == "klsym") { + # symmetric variant KL divergence + div <- 0.5 * (sum(p * (log2(p) - log2(q))) + + sum(q * (log2(q) - log2(p)))) + } else if (type == "js") { + # Jensen-Shannon divergence + m <- (p + q) / 2 + div <- 0.5 * (sum(p * (log2(p) - log2(m))) + + sum(q * (log2(q) - log2(m)))) + } else { + stop(paste("Unsupported parameter value type=", type, "\n", sep="")) + } + return(div) +} + + +# Estimate the probability mass distribution for the observation data, +# using "density()". +# The range of output coordinates of points is set to be: +# from: min(x) - cut*bw +# to: max(x) + cut*bw +# And the probability mass distribution is normalized. +# +# Arguments: +# x - input observation data +# n - number of equally spaced points at which the probability mass is +# to be estimated. +# bw - bandwidth to be used +# kernel - the smoothing kernel +# from - coordinate of the left-most point +# to - coordinate of the right-most point +# cut - c(left, right). Number of bandwidths beyond the left and right +# extremes of the input data. +# This allows the estimated density to drop to approximately zero +# at the extremes. +# If "from" and "to" specified, then "cut" is ignored. +# +# Returns: +# list with following components: +# x - the coordinates of the points where probability mass estimated +# y - the estimated probability mass +# bw - bandwidth used +# kernel - kernel used +# n - sample size +# cut - left and right cut used +# from - coordinate of the left-most point used +# to - coordinate of the right-most point used +# +estimate.prob.mass <- function(x, bw="nrd0", kernel="gaussian", n=512, + from=NULL, to=NULL, cut=c(3,3)) { + data <- x[!is.na(x)] + # calculate the bandwidth + bw <- get(paste("bw.", bw, sep=""))(data) + # determine the estimation range + if (is.null(from)) { + from <- min(data) - cut[1] * bw + } + if (is.null(to)) { + to <- max(data) + cut[2] * bw + } + # estimate with "density()" + d <- density(data, bw=bw, kernel=kernel, n=n, from=from, to=to) + # renormalize the probability mass distribution + pmass <- d$y / sum(d$y) + prob.mass <- list(x=d$x, y=pmass, bw=bw, kernel=kernel, + n=n, from=from, to=to, cut=cut) + return(prob.mass) +} + + +# Estimate the probability mass distribution for the source and corresponding +# background data using 'estimate.prob.mass()'. +# +# The coordinates at which the probability masses are estimated are the same +# for the source and corresponding background probability mass distributions. +# Therefore we can calculate the KL divergence between these two distributions. +# +# Argument: +# srcdata - raw counts data drawn from the source region +# bkgdata - raw counts data drawn from the background region +# +# Return: +# data.frame with the following components: +# x - the coordinates of the points where probability mass estimated +# src - the estimated probability masses of the source data +# bkg - the estimated probability masses of the background data +# +pm.src.bkg <- function(srcdata, bkgdata) { + # compare the data ranges + if (max(srcdata) > max(bkgdata)) { + pm.src <- estimate.prob.mass(srcdata) + from <- pm.src$from + to <- pm.src$to + pm.bkg <- estimate.prob.mass(bkgdata, from=from, to=to) + } else { + pm.bkg <- estimate.prob.mass(bkgdata) + from <- pm.bkg$from + to <- pm.bkg$to + pm.src <- estimate.prob.mass(srcdata, from=from, to=to) + } + df <- data.frame(x=pm.src$x, src=pm.src$y, bkg=pm.bkg$y) + return(df) +} + + +# Calculate the entropies and KL/JS divergences of the source and background +# probability mass distribution group. +# +# Arguments: +# pmdf - data.frame of the probability mass distribution +# comp - components to be calculated +# + "entropy": entropy of the source and background +# + "kl": KL divergences from source to background and vice versa +# + "klsym": symmetric variant of KL divergence +# + "js": JS divergence +# +# Return: +# list with following components: +# entropy.src - entropy of the source distribution +# entropy.bkg - entropy of the background distribution +# kl.src2bkg - KL divergence from source to background +# kl.bkg2src - KL divergence from background to source +# klsym - symmetric variant KL divergence +# js - JS divergence +info.src.bkg <- function(pmdf, comp=c("entropy", "kl", "klsym", "js")) { + pm.src <- pmdf$src + pm.bkg <- pmdf$bkg + entropy.src <- NULL + entropy.bkg <- NULL + kl.src2bkg <- NULL + kl.bkg2src <- NULL + klsym <- NULL + js <- NULL + if ("entropy" %in% comp) { + entropy.src <- calc.entropy(pm.src) + entropy.bkg <- calc.entropy(pm.bkg) + } + if ("kl" %in% comp) { + kl.src2bkg <- kldiv(pm.src, pm.bkg, type="kl") + kl.bkg2src <- kldiv(pm.bkg, pm.src, type="kl") + } + if ("klsym" %in% comp) { + klsym <- kldiv(pm.src, pm.bkg, type="klsym") + } + if ("js" %in% comp) { + js <- kldiv(pm.src, pm.bkg, type="js") + } + return(list(entropy.src=entropy.src, entropy.bkg=entropy.bkg, + kl.src2bkg=kl.src2bkg, kl.bkg2src=kl.bkg2src, + klsym=klsym, js=js)) +} + + +# Calculate the entropies and KL/JS divergences of the source density +# histogram with respect to the corresponding background data which +# drawn from the estimated Poisson mass distribution. +# +# Arguments: +# src - raw counts data of the source region +# comp - components to be calculated +# + "entropy": entropy of the source and background +# + "kl": KL divergences from source to background and vice versa +# + "klsym": symmetric variant of KL divergence +# + "js": JS divergence +# +# Return: +# list with following components: +# entropy.src - entropy of the source distribution +# entropy.bkg - entropy of the background distribution +# kl.src2bkg - KL divergence from source to background +# kl.bkg2src - KL divergence from background to source +# klsym - symmetric variant KL divergence +# js - JS divergence +# +info.src.pois <- function(src, comp=c("entropy", "kl", "klsym", "js")) { + # make the density histogram of the source counts data + hist.src <- hist(src, breaks=(min(src):(max(src)+1)-0.5), plot=FALSE) + x <- hist.src$mids + pm.src <- hist.src$density + # calculate the corresponding theoretical Poisson density/mass distribution + # as the estimated background + lambda <- mean(src) + pm.pois <- dpois(x, lambda) + pm.pois <- pm.pois / sum(pm.pois) + # calculate the entropy, KL/JS divergences + entropy.src <- NULL + entropy.bkg <- NULL + kl.src2bkg <- NULL + kl.bkg2src <- NULL + klsym <- NULL + js <- NULL + if ("entropy" %in% comp) { + entropy.src <- calc.entropy(pm.src) + entropy.bkg <- calc.entropy(pm.pois) + } + if ("kl" %in% comp) { + kl.src2bkg <- kldiv(pm.src, pm.pois, type="kl") + kl.bkg2src <- kldiv(pm.pois, pm.src, type="kl") + } + if ("klsym" %in% comp) { + klsym <- kldiv(pm.src, pm.pois, type="klsym") + } + if ("js" %in% comp) { + js <- kldiv(pm.src, pm.pois, type="js") + } + return(list(entropy.src=entropy.src, entropy.bkg=entropy.bkg, + kl.src2bkg=kl.src2bkg, kl.bkg2src=kl.bkg2src, + klsym=klsym, js=js)) +} + + +# Calculate the information (e.g., entropy, divergences) for each group of +# region data. +# If the background data are not provided, then the background is estimated +# with a Poisson density/mass distribution. +info.reglist <- function(srcdatalist, bkgdatalist=NULL) { + if (is.null(bkgdatalist)) { + infofunc <- "info.src.pois" + } else { + infofunc <- "info.src.bkg" + stopifnot(length(srcdatalist) == length(bkgdatalist)) + } + l <- length(srcdatalist) + infodf <- data.frame(entropy.src=numeric(l), entropy.bkg=numeric(l), + kl.src2bkg=numeric(l), kl.bkg2src=numeric(l), + klsym=numeric(l), js=numeric(l)) + for (i in 1:length(srcdatalist)) { + #cat(i, "\n") + if (is.null(bkgdatalist)) { + if (sum(srcdatalist[[i]]) == 0) { + # srcdata all zeros + cat(i, ": WARNING: srcdata are all zeros!\n") + info <- list(entropy.src=NA, entropy.bkg=NA, + kl.src2bkg=NA, kl.bkg2src=NA, + klsym=NA, js=NA) + } else { + info <- get(infofunc)(srcdatalist[[i]]) + } + } else { + if (sum(srcdatalist[[i]]) == 0 || sum(bkgdatalist[[i]]) == 0) { + # srcdata / bkgdata all zeros + cat(i, ": WARNING: srcdata/bkgdata are all zeros!\n") + info <- list(entropy.src=NA, entropy.bkg=NA, + kl.src2bkg=NA, kl.bkg2src=NA, + klsym=NA, js=NA) + } else { + pmdf <- pm.src.bkg(srcdatalist[[i]], bkgdatalist[[i]]) + info <- get(infofunc)(pmdf) + } + } + infodf[i, ] <- info + } + return(infodf) +} + + +# vim: set ts=8 sw=4 tw=0 fenc=utf-8 ft=r: # |