#----------------------------------------------------------------- # # Statistics for LDD distributions # (Luria-Delbruck with deaths) # Version 1.0 november 28, 2012 # # B. Ycart # Universite de Grenoble and CNRS, France # http://ljk.imag.fr/membres/Bernard.Ycart/LD/ #----------------------------------------------------------------- #-------------------------- Data sets ---------------------------- # From p.504 of D. E. Luria and M. Delbruck, Mutations of bacteria from # virus sensitivity to virus resistance, Genetics, 28, (1943), 491-511 LD43a<-{c(10,18,125,10,14,27,3,17,17,29,41,17,20,31,30,7,17,30,10,40,45, 183,12,173,23,57,51,6,5,10,8,24,13,165,15,6,10,38,28,35,107,13)} LD43b<-{c(1,0,3,0,0,5,0,5,0,6,107,0,0,0,1,0,0,64,0,35, 1,0,0,7,0,303,0,0,3,48,1,4)} LD43c<-{c(0,0,0,0,8,1,0,1,0,15,0,0,19,0,0,17,11,0,0)} # From L. Boe, T. Tolker-Nielsen, K. M. Eegholm, H. Spliid and A. Vrang, # Fluctuation analysis of mutations to nalidixic acid resistance in # Escherichia Coli, J. Bacteriol., 176(10) (1994) 2781-2787 B94<-{c(rep(0,543),rep(1,169),rep(2,92),rep(3,57),rep(4,42),rep(5,25), rep(6,23),rep(7,13),rep(8,11),rep(9,5),rep(10,8),rep(11,8), rep(12,9),rep(13,5),rep(14,5),15,rep(16,8),rep(17,2),rep(18,5), rep(19,6),rep(20,2),rep(21,5),23,rep(24,2),25,rep(26,5),rep(27,3), 28,rep(29,3),rep(30,3),31,32,34,35,36,37,rep(39,2),40,41,41,49,52, 57,59,66,68,69,rep(73,2),74,105,107,116,132,137,140,146,151,152, 192,258,265,320,482,rep(513,4))} # From p.22 of W. A. Rosche and P. L. Foster, Determining mutation rates # in bacterial populations, Methods, 20(1), (2000), 1-17 RF00<-{c(rep(0,11),rep(1,17),rep(2,12),rep(3,3),rep(4,4),5,6,rep(7,2),9)} # From p.239 of Q. Zheng, Statistical and algorithmic methods for # fluctuation analysis with SALVADOR as an implementation, # Mathematical Biosciences, 176, (2002), 237-252 Z02<-{c(33,18,839,47,13,126,48,80,9,71,196,66,28,17,27,37, 126,33,12,44,28,67,730,168,44,50,583,23,17,24)} #------------------------ Distributions -------------------------- rYD=function(n,rho,delta){ # returns a sample of size n for the YD distribution # with parameters rho and delta. # t <- rexp(n,rho) # exponential times up <- (1-2*delta)/(1-delta*(1+exp(-t)))# 1-p0 p0 <- 1-up # probability of 0 pi <- up*exp(-t) # parameters of geometric s <- rbinom(n,1,up) # Bernoulli random variables ind <- which(s==1) # indices of positive values m <- length(ind) # number of positive values s[ind] <- 1+rgeom(m,pi[ind]) # draw geometric values return(s) # sample of YD distribution } # end function rYD rLDD=function(n,alpha,rho,delta){ # returns a sample of size n for the LDD distribution # with parameters alpha, rho, and delta # s <- rep(0,n) # initialize sample mut <- rpois(n,alpha) # Poisson number of mutations for (i in 1:n){ # loop on each Poisson number if (mut[i]>0){ # if there are mutations {s[i] <- sum(rYD(mut[i],rho,delta))} # Poisson sum of YD variables } # end if } # end loop return(s) } # end function rLDD dYD=function(m,rho,delta,m0=1000){ # returns the probabilities pk for k in 0:m for the # YD distribution with parameters rho and delta # uses an equivalent for m>m0 # tol <- 10^(-4) # tolerance if (delta0.5 if (delta>(0.5-tol)){ return("delta must be < 0.5") }else{ # general case d1 <- delta/(1-delta) # multiplicative coefficient f <- function(v){((1-v)*(v^(rho-1)))/ (1-d1*v)} # integrand I <- integrate(f,lower=0,upper=1) # integrate from 0 to 1 p0 <- I$value*d1*rho # probability of 0 P <- p0 # initialize vector if (m>0){ # more values to compute d2 <- ((1-2*delta)/(1-delta))^2 # multiplicative coefficient f <- function(v){v^rho/(1-v*d1)^2} # integrand I <- integrate(f,lower=0,upper=1) # integrate from 0 to 1 p1 <- I$value*d2*rho # probability of 1 P <- c(P,p1) # update vector if (m>1){ # more values to compute m1 <- min(m,m0) # maximum for exact computation for (k in 2:m1){ f <- function(v){v^rho* (1-v)^(k-1)/ (1-v*d1)^(k+1)} # integrand I <- {integrate(f,lower=0, upper=1)}# integrate from 0 to 1 pk <- I$value*d2*rho # probability of k P <- c(P,pk) # update vector } # end for if (m>m0){ # computation by equivalent a <- {(d2^((1-rho)/2))* rho*gamma(rho+1)} # constant in equivalent pk<- a*(seq((m0+1),m))^(-rho-1)# probabilities for large k's P <- c(P,pk) # update vector }}}}} # end ifs return(P) } # end function dYD dLDD=function(m,alpha,rho,delta){ # returns the probabilites qk for k in 0:m for the # LDD distribution with parameters alpha, rho, and delta # P <- dYD(m,rho,delta) # vector of YD probabilities Q <- rep(0,m+1) # vector of LDS probabilities Q[1] <- exp(-alpha*(1-P[1])) # initial probability q0 P <- P[-1] # remove p0 if (m>0){ # if any other needed for (k in 1:m){ # loop on values I <- seq(1,k) # increasing indices D <- seq(k,1,by=-1) # decreasing indices Q[k+1]<-{(alpha/k)* # coefficient sum(I*P[I]*Q[D])} # convolution } # end for } # end if return(Q) } # end function dLDD pYD=function(m,rho,delta){ # returns the probability to be lesser or equal to m # for the YD with parameters rho and delta # P <- dYD(m,rho,delta); # vector of YD probabilities return(sum(P)) } # end function pYD pLDD=function(m,alpha,rho,delta){ # returns the probability to be lesser or equal to m for the # LDD distribution with parameters alpha, rho, and delta # Q <- dLDD(m,alpha,rho,delta) # vector of LDD probabilities return(sum(Q)) } # end function pLDD qYD=function(p,rho,delta){ # returns the quantile of order p 0M){ # if p not yet in the table M <- 2*M # double the table P <- cumsum(dYD(M,rho,delta)) # truncated distribution function k <- max(which(PM){ # if p not yet in the table M <- 2*M # double the table P <- {cumsum(dLDD(M,alpha, rho,delta))} # truncated distribution function k <- max(which(Peps) # check precision it <- it+1 # count iterations if (it>10){ crit <- FALSE # convergence failure rn <- rb # default grid search result }else{ro <- rn} # next value } # end while return(rn) } # end function h1rd.solve gard=function(alpha,rho,delta,z){ # returns the evaluation at z of the generating function # of the LDD(alpha,rho,delta) # g <- exp(alpha*(hrd(rho,delta,z)-1)) # from the gf of the YD distribution return(g) } # end function gard gard.solve=function(g1,g2,g3,z1,z2,z3){ # solves in a,r,d the sytem a*(hrd(r,d,zi)-1)=yi # for i=1,2,3 # f1 <- function(a,r,d){a*(hrd(r,d,z1)-1)-g1} f2 <- function(a,r,d){a*(hrd(r,d,z2)-1)-g2} f3 <- function(a,r,d){a*(hrd(r,d,z3)-1)-g3} df11 <- function(a,r,d){hrd(r,d,z1)-1} df12 <- function(a,r,d){a*h1rd(r,d,z1)} df13 <- function(a,r,d){a*h2rd(r,d,z1)} df21 <- function(a,r,d){hrd(r,d,z2)-1} df22 <- function(a,r,d){a*h1rd(r,d,z2)} df23 <- function(a,r,d){a*h2rd(r,d,z2)} df31 <- function(a,r,d){hrd(r,d,z3)-1} df32 <- function(a,r,d){a*h1rd(r,d,z3)} df33 <- function(a,r,d){a*h2rd(r,d,z3)} d0 <- 0 # first d value r0 <- h1rd.solve(g1/g2,z1,z2,d0) # first r value a0 <- g3/(hrd(r0,d0,z3)-1) # first a value ard <- c(a0,r0,d0) # initialize eps <- 10^(-4) # precision crit <- {max(abs(c(f1(a0,r0,d0), f2(a0,r0,d0), f3(a0,r0,d0))))} # stopping criterion it <- 1 # number of iterations while (crit>eps){ # main loop a<-ard[1];r<-ard[2];d<-ard[3] # get values V <- {c(f1(a,r,d), f2(a,r,d), f3(a,r,d))} # vector value dM <- {rbind(c(df11(a,r,d), df12(a,r,d), df13(a,r,d)), c(df21(a,r,d), df22(a,r,d), df23(a,r,d)), c(df31(a,r,d), df32(a,r,d), df33(a,r,d)))} # matrix of derivatives ard <- ard-solve(dM)%*%V # next value crit <- max(abs(V)) # stopping criterion it <- it+1 # count iterations if (it>20){ crit <- FALSE # failure to converge ard <- c(a0,r0,d0) # return initialization } } # end while return(ard) } covgd=function(alpha,rho,delta,z1,z2){ # returns the covariance of (z1^X,z2^X), where X # follows the LD(alpha,rho) # v <- {gard(alpha,rho,delta,z1*z2)- # expectation of product gard(alpha,rho,delta,z1)* gard(alpha,rho,delta,z2)} # product of expectations return(v) } # end function covgd GFD.covar.alpharho=function(alpha,rho,delta,z1,z2,z3){ # returns the covariance matrix of the GF estimator of # (alpha,rho) for the LDD(alpha,rho,delta) if delta is known # c11 <- covgd(alpha,rho,delta,z1,z1) # variance of z1^X c22 <- covgd(alpha,rho,delta,z2,z2) # variance of z2^X c33 <- covgd(alpha,rho,delta,z3,z3) # variance of z3^X c12 <- covgd(alpha,rho,delta,z1,z2) # covariance (z1^X,z2^X) c13 <- covgd(alpha,rho,delta,z1,z3) # covariance (z1^X,z3^X) c23 <- covgd(alpha,rho,delta,z2,z3) # covariance (z2^X,z3^X) M1 <- {rbind(c(c11,c12,c13), c(c12,c22,c23), c(c13,c23,c33))} # covariance matrix D <- {alpha*((hrd(rho,delta,z2)-1)* h1rd(rho,delta,z1) -(hrd(rho,delta,z1)-1)* h1rd(rho,delta,z2))} # common denominator R1 <- {(hrd(rho,delta,z2)-1)/ (D*gard(alpha,rho,delta,z1))} # coefficient of first coordinate R2 <- {-(hrd(rho,delta,z1)-1)/ (D*gard(alpha,rho,delta,z2))} # coefficient of second coordinate R3 <- 0 # coefficient of third coordinate A1 <- {(alpha*h1rd(rho,delta,z3)*R1)/ (1-hrd(rho,delta,z3))} # coefficient of first coordinate A2 <- {(alpha*h1rd(rho,delta,z3)*R2)/ (1-hrd(rho,delta,z3))} # coefficient of second coordinate A3 <- {1/(gard(alpha,rho,delta,z3)* (hrd(rho,delta,z3)-1))} # coefficient of third coordinate CO <- rbind(c(A1,R1),c(A2,R2),c(A3,R3))# matrix of coefficients V <- t(CO)%*%M1%*%CO # covariance matrix return(V) } # end function GFD.covar.alpharho GFD.covar=function(alpha,rho,delta,z1,z2,z3){ # returns the covariance matrix of the GF estimator of # (alpha,rho,delta) for the LDD(alpha,rho,delta) # a <- alpha; r <- rho; d <- delta # rename parameters c11 <- covgd(a,r,d,z1,z1) # variance of z1^X c22 <- covgd(a,r,d,z2,z2) # variance of z2^X c33 <- covgd(a,r,d,z3,z3) # variance of z3^X c12 <- covgd(a,r,d,z1,z2) # covariance (z1^X,z2^X) c13 <- covgd(a,r,d,z1,z3) # covariance (z1^X,z3^X) c23 <- covgd(a,r,d,z2,z3) # covariance (z2^X,z3^X) M <- {rbind(c(c11,c12,c13), c(c12,c22,c23), c(c13,c23,c33))} # covariance matrix j11 <- gard(a,r,d,z1)*(hrd(r,d,z1)-1) j12 <- gard(a,r,d,z1)*(a*h1rd(r,d,z1)) j13 <- gard(a,r,d,z1)*(a*h2rd(r,d,z1)) j21 <- gard(a,r,d,z2)*(hrd(r,d,z2)-1) j22 <- gard(a,r,d,z2)*(a*h1rd(r,d,z2)) j23 <- gard(a,r,d,z2)*(a*h2rd(r,d,z2)) j31 <- gard(a,r,d,z3)*(hrd(r,d,z3)-1) j32 <- gard(a,r,d,z3)*(a*h1rd(r,d,z3)) j33 <- gard(a,r,d,z3)*(a*h2rd(r,d,z3)) J <- {rbind(c(j11,j12,j13), c(j12,j22,j23), c(j13,j23,j33))} # jacobian matrix J <- solve(J) # its inverse V <- J%*%M%*%t(J) # covariance matrix return(V) } # end function GFD.covar #--------------- Generating Function Estimates ------------------- tuning=function(){ # returns the tuning constants for the GF estimators # tu <- list(z1=0.1,z2=0.9,z3=0.8,q=0.1) return(tu) } # end function tuning GFD.est.alpha=function(S,rho,delta,b){ # returns the GF estimate of alpha for a sample S of the # LDD(alpha,rho,delta), given rho, delta, and a scaling factor b # tu <- tuning() # tuning constants z3 <- tu$z3 # fixed value g <- mean(z3^(S/b)) # empirical generating function a <- log(g)/(hrd(rho,delta,z3^(1/b))-1)# estimate of alpha return(a) # return estimate } # end function GF.est.alpha GFD.est.rho=function(S,delta,b){ # returns the GF estimate of rho for a sample S of the # LDD(alpha,rho,delta), given delta and a scaling factor b # tu <- tuning() # tuning constants z1 <- tu$z1 # lower value z2 <- tu$z2 # higher value g1 <- mean(z1^(S/b)) # empirical generating function at z1 g2 <- mean(z2^(S/b)) # empirical generating function at z2 y <- log(g1)/log(g2) # get ratio of logs z1<-z1^(1/b); z2<-z2^(1/b) # rescale variables r <- h1rd.solve(y,z1,z2,delta) # find corresponding r return(r) # return estimate } # end function GF.est.rho GFD.est.alpharho=function(S,delta){ # returns the GF estimate of alpha and rho for a sample S of the # LDD(alpha,rho,delta), given delta # tu <- tuning() # tuning constants q <- tu$q # quantile b <- quantile(S,q,names=F)+1 # scaling factor r <- GFD.est.rho(S,delta,b) # estimate of rho a <- GFD.est.alpha(S,r,delta,b) # estimate of alpha return(c(a,r)) # return estimates } # end function GF.est.alpharho GFD.confint.alpharho=function(S,delta=0,level=0.95){ # returns confidence intervals on the GF estimates of # alpha and rho for a sample S of the LDD(alpha,rho,delta) # (delta is supposed to be known) # tu <- tuning() # tuning constants z1 <- tu$z1 # fixed value z2 <- tu$z2 # fixed value z3 <- tu$z3 # fixed value q <- tu$q # quantile b <- quantile(S,q,names=F)+1 # scaling factor ar <- GFD.est.alpharho(S,delta) # point estimates a <- ar[1]; r <- ar[2] # estimates of alpha and rho z1 <- z1^(1/b) # rescale z1 z2 <- z2^(1/b) # rescale z2 z3 <- z3^(1/b) # rescale z3 M <- {GFD.covar.alpharho(a,r,delta, z1,z2,z3)} # asymptotic covariance matrix sig <- sqrt(diag(M)) # asymptotic standard deviations p <- (1-level)/2 # lower probability ma <- {qnorm(1-p)* # quantile of standard normal sig/sqrt(length(S))} # margin of error M <- cbind(ar-ma,ar+ma) # matrix of two intervals M <- data.frame(M) # make date frame row.names(M) <- c("alpha","rho") # row labels cn <- {c(paste(as.character(round(p*100,1)),"%"), paste(as.character(round((1-p)*100,1)),"%"))} names(M) <- cn # column labels return(M) # confidence intervals } # end function GF.confint.alpha GFD.est=function(S,dmin=0,dmax=0.3,dstep=0.01){ # returns the estimates of alpha, rho, delta in the sense of the # minimal distance between the empirical generating function of # a sample S and the generating function of the LDD(alpha,rho,delta), # where alpha and rho are estimated using delta from dmin to dmax # by dstep # tu <- tuning() # tuning constants q <- tu$q # quantile b <- quantile(S,q,names=F)+1 # scaling factor Z <- seq(0,0.99,0.01) # vector of z values nz <- length(Z) # number of z values Yh <- rep(0,nz) # vector of estimated g(z) for (j in 1:nz){ Yh[j] <- mean(Z[j]^(S/b)) } Z <- (seq(0,0.99,0.01))^(1/b) # rescaled z values Y <- rep(0,nz) # vector of computed g(z) D <- seq(dmin,dmax,dstep) # vector of delta values nd <- length(D) # number of delta values dist <- rep(0,nd) # vector of distances R <- rep(0,nd) # vector of rho values A <- rep(0,nd) # vector of alpha values for (i in 1:nd){ d <- D[i] r <- GFD.est.rho(S,d,b) # estimate of rho a <- GFD.est.alpha(S,r,d,b) # estimate of alpha A[i] <- a; R[i] <- r # stack estimates for (j in 1:nz){ Y[j] <- gard(a,r,d,Z[j]) # generating function value } dist[i] <- max(abs(Y-Yh)) # distance } ib <- which.min(dist) # index of best distance a <- A[ib]; r<- R[ib]; d <- D[ib] # best estimates return(c(a,r,d)) } # end function GFD.best GFD.confint=function(S,level=0.95){ # returns confidence intervals on the GF estimates of # alpha, rho, and delta for a sample S of the # LDD(alpha,rho,delta) # tu <- tuning() # tuning constants z1 <- tu$z1 # fixed value z2 <- tu$z2 # fixed value z3 <- tu$z3 # fixed value q <- tu$q # quantile b <- quantile(S,q,names=F)+1 # scaling factor ard <- GFD.est(S) # point estimates a <- ard[1]; r <- ard[2]; d <- ard[3] # estimates of alpha, rho, delta z1 <- z1^(1/b) # rescale z1 z2 <- z2^(1/b) # rescale z2 z3 <- z3^(1/b) # rescale z3 M <- GFD.covar(a,r,d,z1,z2,z3) # asymptotic covariance matrix sig <- sqrt(diag(M)) # asymptotic standard deviations p <- (1-level)/2 # lower probability ma <- {qnorm(1-p)* # quantile of standard normal sig/sqrt(length(S))} # margin of error M <- cbind(ard-ma,ard+ma) # matrix of two intervals M <- data.frame(M) # make date frame row.names(M) <- {c("alpha", "rho", "delta")} # row labels cn <- {c( paste(as.character(round(p*100,1)),"%"), paste(as.character(round((1-p)*100,1)),"%"))} names(M) <- cn # column labels return(M) # confidence intervals } # end function GF.confint