#----------------------------------------------------------------- # # Statistics for the Luria-Delbruck distribution # # Version 1.0 03/15/2012 # # A. Hamon and B. Ycart # Universite de Grenoble and CNRS, France # http://ljk.imag.fr/membres/Bernard.Ycart/LD/ #----------------------------------------------------------------- #-------------------------- Data sets ---------------------------- # From p.504 of D. E. Luria and M. Delbruck, Mutations of bacteria from # virus sensitivity to virus resistance, Genetics, 28, (1943), 491-511 LD43a<-{c(10,18,125,10,14,27,3,17,17,29,41,17,20,31,30,7,17,30,10,40,45, 183,12,173,23,57,51,6,5,10,8,24,13,165,15,6,10,38,28,35,107,13)} LD43b<-{c(1,0,3,0,0,5,0,5,0,6,107,0,0,0,1,0,0,64,0,35, 1,0,0,7,0,303,0,0,3,48,1,4)} LD43c<-{c(0,0,0,0,8,1,0,1,0,15,0,0,19,0,0,17,11,0,0)} # From L. Boe, T. Tolker-Nielsen, K. M. Eegholm, H. Spliid and A. Vrang, # Fluctuation analysis of mutations to nalidixic acid resistance in # Escherichia Coli, J. Bacteriol., 176(10) (1994) 2781-2787 B94<-{c(rep(0,543),rep(1,169),rep(2,92),rep(3,57),rep(4,42),rep(5,25), rep(6,23),rep(7,13),rep(8,11),rep(9,5),rep(10,8),rep(11,8), rep(12,9),rep(13,5),rep(14,5),15,rep(16,8),rep(17,2),rep(18,5), rep(19,6),rep(20,2),rep(21,5),23,rep(24,2),25,rep(26,5),rep(27,3), 28,rep(29,3),rep(30,3),31,32,34,35,36,37,rep(39,2),40,41,41,49,52, 57,59,66,68,69,rep(73,2),74,105,107,116,132,137,140,146,151,152, 192,258,265,320,482,rep(513,4))} # From p.22 of W. A. Rosche and P. L. Foster, Determining mutation rates # in bacterial populations, Methods, 20(1), (2000), 1-17 RF00<-{c(rep(0,11),rep(1,17),rep(2,12),rep(3,3),rep(4,4),5,6,rep(7,2),9)} # From p.239 of Q. Zheng, Statistical and algorithmic methods for # fluctuation analysis with SALVADOR as an implementation, # Mathematical Biosciences, 176, (2002), 237-252 Z02<-{c(33,18,839,47,13,126,48,80,9,71,196,66,28,17,27,37, 126,33,12,44,28,67,730,168,44,50,583,23,17,24)} #------------------------ Distributions -------------------------- rY<-function(n,rho){ # returns a sample of size n for the Yule distribution # with parameter rho. # t <- rexp(n,rho) # exponential times p <- exp(-t) # parameters for geometric s <- 1+rgeom(n,p) # sample of Yule distribution return(s) } # end function rY rLD<-function(n,alpha,rho){ # returns a sample of size n for the Luria-Delbruck distribution # with parameters alpha and rho # s <- rep(0,n) # initialize sample mut <- rpois(n,alpha) # Poisson number of mutations for (i in 1:n){ # loop on each Poisson number if (mut[i]>0){ # if there are mutations s[i] <- sum(rY(mut[i],rho)) # Poisson sum of Yule variables } # end if } # end loop return(s) } # end function rLD dY<-function(m,rho){ # returns the probabilities pk for k in 0:m # for the Yule distribution with parameter rho # P<-rho*beta((rho+1)*rep(1,m),1:m); # compute the probabilities P <- append(0,P) # null for 0 return(P) } # end function dY dLD<-function(m,alpha,rho){ # returns the probabilites qk for k in 0:m for the # Luria-Delbruck distribution with parameters alpha and rho # P <- dY(m,rho) # vector of Yule probabilities P <- P[-1] # remove null value Q <- rep(0,m+1) # vector of LD probabilities Q[1] <- exp(-alpha) # initial probability q0 if (m>0){ # if any other needed for (k in 1:m){ # loop on values I <- seq(1,k) # increasing indices D <- seq(k,1,by=-1) # decreasing indices Q[k+1]<-{(alpha/k)* # coefficient sum(I*P[I]*Q[D])} # convolution } # end for } # end if return(Q) } # end function dLD pY<-function(m,rho){ # returns the probability to be lesser or equal to m # for the Yule distribution with parameter rho # p <- 1-rho*beta(rho,m+1); # compute the probability return(p) } # end function pY pLD<-function(m,alpha,rho){ # returns the probability to be lesser or equal to m for the # Luria-Delbruck distribution with parameters alpha and rho # Q <- dLD(m,alpha,rho) # vector of LD probabilities return(sum(Q)) } # end function pLD qY<-function(p,rho){ # returns the quantile of order p 0
M){ # if p not yet in the table M <- 2*M # double the table P <- cumsum(dLD(M,alpha,rho)) # truncated distribution function k <- max(which(P
eps) # check precision
it <- it+1 # count iteration
if (it>10){
crit <- FALSE # convergence failure
rn <- rb # default grid search result
}else{ro <- rn} # next value
} # end while
return(rn)
} # end function hr.solve
gar<-function(alpha,rho,z){
# returns the evaluation at z of the generating function
# of the LD(alpha,rho)
#
g <- exp(alpha*(hr(rho,z)-1)) # from the gf of the Yule distribution
return(g)
} # end function gar
covg<-function(alpha,rho,z1,z2){
# returns the covariance of (z1^X,z2^X), where X
# follows the LD(alpha,rho)
#
v <- {gar(alpha,rho,z1*z2)- # expectation of product
gar(alpha,rho,z1)*
gar(alpha,rho,z2)} # product of expectations
return(v)
} # end function covg
GF.covar<-function(alpha,rho,z1,z2,z3){
# returns the covariance matrix of the GF estimator of
# (alpha,rho) for the LD(alpha,rho)
#
c11 <- covg(alpha,rho,z1,z1) # variance of z1^X
c22 <- covg(alpha,rho,z2,z2) # variance of z2^X
c33 <- covg(alpha,rho,z3,z3) # variance of z3^X
c12 <- covg(alpha,rho,z1,z2) # covariance (z1^X,z2^X)
c13 <- covg(alpha,rho,z1,z3) # covariance (z1^X,z3^X)
c23 <- covg(alpha,rho,z2,z3) # covariance (z2^X,z3^X)
M1 <- {rbind(c(c11,c12,c13),
c(c12,c22,c23),
c(c13,c23,c33))} # covariance matrix
D <- {alpha*((hr(rho,z2)-1)*h1r(rho,z1)
-(hr(rho,z1)-1)*h1r(rho,z2))} # common denominator
R1 <- {(hr(rho,z2)-1)/
(D*gar(alpha,rho,z1))} # coefficient of first coordinate
R2 <- {-(hr(rho,z1)-1)/
(D*gar(alpha,rho,z2))} # coefficient of second coordinate
R3 <- 0 # coefficient of third coordinate
A1 <- {(alpha*h1r(rho,z3)*R1)/
(1-hr(rho,z3))} # coefficient of first coordinate
A2 <- {(alpha*h1r(rho,z3)*R2)/
(1-hr(rho,z3))} # coefficient of second coordinate
A3 <- {1/(gar(alpha,rho,z3)*
(hr(rho,z3)-1))} # coefficient of third coordinate
CO <- rbind(c(A1,R1),c(A2,R2),c(A3,R3))# matrix of coefficients
V <- t(CO)%*%M1%*%CO # covariance matrix
return(V)
} # end function GF.covar
#--------------- Generating Function Estimates -------------------
tuning<-function(){
# returns the tuning constants for the GF estimators
#
tu <- list(z1=0.1,z2=0.9,z3=0.8,q=0.1)
return(tu)
} # end function tuning
GF.est.alpha<-function(S,rho,b){
# returns the GF estimate of alpha for a sample S of the
# LD(alpha,rho), given rho and a scaling factor b
#
tu <- tuning() # tuning constants
z3 <- tu$z3 # fixed value
g <- mean(z3^(S/b)) # empirical generating function
a <- log(g)/(hr(rho,z3^(1/b))-1) # estimate of alpha
return(a) # return estimate
} # end function GF.est.alpha
GF.est.rho<-function(S,b){
# returns the GF estimate of rho for a sample S of the
# LD(alpha,rho), given a scaling factor b
#
tu <- tuning() # tuning constants
z1 <- tu$z1 # lower value
z2 <- tu$z2 # higher value
g1 <- mean(z1^(S/b)) # empirical generating function at z1
g2 <- mean(z2^(S/b)) # empirical generating function at z2
y <- log(g1)/log(g2) # get ratio of logs
r <- hr.solve(y,z1^(1/b),z2^(1/b)) # find corresponding r
return(r) # return estimate
} # end function GF.est.rho
GF.est<-function(S){
# returns the GF estimates of alpha and rho
# for a sample S of the LD(alpha,rho)
#
tu <- tuning() # tuning constants
q <- tu$q # quantile
b <- quantile(S,q,names=F)+1 # scaling factor
r <- GF.est.rho(S,b) # first estimate of rho
a <- GF.est.alpha(S,r,b) # first estimate of alpha
return(c(a,r)) # return estimates
} # end function GF.est
GF.confint<-function(S,level=0.95){
# returns confidence intervals on the GF estimates of
# alpha and rho for a sample S of the LD(alpha,rho)
#
tu <- tuning() # tuning constants
z1 <- tu$z1 # fixed value
z2 <- tu$z2 # fixed value
z3 <- tu$z3 # fixed value
q <- tu$q # quantile
b <- quantile(S,q,names=F)+1 # scaling factor
ar <- GF.est(S) # point estimates
a <- ar[1]; r <- ar[2] # estimates of alpha and rho
z1 <- z1^(1/b) # rescale z1
z2 <- z2^(1/b) # rescale z2
z3 <- z3^(1/b) # rescale z3
M <- GF.covar(a,r,z1,z2,z3) # asymptotic covariance matrix
sig <- sqrt(diag(M)) # asymptotic standard deviations
p <- (1-level)/2 # lower probability
ma <- {qnorm(1-p)* # quantile of standard normal
sig/sqrt(length(S))} # margin of error
M <- cbind(ar-ma,ar+ma) # matrix of two intervals
M <- data.frame(M) # make date frame
row.names(M) <- c("alpha","rho") # row labels
cn <- {c(
paste(as.character(round(p*100,1)),"%"),
paste(as.character(round((1-p)*100,1)),"%"))}
names(M) <- cn # column labels
return(M) # confidence intervals
} # end function GF.confint.alpha
GF.test.alpha<-function(S,a0){
# returns the p-value of the one-sided test of alpha=a0,
# using the GF estimate of alpha for a sample S of the
# LD(alpha,rho), rho being unknown
#
tu <- tuning() # tuning constants
z1 <- tu$z1 # fixed value
z2 <- tu$z2 # fixed value
z3 <- tu$z3 # fixed value
q <- tu$q # quantile
b <- quantile(S,q,names=F)+1 # scaling factor
estim <- GF.est(S) # point estimates
a <- estim[1]; r <- estim[2] # estimates of alpha and rho
z1 <- z1^(1/b) # rescale z1
z2 <- z2^(1/b) # rescale z2
z3 <- z3^(1/b) # rescale z3
M <- GF.covar(a,r,z1,z2,z3) # asymptotic covariance matrix
V <- M[[1,1]] # asymptotic variance of alpha
T <- (a-a0) # difference with null hypothesis
T <- T/sqrt(V) # standard deviation
T <- T*sqrt(length(S)) # test statistic
if (T<0){
p <- pnorm(T) # left tail
}else{
p <- 1-pnorm(T) # right tail
} # end if
return(p) # p-value
} # end function GF.test.alpha
GF.test.rho<-function(S,r0){
# returns the p-value of the one-sided test of rho=r0,
# using the GF estimate of rho for a sample S of the
# LD(alpha,rho)
#
tu <- tuning() # tuning constants
z1 <- tu$z1 # fixed value
z2 <- tu$z2 # fixed value
z3 <- tu$z3 # fixed value
q <- tu$q # quantile
b <- quantile(S,q,names=F)+1 # scaling factor
estim <- GF.est(S) # point estimates
a <- estim[1]; r <- estim[2] # estimates of alpha and rho
z1 <- z1^(1/b) # rescale z1
z2 <- z2^(1/b) # rescale z2
z3 <- z3^(1/b) # rescale z3
M <- GF.covar(a,r,z1,z2,z3) # asymptotic covariance matrix
V <- M[[2,2]] # asymptotic variance of rho
T <- (r-r0) # difference with null hypothesis
T <- T/sqrt(V) # standard deviation
T <- T*sqrt(length(S)) # test statistic
if (T<0){
p <- pnorm(T) # left tail
}else{
p <- 1-pnorm(T) # right tail
} # end if
return(p) # p-value
} # end function GF.test.rho
#-------------------- Maximum Likelihood -------------------------
diff.rho<-function(rho,m){
# returns a 3 by m matrix
# first row : the probabilities of the Yule distribution
# second and third row : first and second derivatives
# with respect to rho
B1<-c(1/(1+rho),1/(1+rho)*cumprod(1:(m-1)/(2:m+rho)))
P1<- rho*B1
eta<-cumsum(1/(1:m+rho))
d0<-c(0,P1)
d1<-c(0,B1-P1*eta)
d2<-c(0,B1*(rho*eta^2-eta-cumsum((1:m)/((1:m+rho)^2))))
d<-rbind(d0,d1,d2)
return(d)
}
diff.Q<-function(alpha,rho,P,d_rho_P,dd_rho_P,m){
#
# WARNING ! this function requires the probabilities of the Yule distribution and their first and second order derivatives
#
# it returns a 6 by m matrix
# first row : the probabilites qk for k in 0:m for the Luria-Delbruck distribution with parameters alpha and rho
# 2-3 rows : the partial derivatives of qk with respect to alpha and rho
# 4-6 rows : the second-order partial derivatives of qk with respect to alpha, rho and alpha-rho
Q <- rep(0,m+1)
Q[1] <- exp(-alpha)
for (i in 1:m){
Ii <- seq(1,i)
Id <- seq(i,1,by=-1)
Q[i+1]<-(alpha/i)*sum(Ii*P[Ii+1]*Q[Id])}
# ************************************************************* #
d_alpha_Q<-rep(0,m+1)
d_alpha_Q[1] <- -exp(-alpha)
for (k in 1:m){
Ii <- seq(1,k)
Id <- seq(k,1,by=-1)
d_alpha_Q[k+1]<- sum(P[Ii+1]*Q[Id])-Q[k+1]}
# ************************************************************* #
d_rho_Q<-rep(0,m+1)
d_rho_Q[1] <- 0
for (k in 1:m){
Ii <- seq(1,k)
Id <- seq(k,1,by=-1)
d_rho_Q[k+1]<- alpha*sum(d_rho_P[Ii+1]*Q[Id])}
# ************************************************************* #
dd_alpha_rho_Q<-rep(0,m+1)
dd_alpha_rho_Q[1] <-0
for (k in 1:m){
Ii <- seq(1,k)
Id <- seq(k,1,by=-1)
dd_alpha_rho_Q[k+1]<- sum(d_rho_P[Ii+1]* (Q[Id]+alpha*d_alpha_Q[Id]) )}
# ************************************************************* #
dd_rho_Q<-rep(0,m+1)
dd_rho_Q[1] <- 0
for (k in 1:m){
Ii <- seq(1,k)
Id <- seq(k,1,by=-1)
dd_rho_Q[k+1]<- alpha*sum(dd_rho_P[Ii+1]*Q[Id]+d_rho_P[Ii+1]*d_rho_Q[Id])}
# ************************************************************* #
dd_alpha_Q<-rep(0,m+1)
dd_alpha_Q[1] <- Q[1]
for (k in 1:m){
Ii <- seq(1,k)
Id <- seq(k,1,by=-1)
dd_alpha_Q[k+1]<- sum(P[Ii+1]*d_alpha_Q[Id])-d_alpha_Q[k+1]}
results<-rbind(Q,d_alpha_Q,d_rho_Q,dd_alpha_Q,dd_rho_Q,dd_alpha_rho_Q)
return(results)
}
ML.est<-function(sample,init,win=500,max.iter=20,precision=0.0001,details=FALSE){
# returns the maximum likelihood estimates of alpha and rho
# if the user gives a value for win then the sample is winsorized i.e. all the data below win is set to the value win
# the Newton-Raphson method is used to perform the maximization :
# - init is the vector of initial estimates if missing GF.est is used
# - max.iter is the maximum number of iterations allowed in NR
# - precision is the stopping condition
# if the maximum value of the sample is > maximum then the function returns "NA"
if (missing(init)) {init<-GF.est(sample)} # if no initial estimates are given then the default values are set to the GF estimates
max.sample<-max(sample)
fin<-max.sample+1
if (max.sample>win) {
new.sample<-sample # winsorization = transformation of the data
new.sample[which(sample>=win)]<-win # extreme values (> win) are replaced by win
counts<-rowSums(t(matrix(new.sample,length(new.sample),win+1))==0:win)}
else counts<-rowSums(t(matrix(sample,length(sample),fin))==0:max.sample)
m<-length(counts)-1
iter<-1 # initializations
new.val<-init
critere<-1
# ************************************************
# ************************************************
while ( (iter