Source code for curveball.models

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# This file is part of curveball.
# https://github.com/yoavram/curveball

# Licensed under the MIT license:
# http://www.opensource.org/licenses/MIT-license
# Copyright (c) 2015, Yoav Ram <yoav@yoavram.com>
from __future__ import print_function
from __future__ import division
from builtins import str
from builtins import range
import sys
import numbers
from warnings import warn
import numpy as np
import matplotlib.pyplot as plt
import collections
try:
	from scipy.stats.distributions import chi2
	chisqprob = chi2.sf
except ImportError:
	from scipy.stats import chisqprob
from scipy.stats import linregress
from scipy.misc import derivative
import pandas as pd
import copy
import inspect
import lmfit
import sympy
import seaborn as sns
sns.set_style("ticks")
import curveball
import curveball.baranyi_roberts_model
from curveball.utils import smooth


[docs]def is_model(cls):
    """Returns :const:`True` if the input is a subclass of :py:class:`lmfit.model.Model`.

    Parameters
    ----------
    cls : class

    Returns
    -------
    bool
    """
    return inspect.isclass(cls) and issubclass(cls, lmfit.model.Model)


[docs]def get_models(module):
    """Finds and returns all models in `module`.

    Parameters
    ----------
    module : module
        the module in which to look for models

    Returns
    -------
    list
        list of subclasses of :py:class:`lmfit.model.Model` that can be used with :py:func:`curveball.models.fit_model`

    See also
    --------
    fit_model
    is_model
    curveball.baranyi_roberts_model
    """
    return [m[1] for m in inspect.getmembers(module, curveball.models.is_model)]


[docs]def bootstrap_params(df, model_result, nsamples, unit='Well', fit_kws=None):
    """Sample model parameters by fitting the model to resampled data.

    The data is bootstraped by drawing growth curves (sampling from the `unit` column in `df`) with replacement.

    Parameters
    ----------
    df : pandas.DataFrame
        the data to fit
    model_result : lmfit.model.ModelResult
        result of fitting a :py:class:`lmfit.model.Model` to data in ``df``
    nsamples : int
        number of samples to draw
    unit : str, optional
        the name of the column in `df` that identifies a resampling unit, defaults to ``Well``
    fit_kws : dict, optional
        dict of kwargs for `fit_model`
    Returns
    -------
    pandas.DataFrame
        data frame of samples; each row is a sample, each column is a parameter.

    Raises
    ------
    ValueError : if `model_result` isn't an instance of :py:class:`lmfit.model.ModelResult`
    ValueError : if `df` is empty

    See also
    --------
    sample_params
    """
    if not isinstance(model_result, lmfit.model.ModelResult):
        raise TypeError(
            "Input model_class must be a {0}, but it is {1}".format(
                lmfit.model.ModelResult.__name__, 
                model_result.__class__.__name__))
    if df.empty:
        raise ValueError("Input data frame df is empty")
        
    if fit_kws is None:
        fit_kws = {}
    if not 'param_fix' in fit_kws:
        fit_kws['param_fix'] = {pname for pname, param in model_result.params.items() if not param.vary}
    if not 'param_max' in fit_kws:
        fit_kws['param_max'] = {pname: param.max for pname, param in model_result.params.items()}
    if not 'param_max' in fit_kws:
        fit_kws['param_min'] = {pname: param.min for pname, param in model_result.params.items()}
    model_class = type(model_result.model)

    unique_units = pd.Series(df[unit].unique())
    grouped = df.groupby(unit)
    param_samples = [None] * nsamples

    for i in range(nsamples):
        sampled_units = unique_units.sample(frac=1, replace=True)
        _df = pd.concat(grouped.get_group(grp_id) for grp_id in sampled_units.values)
        assert (_df[unit].unique() == sampled_units.unique()).all()
        assert _df.shape == df.shape    
        model_fit = curveball.models.fit_model(_df, models=model_class, 
                                               PLOT=False, PRINT=False, 
                                               **fit_kws)[0]
        param_samples[i] = model_fit.best_values
    return pd.DataFrame(param_samples)


[docs]def sample_params(model_fit, nsamples, params=None, covar=None):
    """Random sample of parameter values from a truncated multivariate normal distribution defined by the 
    covariance matrix of the a model fitting result.

    Parameters
    ----------
    model_fit : lmfit.model.ModelResult
        the model fit that defines the sampled distribution
    nsamples : int
        number of samples to make
    params : dict, optional
        a dictionary of model parameter values; if given, overrides values from `model_fit`
    covar : numpy.ndarray, optional
        an array containing the parameters covariance matrix; if given, overrides values from `model_fit`

    Returns
    -------
    pandas.DataFrame
        data frame of samples; each row is a sample, each column is a parameter.

    See also
    --------
    bootstrap_params
    """
    if params is None:
        params = model_fit.params
    else:
        _params = model_fit.params.copy()
        for pname, pvalue in params.items():
            _params[pname].value = pvalue
        params = _params
    if covar is None:
        covar = model_fit.covar
    if covar is None:
        raise ValueError("Covariance matrix for {0} is invalid (None).".format(model_fit.model))
    if covar.ndim != 2:
        warn("Covariance matrix doesn't have 2 dimensions: \n{}".format(covar))
    w, h = covar.shape
    if w != h:
        warn("Covariance matrix is not square: \n{}".format(covar))
    names = [p.name for p in params.values() if p.vary]
    means = [p.value for p in params.values() if p.vary]
        
    param_samples = np.random.multivariate_normal(means, covar, nsamples)
    param_samples = pd.DataFrame(param_samples, columns=names)
    idx = np.zeros(nsamples) == 0
    for p in params.values():
        if not p.vary:
            param_samples[p.name] = p.value
        idx = idx & (param_samples[p.name] >= p.min) & (param_samples[p.name] <= p.max)
    if param_samples.shape[0] < nsamples:
        warn("Warning: truncated {0} parameter samples; please report at {1}, including the data and use case.".format(nsamples - param_samples.shape[0], "https://github.com/yoavram/curveball/issues"))
    param_samples = param_samples[idx]
    return param_samples
    


def noisify_normal_additive(data, std, rng=None):
    if not rng:
        rng = np.random
    return data +  rng.normal(0, std, data.shape)


def noisify_lognormal_multiplicative(data, std, random_seed=None):
    if random_seed is None:
        rng = np.random
    else:
        rng = np.random.RandomState(random_seed)
    return data * rng.lognormal(0, std, data.shape)


def randomize(t=12, y0=0.1, K=1.0, r=0.1, nu=1.0, q0=np.inf, v=np.inf, 
        func=curveball.baranyi_roberts_model.baranyi_roberts_function, reps=1, noise_std=0.02, 
        noise_func=noisify_lognormal_multiplicative, random_seed=None, as_df=True, 
        data_label='OD', time_label='Time', replicate_label='Well'):
    if isinstance(t, numbers.Number):
        t = np.linspace(0, t)
    y = func(t, y0, K, r, nu, q0, v)
    y.resize((len(t),))
    y = y.repeat(reps)
    y.resize((len(t), reps))
    well = np.arange(reps).repeat(len(t))#.resize(len(t), reps)
    if noise_std > 0:
        y = noise_func(y, noise_std, random_seed)
    y[y < 0] = 0.0
    y = y.flatten()
    t = t.repeat(reps)
    well = well.flatten()
    if as_df:
        return pd.DataFrame({data_label: y, time_label: t, replicate_label: well})
    else:
        return t, y


[docs]def lrtest(m0, m1, alfa=0.05):
    r"""Performs a likelihood ratio test on two nested models.

    For two models, one nested in the other 
    (meaning that the nested model estimated parameters are a subset of the nesting model), 
    the test statistic :math:`D` is:

    .. math::

        \Lambda = \Big( \Big(\frac{\sum{(X_i - \hat{X_i}(\theta_1))^2}}{\sum{(X_i - \hat{X_i}(\theta_0))^2}}\Big)^{n/2} \Big)

        D = -2 log \Lambda

        lim_{n \to \infty} D \sim \chi^2_{df=\Delta}


    where :math:`\Lambda` is the likelihood ratio, :math:`D` is the statistic, 
    :math:`X_{i}` are the data points, :math:`\hat{X_i}(\theta)` is the 
    model prediction with parameters :math:`\theta`, 
    :math:`\theta_i` is the parameters estimation for model :math:`i`, 
    :math:`n` is the number of data points, and :math:`\Delta` is the 
    difference in number of parameters between the models.

    The function compares between two :py:class:`lmfit.model.ModelResult` objects. 
    These are the results of fitting models to the same dataset 
    using the `lmfit <lmfit.github.io/lmfit-py>`_ package.

    The function compares between model fit `m0` and `m1` and assumes that `m0` is nested in `m1`, 
    meaning that the set of varying parameters of `m0` is a subset of the varying parameters of `m1`. 
    :py:attr:`lmfit.model.ModelResult.chisqr` is the sum of the square of the residuals of the fit. 
    :py:attr:`lmfit.model.ModelResult.ndata` is the number of data points. 
    :py:attr:`lmfit.model.ModelResult.nvarys` is the number of varying parameters.

    Parameters
    ----------
    m0, m1 : lmfit.model.ModelResult
        objects representing two model fitting results. `m0` is assumed to be nested in `m1`.
    alfa : float, optional
        test significance level, defaults to 0.05 = 5%.

    Returns
    -------
    prefer_m1 : bool
        should we prefer `m1` over `m0`
    pval : float
        the test p-value
    D : float
        the test statistic
    ddf : int 
        the number of degrees of freedom

    Notes
    ----------
    - `Generalised Likelihood Ratio Test Example <http://www.stat.sc.edu/~habing/courses/703/GLRTExample.pdf>`_.
    - `IPython notebook <http://nbviewer.ipython.org/github/yoavram/ipython-notebooks/blob/master/likelihood%20ratio%20test.ipynb>`_.
    """
    n0 = m0.ndata
    k0 = m0.nvarys
    chisqr0 = m0.chisqr
    assert chisqr0 > 0, chisqr0
    n1 = m1.ndata
    assert n0 == n1
    k1 = m1.nvarys
    chisqr1 = m1.chisqr
    assert chisqr1 > 0, chisqr1
    D = -n0 * (np.log(m1.chisqr) - np.log(m0.chisqr))
    assert D > 0, D
    ddf = k1 - k0
    assert ddf > 0, ddf
    pval = chisqprob(D, ddf)
    prefer_m1 = pval < alfa
    return prefer_m1, pval, D, ddf


[docs]def find_max_growth(model_fit, params=None, after_lag=True):
    r"""Estimates the maximum population and specific growth rates from the model fit.

    The function calculates the maximum population growth rate :math:`a=\max{\frac{dy}{dt}}` 
    as the derivative of the model curve and calculates its maximum. 
    It also calculates the maximum of the per capita growth rate :math:`\mu = \max{\frac{dy}{y \cdot dt}}`.
    The latter is more useful as a metric to compare different strains or treatments 
    as it does not depend on the population size/density.    

    For example, in the logistic model the population growth rate is a quadratic function of :math:`y` 
    so the maximum is realized when the 2nd derivative is zero:

    .. math::

        \frac{dy}{dt} = r y (1 - \frac{y}{K}) \Rightarrow

        \frac{d^2 y}{dt^2} = r - 2\frac{r}{K}y \Rightarrow

        \frac{d^2 y}{dt^2} = 0 \Rightarrow y = \frac{K}{2} \Rightarrow

        \max{\frac{dy}{dt}} = \frac{r K}{4}

    In contrast, the per capita growth rate a linear function of :math:`y` 
    and so its maximum is realized when :math:`y=y_0`:

    .. math::

        \frac{dy}{y \cdot dt} = r (1 - \frac{y}{K}) \Rightarrow

        \max{\frac{dy}{y \cdot dt}} = \frac{dy}{y \cdot dt}(y=y_0) = r (1 - \frac{y_0}{K})     

    Parameters
    ----------
    model_fit : lmfit.model.ModelResult
        the result of a model fitting procedure
    params : lmfit.parameter.Parameters, optional
        if provided, these parameters will override `model_fit`'s parameters
    after_lag : bool
        if true, only explore the time after the lag phase. Otherwise start at time zero. Defaults to :const:`True`.

    Returns
    -------
    t1 : float
        the time when the maximum population growth rate is achieved in the units of the `model_fit` ``Time`` variable.
    y1 : float
        the population size or density (OD) for which the maximum population growth rate is achieved.
    a : float
        the maximum population growth rate.
    t2 : float
        the time when the maximum per capita growth rate is achieved in the units of the `model_fit` `Time` variable.
    y2 : float
        the population size or density (OD) for which the maximum per capita growth rate is achieved.
    mu : float
        the the maximum specific (per capita) growth rate.

    See also
    --------
    find_max_growth_ci
    """
    if params is None:
        params = model_fit.params

    y0 = params['y0'].value
    K  = params['K'].value

    t0 = find_lag(model_fit) if after_lag else 0
    t0 = max(t0, 0)
    t1 = model_fit.userkws['t'].max()
    t = np.linspace(t0, t1)     
    def f(t): 
        return model_fit.model.eval(t=t, params=params)
    y = f(t)
    dfdt = derivative(f, t)

    a = dfdt.max()
    i = dfdt.argmax()
    t1 = t[i]
    y1 = y[i]

    dfdt_y = dfdt / y
    mu = dfdt_y.max()
    i = dfdt_y.argmax()
    t2 = t[i]
    y2 = y[i]
    
    return t1, y1, a, t2, y2, mu


[docs]def find_max_growth_ci(model_fit, param_samples, after_lag=True, ci=0.95):
    """Estimates a confidence interval for the maximum population/specific growth rates from the model fit.
    
    The maximum population/specific growth rate for each parameter sample is calculated.
    The confidence interval of the rate is the lower and higher percentiles such that 
    `ci` percent of the random rates are within the confidence interval.

    Parameters
    ----------
    model_fit : lmfit.model.ModelResult
        the result of a model fitting procedure
    param_samples : pandas.DataFrame
        parameter samples, generated using :function:`sample_params` or :function:`bootstrap_params`
    after_lag : bool
        if true, only explore the time after the lag phase. Otherwise start at time zero. Defaults to :const:`True`        
    ci : float, optional
        the fraction of lag durations that should be within the calculated limits. 0 < `ci` <, defaults to 0.95
    
    Returns
    -------
    low_a, high_a : float
        the lower and the higher boundaries of the confidence interval of the the maximum population growth rate in the units of the `model_fit` ``OD``/``Time`` (usually OD/hours).
    low_mu, high_mu : float
        the lower and the higher boundaries of the confidence interval of the the maximum specific growth rate in the units of the `model_fit` 1/``Time`` variable (usually 1/hours).


    See also
    --------
    find_max_growth
    """
    if not 0 <= ci <= 1:
        raise ValueError("ci must be between 0 and 1")
    nsamples = param_samples.shape[0]
    aa = np.zeros(nsamples)
    mumu = np.zeros(nsamples)        
    for i in range(param_samples.shape[0]):
        sample = param_samples.iloc[i,:]
        params = model_fit.params.copy()
        for k,v in params.items():
            if v.vary:
                params[k].set(value=sample[k])
        _, _, a, _, _, mu = find_max_growth(model_fit, params=params, after_lag=after_lag)
        aa[i] = a
        mumu[i] = mu

    margin = (1.0 - ci) * 50.0

    idx = np.isfinite(aa) & (aa >= 0)
    if not idx.all():
        warn("Warning: omitting {0} non-finite growth rate values".format(len(aa) - idx.sum()))
    aa = aa[idx]
    low_a = np.percentile(aa, margin)
    high_a = np.percentile(aa, ci * 100.0 + margin)
    assert high_a >= low_a, aa.tolist()

    idx = np.isfinite(mumu) & (mumu >= 0)
    if not idx.all():
        warn("Warning: omitting {0} non-finite growth rate values".format(len(mumu) - idx.sum()))
    mumu = mumu[idx]
    low_mu = np.percentile(mumu, margin)
    high_mu = np.percentile(mumu, ci * 100.0 + margin)
    assert high_mu >= low_mu, mumu.tolist()
    return low_a, high_a, low_mu, high_mu


[docs]def find_min_doubling_time(model_fit, params=None, PLOT=False):
    """Estimates the minimal doubling time from the model fit.

    The function evaluates a growth curves based on the model fit and supplied parameters (if any) 
    and calculates that time required to double the population density at each time point. 
    It then returns the minimal doubling time.

    Parameters
    ----------
    model_fit : lmfit.model.ModelResult
        the result of a model fitting procedure
    params : lmfit.parameter.Parameters, optional
        if provided, these parameters will override `model_fit`'s parameters
    PLOT : bool, optional
        if :const:`True`, the function will plot the Cook's distances of the wells and the threshold. # FIXME

    Returns
    -------
    min_dbl : float
        the minimal time it takes the population density to double.
    fig : matplotlib.figure.Figure
        if the argument `PLOT` was :const:`True`, the generated figure.
    ax : matplotlib.axes.Axes
        if the argument `PLOT` was :const:`True`, the generated axis.
    """
    if params is None:
        params = model_fit.params
    def f(t): 
        return model_fit.model.eval(t=t, params=params)

    t1 = model_fit.userkws['t'].max()
    t = np.linspace(0, t1, 1000)
    y = f(t)

    def find_point(point):
        return abs(y - point).argmin()

    imax = find_point(y.max() / 2) + 1
    doubling_times = np.zeros(imax)
    for i0, y0 in enumerate(y[:imax]):
        i1 = find_point(2 * y0)
        doubling_times[i0] = t[i1] - t[i0]
    min_dbl = doubling_times.min()

    if PLOT:
        fig, ax = plt.subplots(1, 1)
        ax.plot(t[:imax], doubling_times, '-')
        ax.axhline(min_dbl, ls='--', color='k')
        ax.set(xlabel='Time', ylabel='Doubling time')
        sns.despine()
        return min_dbl, fig, ax

    return min_dbl


[docs]def find_min_doubling_time_ci(model_fit, param_samples, ci=0.95):
    """Estimates a confidence interval for the minimal doubling time from the model fit.

    The minimal doubling time for each parameter sample is calculated.
    The confidence interval of the doubling time is the lower and higher percentiles such that 
    `ci` percent of the random lag durations are within the confidence interval.

    Parameters
    ----------
    model_fit : lmfit.model.ModelResult
        the result of a model fitting procedure
    param_samples : pandas.DataFrame
        parameter samples, generated using :function:`sample_params` or :function:`bootstrap_params`    
    ci : float, optional
        the fraction of doubling times that should be within the calculated limits. 0 < `ci` <, defaults to 0.95.
    
    Returns
    -------
    low, high : float
        the lower and the higher boundaries of the confidence interval of the minimal doubling times in the units of the `model_fit` ``Time`` variable (usually hours).

    See also
    --------
    find_min_doubling_time
    """
    if not 0 <= ci <= 1:
        raise ValueError("ci must be between 0 and 1")
    nsamples = param_samples.shape[0]
    dbls = np.zeros(nsamples)    
    for i in range(param_samples.shape[0]):
        sample = param_samples.iloc[i,:]
        params = model_fit.params.copy()
        for k, v in params.items():
            if v.vary:
                v.value = sample[k]
        dbls[i] = find_min_doubling_time(model_fit, params=params)

    margin = (1.0 - ci) * 50.0
    idx = np.isfinite(dbls) & (dbls >= 0)
    if not idx.all():
        warn("Warning: omitting {0} non-finite doubling time values".format(len(dbls) - idx.sum()))
    dbls = dbls[idx]
    low = np.percentile(dbls, margin)
    high = np.percentile(dbls, ci * 100.0 + margin)
    assert high >= low, dbls.tolist()
    return low, high


[docs]def find_K_ci(param_samples, ci=0.95):
    """Estimates a confidence interval for ``K``, the maximum population density from the model fit.

    The confidence interval of the doubling time is the lower and higher percentiles such that 
    `ci` percent of the max densities are within the confidence interval.

    Parameters
    ----------
    param_samples : pandas.DataFrame
        parameter samples, generated using :function:`sample_params` or :function:`bootstrap_params`    
    ci : float, optional
        the fraction of doubling times that should be within the calculated limits. 0 < `ci` <, defaults to 0.95.
    
    Returns
    -------
    low, high : float
        the lower and the higher boundaries of the confidence interval of the maximum population density.
    
    """
    if not 0 <= ci <= 1:
        raise ValueError("ci must be between 0 and 1")    
    Ks = param_samples['K']    
    
    margin = (1.0 - ci) * 50.0
    idx = np.isfinite(Ks) & (Ks >= 0)
    if not idx.all():
        warn("Warning: omitting {0} non-finite K values".format(len(Ks) - idx.sum()))
    Ks = Ks[idx]
    low = np.percentile(Ks, margin)
    high = np.percentile(Ks, ci * 100.0 + margin)
    assert high >= low, Ks.tolist()
    return low, high


[docs]def find_lag(model_fit, params=None):
    """Estimates the lag duration from the model fit.

    The function calculates the tangent line to the model curve at the point of maximum derivative (the inflection point). 
    The time when this line intersects with :math:`N_0` (the initial population size) 
    is labeled :math:`\lambda` and is called the lag duration time [fig2.2]_.

    Parameters
    ----------
    model_fit : lmfit.model.ModelResult
        the result of a model fitting procedure
    params : lmfit.parameter.Parameters, optional
        if provided, these parameters will override `model_fit`'s parameters

    Returns
    -------
    lam : float
        the lag phase duration in the units of the `model_fit` ``Time`` variable (usually hours).    

    References
    ----------
    .. [fig2.2] Fig. 2.2 pg. 19 in Baranyi, J., 2010. `Modelling and parameter estimation of bacterial growth with distributed lag time. <http://www2.sci.u-szeged.hu/fokozatok/PDF/Baranyi_Jozsef/Disszertacio.pdf>`_.

    See also
    --------
    find_lag_ci
    has_lag
    """
    if params is None:
        params = model_fit.params
    
    y0 = params['y0'].value
    K  = params['K'].value

    t = model_fit.userkws['t']
    t = np.linspace(t.min(), t.max())
    def f(t): 
        return model_fit.model.eval(t=t, params=params)
    y = f(t)    
    dfdt = derivative(f, t)
    idx = y > K / np.e
    if idx.sum() == 0:
        warn("All values are below K/e")
        return np.nan
    t = t[idx]
    y = y[idx]
    dfdt = dfdt[idx]

    a = dfdt.max()
    i = dfdt.argmax()
    t1 = t[i]
    y1 = y[i]
    b = y1 - a * t1
    lam = (y0 - b) / a
    return lam


[docs]def find_lag_ci(model_fit, param_samples, ci=0.95):
    """Estimates a confidence interval for the lag duration from the model fit.

    The lag duration for each parameter sample is calculated.
    The confidence interval of the lag is the lower and higher percentiles such that 
    `ci` percent of the random lag durations are within the confidence interval.

    Parameters
    ----------
    model_fit : lmfit.model.ModelResult
        the result of a model fitting procedure
    param_samples : pandas.DataFrame
        parameter samples, generated using :function:`sample_params` or :function:`bootstrap_params`    
    ci : float, optional
        the fraction of lag durations that should be within the calculated limits. 0 < `ci` <, defaults to 0.95.
    
    Returns
    -------
    low, high : float
        the lower and the higher boundaries of the confidence interval of the lag phase duration in the units of the `model_fit` ``Time`` variable (usually hours).

    See also
    --------
    find_lag
    has_lag    
    """    
    if not 0 <= ci <= 1:
        raise ValueError("ci must be between 0 and 1")
    nsamples = param_samples.shape[0]
    lags = np.zeros(nsamples)    
    for i in range(param_samples.shape[0]):
        sample = param_samples.iloc[i,:]
        params = model_fit.params.copy()
        for k,v in params.items():
            if v.vary:
                params[k].set(value=sample[k])
        lags[i] = find_lag(model_fit, params=params)

    margin = (1.0 - ci) * 50.0
    idx = np.isfinite(lags)
    if not idx.all():
        warn("Warning: omitting {0} non-finite lag values".format(len(lags) - idx.sum()))
    lags = lags[idx]
    idx = (lags >= 0)
    if not idx.all():
        warn("Warning: omitting {0} negative lag values".format(len(lags) - idx.sum()))
    if not idx.any(): # no legal lag values left
        return np.nan, np.nan, np.nan
    lags = lags[idx]
    low = np.percentile(lags, margin)
    high = np.percentile(lags, ci * 100.0 + margin)
    assert high >= low, lags.tolist()
    return low, high


[docs]def has_lag(model_fits, alfa=0.05, PRINT=False):
    r"""Checks if if the best fit has statisticaly significant lag phase :math:`\lambda > 0`.

    If the best fitted model doesn't has a lag phase to begin with, return :const:`False`. 
    This includes the logistic model and Richards model.

    Otherwise, a likelihood ratio test will be perfomed with nesting determined according to Figure 1. 
    The null hypothesis of the test is that :math:`\frac{1}{v} = 0` , 
    i.e. the adjustment rate :math:`v` is infinite and therefore there is no lag phase.

    The function will return :const:`True` if the null hypothesis is rejected, 
    otherwise it will return :const:`False`.

    Parameters
    ----------
    model_fits : sequence of lmfit.model.ModelResult
        the results of several model fitting procedures, ordered by their statistical preference. Generated by :py:func:`fit_model`.
    alfa : float, optional
        test significance level, defaults to 0.05 = 5%.
    PRINT : bool, optional
        if :const:`True`, the function will print the result of the underlying statistical test; defaults to :const:`False`.

    Returns
    -------
    bool
        the result of the hypothesis test. :const:`True` if the null hypothesis was rejected and the data suggest that there is a significant lag phase.

    Raises
    ------
    ValueError
        raised if the fittest of the :py:class:`lmfit.model.ModelResult` objects in `model_fits` is of an unknown model.
    """
    m1 = model_fits[0]
    if np.isposinf(m1.best_values.get('q0', np.inf)) and np.isposinf(m1.best_values.get('v', np.inf)):
        if PRINT:
            print('H1 model has no lag')
        return False
        
    try:
        m0_model_class = m1.model.nested_models['lag']
    except KeyError:
        raise ValueError("The best fit model {0} has no nested model for testing lag".format(m1.model.name))
    try:
        m0 = [m for m in model_fits if isinstance(m.model, m0_model_class)][0]
    except IndexError:
        raise ValueError("No {0} in model results.".format(m0_model_class.name))
    
    prefer_m1, pval, D, ddf = lrtest(m0, m1, alfa=alfa)
    if PRINT:
        print("Tested H0: %s vs. H1: %s; D=%.2g, ddf=%d, p-value=%.2g" % (m0.model.name, m1.model.name, D, ddf, pval))    
    return prefer_m1


[docs]def has_nu(model_fits, alfa=0.05, PRINT=False):
    r"""Checks if if the best fit has :math:`\nu \ne 1` and if so if that is statisticaly significant.

    If the best fitted model has :math:`\nu = 1` to begin with, return :const:`False`. This includes the logistic model.
    Otherwise, a likelihood ratio test will be perfomed with nesting determined according to Figure 1. 
    The null hypothesis of the test is that :math:`\nu = 1`; if it is rejected than the function will return :const:`True`.
    Otherwise it will return :const:`False`.

    Parameters
    ----------
    model_fits : list lmfit.model.ModelResult
        the results of several model fitting procedures, ordered by their statistical preference. Generated by :py:func:`fit_model`.
    alfa : float, optional
        test significance level, defaults to 0.05 = 5%.
    PRINT : bool, optional
        if :const:`True`, the function will print the result of the underlying statistical test; defaults to :const:`False`.

    Returns
    -------
    bool
        the result of the hypothesis test. :const:`True` if the null hypothesis was rejected and the data suggest that :math:`\nu` is significantly different from one.

    Raises
    ------
    ValueError
        raised if the fittest of the :py:class:`lmfit.model.ModelResult` objects in `model_fits` is of an unknown model.
    """
    m1 = model_fits[0]
    if m1.best_values.get('nu', 1.0) == 1.0:
        return False

    try:
        m0_model_class = m1.model.nested_models['nu']
    except KeyError:
        raise ValueError("The best fit model {} has no nested model for testing nu".format(m1.model.name))
    try:
        m0 = [m for m in model_fits if isinstance(m.model, m0_model_class)][0]
    except IndexError:
        raise ValueError("No {} in model results.".format(m0_model_class.name))

    prefer_m1, pval, D, ddf = lrtest(m0, m1, alfa=alfa)
    if PRINT:
        msg = "Tested H0: %s (nu=%.2g) vs. H1: %s (nu=%.2g); D=%.2g, ddf=%d, p-value=%.2g"  
        print(msg % (m0.model.name, m0.best_values.get('nu', 1), m1.model.name, m1.best_values.get('nu', 1), D, ddf, pval))
    return prefer_m1


def make_Dfun(model, params):
    expr, t, args = model.get_sympy_expr(params)
    partial_derivs = [None]*len(args)
    for i,x in enumerate(args):
        dydx = expr.diff(x)
        dydx = sympy.lambdify(args=(t,) + args, expr=dydx, modules="numpy")
        partial_derivs[i] = dydx        
    def Dfun(params, y, a, t):
        values = [ par.value for par in params.values() if par.vary ]
        res = np.array([dydx(t, *values) for dydx in partial_derivs])
        expected_shape = (len(values), len(t))
        if res.shape != expected_shape:
            raise TypeError("Dfun result shape for {0} is incorrect, expected {1} but it is {2}.".format(model.name, expected_shape, res.shape))
        return res
    return Dfun


[docs]def cooks_distance(df, model_fit, use_weights=True):
    """Calculates Cook's distance of each well given a specific model fit. 

    Cook's distance is an estimate of the influence of a data curve when performing model fitting; 
    it is used to find wells (growth curve replicates) that are suspicious as outliers.
    The higher the distance, the more suspicious the curve.

    Parameters
    ----------
    df : pandas.DataFrame
        growth curve data, see :py:mod:`curveball.ioutils` for a detailed definition.
    model_fit : lmfit.model.ModelResult
        result of model fitting procedure
    use_weights : bool, optional
        should the function use standard deviation across replicates as weights for the fitting procedure, defaults to :const:`True`.

    Returns
    -------
    dict
        a dictionary of Cook's distances: keys are wells (from the `Well` column in `df`), values are Cook's distances.

    Notes
    -----
    `Wikipedia <https://en.wikipedia.org/wiki/Cook's_distance>`_
    """
    p = model_fit.nvarys
    MSE = model_fit.chisqr / model_fit.ndata
    wells = df.Well.unique()
    D = {}
    
    for well in wells:    
        _df = df[df.Well != well]
        time = _df['Time'].values
        OD = _df['OD'].values
        weights =  calc_weights(_df) if use_weights else None
        model_fit_i = copy.deepcopy(model_fit)
        model_fit_i.fit(data=OD, t=time, weights=weights)
        D[well] = model_fit_i.chisqr / (p * MSE)
    return D


[docs]def find_outliers(df, model_fit, deviations=2, use_weights=True, ax=None, PLOT=False):
    """Find outlier wells in growth curve data.

    Uses the Cook's distance approach (`cooks_distance`); 
    values of Cook's distance that are `deviations` standard deviations **above** the mean
    are defined as outliers.

    Parameters
    ----------
    df : pandas.DataFrame
        growth curve data, see :py:mod:`curveball.ioutils` for a detailed definition.
    model_fit : lmfit.model.ModelResult
        result of model fitting procedure
    deviations : float, optional
        the number of standard deviations that defines an outlier, defaults to 2.
    use_weights : bool, optional
        should the function use standard deviation across replicates as weights for the fitting procedure, defaults to :const:`True`.
    ax : matplotlib.axes.Axes, optional
        an axes to plot into; if not provided, a new one is created.
    PLOT : bool, optional
        if :const:`True`, the function will plot the Cook's distances of the wells and the threshold.

    Returns
    -------
    outliers : list
        the labels of the outlier wells.
    fig : matplotlib.figure.Figure
        if the argument `PLOT` was :const:`True`, the generated figure.
    ax : matplotlib.axes.Axes
        if the argument `PLOT` was :const:`True`, the generated axis.
    """
    D = cooks_distance(df, model_fit, use_weights=use_weights)
    D = sorted(D.items()) # TODO SortedDict?
    distances = [x[1] for x in D]        
    dist_mean, dist_std = np.mean(distances), np.std(distances)
    outliers = [well for well,dist in D if dist > dist_mean + deviations * dist_std]
    if PLOT:
        if ax is None:
            fig,ax = plt.subplots(1,1)
        else:
            fig = ax.get_figure()
        wells = [x[0] for x in D]            
        ax.stem(distances, linefmt='k-', basefmt='')
        ax.axhline(y=dist_mean, ls='-', color='k')
        ax.axhline(y=dist_mean + deviations * dist_std, ls='--', color='k')
        ax.axhline(y=dist_mean - deviations * dist_std, ls='--', color='k')
        ax.set_xticks(range(len(wells)))
        ax.set_xlim(-0.5, len(wells) - 0.5)
        ax.set_xticklabels(wells, rotation=90)
        ax.set_xlabel('Well')
        ax.set_ylabel("Cook's distance")
        sns.despine()
        return outliers,fig,ax
    return outliers


[docs]def find_all_outliers(df, model_fit, deviations=2, max_outlier_fraction=0.1, use_weights=True, PLOT=False):    
    """Iteratively find outlier wells in growth curve data.

    At each iteration, calls :py:func:`find_outliers`.
    Iterations stop when no more outliers are found or when the fraction of wells defined as outliers
    is above `max_outlier_fraction`.
        
    Parameters
    ----------
    df : pandas.DataFrame
        growth curve data, see :py:mod:`curveball.ioutils` for a detailed definition.
    model_fit : lmfit.model.ModelResult
        result of model fitting procedure
    deviations : float, optional
        the number of standard deviations that defines an outlier, defaults to 2.
    max_outlier_fraction : float, optional
        maximum fraction of wells to define as outliers, defaults to 0.1 = 10%.
    use_weights : bool, optional
        should the function use standard deviation across replicates as weights for the fitting procedure, defaults to :const:`True`.
    ax : matplotlib.axes.Axes
        an axes to plot into; if not provided, a new one is created.
    PLOT : bool, optional
        if :const:`True`, the function will plot the Cook's distances of the wells and the threshold.

    Returns
    -------
    outliers : list
        a list of lists: the list nested at index *i* contains the labels of the outlier wells found at iteration *i*.
    fig : matplotlib.figure.Figure
        if the argument `PLOT` was :const:`True`, the generated figure.
    ax : matplotlib.axes.Axes
        if the argument `PLOT` was :const:`True`, the generated axis.
    """
    outliers = []
    num_wells = len(df.Well.unique())
    df = copy.deepcopy(df)
    if PLOT:
        fig = plt.figure()
        o, fig, ax = find_outliers(df, model_fit, deviations=deviations, use_weights=use_weights, ax=fig.add_subplot(), PLOT=PLOT)
    else:
        o = find_outliers(df, model_fit, deviations=deviations, use_weights=use_weights, PLOT=PLOT)
    outliers.append(o)
    while len(outliers[-1]) != 0 and len(sum(outliers, [])) <  max_outlier_fraction * num_wells:
        df = df[~df.Well.isin(outliers[-1])]
        assert df.shape[0] > 0, df.shape[0] 
        model_fit = fit_model(df, use_weights=use_weights, PLOT=False, PRINT=False)[0]
        if PLOT:
            o, fig, ax = find_outliers(df, model_fit, deviations=deviations, use_weights=use_weights, ax=fig.add_subplot(), PLOT=PLOT)            
        else:
            o = find_outliers(df, model_fit, deviations=deviations, use_weights=use_weights, PLOT=PLOT)
        outliers.append(o)
    if PLOT:
        return outliers[:-1],fig,ax
    return outliers[:-1]


[docs]def calc_weights(df, PLOT=False):
    """Calculate weights for the fittiing procedure based on the standard deviations at each time point.

    If there is more than one replicate, use the standard deviations as weight.
    Warn about NaN and infinite values.

    Parameters
    ----------
    df : pandas.DataFrame
        data frame with `Time` and `OD` columns
    PLOT : bool, optional
        if :const:`True`, plot the weights by time

    Returns
    -------
    weights : np.ndarray
        array of weights, calculated as the inverse of the standard deviations at each time point
    fig : matplotlib.figure.Figure
        if the argument `PLOT` was :const:`True`, the generated figure.
    ax : matplotlib.axes.Axes
        if the argument `PLOT` was :const:`True`, the generated axis.
    """
    deviations = df.groupby('Time')['OD'].transform(lambda x: np.repeat(x.std(), len(x))).values
    if np.isnan(deviations).any():
        warn("NaN in deviations, can't use weights")
        weights = None
    else:
        weights = 1.0 / deviations
        # if any weight is nan, raise error
        idx = np.isnan(weights)
        if idx.any():
            raise ValueError("NaN weights are illegal, indices: {0}".format(idx))
        # if any weight is infinite, change to the max
        idx = np.isinf(weights)
        if idx.all():
            warn("All weights are infinite, proceeding without weights)")
            weights = None
        elif idx.any():
            warn("Found infinite weight, changing to maximum ({0} occurences)".format(idx.sum()))
            weights[idx] = weights[~idx].max()
    if PLOT:
        fig, ax = plt.subplots(1, 1)
        ax.plot(df.Time, weights, 'o')
        ax.set_xlabel('Time')
        ax.set_ylabel('Weight')
        sns.despine()
        return weights, fig, ax
    return weights


[docs]def information_criteria_weights(results):
    r"""Calculate weighted AIC and BIC for model results.

    .. :math::

        w_i = \frac{e^{-\frac{x_i - \bar{x}}{2}}}{\sum_j{e^{-\frac{x_j - \bar{x}}{2}}}}

    where :math:`w_i` is the weighted measure for model result :math:`i` and
    :math:`x_i` is the AIC or BIC of model result :math:`i`.

    Parameters
    ----------
    results : sequence of lmfit.model.ModelResult
        use the :py:attr:`lmfit.attr.ModelResult.aic` and :py:attr:`lmfit.model.ModelResult.bic` attributes to add a `weighted_aic` and `weighted_bic` attribute.

    Notes
    -----
    - `Weighted AIC explained <http://stats.stackexchange.com/questions/35010/aic-bic-number-interpretation/44360#44360>`_.
    """
    aics = np.array([m.aic for m in results])
    bics = np.array([m.bic for m in results])
    weighted_aics = np.exp(-0.5 * (aics - aics.min()))
    weighted_aics /= weighted_aics.sum()
    weighted_bics = np.exp(-0.5 * (bics - bics.min()))
    weighted_bics /= weighted_bics.sum()
    for m, b, a in zip(results, weighted_bics, weighted_aics):
        m.weighted_aic = a
        m.weighted_bic = b


def nvarys(params):
    return len([p for p in params.values() if p.vary])


[docs]def fit_exponential_growth_phase(t, N, k=2):
    r"""Fits an exponential model to the exponential growth phase.

    Fits a polynomial p(t)~N, finds tmax the time of the maximum of the derivative dp/dt,
    and fits a linear function to log(N) around tmax.
    The resulting slope (a) and intercept (b) are the parameters of the exponential model:

    .. math::

        N(t) = N_0 e^{at}

        N_0 = e^b

    Arguments
    ---------
    t : np.ndarray
        time
    N : np.ndarray
        `N[i]` is the population size at time `t[i]`
    k : int
        number of points to take around tmax, defaults to 2 for a total of 5 points

    Returns
    -------
    slope : float
        slope of the linear regression, a
    intercept : float
        intercept of the linear regression, b
    """
    N_smooth = smooth(t, N)
    dNdt = derivative(N_smooth, t)
    imax = dNdt.argmax()
    idx = np.arange(imax - k, imax + k)
    slope, intercept = linregress(t[idx], np.log(N[idx]))[:2]
    return slope, intercept


[docs]def fit_model(df, param_guess=None, param_min=None, param_max=None, param_fix=None, 
              models=None, use_weights=False, use_Dfun=False, method='leastsq', ax=None, PLOT=True, PRINT=True):
    r"""Fit and select a growth model to growth curve data.

    This function fits several growth models to growth curve data (``OD`` as a function of ``Time``).

    Parameters
    ----------
    df : pandas.DataFrame
        growth curve data, see :py:mod:`curveball.ioutils` for a detailed definition.
    param_guess : dict, optional
        a dictionary of parameter guesses to use (key: :py:class:`str` of param name; value: :py:class:`float` of param guess).
    param_min : dict, optional
        a dictionary of parameter minimum bounds to use (key: :py:class:`str` of param name; value: :py:class:`float` of param min bound).
    param_max : dict, optional
        a dictionary of parameter maximum bounds to use (key: :py:class:`str` of param name; value: :py:class:`float` of param max bound).
    param_fix : set, optional
        a set of names (:py:class:`str`) of parameters to fix rather then vary, while fitting the models.
    use_weights : bool, optional
        should the function use the deviation across replicates as weights for the fitting procedure, defaults to :const:`False`.
    use_Dfun : bool, optional
        should the function calculate the partial derivatives of the model functions to be used in the fitting procedure, defaults to :const:`False`.
    models : one or more model classes, optional
        model classes (not instances) to use for fitting; defaults to all model classes in `curveball.baranyi_roberts_model`.
    method : str, optional
        the minimization method to use, defaults to `leastsq`, 
        can be anything accepted by :py:func:`lmfit.minimizer.Minimizer.minimize` or :py:func:`lmfit.minimizer.Minimizer.scalar_minimize`.
    ax : matplotlib.axes.Axes, optional
        an axes to plot into; if not provided, a new one is created.
    PLOT : bool, optional
        if :const:`True`, the function will plot the all model fitting results.
    PRINT : bool, optional
        if :const:`True`, the function will print the all model fitting results.

    Returns
    -------
    models : list of lmfit.model.ModelResult
        all model fitting results, sorted by increasing BIC (a measure of fit quality).
    fig : matplotlib.figure.Figure
        figure object.
    ax : numpy.ndarray
        array of :py:class:`matplotlib.axes.Axes` objects, one for each model result, with the same order as `models`.

    Raises
    ------
    TypeError
        if one of the input parameters is of the wrong type (not guaranteed).
    ValueError
        if the input is bad, for example, `df` is empty (not guaranteed).
    AssertionError
        if any of the intermediate calculated values are inconsistent (for example, ``y0<0``).

    Examples
    --------
    >>> import curveball
    >>> import pandas as pd
    >>> import matplotlib.pyplot as plt
    >>> plate = pd.read_csv('plate_templates/G-RG-R.csv')
    >>> df = curveball.ioutils.read_tecan_xlsx('data/Tecan_280715.xlsx', label='OD', plate=plate)
    >>> green_models = curveball.models.fit_model(df[df.Strain == 'G'], PLOT=True)
    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input df must be a %s, but it is %s" % (pd.DataFrame.__name__, df.__class__.__name__))
    if df.shape[0] == 0:
        raise ValueError("No rows in input df")
    
    if param_guess is None:
        param_guess = dict()
    if param_fix is None:
        param_fix = set()

    df = df.sort_values(by=['Time', 'OD'])
    time = df['Time'].values
    OD = df['OD'].values
    weights =  calc_weights(df) if use_weights else None
    # TODO why should we use weights if we use the whole data set?
    ODerr = df.groupby('Time').OD.transform(lambda x: np.repeat(x.std(), len(x))).values
   
    if models is None:
        models = get_models(curveball.baranyi_roberts_model)
    elif is_model(models):
        models = [models]
    results = [None] * len(models)
    for i, model_class in enumerate(models):
        model = model_class()
        params = model.guess(data=OD, t=time, param_guess=param_guess, param_min=param_min, param_max=param_max, param_fix=param_fix)    
        fit_kws = {'Dfun': make_Dfun(model, params), "col_deriv":True} if use_Dfun else {}        
        model_result = model.fit(data=OD, t=time, params=params, weights=weights, fit_kws=fit_kws, method=method)
        results[i] = model_result

    # sort by increasing BIC
    information_criteria_weights(results)
    results.sort(key=lambda m: m.bic)

    if PRINT:
        print(results[0].fit_report(show_correl=False))
    if PLOT:        
        dy = df['OD'].max() / 50.0
        dx = df['Time'].max() / 25.0
        columns = min(3, len(results))
        rows = int(np.ceil(len(results) / columns))
        w = max(8, 4 * columns)
        h = max(6, 3*rows)
        fig, ax = plt.subplots(rows, columns, sharex=True, sharey=True, figsize=(w, h))
        if not hasattr(ax, '__iter__'):
            ax = np.array(ax, ndmin=2)
        for i,fit in enumerate(results):
            row = i // columns
            col = i % columns
            _ax = ax[row, col]
            vals = fit.best_values
            fit.plot_fit(ax=_ax, datafmt='.', data_kws={'alpha':0.3}, fit_kws={'lw': 4}, yerr=ODerr)
            _ax.axhline(y=vals.get('y0', 0), color='k', ls='--')
            _ax.axhline(y=vals.get('K', 0), color='k', ls='--')          
            title = '%s %dp\nBIC: %.3f\ny0=%.2f, K=%.2f, r=%.2g\n' + r'$\nu$=%.2g, $q_0$=%.2g, v=%.2g'
            title = title % (fit.model.name, fit.nvarys, fit.bic, vals.get('y0', np.nan), vals.get('K', np.nan), vals.get('r', np.nan), vals.get('nu', np.nan), vals.get('q0', np.nan), vals.get('v', np.nan))
            _ax.set_title(title)
            _ax.get_legend().set_visible(False)
            _ax.set_xlabel('')
            _ax.set_ylabel('')
            if col == 0:
                _ax.set_ylabel('OD')
            if row == rows - 1:
                _ax.set_xlabel('Time')
        _ax.set_xlim(0, 1.1 * df['Time'].max())
        _ax.set_ylim(0.9 * df['OD'].min(), 1.1 * df['OD'].max())
        sns.despine()
        fig.tight_layout()
        return results, fig, ax
    return results


if __name__ == '__main__':    
    # simulate 30 growth curves
    df = randomize(t=12, y0=0.12, K=0.56, r=0.8, nu=1.8, q0=0.2, v=0.8, reps=30, noise_std=0.04, random_seed=0)

    # fit models to growth curves
    results, fig, ax = fit_model(df, use_Dfun=True, PLOT=True, PRINT=True)    
    plt.savefig('test_models.png')
    plt.show()