Source code for curveball.ioutils

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# This file is part of curveball.
# https://github.com/yoavram/curveball

# Licensed under the MIT license:
# http://www.opensource.org/licenses/MIT-license
# Copyright (c) 2015, Yoav Ram <yoav@yoavram.com>
from __future__ import print_function
from __future__ import division
from builtins import zip
from builtins import str
from builtins import filter
from builtins import map
from builtins import range
from six import string_types
from past.utils import old_div
import numpy as np
import pandas as pd
from string import ascii_uppercase
from scipy.io import loadmat
import re
import datetime
import dateutil.parser
from glob import glob
import os.path
from warnings import warn


MAT_VERSION = u'1.0'


def _fix_dtypes(df):
    """Fix the dtypes of categorial columns.

    Parameters
    ----------
    df : pandas.DataFrame
        data frame to fix

    Returns
    -------
    None
    """
    if u'Strain' in df.columns:
        df[u'Strain'] = df[u'Strain'].astype(str)
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(str)
    

[docs]def read_curveball_csv(filename, max_time=None, plate=None):
    """Reads growth measurements from a Curveball csv (comma separated values) file.

    Parameters
    ----------
    filename : str
        path to the file.
    plate : pandas.DataFrame, optional
        data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_.

    Returns
    -------
    pandas.DataFrame

    Examples
    --------   
    >>> df = curveball.ioutils.read_curveball_csv("data/Tecan_210115.csv")
    """
    df = pd.read_csv(filename, encoding='utf8')
    if plate is None and 'Strain' not in df.columns:
        df[u'Strain'] = u'0'
    if plate is None and 'Color' not in df.columns:
        df[u'Color'] = u'#000000'
    if max_time is not None:
        df = df[df.Time <= max_time]
    _fix_dtypes(df)
    return df


[docs]def write_curveball_csv(df, filename):
    """Reads growth measurements from a Curveball csv (comma separated values) file.

    Parameters
    ----------
    df : pandas.DataFrame, optional
        data frame to write
    filename : str
        path to the output file        
    """
    df.to_csv(filename, index=False, encoding='utf8')


[docs]def read_tecan_xlsx(filename, label=u'OD', sheets=None, max_time=None, plate=None, PRINT=False):
    """Reads growth measurements from a Tecan Infinity Excel output file.

    Parameters
    ----------
    filename : str
        path to the file.
    label : str / sequence of str
        a string or sequence of strings containing measurment names used as titles of the data tables in the file.
    sheets : list, optional
        list of sheet numbers, if known. Otherwise the function will try to all the sheets.
    max_time : float, optional
        maximal time in hours, defaults to infinity
    plate : pandas.DataFrame, optional
        data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_.

    Returns
    -------
    pandas.DataFrame
        Data frame containing the columns:

        - ``Time`` (:py:class:`float`, in hours)
        - ``Temp. [°C]`` (:py:class:`float`)
        - ``Cycle Nr.`` (:py:class:`int`)
        - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column.
        - ``Row`` (:py:class:`str`): the letter corresponding to the well row.
        - ``Col`` (:py:class:`str`): the number corresponding to the well column.
        - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate.
        - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate.

        There will also be a separate column for each label, and if there is more than one label, a separate `Time` and `Temp. [°C]` column for each label.

    Raises
    ------
    ValueError
        if not data was parsed from the file.

    Examples
    --------
    
    >>> plate = pd.read_csv("plate_templates/G-RG-R.csv")
    >>> df = curveball.ioutils.read_tecan_xlsx("data/Tecan_210115.xlsx", label=('OD','Green','Red'), max_time=12, plate=plate)
    >>> df.shape
    (8544, 9)
    """
    import xlrd
    wb = xlrd.open_workbook(filename)
    dateandtime = datetime.datetime.now() # default

    if isinstance(label, string_types):
        label = [label]
    if sheets is None:
        sheets = range(wb.nsheets)
    if PRINT: print("Reading {0} worksheets from workbook {1}".format(len(sheets), filename))
    label_dataframes = []
    for lbl in label:
        sheet_dataframes = []        
        ## FOR sheet
        for sh_i in sheets:
            sh = wb.sheet_by_index(sh_i)
            if sh.nrows == 0:
                continue # to next sheet
        
            for i in range(sh.nrows):
                ## FOR row
                row = sh.row_values(i)
                if row[0].startswith(u'Date'):
                    if isinstance(row[1], string_types):
                        date = ''.join(row[1:])
                        next_row = sh.row_values(i + 1)
                        if next_row[0].startswith(u'Time'):
                            time = ''.join(next_row[1:])
                        else:
                            warn(u"Warning: time row missing (sheet '{0}', row{1}), found row starting with {2}".format(sh.name, i, row[0]))
                        dateandtime = dateutil.parser.parse("%s %s" % (date, time))
                    elif isinstance(row[1], float):
                        date_tuple = xlrd.xldate_as_tuple(row[1], wb.datemode)
                        next_row = sh.row_values(i + 1)
                        if next_row[0].startswith(u'Time'):
                            time = tuple(map(int, next_row[1].split(':')))[:3]
                            date_tuple = date_tuple[:3] + time
                        else:
                            warn(u"Warning: time row missing (sheet '{0}', row{1}), found row starting with {2}".format(sh.name, i, row[0]))
                        dateandtime = datetime.datetime(*date_tuple)
                    else:
                        warn(u"Warning: date row (sheet '{2}', row {3}) could not be parsed: {0} {1}".format(row[1], type(row[1]), sh.name, i))
                if row[0] == lbl:
                    break
                ## FOR row ENDS
            
            data = {}            
            for j in range(i + 1, sh.nrows):
                ## FOR row
                row = sh.row(j)
                if not row[0].value:
                    break
                data[row[0].value] = [x.value for x in row[1:] if x.ctype == 2]
                ## FOR row ENDS

            if not data:
                raise ValueError("No data found in sheet {1} of workbook {0}".format(filename, sh_i))

            min_length = min(map(len, data.values()))
            for k,v in data.items():
                data[k] =  v[:min_length]

            df = pd.DataFrame(data)
            df = pd.melt(df, id_vars=(u'Time [s]', u'Temp. [°C]', u'Cycle Nr.'), var_name=u'Well', value_name=lbl)
            df.rename(columns={u'Time [s]': u'Time'}, inplace=True)
            df.Time = [dateandtime + datetime.timedelta(0, t) for t in df.Time]        
            df[u'Row'] = [x[0] for x in df.Well]
            df[u'Col'] = [int(x[1:]) for x in df.Well]
            sheet_dataframes.append(df)
        ## FOR sheet ENDS

        n_sheet_dataframes = len(sheet_dataframes)
        if n_sheet_dataframes == 0:
            df = pd.DataFrame()
        elif n_sheet_dataframes == 1:
            df = sheet_dataframes[0]
        else:
            df = pd.concat(sheet_dataframes)

        min_time = df.Time.min()
        if PRINT:
            print("Starting time", min_time)
        df.Time = [(t - min_time).total_seconds() / 3600.0 for t in df.Time]
        if max_time is not None:
            df = df[df.Time <= max_time]
        df = df.sort_values([u'Row', u'Col', u'Time'])
        label_dataframes.append((lbl,df))

    n_label_dataframes = len(label_dataframes)
    if n_label_dataframes == 0: # no dataframes
        return pd.DataFrame()
    if n_label_dataframes == 1: # just one dataframe
        df = label_dataframes[0][1]
    else: # multiple dataframes, merge together
        # FIXME last label isn't used as a suffix, not sure why
        lbl, df = label_dataframes[0]
        lbl = '_' + lbl
        for lbli, dfi in label_dataframes[1:]:
            lbli = '_' + lbli
            df = pd.merge(df, dfi, on=(u'Cycle Nr.', u'Well', u'Row', u'Col'), suffixes=(lbl,lbli))
    if plate is None:
        df[u'Strain'] = u'0'
        df[u'Color'] = u'#000000'
    else:
        df = pd.merge(df, plate, on=(u'Row', u'Col'))
    if PRINT: print("Read {0} records from workbook".format(df.shape[0]))
    _fix_dtypes(df)
    return df


[docs]def read_tecan_mat(filename, time_label=u'tps', value_label=u'plate_mat', value_name=u'OD', plate_width=12, max_time=None, plate=None):
    """Reads growth measurements from a Matlab file generated by a propriety script at the *Pilpel lab*.

    Parameters
    ----------
    filename : str
        name of the XML file to be read. Use ``*`` and ``?`` in filename to read multiple files and parse them into a single data frame.
    time_label : str, optional
        name of the field used to store the time values, defaults to ``tps``.
    label : str
        name of the field used to store the OD values, defaults to ``plate_mat``.
    plate_width : int
        width of the microwell in plate in number of wells, defaults to 12.
    max_time : float, optional
        maximal time in hours, defaults to infinity
    plate : pandas.DataFrame, optional
        data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_.

    Returns
    -------
    pandas.DataFrame
        Data frame containing the columns:

        - ``Time`` (:py:class:`float`, in hours)
        - ``OD`` (:py:class:`float`)
        - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column.
        - ``Row`` (:py:class:`str`): the letter corresponding to the well row.
        - ``Col`` (:py:class:`str`): the number corresponding to the well column.
        - ``Filename`` (:py:class:`str`): the filename from which this measurement was read.
        - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate.
        - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate.
    """
    mat = loadmat(filename, appendmat=True)
    if mat[u'__version__'] != MAT_VERSION:
        warn(u"Warning: expected mat file version {0} but got {1}".format(MAT_VERSION, mat[u'__version__']))
    t = mat[time_label]
    t = t.reshape(max(t.shape))
    y = mat[value_label]
    assert y.shape[1] == t.shape[0]

    df = pd.DataFrame(y.T, columns=np.arange(y.shape[0]) + 1)
    df[u'Time'] = old_div(t, 3600.)
    df[u'Cycle Nr.'] = np.arange(1, 1 + len(t))
    df = pd.melt(df, id_vars=(u'Cycle Nr.', u'Time'), var_name=u'Well', value_name=value_name)
    df[u'Well'] = [ascii_uppercase[old_div((int(w) - 1), plate_width)] + str(w % plate_width if w % plate_width > 0 else plate_width) for w in df[u'Well']]
    df[u'Row'] = [w[0] for w in df[u'Well']]
    df[u'Col'] = [int(w[1:]) for w in df[u'Well']]
    
    if plate is None:
        df[u'Strain'] = u'0'
        df[u'Color'] = u'#000000'
    else:
        df = pd.merge(df, plate, on=(u'Row', u'Col'))
    if not max_time:
        max_time = df.Time.max()
    df = df[df.Time < max_time]
    df.sort_values([u'Row', u'Col', u'Time'], inplace=True)    
    _fix_dtypes(df)
    return df


[docs]def read_tecan_xml(filename, label=u'OD', max_time=None, plate=None):
    """Reads growth measurements from a Tecan Infinity XML output files.

    Parameters
    ----------
    filename : str
        pattern of the XML files to be read. Use ``*`` and ``?`` in filename to read multiple files and parse them into a single data frame.
    label : str, optional
        measurment name used as ``Name`` in the measurement sections in the file, defaults to ``OD``.
    max_time : float, optional
        maximal time in hours, defaults to infinity
    plate : pandas.DataFrame, optional
        data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_.

    Returns
    -------
    pandas.DataFrame
        Data frame containing the columns:

        - ``Time`` (:py:class:`float`, in hours)
        - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column.
        - ``Row`` (:py:class:`str`): the letter corresponding to the well row.
        - ``Col`` (:py:class:`str`): the number corresponding to the well column.
        - ``Filename`` (:py:class:`str`): the filename from which this measurement was read.
        - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate.
        - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate.

    There will also be a separate column for the value of the label.

    Examples
    --------    

    >>> import zipfile
    >>> with zipfile.ZipFile("data/20130211_dh.zip") as z:
        z.extractall("data/20130211_dh")
    >>> plate = pd.read_csv("plate_templates/checkerboard.csv")
    >>> df = curveball.ioutils.read_tecan_xlsx("data/20130211_dh/*.xml", 'OD', plate=plate)
    >>> df.shape
    (2016, 8)

    Notes
    -----
    This function was adapted from `choderalab/assaytools <https://github.com/choderalab/assaytools/blob/908471e7976e207df3f9b0e31b2a89f84da40607/AssayTools/platereader.py>`_ (licensed under LGPL).
    """
    from lxml import etree
    dataframes = []
    for filename in glob(filename):
        # Parse XML file into nodes.
        root_node = etree.parse(filename)

        # Build a dict of section nodes.
        section_nodes = { section_node.get(u'Name') : section_node for section_node in root_node.xpath(u"/*/Section") }

        # Process all sections.
        if label not in section_nodes:
            return pd.DataFrame()

        section_node = section_nodes[label]
        
        # Get the time of measurement
        time_start = section_node.attrib[u'Time_Start']

        # Get a list of all well nodes
        well_nodes = section_node.xpath(u"*/Well")
        
        # Process all wells into data.
        well_data = []
        for well_node in well_nodes:
            well = well_node.get(u'Pos')
            value = float(well_node.xpath(u"string()"))
            well_data.append({u'Well': well, label: value})

        # Add to data frame
        df = pd.DataFrame(well_data)
        df[u'Row'] = [x[0] for x in df.Well]
        df[u'Col'] = [int(x[1:]) for x in df.Well]
        df[u'Time'] = dateutil.parser.parse(time_start)
        df[u'Filename'] = os.path.split(filename)[-1]
        dataframes.append(df)
    df = pd.concat(dataframes)
    min_time = df.Time.min()
    df.Time = [(t - min_time).total_seconds() / 3600.0 for t in df.Time]
    if plate is None:
        df[u'Strain'] = u'0'
        df[u'Color'] = u'#000000'
    else:
        df = pd.merge(df, plate, on=(u'Row', u'Col'))
    if max_time is not None:
        df = df[df.Time <= max_time]
    df.sort_values([u'Row', u'Col', u'Time'], inplace=True)
    _fix_dtypes(df)
    return df


[docs]def read_sunrise_xlsx(filename, label=u'OD', max_time=None, plate=None):
    """Reads growth measurements from a Tecan Sunrise Excel output file.

    Parameters
    ----------
    filename : str
        pattern of the XLSX files to be read. Use * and ? in filename to read multiple files and parse them into a single data frame.    label : str, optional
    label : str, optional
        measurment name to use for the data in the file, defaults to ``OD``.
    max_time : float, optional
        maximal time in hours, defaults to infinity
    plate : pandas.DataFrame, optional
        data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_.

    Returns
    -------
    pandas.DataFrame
        Data frame containing the columns:

        - ``Time`` (:py:class:`float`, in hours)
        - ``OD`` (or the value of `label`, if given)
        - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column.
        - ``Row`` (:py:class:`str`): the letter corresponding to the well row.
        - ``Col`` (:py:class:`str`): the number corresponding to the well column.
        - ``Filename`` (:py:class:`str`): the filename from which this measurement was read.
        - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate.
        - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate.
    """
    import xlrd
    dataframes = []
    files = glob(filename)
    if not files:
        return pd.DataFrame()
    for filename in files:
        wb = xlrd.open_workbook(filename)
        for sh in wb.sheets():
            if sh.nrows > 0:
                break
        parse_data = False # start with metadata
        index = []
        data = []

        for i in range(sh.nrows):
            row = sh.row_values(i)
            if row[0] == u'Date:':
                date = next(filter(lambda x: isinstance(x, float), row[1:]))
                date = xlrd.xldate_as_tuple(date, 0)        
            elif row[0] == u'Time:':
                time = next(filter(lambda x: isinstance(x, float), row[1:]))
                time = xlrd.xldate_as_tuple(time, 0)
            elif row[0] == u'<>':
                columns = list(map(int, row[1:]))
                parse_data = True
            elif len(row[0]) == 0 and parse_data:
                break
            elif parse_data:
                index.append(row[0])
                data.append(list(map(float, row[1:])))
                
        dateandtime = date[:3] + time[-3:]
        dateandtime = datetime.datetime(*dateandtime)
        
        df = pd.DataFrame(data, columns=columns, index=index)
        df[u'Row'] = index
        df = pd.melt(df, id_vars=u'Row', var_name=u'Col', value_name=label)
        df[u'Time'] = dateandtime
        df[u'Well'] = [x[0] + str(x[1]) for x in zip(df.Row, df.Col)]
        df[u'Filename'] = os.path.split(filename)[-1]
        dataframes.append(df)
    df = pd.concat(dataframes)
    min_time = df.Time.min()
    df.Time = [(t - min_time).total_seconds() / 3600.0 for t in df.Time]
    if plate is None:
        df[u'Strain'] = u'0'
        df[u'Color'] = u'#000000'
    else:
        df = pd.merge(df, plate, on=(u'Row', u'Col'))
    if max_time is not None:
        df = df[df.Time <= max_time]
    df.sort_values([u'Row', u'Col', u'Time'], inplace=True)
    _fix_dtypes(df)
    return df


def read_biotek_xlsx(filename, max_time=None, plate=None, PRINT=False):
    import xlrd
    wb = xlrd.open_workbook(filename)
    for sh_i in range(wb.nsheets):
        sh = wb.sheet_by_index(sh_i)
        if sh.nrows > 0:
            break
    else: # all sheets are empty
        warnings.warn('All sheets are empty in {}'.format(filename))
        return pd.DataFrame()

    if PRINT:
        print("Reading worksheet {0} with {2} lines from workbook {1}".format(sh_i, filename, sh.nrows))

    in_data = False
    data = []
    dfs = []
    i = 0
    while i < sh.nrows:
        row = sh.row_values(i)
        if not in_data and row[1] == 'Time':
            columns = row[1:]
            in_data = True
        elif in_data and row[1] != '':
            row[1] *= 24 # days -> hours
            data.append(row[1:])
        elif in_data and row[1] == '':
            in_data = False
            df = pd.DataFrame(data, columns=columns)
            df = pd.melt(df, [u'Time', u'T° 600'], var_name=u'Well', value_name=u'OD')
            dfs.append(df)
        i += 1
    df = pd.concat(dfs )

    df[u'Row'] = [x[0] for x in df.Well]
    df[u'Col'] = [int(x[1:]) for x in df.Well]

    min_time = df.Time.min()
    if PRINT:
        print("Starting time", min_time)
    if max_time is not None:
        df = df[df.Time <= max_time]
    df.sort_values([u'Row', u'Col', u'Time'], inplace=True)

    if df.shape[0] == 0: # no dataframes
        return pd.DataFrame()

    if plate is None:
        df[u'Strain'] = u'0'
        df[u'Color'] = u'#000000'
    else:
        df = pd.merge(df, plate, on=(u'Row', u'Col'))
    if PRINT: print("Read {0} records from workbook".format(df.shape[0]))
    _fix_dtypes(df)
    return df