Source code for curveball.ioutils

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# This file is part of curveball.
# https://github.com/yoavram/curveball

# Licensed under the MIT license:
# http://www.opensource.org/licenses/MIT-license
# Copyright (c) 2015, Yoav Ram <yoav@yoavram.com>
from __future__ import print_function
from __future__ import division
from builtins import zip
from builtins import str
from builtins import filter
from builtins import map
from builtins import range
from six import string_types
from past.utils import old_div
import numpy as np
import pandas as pd
from string import ascii_uppercase
from scipy.io import loadmat
import re
import datetime
import dateutil.parser
from glob import glob
import os.path
from warnings import warn


MAT_VERSION = u'1.0'


def _fix_dtypes(df):
    """Fix the dtypes of categorial columns.

    Parameters
    ----------
    df : pandas.DataFrame
        data frame to fix

    Returns
    -------
    None
    """
    if u'Strain' in df.columns:
        df[u'Strain'] = df[u'Strain'].astype(str)
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].astype(str)
    

[docs]def read_curveball_csv(filename, max_time=None, plate=None): """Reads growth measurements from a Curveball csv (comma separated values) file. Parameters ---------- filename : str path to the file. plate : pandas.DataFrame, optional data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_. Returns ------- pandas.DataFrame Examples -------- >>> df = curveball.ioutils.read_curveball_csv("data/Tecan_210115.csv") """ df = pd.read_csv(filename, encoding='utf8') if plate is None and 'Strain' not in df.columns: df[u'Strain'] = u'0' if plate is None and 'Color' not in df.columns: df[u'Color'] = u'#000000' if max_time is not None: df = df[df.Time <= max_time] _fix_dtypes(df) return df
[docs]def write_curveball_csv(df, filename): """Reads growth measurements from a Curveball csv (comma separated values) file. Parameters ---------- df : pandas.DataFrame, optional data frame to write filename : str path to the output file """ df.to_csv(filename, index=False, encoding='utf8')
[docs]def read_tecan_xlsx(filename, label=u'OD', sheets=None, max_time=None, plate=None, PRINT=False): """Reads growth measurements from a Tecan Infinity Excel output file. Parameters ---------- filename : str path to the file. label : str / sequence of str a string or sequence of strings containing measurment names used as titles of the data tables in the file. sheets : list, optional list of sheet numbers, if known. Otherwise the function will try to all the sheets. max_time : float, optional maximal time in hours, defaults to infinity plate : pandas.DataFrame, optional data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_. Returns ------- pandas.DataFrame Data frame containing the columns: - ``Time`` (:py:class:`float`, in hours) - ``Temp. [°C]`` (:py:class:`float`) - ``Cycle Nr.`` (:py:class:`int`) - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column. - ``Row`` (:py:class:`str`): the letter corresponding to the well row. - ``Col`` (:py:class:`str`): the number corresponding to the well column. - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate. - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate. There will also be a separate column for each label, and if there is more than one label, a separate `Time` and `Temp. [°C]` column for each label. Raises ------ ValueError if not data was parsed from the file. Examples -------- >>> plate = pd.read_csv("plate_templates/G-RG-R.csv") >>> df = curveball.ioutils.read_tecan_xlsx("data/Tecan_210115.xlsx", label=('OD','Green','Red'), max_time=12, plate=plate) >>> df.shape (8544, 9) """ import xlrd wb = xlrd.open_workbook(filename) dateandtime = datetime.datetime.now() # default if isinstance(label, string_types): label = [label] if sheets is None: sheets = range(wb.nsheets) if PRINT: print("Reading {0} worksheets from workbook {1}".format(len(sheets), filename)) label_dataframes = [] for lbl in label: sheet_dataframes = [] ## FOR sheet for sh_i in sheets: sh = wb.sheet_by_index(sh_i) if sh.nrows == 0: continue # to next sheet for i in range(sh.nrows): ## FOR row row = sh.row_values(i) if row[0].startswith(u'Date'): if isinstance(row[1], string_types): date = ''.join(row[1:]) next_row = sh.row_values(i + 1) if next_row[0].startswith(u'Time'): time = ''.join(next_row[1:]) else: warn(u"Warning: time row missing (sheet '{0}', row{1}), found row starting with {2}".format(sh.name, i, row[0])) dateandtime = dateutil.parser.parse("%s %s" % (date, time)) elif isinstance(row[1], float): date_tuple = xlrd.xldate_as_tuple(row[1], wb.datemode) next_row = sh.row_values(i + 1) if next_row[0].startswith(u'Time'): time = tuple(map(int, next_row[1].split(':')))[:3] date_tuple = date_tuple[:3] + time else: warn(u"Warning: time row missing (sheet '{0}', row{1}), found row starting with {2}".format(sh.name, i, row[0])) dateandtime = datetime.datetime(*date_tuple) else: warn(u"Warning: date row (sheet '{2}', row {3}) could not be parsed: {0} {1}".format(row[1], type(row[1]), sh.name, i)) if row[0] == lbl: break ## FOR row ENDS data = {} for j in range(i + 1, sh.nrows): ## FOR row row = sh.row(j) if not row[0].value: break data[row[0].value] = [x.value for x in row[1:] if x.ctype == 2] ## FOR row ENDS if not data: raise ValueError("No data found in sheet {1} of workbook {0}".format(filename, sh_i)) min_length = min(map(len, data.values())) for k,v in data.items(): data[k] = v[:min_length] df = pd.DataFrame(data) df = pd.melt(df, id_vars=(u'Time [s]', u'Temp. [°C]', u'Cycle Nr.'), var_name=u'Well', value_name=lbl) df.rename(columns={u'Time [s]': u'Time'}, inplace=True) df.Time = [dateandtime + datetime.timedelta(0, t) for t in df.Time] df[u'Row'] = [x[0] for x in df.Well] df[u'Col'] = [int(x[1:]) for x in df.Well] sheet_dataframes.append(df) ## FOR sheet ENDS n_sheet_dataframes = len(sheet_dataframes) if n_sheet_dataframes == 0: df = pd.DataFrame() elif n_sheet_dataframes == 1: df = sheet_dataframes[0] else: df = pd.concat(sheet_dataframes) min_time = df.Time.min() if PRINT: print("Starting time", min_time) df.Time = [(t - min_time).total_seconds() / 3600.0 for t in df.Time] if max_time is not None: df = df[df.Time <= max_time] df = df.sort_values([u'Row', u'Col', u'Time']) label_dataframes.append((lbl,df)) n_label_dataframes = len(label_dataframes) if n_label_dataframes == 0: # no dataframes return pd.DataFrame() if n_label_dataframes == 1: # just one dataframe df = label_dataframes[0][1] else: # multiple dataframes, merge together # FIXME last label isn't used as a suffix, not sure why lbl, df = label_dataframes[0] lbl = '_' + lbl for lbli, dfi in label_dataframes[1:]: lbli = '_' + lbli df = pd.merge(df, dfi, on=(u'Cycle Nr.', u'Well', u'Row', u'Col'), suffixes=(lbl,lbli)) if plate is None: df[u'Strain'] = u'0' df[u'Color'] = u'#000000' else: df = pd.merge(df, plate, on=(u'Row', u'Col')) if PRINT: print("Read {0} records from workbook".format(df.shape[0])) _fix_dtypes(df) return df
[docs]def read_tecan_mat(filename, time_label=u'tps', value_label=u'plate_mat', value_name=u'OD', plate_width=12, max_time=None, plate=None): """Reads growth measurements from a Matlab file generated by a propriety script at the *Pilpel lab*. Parameters ---------- filename : str name of the XML file to be read. Use ``*`` and ``?`` in filename to read multiple files and parse them into a single data frame. time_label : str, optional name of the field used to store the time values, defaults to ``tps``. label : str name of the field used to store the OD values, defaults to ``plate_mat``. plate_width : int width of the microwell in plate in number of wells, defaults to 12. max_time : float, optional maximal time in hours, defaults to infinity plate : pandas.DataFrame, optional data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_. Returns ------- pandas.DataFrame Data frame containing the columns: - ``Time`` (:py:class:`float`, in hours) - ``OD`` (:py:class:`float`) - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column. - ``Row`` (:py:class:`str`): the letter corresponding to the well row. - ``Col`` (:py:class:`str`): the number corresponding to the well column. - ``Filename`` (:py:class:`str`): the filename from which this measurement was read. - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate. - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate. """ mat = loadmat(filename, appendmat=True) if mat[u'__version__'] != MAT_VERSION: warn(u"Warning: expected mat file version {0} but got {1}".format(MAT_VERSION, mat[u'__version__'])) t = mat[time_label] t = t.reshape(max(t.shape)) y = mat[value_label] assert y.shape[1] == t.shape[0] df = pd.DataFrame(y.T, columns=np.arange(y.shape[0]) + 1) df[u'Time'] = old_div(t, 3600.) df[u'Cycle Nr.'] = np.arange(1, 1 + len(t)) df = pd.melt(df, id_vars=(u'Cycle Nr.', u'Time'), var_name=u'Well', value_name=value_name) df[u'Well'] = [ascii_uppercase[old_div((int(w) - 1), plate_width)] + str(w % plate_width if w % plate_width > 0 else plate_width) for w in df[u'Well']] df[u'Row'] = [w[0] for w in df[u'Well']] df[u'Col'] = [int(w[1:]) for w in df[u'Well']] if plate is None: df[u'Strain'] = u'0' df[u'Color'] = u'#000000' else: df = pd.merge(df, plate, on=(u'Row', u'Col')) if not max_time: max_time = df.Time.max() df = df[df.Time < max_time] df.sort_values([u'Row', u'Col', u'Time'], inplace=True) _fix_dtypes(df) return df
[docs]def read_tecan_xml(filename, label=u'OD', max_time=None, plate=None): """Reads growth measurements from a Tecan Infinity XML output files. Parameters ---------- filename : str pattern of the XML files to be read. Use ``*`` and ``?`` in filename to read multiple files and parse them into a single data frame. label : str, optional measurment name used as ``Name`` in the measurement sections in the file, defaults to ``OD``. max_time : float, optional maximal time in hours, defaults to infinity plate : pandas.DataFrame, optional data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_. Returns ------- pandas.DataFrame Data frame containing the columns: - ``Time`` (:py:class:`float`, in hours) - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column. - ``Row`` (:py:class:`str`): the letter corresponding to the well row. - ``Col`` (:py:class:`str`): the number corresponding to the well column. - ``Filename`` (:py:class:`str`): the filename from which this measurement was read. - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate. - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate. There will also be a separate column for the value of the label. Examples -------- >>> import zipfile >>> with zipfile.ZipFile("data/20130211_dh.zip") as z: z.extractall("data/20130211_dh") >>> plate = pd.read_csv("plate_templates/checkerboard.csv") >>> df = curveball.ioutils.read_tecan_xlsx("data/20130211_dh/*.xml", 'OD', plate=plate) >>> df.shape (2016, 8) Notes ----- This function was adapted from `choderalab/assaytools <https://github.com/choderalab/assaytools/blob/908471e7976e207df3f9b0e31b2a89f84da40607/AssayTools/platereader.py>`_ (licensed under LGPL). """ from lxml import etree dataframes = [] for filename in glob(filename): # Parse XML file into nodes. root_node = etree.parse(filename) # Build a dict of section nodes. section_nodes = { section_node.get(u'Name') : section_node for section_node in root_node.xpath(u"/*/Section") } # Process all sections. if label not in section_nodes: return pd.DataFrame() section_node = section_nodes[label] # Get the time of measurement time_start = section_node.attrib[u'Time_Start'] # Get a list of all well nodes well_nodes = section_node.xpath(u"*/Well") # Process all wells into data. well_data = [] for well_node in well_nodes: well = well_node.get(u'Pos') value = float(well_node.xpath(u"string()")) well_data.append({u'Well': well, label: value}) # Add to data frame df = pd.DataFrame(well_data) df[u'Row'] = [x[0] for x in df.Well] df[u'Col'] = [int(x[1:]) for x in df.Well] df[u'Time'] = dateutil.parser.parse(time_start) df[u'Filename'] = os.path.split(filename)[-1] dataframes.append(df) df = pd.concat(dataframes) min_time = df.Time.min() df.Time = [(t - min_time).total_seconds() / 3600.0 for t in df.Time] if plate is None: df[u'Strain'] = u'0' df[u'Color'] = u'#000000' else: df = pd.merge(df, plate, on=(u'Row', u'Col')) if max_time is not None: df = df[df.Time <= max_time] df.sort_values([u'Row', u'Col', u'Time'], inplace=True) _fix_dtypes(df) return df
[docs]def read_sunrise_xlsx(filename, label=u'OD', max_time=None, plate=None): """Reads growth measurements from a Tecan Sunrise Excel output file. Parameters ---------- filename : str pattern of the XLSX files to be read. Use * and ? in filename to read multiple files and parse them into a single data frame. label : str, optional label : str, optional measurment name to use for the data in the file, defaults to ``OD``. max_time : float, optional maximal time in hours, defaults to infinity plate : pandas.DataFrame, optional data frame representing a plate, usually generated by reading a CSV file generated by `Plato <http://plato.yoavram.com/>`_. Returns ------- pandas.DataFrame Data frame containing the columns: - ``Time`` (:py:class:`float`, in hours) - ``OD`` (or the value of `label`, if given) - ``Well`` (:py:class:`str`): the well name, usually a letter for the row and a number of the column. - ``Row`` (:py:class:`str`): the letter corresponding to the well row. - ``Col`` (:py:class:`str`): the number corresponding to the well column. - ``Filename`` (:py:class:`str`): the filename from which this measurement was read. - ``Strain`` (:py:class:`str`): if a `plate` was given, this is the strain name corresponding to the well from the plate. - ``Color`` (:py:class:`str`, hex format): if a `plate` was given, this is the strain color corresponding to the well from the plate. """ import xlrd dataframes = [] files = glob(filename) if not files: return pd.DataFrame() for filename in files: wb = xlrd.open_workbook(filename) for sh in wb.sheets(): if sh.nrows > 0: break parse_data = False # start with metadata index = [] data = [] for i in range(sh.nrows): row = sh.row_values(i) if row[0] == u'Date:': date = next(filter(lambda x: isinstance(x, float), row[1:])) date = xlrd.xldate_as_tuple(date, 0) elif row[0] == u'Time:': time = next(filter(lambda x: isinstance(x, float), row[1:])) time = xlrd.xldate_as_tuple(time, 0) elif row[0] == u'<>': columns = list(map(int, row[1:])) parse_data = True elif len(row[0]) == 0 and parse_data: break elif parse_data: index.append(row[0]) data.append(list(map(float, row[1:]))) dateandtime = date[:3] + time[-3:] dateandtime = datetime.datetime(*dateandtime) df = pd.DataFrame(data, columns=columns, index=index) df[u'Row'] = index df = pd.melt(df, id_vars=u'Row', var_name=u'Col', value_name=label) df[u'Time'] = dateandtime df[u'Well'] = [x[0] + str(x[1]) for x in zip(df.Row, df.Col)] df[u'Filename'] = os.path.split(filename)[-1] dataframes.append(df) df = pd.concat(dataframes) min_time = df.Time.min() df.Time = [(t - min_time).total_seconds() / 3600.0 for t in df.Time] if plate is None: df[u'Strain'] = u'0' df[u'Color'] = u'#000000' else: df = pd.merge(df, plate, on=(u'Row', u'Col')) if max_time is not None: df = df[df.Time <= max_time] df.sort_values([u'Row', u'Col', u'Time'], inplace=True) _fix_dtypes(df) return df
def read_biotek_xlsx(filename, max_time=None, plate=None, PRINT=False): import xlrd wb = xlrd.open_workbook(filename) for sh_i in range(wb.nsheets): sh = wb.sheet_by_index(sh_i) if sh.nrows > 0: break else: # all sheets are empty warnings.warn('All sheets are empty in {}'.format(filename)) return pd.DataFrame() if PRINT: print("Reading worksheet {0} with {2} lines from workbook {1}".format(sh_i, filename, sh.nrows)) in_data = False data = [] dfs = [] i = 0 while i < sh.nrows: row = sh.row_values(i) if not in_data and row[1] == 'Time': columns = row[1:] in_data = True elif in_data and row[1] != '': row[1] *= 24 # days -> hours data.append(row[1:]) elif in_data and row[1] == '': in_data = False df = pd.DataFrame(data, columns=columns) df = pd.melt(df, [u'Time', u'T° 600'], var_name=u'Well', value_name=u'OD') dfs.append(df) i += 1 df = pd.concat(dfs ) df[u'Row'] = [x[0] for x in df.Well] df[u'Col'] = [int(x[1:]) for x in df.Well] min_time = df.Time.min() if PRINT: print("Starting time", min_time) if max_time is not None: df = df[df.Time <= max_time] df.sort_values([u'Row', u'Col', u'Time'], inplace=True) if df.shape[0] == 0: # no dataframes return pd.DataFrame() if plate is None: df[u'Strain'] = u'0' df[u'Color'] = u'#000000' else: df = pd.merge(df, plate, on=(u'Row', u'Col')) if PRINT: print("Read {0} records from workbook".format(df.shape[0])) _fix_dtypes(df) return df