Source code for pypmi.loaders

# -*- coding: utf-8 -*-
"""
Functions for loading data downloaded from the PPMI database
"""

from functools import reduce
import itertools
import os
import re
from typing import List

import numpy as np
import pandas as pd

from ._info import BEHAVIORAL_INFO, DEMOGRAPHIC_INFO, VISITS
from .utils import _get_data_dir


[docs]def load_biospecimen(path: str = None,
                     measures: List[str] = None) -> pd.DataFrame:
    """
    Loads biospecimen data into tidy dataframe

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None
    measures : list, optional
        Which measures to keep in the final dataframe. There are a number of
        biospecimen measures that are missing for large numbers of subjects, so
        if not specified only those that are present in at least 80% of entries
        are kept. Specifying `measures='all'` will retain everything, but this
        will significantly increase load time. It is highly recommended to
        specify which measures to keep; available biospecimen measures can be
        viewed with :py:func:`pypmi.available_biospecimen`. Default: None

    Returns
    -------
    data : :obj:`pandas.DataFrame`
        Biospecimen data

    See Also
    --------
    pypmi.available_biospecimen
    """

    rename_cols = dict(PATNO='participant', CLINICAL_EVENT='visit',
                       TESTNAME='test', TESTVALUE='score')
    dtype = dict(PATNO=int, CLINICAL_EVENT=VISITS, TESTNAME=str, TESTVALUE=str)

    # check for file and get data directory path
    fname = 'Current_Biospecimen_Analysis_Results.csv'
    path = os.path.join(_get_data_dir(path=path, fnames=[fname]), fname)

    # load data, make scores numeric, and clean up test names (no spaces!)
    data = pd.read_csv(path, dtype=dtype, usecols=rename_cols.keys())
    data = data.rename(columns=rename_cols)
    data['score'] = pd.to_numeric(data['score'], errors='coerce')
    data['test'] = data['test'].apply(lambda x: x.replace(' ', '_').lower())

    # keep only desired measures
    if measures is None:
        measures = ['abeta_1-42', 'csf_alpha-synuclein', 'ptau', 'ttau']
    elif isinstance(measures, str) and measures == 'all':
        measures = data['test'].unique().tolist()
    data = data.query(f'test in {measures}')

    # convert to tidy dataframe
    tidy = data.groupby(['participant', 'visit', 'test']) \
               .agg({'score': np.nanmean}) \
               .unstack(level='test') \
               .get('score') \
               .reset_index() \
               .rename_axis(None, axis=1)

    # (try to) add visit date information
    tidy = _add_dates(tidy, path=os.path.dirname(path),
                      fnames=['Lumbar_Puncture_Sample_Collection.csv'])

    return tidy.sort_values(['participant', 'visit']).reset_index(drop=True)


[docs]def available_biospecimen(path: str = None) -> List[str]:
    """
    Lists measures available in :py:func:`pypmi.load_biospecimen`

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None

    Returns
    -------
    measures : list
        Available biospecimen measures (i.e., "tests")

    See Also
    --------
    pypmi.load_biospecimen
    """

    # check for file and get data directory path
    fname = 'Current_Biospecimen_Analysis_Results.csv'
    path = os.path.join(_get_data_dir(path=path, fnames=[fname]), fname)

    data = pd.read_csv(path, usecols=['TESTNAME'])['TESTNAME'].unique()

    return sorted(list(set([f.replace(' ', '_').lower() for f in data])))


[docs]def load_datscan(path: str = None,
                 measures: List[str] = None) -> pd.DataFrame:
    """
    Loads DaT scan data into tidy dataframe

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None
    measures : list, optional
        Which measures to keep in the final dataframe. If not specified all
        measures are retained; available DaT scan measures can be viewed with
        :py:func:`pypmi.available_datscan`. Default: None

    Returns
    -------
    data : :obj:`pandas.DataFrame`
        DaTScan data

    See Also
    --------
    pypmi.available_datscan
    """

    rename_cols = dict(PATNO='participant', EVENT_ID='visit', SCAN_DATE='date')
    dtype = dict(PATNO=int, EVENT_ID=VISITS, SCAN_DATE=str)

    # check for file and get data directory path
    fname = 'DATScan_Analysis.csv'
    path = os.path.join(_get_data_dir(path=path, fnames=[fname]), fname)

    # load data and coerce into standard format
    raw = pd.read_csv(path, dtype=dtype)
    tidy = raw.rename(columns=rename_cols).dropna(subset=['visit'])
    tidy.columns = [f.lower() for f in tidy.columns]

    # keep only desired measures
    if measures is not None:
        if isinstance(measures, str) and measures == 'all':
            measures = available_datscan(path=os.path.dirname(path))
        elif not isinstance(measures, list):
            measures = list(measures)
        for m in measures:
            if m not in tidy.columns:
                raise ValueError('Specified measure {} is not valid. Please '
                                 'see available datscan measures with `pypmi.'
                                 'available_datscan()`.'.format(m))
        tidy = tidy[['participant', 'visit'] + measures]

    if 'date' in tidy.columns:
        tidy['date'] = pd.to_datetime(tidy['date'], format='%Y-%m-%d',
                                      errors='coerce')
    else:
        tidy = _add_dates(tidy, path=os.path.dirname(path))

    return tidy.sort_values(['participant', 'visit']).reset_index(drop=True)


[docs]def available_datscan(path: str = None) -> List[str]:
    """
    Lists measures available in :py:func:`pypmi.load_datscan`

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None

    Returns
    -------
    measures : list
        Available DaT scan measures

    See Also
    --------
    pypmi.load_datscan
    """

    # check for file and get data directory path
    fname = 'DATScan_Analysis.csv'
    path = os.path.join(_get_data_dir(path=path, fnames=[fname]), fname)

    # only need first line!
    with open(path, 'r') as src:
        data = src.readline().strip().replace('"', '').split(',')[2:]

    if 'SCAN_DATE' in data:
        data = data[1:]

    return sorted([f.lower() for f in data])


[docs]def load_behavior(path: str = None,
                  measures: List[str] = None) -> pd.DataFrame:
    """
    Loads clinical-behavioral data into tidy dataframe

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None
    measures : list, optional
        Which measures to keep in the final dataframe. If not specified all
        measures are retained; available behavioral measures can be viewed with
        :py:func:`pypmi.available_behavior`. Default: None

    Returns
    -------
    df : :obj:`pandas.DataFrame`
        Tidy DataFrame with all clinical-behavioral assessments

    See Also
    --------
    pypmi.available_behavior
    """

    rename_cols = dict(PATNO='participant', EVENT_ID='visit', INFODT='date')

    # determine measures
    if measures is not None:
        if isinstance(measures, str) and measures == 'all':
            beh_info = BEHAVIORAL_INFO
        else:
            beh_info = {d: v for d, v in BEHAVIORAL_INFO.items()
                        if d in measures}
        if 'moca' not in beh_info.keys() and 'education' in beh_info.keys():
            del beh_info['education']
    else:
        beh_info = BEHAVIORAL_INFO

    if len(beh_info) == 0:
        return pd.DataFrame(columns=['participant', 'visit', 'date'])

    # check for files and get data directory path
    fnames = []
    for info in beh_info.values():
        fnames.extend(list(info.get('files', {}).keys()))
    path = _get_data_dir(path=path, fnames=set(fnames))

    df = pd.DataFrame()
    # iterate through all keys in dictionary
    for key, info in beh_info.items():
        cextra = info.get('extra', ['PATNO', 'EVENT_ID', 'INFODT', 'PAG_NAME'])
        capply = info.get('applymap', itertools.repeat(lambda x: x))
        copera = info.get('operation', itertools.repeat(np.sum))

        temp_scores = []
        # go through relevant files and items for current key and grab scores
        for fname, items in info['files'].items():
            # read in file
            data = pd.read_csv(os.path.join(path, fname))
            # iterate through items to be retrieved and apply operations
            for n, (it, ap, ope) in enumerate(zip(items, capply, copera)):
                score = ope(data[it].applymap(ap), axis=1)
                temp_scores.append(data[cextra].join(pd.Series(score, name=n)))

        # merge temp score DataFrames
        curr_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=cextra),
                         temp_scores)
        # combine individual scores for key with joinfunc and add to extra info
        joinfunc = info.get('joinfunc', np.sum)
        score = pd.Series(joinfunc(curr_df.drop(cextra, axis=1), axis=1)
                          .astype('float'), name='score')
        curr_df = curr_df[cextra].astype('str').join(score).assign(test=key)
        # append resultant DataFrame to df
        df = df.append(curr_df, ignore_index=True, sort=True)

    # rename post-treatment UDPRS III scores so there's no collision
    # pivot_table would average between the two by default. we don't want that!
    df.loc[df['PAG_NAME'] == "NUPDRS3A", 'test'] = 'updrs_iii_a'

    # clean up column names and convert to tidy dataframe
    df = df.rename(columns=rename_cols)
    tidy = pd.pivot_table(df, index=['participant', 'visit', 'date'],
                          columns='test', values='score').reset_index()
    tidy = tidy.rename_axis(None, axis=1)

    # get adjusted MOCA scores (add 'education' variable)
    if 'moca' in tidy.columns:
        adjust = tidy['moca'] < 30
        tidy.loc[adjust, 'moca'] += tidy.loc[adjust, 'education'].fillna(0)
        tidy = tidy.drop(['education'], axis=1)

    # coerce data types to desired format
    tidy['participant'] = tidy['participant'].astype(int)
    tidy['visit'] = tidy['visit'].astype(VISITS)
    tidy['date'] = pd.to_datetime(tidy['date'], format='%m/%Y',
                                  errors='coerce')

    return tidy.sort_values(['participant', 'visit']).reset_index(drop=True)


[docs]def available_behavior(path: str = None) -> List[str]:
    """
    Lists measures available in :py:func:`pypmi.load_behavior`

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None

    Returns
    -------
    measures : list
        Available behavioral measures

    See Also
    --------
    pypmi.load_behavior
    """

    measures = sorted(list(BEHAVIORAL_INFO.keys()) + ['updrs_iii_a'])
    measures.remove('education')

    return measures


[docs]def load_demographics(path: str = None,
                      measures: List[str] = None) -> pd.DataFrame:
    """
    Loads demographic data into tidy dataframe

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None
    measures : list, optional
        Which measures to keep in the final dataframe. If not specified all
        measures are retained; available demographics measures can be viewed
        with :py:func:`pypmi.available_demographics`. Default: None

    Returns
    -------
    demographics : :obj:`pandas.DataFrame`
        Tidy data frame containing demographic information for PPMI subjects

    See Also
    --------
    pypmi.available_demographics
    """

    rename_cols = dict(PATNO='participant', EVENT_ID='visit')
    dtype = dict(PATNO=int)

    # determine measures
    if measures is not None:
        if isinstance(measures, str) and measures == 'all':
            dem_info = DEMOGRAPHIC_INFO
        else:
            dem_info = {d: v for d, v in DEMOGRAPHIC_INFO.items()
                        if d in measures}
    else:
        dem_info = DEMOGRAPHIC_INFO

    # check for files and get data directory path
    fnames = []
    for info in dem_info.values():
        fnames.extend(list(info.get('files', {}).keys()))
    path = _get_data_dir(path=path, fnames=set(fnames))

    # empty data frame to hold information
    tidy = pd.DataFrame([], columns=['PATNO'])

    # iterate through demographic info to wrangle
    for key, curr_key in dem_info.items():
        for n, (fname, items) in enumerate(curr_key['files'].items()):
            data = pd.read_csv(os.path.join(path, fname), dtype=dtype)
            curr_score = data[items]
            for attr in [f for f in curr_key.keys() if f not in ['files']]:
                if hasattr(curr_score, attr):
                    fnc = getattr(curr_score, attr)
                    curr_score = fnc(curr_key[attr].get('input', None),
                                     **curr_key[attr].get('kwargs', {}))
            curr_score = pd.Series(curr_score, name=key)
            temp_scores = data[['PATNO']].join(curr_score)
        tidy = pd.merge(tidy, temp_scores, on='PATNO', how='outer')

    # rename columns and remove duplicates (how are there duplicates???)
    tidy = (tidy.rename(columns=rename_cols)
                .drop_duplicates(subset=['participant']))

    return tidy.sort_values('participant').reset_index(drop=True)


[docs]def available_demographics(path: str = None) -> List[str]:
    """
    Lists measures available in :py:func:`pypmi.load_demographics`

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None

    Returns
    -------
    measures : list
        Available demographic measures

    See Also
    --------
    pypmi.load_demographics
    """

    return list(DEMOGRAPHIC_INFO.keys())


def _load_dates(path: str = None,
                fnames: List[str] = None) -> pd.DataFrame:
    """
    Loads visit date information into tidy dataframe

    Parameters
    ----------
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None
    fnames : list, optional
        List of PPMI data files that may contain additional date information
        beyond the "default" files used (i.e., Inclusion_Exclusion.csv,
        Signature_Form.csv', Socio-Economics.csv, and Vital_Signs.csv). If not
        specified only default files are used. Default: None

    Returns
    -------
    dates : :obj:`pandas.DataFrame`
        Tidy data frame with columns ['participant', 'visit', 'date'] for
        linking each visit (valued as e.g., "V01", "V02") with a specific
        YYYY-MM-DD date
    """

    rename_cols = dict(PATNO='participant', EVENT_ID='visit', INFODT='date')
    dtype = dict(PATNO=int, EVENT_ID=VISITS)

    # check for file and get data directory path
    # we use four files to try and capture as much "visit date" info:
    files = [
        'Inclusion_Exclusion.csv',
        'Signature_Form.csv',
        'Socio-Economics.csv',
        'Vital_Signs.csv',
    ]
    # add additional files as needed by datatype and then get data path
    if fnames is not None:
        files = fnames + files
    path = _get_data_dir(path=path, fnames=files)

    # load data and coerce into standard format
    raw = [pd.read_csv(os.path.join(path, f),
                       dtype=dtype,
                       usecols=rename_cols.keys()) for f in files]
    tidy = (pd.concat(raw).rename(columns=rename_cols)
                          .get(list(rename_cols.values()))
                          .dropna()
                          .drop_duplicates(subset=['participant', 'visit']))
    tidy['date'] = pd.to_datetime(tidy['date'], format='%m/%Y',
                                  errors='coerce')

    return tidy.sort_values(['participant', 'visit']).reset_index(drop=True)


def _add_dates(df: pd.DataFrame,
               path: str = None,
               fnames: List[str] = None) -> pd.DataFrame:
    """
    Attempts to add visit date to information to dataframe `df`

    If files required for visit date information cannot be found then `df` is
    returned, unaltered

    Parameters
    ----------
    df : :obj:`pandas.DataFrame`
        Data frame to add date information to
    path : str, optional
        Filepath to directory containing PPMI data files. If not specified this
        function will, in order, look (1) for an environmental variable
        $PPMI_PATH and (2) in the current directory. Default: None
    fnames : list, optional
        List of PPMI data files that may contain additional date information
        beyond the "default" files used (i.e., Inclusion_Exclusion.csv,
        Signature_Form.csv', Socio-Economics.csv, and Vital_Signs.csv). If not
        specified only default files are used. Default: None

    Returns
    -------
    df : :obj:`pandas.DataFrame`
        Provided `df` with new 'date' columns
    """

    try:
        tidy = pd.merge(df, _load_dates(path=path, fnames=fnames),
                        on=['participant', 'visit'], how='left')
        # reorder columns so that 'participant', 'visit', and 'date' are first
        cols = ['participant', 'visit', 'date']
        tidy = tidy[cols + np.setdiff1d(tidy.columns, cols).tolist()]
    except FileNotFoundError:
        pass

    return tidy


def load_genetics(fname: str,
                  gene_list: str = None) -> (pd.DataFrame, pd.DataFrame):
    """
    Loads PPMI genotyping data stored at `fname`

    Parameters
    ----------
    fname : str
        Filepath to genotyping PLINK files
    gene_list : str, optional
        Path to pandas-compatible csv with at least 'snp', 'target', and
        'odds_ratio' columns denoting rs#, target (effect) allele, and odds
        ratio of target allele in population.

    Returns
    -------
    data : (N, G) :obj:`pandas.DataFrame`
        Wide-format genetics data where `N` is participants and `G` is SNPs
    info : (G, 5) :obj:`pandas.DataFrame`
        Information on SNPs in `data`, including 'odds_ratio' for genetic
        risk score calculation
    """

    try:
        from pandas_plink import read_plink
    except ImportError:
        raise ImportError('Loading genotyping data requires installing the '
                          '`pandas_plink` module. Please install that and try '
                          'again.')

    # make helper function for extracting SNP rs# from PLINK files
    def extract(x):
        try:
            return re.findall('[-_]*(rs[0-9]+)[-_]*', x)[0]
        except IndexError:
            return None

    # load PLINK data
    bim, fam, gen = read_plink(fname, verbose=False)
    participant_id = pd.Series(fam.fid.get_values(), name='participant')
    cols = ['snp', 'a0', 'a1']

    if gene_list is not None:
        # load gene list
        gene_info = pd.read_csv(gene_list).drop_duplicates(subset=['snp'])

        # check where SNPs match desired gene list & subset data
        inds = bim.snp.apply(extract).isin(gene_info.snp.dropna()).get_values()
        bim, gen = bim[inds], gen[inds]

        # clean up ugly bim.snp names with just rs# of SNPs
        bim.loc[:, 'snp'] = bim.snp.map({f: extract(f) for f in bim.snp})

        # get allele info for making sense of the data
        cols += ['target', 'odds_ratio', 'study']
        info = pd.merge(bim, gene_info, on='snp')[cols]

        # if a0/a1 alleles don't match target, confusion ensues
        # drop the non-matched ones and then grab SNPs that need to be reversed
        info = info[~((info.a0 != info.target) & (info.a1 != info.target))]
        flip = info[info.a1 != info.target].snp
        info = info[['snp', 'odds_ratio', 'study']]
    else:
        # placeholders so below code doesn't fail
        info = bim[cols]
        flip = pd.Series([], name='snp')

    # make wide-format participant x SNP dataframe
    data = pd.DataFrame(gen.compute().T, index=participant_id, columns=bim.snp)
    # if multiple columns represent same snp, combine them
    # THEY SHOULD ALL BE THE SAME -- if they aren't, that's bad...
    data = (data.dropna(axis=1, how='all')
                .groupby(level=0, axis=1)
                .mean()
                .dropna(axis=0, how='all')
                .sort_index())
    # flip reverse-coded SNPs
    data[flip] = data[flip].applymap(lambda x: {0: 2, 1: 1, 2: 0}.get(x))

    # retain only relevant SNPs in allele
    info = info[info.snp.isin(data.columns)]
    info = info.drop_duplicates(subset=['snp']).reset_index(drop=True)

    # return sorted data and info
    return data[info.snp], info