Experiment 03 - Data#

Create data splits

Hide code cell source

import logging
from functools import partial
from pathlib import Path
from typing import List, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
from IPython.display import display
from sklearn.model_selection import train_test_split

import pimmslearn
import pimmslearn.io.load
from pimmslearn.analyzers import analyzers
from pimmslearn.io.datasplits import DataSplits
from pimmslearn.sampling import feature_frequency
from pimmslearn.sklearn import get_PCA

logger = pimmslearn.logging.setup_nb_logger()
logger.info("Split data and make diagnostic plots")
logging.getLogger('fontTools').setLevel(logging.WARNING)


def align_meta_data(df: pd.DataFrame, df_meta: pd.DataFrame):
    try:
        df = df.loc[df_meta.index]
    except KeyError as e:
        logger.warning(e)
        logger.warning("Ignore missing samples in quantified samples")
        df = df.loc[df.index.intersection(
            df_meta.index)]
    return df_meta


pd.options.display.max_columns = 32
plt.rcParams['figure.figsize'] = [4, 2]

pimmslearn.plotting.make_large_descriptors(7)

figures = {}  # collection of ax or figures
dumps = {}  # collection of data dumps
root - INFO     Split data and make diagnostic plots

Arguments#

Hide code cell source

# catch passed parameters
args = None
args = dict(globals()).keys()
FN_INTENSITIES: str = 'data/dev_datasets/HeLa_6070/protein_groups_wide_N50.csv'  # Sample (rows), features (columns)
index_col: Union[str, int] = 0  # Can be either a string or position (default 0 for first column), or a list of these.
column_names: List[str] = ["Gene Names"]  # Manuelly set column names (of Index object in columns)
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'  # metadata for samples (rows)
feat_prevalence: Union[int, float] = 0.25  # Minimum number or fraction of feature prevalence across samples to be kept
sample_completeness: Union[int, float] = 0.5  # Minimum number or fraction of total requested features per Sample
select_N: int = None  # only use latest N samples
sample_N: bool = False  # if select_N, sample N randomly instead of using latest N
random_state: int = 42  # random state for reproducibility of splits
logarithm: str = 'log2'  # Log transformation of initial data (select one of the existing in numpy)
folder_experiment: str = 'runs/example'  # folder to save figures and data dumps
folder_data: str = ''  # specify special data directory if needed
file_format: str = 'csv'  # file format of create splits, default pickle (pkl)
use_every_nth_xtick: int = 1  # use every nth xtick in plots (default 1, i.e. every xtick is kept)
# metadata -> defaults for metadata extracted from machine data, used for plotting
meta_date_col: str = None  # date column in meta data
meta_cat_col: str = None  # category column in meta data
# train, validation and test data splits
frac_non_train: float = 0.1  # fraction of non training data (validation and test split)
frac_mnar: float = 0.0  # fraction of missing not at random data, rest: missing completely at random
prop_sample_w_sim: float = 1.0  # proportion of samples with simulated missing values
feat_name_display: str = None  # display name for feature name (e.g. 'protein group')
# Parameters
FN_INTENSITIES = "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv"
sample_completeness = 0.5
feat_prevalence = 0.25
column_names = ["protein groups"]
index_col = 0
meta_cat_col = "_collection site"
meta_date_col = None
frac_mnar = 0.25
frac_non_train = 0.1
fn_rawfile_metadata = "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv"
folder_experiment = "runs/alzheimer_study"

Hide code cell source

args = pimmslearn.nb.get_params(args, globals=globals())
args
root - INFO     Removed from global namespace: FN_INTENSITIES
root - INFO     Removed from global namespace: index_col
root - INFO     Removed from global namespace: column_names
root - INFO     Removed from global namespace: fn_rawfile_metadata
root - INFO     Removed from global namespace: feat_prevalence
root - INFO     Removed from global namespace: sample_completeness
root - INFO     Removed from global namespace: select_N
root - INFO     Removed from global namespace: sample_N
root - INFO     Removed from global namespace: random_state
root - INFO     Removed from global namespace: logarithm
root - INFO     Removed from global namespace: folder_experiment
root - INFO     Removed from global namespace: folder_data
root - INFO     Removed from global namespace: file_format
root - INFO     Removed from global namespace: use_every_nth_xtick
root - INFO     Removed from global namespace: meta_date_col
root - INFO     Removed from global namespace: meta_cat_col
root - INFO     Removed from global namespace: frac_non_train
root - INFO     Removed from global namespace: frac_mnar
root - INFO     Removed from global namespace: prop_sample_w_sim
root - INFO     Removed from global namespace: feat_name_display
{'FN_INTENSITIES': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv',
 'index_col': 0,
 'column_names': ['protein groups'],
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'feat_prevalence': 0.25,
 'sample_completeness': 0.5,
 'select_N': None,
 'sample_N': False,
 'random_state': 42,
 'logarithm': 'log2',
 'folder_experiment': 'runs/alzheimer_study',
 'folder_data': '',
 'file_format': 'csv',
 'use_every_nth_xtick': 1,
 'meta_date_col': None,
 'meta_cat_col': '_collection site',
 'frac_non_train': 0.1,
 'frac_mnar': 0.25,
 'prop_sample_w_sim': 1.0,
 'feat_name_display': None}

Hide code cell source

args = pimmslearn.nb.args_from_dict(args)
args
{'FN_INTENSITIES': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv',
 'column_names': ['protein groups'],
 'data': PosixPath('runs/alzheimer_study/data'),
 'feat_name_display': None,
 'feat_prevalence': 0.25,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'folder_data': '',
 'folder_experiment': PosixPath('runs/alzheimer_study'),
 'frac_mnar': 0.25,
 'frac_non_train': 0.1,
 'index_col': 0,
 'logarithm': 'log2',
 'meta_cat_col': '_collection site',
 'meta_date_col': None,
 'out_figures': PosixPath('runs/alzheimer_study/figures'),
 'out_folder': PosixPath('runs/alzheimer_study'),
 'out_metrics': PosixPath('runs/alzheimer_study'),
 'out_models': PosixPath('runs/alzheimer_study'),
 'out_preds': PosixPath('runs/alzheimer_study/preds'),
 'prop_sample_w_sim': 1.0,
 'random_state': 42,
 'sample_N': False,
 'sample_completeness': 0.5,
 'select_N': None,
 'use_every_nth_xtick': 1}

Hide code cell source

if not 0.0 <= args.frac_mnar <= 1.0:
    raise ValueError("Invalid MNAR float value (should be betw. 0 and 1):"
                     f" {args.frac_mnar}")

if isinstance(args.index_col, str) or isinstance(args.index_col, int):
    args.overwrite_entry('index_col', [args.index_col])
args.index_col  # make sure it is an iterable
[0]

Raw data#

process arguments

Hide code cell source

logger.info(f"{args.FN_INTENSITIES = }")


FILE_FORMAT_TO_CONSTRUCTOR = {'csv': 'from_csv',
                              'pkl': 'from_pickle',
                              'pickle': 'from_pickle',
                              }

FILE_EXT = Path(args.FN_INTENSITIES).suffix[1:]
logger.info(
    f"File format (extension): {FILE_EXT}  (!specifies data loading function!)")
root - INFO     args.FN_INTENSITIES = 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv'
root - INFO     File format (extension): csv  (!specifies data loading function!)

Hide code cell source

# ! factor out file reading to a separate module, not class
# AnalyzePeptides.from_csv
constructor = getattr(pimmslearn.io.load, FILE_FORMAT_TO_CONSTRUCTOR[FILE_EXT])
df = constructor(fname=args.FN_INTENSITIES,
                 index_col=args.index_col,
                 )
if args.column_names:
    df.columns.names = args.column_names
if args.feat_name_display is None:
    args.overwrite_entry('feat_name_display', 'features')
    if args.column_names:
        args.overwrite_entry('feat_name_display', args.column_names[0])


if not df.index.name:
    logger.warning("No sample index name found, setting to 'Sample ID'")
    df.index.name = 'Sample ID'

if args.logarithm:
    log_fct = getattr(np, args.logarithm)
    df = log_fct(df)  # ! potentially add check to increase value by 1 if 0 is present (should be part of preprocessing)
df
pimmslearn.io.load - WARNING  Passed unknown kwargs: {}
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3B9;E9PJL7;E9PNH7;E9PR44;E9PRA8;P02511 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 A0A075B6J1 A0A075B6J9 A0A075B6K4 A0A075B6K5 A0A075B6P5;P01615 ... Q9Y4L1 Q9Y5F6;Q9Y5F6-2 Q9Y5I4;Q9Y5I4-2 Q9Y5Y7 Q9Y617 Q9Y646 Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 NaN 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 15.164 NaN 16.148 17.343 17.016 ... 18.598 16.469 17.187 18.840 16.859 19.322 16.012 15.178 NaN 15.050 16.842 19.863 NaN 19.563 12.837 12.805
Sample_001 15.936 16.874 NaN 15.519 16.387 13.796 19.941 18.786 17.144 NaN 19.067 NaN 16.188 16.127 17.417 17.779 ... 18.476 15.782 17.447 19.195 16.799 19.190 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 14.523 NaN 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 NaN NaN 15.387 17.236 17.431 ... 18.991 17.015 17.410 19.088 16.288 19.702 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 NaN 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 13.050 13.438 16.565 16.267 16.990 ... 18.560 16.529 17.545 18.715 17.075 19.760 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 12.627 12.445
Sample_004 15.603 15.331 NaN 15.375 16.679 15.473 20.450 18.682 17.081 14.140 19.686 15.703 14.495 16.418 17.390 17.493 ... 18.305 16.285 17.297 18.668 16.736 19.624 14.757 15.094 14.048 15.256 17.075 19.582 15.328 19.867 13.145 12.235
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 15.682 16.886 14.061 14.910 16.482 NaN 17.705 17.039 NaN 16.413 19.102 NaN 16.064 15.350 17.154 17.175 ... 18.290 15.968 17.104 18.726 15.808 19.894 15.235 15.684 14.236 15.415 17.551 17.922 16.340 19.928 12.929 11.802
Sample_206 15.798 17.554 15.266 15.600 15.938 NaN 18.154 18.152 16.503 16.860 18.538 NaN 15.288 16.582 17.902 19.575 ... 17.977 16.885 17.109 18.460 15.035 20.015 15.422 16.106 NaN 15.345 17.084 18.708 14.249 19.433 NaN NaN
Sample_207 15.739 16.877 NaN 15.469 16.898 NaN 18.636 17.950 16.321 16.401 18.849 NaN 17.580 15.768 17.519 19.306 ... 18.149 15.878 16.938 19.502 16.283 20.306 15.808 16.098 14.403 15.715 16.586 18.725 16.138 19.599 13.637 11.174
Sample_208 15.477 16.779 NaN 14.995 16.132 NaN 14.908 17.530 NaN 16.119 18.368 NaN 15.202 17.560 17.502 18.977 ... 17.881 15.554 17.155 18.892 15.920 20.203 15.157 16.712 NaN 14.640 16.533 19.411 15.807 19.545 13.216 NaN
Sample_209 15.727 17.261 15.344 15.175 16.235 NaN 17.893 17.744 16.371 15.780 18.806 NaN 16.532 16.338 16.989 18.688 ... 18.125 16.575 16.776 18.675 15.713 20.042 15.237 15.652 15.211 14.205 16.749 19.275 15.732 19.577 11.042 11.791

210 rows × 1542 columns

Hide code cell source

ax = (df
      .notna()
      .sum(axis=0)
      .to_frame(df.columns.name)
      .plot
      .box()
      )
ax.set_ylabel('Frequency')
Text(0, 0.5, 'Frequency')
_images/83d00543442218feeb05387c6db7af5ca3d15238078a4cd0727e4c3e7288819d.png

Hide code cell source

fname = args.out_folder / '01_0_data_stats.xlsx'
dumps[fname.name] = fname.as_posix()
writer = pd.ExcelWriter(fname)

notna = df.notna()
data_stats_original = pd.concat(
    [
        notna.sum().describe().rename('feat_stats'),
        notna.sum(axis=1).describe().rename('sample_stats')
    ],
    axis=1)
data_stats_original.to_excel(writer, sheet_name='data_stats_original')
data_stats_original
feat_stats sample_stats
count 1,542.000 210.000
mean 165.477 1,215.071
std 57.677 105.904
min 1.000 801.000
25% 136.000 1,155.000
50% 195.000 1,236.000
75% 209.000 1,297.750
max 210.000 1,377.000

In case there are multiple features for each intensity values (currenlty: peptide sequence and charge), combine the column names to a single str index.

The Collaborative Modeling approach will need a single feature column.

Hide code cell source

def join_as_str(seq):
    ret = "_".join(str(x) for x in seq)
    return ret


if isinstance(df.columns, pd.MultiIndex):
    logger.warning("combine MultiIndex columns to one feature column")
    print(df.columns[:10].map(join_as_str))
    _new_name = join_as_str(df.columns.names)
    df.columns = df.columns.map(join_as_str)
    df.columns.name = _new_name
    logger.warning(f"New name: {df.columns.names = }")

Machine metadata#

Hide code cell source

if args.fn_rawfile_metadata:
    df_meta = pd.read_csv(args.fn_rawfile_metadata, index_col=0)
else:
    logger.warning("No metadata for samples provided, create placeholder.")
    if args.meta_date_col:
        raise ValueError(
            f"No metadata provided, but data column set: {args.meta_date_col}")
    if args.meta_cat_col:
        raise ValueError(
            f"No metadata provided, but data column set: {args.meta_cat_col}")
    df_meta = pd.DataFrame(index=df.index)
df_meta = df_meta.loc[df.index.to_list()]  # index is sample index
if df_meta.index.name is None:
    df_meta.index.name = args.index_col[0]
df_meta
_collection site _age at CSF collection _gender _t-tau [ng/L] _p-tau [ng/L] _Abeta-42 [ng/L] _Abeta-40 [ng/L] _Abeta-42/Abeta-40 ratio _primary biochemical AD classification _clinical AD diagnosis _MMSE score
Sample ID
Sample_000 Sweden 71.000 f 703.000 85.000 562.000 NaN NaN biochemical control NaN NaN
Sample_001 Sweden 77.000 m 518.000 91.000 334.000 NaN NaN biochemical AD NaN NaN
Sample_002 Sweden 75.000 m 974.000 87.000 515.000 NaN NaN biochemical AD NaN NaN
Sample_003 Sweden 72.000 f 950.000 109.000 394.000 NaN NaN biochemical AD NaN NaN
Sample_004 Sweden 63.000 f 873.000 88.000 234.000 NaN NaN biochemical AD NaN NaN
... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 Berlin 69.000 f 1,945.000 NaN 699.000 12,140.000 0.058 biochemical AD AD 17.000
Sample_206 Berlin 73.000 m 299.000 NaN 1,420.000 16,571.000 0.086 biochemical control non-AD 28.000
Sample_207 Berlin 71.000 f 262.000 NaN 639.000 9,663.000 0.066 biochemical control non-AD 28.000
Sample_208 Berlin 83.000 m 289.000 NaN 1,436.000 11,285.000 0.127 biochemical control non-AD 24.000
Sample_209 Berlin 63.000 f 591.000 NaN 1,299.000 11,232.000 0.116 biochemical control non-AD 29.000

210 rows × 11 columns

Hide code cell source

if args.meta_date_col:
    df_meta[args.meta_date_col] = pd.to_datetime(
        df_meta[args.meta_date_col])
else:
    args.overwrite_entry('meta_date_col', 'PlaceholderTime')
    df_meta[args.meta_date_col] = range(len(df_meta))
df_meta
_collection site _age at CSF collection _gender _t-tau [ng/L] _p-tau [ng/L] _Abeta-42 [ng/L] _Abeta-40 [ng/L] _Abeta-42/Abeta-40 ratio _primary biochemical AD classification _clinical AD diagnosis _MMSE score PlaceholderTime
Sample ID
Sample_000 Sweden 71.000 f 703.000 85.000 562.000 NaN NaN biochemical control NaN NaN 0
Sample_001 Sweden 77.000 m 518.000 91.000 334.000 NaN NaN biochemical AD NaN NaN 1
Sample_002 Sweden 75.000 m 974.000 87.000 515.000 NaN NaN biochemical AD NaN NaN 2
Sample_003 Sweden 72.000 f 950.000 109.000 394.000 NaN NaN biochemical AD NaN NaN 3
Sample_004 Sweden 63.000 f 873.000 88.000 234.000 NaN NaN biochemical AD NaN NaN 4
... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 Berlin 69.000 f 1,945.000 NaN 699.000 12,140.000 0.058 biochemical AD AD 17.000 205
Sample_206 Berlin 73.000 m 299.000 NaN 1,420.000 16,571.000 0.086 biochemical control non-AD 28.000 206
Sample_207 Berlin 71.000 f 262.000 NaN 639.000 9,663.000 0.066 biochemical control non-AD 28.000 207
Sample_208 Berlin 83.000 m 289.000 NaN 1,436.000 11,285.000 0.127 biochemical control non-AD 24.000 208
Sample_209 Berlin 63.000 f 591.000 NaN 1,299.000 11,232.000 0.116 biochemical control non-AD 29.000 209

210 rows × 12 columns

Hide code cell source

df_meta.describe(percentiles=np.linspace(0.05, 0.95, 10))
_age at CSF collection _t-tau [ng/L] _p-tau [ng/L] _Abeta-42 [ng/L] _Abeta-40 [ng/L] _Abeta-42/Abeta-40 ratio _MMSE score PlaceholderTime
count 197.000 181.000 98.000 181.000 121.000 121.000 83.000 210.000
mean 67.726 553.624 72.449 687.105 10,505.843 0.079 25.723 104.500
std 12.123 372.272 40.869 381.119 5,192.847 0.047 4.028 60.766
min 20.000 78.000 16.000 154.000 2,450.000 0.016 12.000 0.000
5% 42.800 149.000 24.000 249.000 3,959.000 0.031 17.100 10.450
15% 60.000 218.000 33.000 349.000 5,748.000 0.037 21.000 31.350
25% 63.000 275.000 36.750 417.000 6,608.000 0.045 23.500 52.250
35% 67.000 320.000 45.900 478.000 7,866.000 0.052 25.700 73.150
45% 69.000 383.000 60.300 544.000 9,016.000 0.063 27.000 94.050
50% 70.000 441.000 73.500 593.000 9,515.000 0.067 27.000 104.500
55% 71.000 519.000 77.000 629.000 10,171.000 0.078 28.000 114.950
65% 72.000 654.000 87.000 740.000 11,466.000 0.091 28.000 135.850
75% 74.000 802.000 93.750 892.000 12,967.000 0.105 29.000 156.750
85% 77.000 920.000 109.000 1,016.000 16,531.000 0.119 29.000 177.650
95% 83.000 1,183.000 144.150 1,436.000 20,554.000 0.144 30.000 198.550
max 88.000 2,390.000 233.000 2,206.000 26,080.000 0.370 30.000 209.000

Hide code cell source

df_meta = df_meta.sort_values(args.meta_date_col)

Hide code cell source

meta_stats = df_meta.describe(include='all')
meta_stats
_collection site _age at CSF collection _gender _t-tau [ng/L] _p-tau [ng/L] _Abeta-42 [ng/L] _Abeta-40 [ng/L] _Abeta-42/Abeta-40 ratio _primary biochemical AD classification _clinical AD diagnosis _MMSE score PlaceholderTime
count 197 197.000 197 181.000 98.000 181.000 121.000 121.000 197 137 83.000 210.000
unique 4 NaN 2 NaN NaN NaN NaN NaN 2 2 NaN NaN
top Berlin NaN f NaN NaN NaN NaN NaN biochemical control non-AD NaN NaN
freq 83 NaN 99 NaN NaN NaN NaN NaN 109 88 NaN NaN
mean NaN 67.726 NaN 553.624 72.449 687.105 10,505.843 0.079 NaN NaN 25.723 104.500
std NaN 12.123 NaN 372.272 40.869 381.119 5,192.847 0.047 NaN NaN 4.028 60.766
min NaN 20.000 NaN 78.000 16.000 154.000 2,450.000 0.016 NaN NaN 12.000 0.000
25% NaN 63.000 NaN 275.000 36.750 417.000 6,608.000 0.045 NaN NaN 23.500 52.250
50% NaN 70.000 NaN 441.000 73.500 593.000 9,515.000 0.067 NaN NaN 27.000 104.500
75% NaN 74.000 NaN 802.000 93.750 892.000 12,967.000 0.105 NaN NaN 29.000 156.750
max NaN 88.000 NaN 2,390.000 233.000 2,206.000 26,080.000 0.370 NaN NaN 30.000 209.000

subset with variation

Hide code cell source

try:
    display(meta_stats.loc[:, (meta_stats.loc['unique']
            > 1) | (meta_stats.loc['std'] > 0.1)])
except KeyError:
    if 'std' in meta_stats.index:
        display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)])
    if 'unique' in meta_stats.index:
        display(meta_stats.loc[:, (meta_stats.loc['std'] > 0.1)])
_collection site _age at CSF collection _gender _t-tau [ng/L] _p-tau [ng/L] _Abeta-42 [ng/L] _Abeta-40 [ng/L] _primary biochemical AD classification _clinical AD diagnosis _MMSE score PlaceholderTime
count 197 197.000 197 181.000 98.000 181.000 121.000 197 137 83.000 210.000
unique 4 NaN 2 NaN NaN NaN NaN 2 2 NaN NaN
top Berlin NaN f NaN NaN NaN NaN biochemical control non-AD NaN NaN
freq 83 NaN 99 NaN NaN NaN NaN 109 88 NaN NaN
mean NaN 67.726 NaN 553.624 72.449 687.105 10,505.843 NaN NaN 25.723 104.500
std NaN 12.123 NaN 372.272 40.869 381.119 5,192.847 NaN NaN 4.028 60.766
min NaN 20.000 NaN 78.000 16.000 154.000 2,450.000 NaN NaN 12.000 0.000
25% NaN 63.000 NaN 275.000 36.750 417.000 6,608.000 NaN NaN 23.500 52.250
50% NaN 70.000 NaN 441.000 73.500 593.000 9,515.000 NaN NaN 27.000 104.500
75% NaN 74.000 NaN 802.000 93.750 892.000 12,967.000 NaN NaN 29.000 156.750
max NaN 88.000 NaN 2,390.000 233.000 2,206.000 26,080.000 NaN NaN 30.000 209.000

Hide code cell source

df_meta = align_meta_data(df, df_meta=df_meta)

Ensure unique indices

Hide code cell source

assert df.index.is_unique, "Duplicates in index."

Select a subset of samples if specified (reduce the number of samples)#

  • select features if select_N is specifed (for now)

  • for interpolation to make sense, it is best to select a consecutive number of samples:

    • take N most recent samples (-> check that this makes sense for your case)

Hide code cell source

if args.select_N is not None:
    args.select_N = min(args.select_N, len(df_meta))
    if args.sample_N:
        df_meta = df_meta.sample(args.select_N)
    else:
        df_meta = df_meta.iloc[-args.select_N:]

    df = df.loc[df_meta.index].dropna(
        how='all', axis=1)
    ax = df.T.describe().loc['count'].hist()
    _ = ax.set_title('histogram of features for all eligable samples')

First Step: Select features by prevalence#

  • feat_prevalence across samples

Hide code cell source

! add function
freq_per_feature = df.notna().sum()  # on wide format
if isinstance(args.feat_prevalence, float):
    N_samples = len(df)
    logger.info(f"Current number of samples: {N_samples}")
    logger.info(
        f"Feature has to be present in at least {args.feat_prevalence:.2%} of samples")
    args.overwrite_entry('feat_prevalence', int(
        N_samples * args.feat_prevalence))
assert isinstance(args.feat_prevalence, int)
! check that feature prevalence is greater equal to 3 (otherwise train, val, test split is not possible)
logger.info(
    f"Feature has to be present in at least {args.feat_prevalence} of samples")
# select features
mask = freq_per_feature >= args.feat_prevalence
logger.info(f"Drop {(~mask).sum()} features")
freq_per_feature = freq_per_feature.loc[mask]
df = df.loc[:, mask]
df
/bin/bash: line 1: add: command not found
/bin/bash: -c: line 1: syntax error near unexpected token `('
/bin/bash: -c: line 1: ` check that feature prevalence is greater equal to 3 (otherwise train, val, test split is not possible)'
root - INFO     Current number of samples: 210
root - INFO     Feature has to be present in at least 25.00% of samples
root - INFO     Feature has to be present in at least 52 of samples
root - INFO     Drop 121 features
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 A0A075B6J9 A0A075B6K4 A0A075B6K5 A0A075B6P5;P01615 A0A075B6Q5 A0A075B6R2 ... Q9Y4L1 Q9Y5F6;Q9Y5F6-2 Q9Y5I4;Q9Y5I4-2 Q9Y5Y7 Q9Y617 Q9Y646 Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 NaN 16.148 17.343 17.016 NaN NaN ... 18.598 16.469 17.187 18.840 16.859 19.322 16.012 15.178 NaN 15.050 16.842 19.863 NaN 19.563 12.837 12.805
Sample_001 15.936 16.874 15.519 16.387 13.796 19.941 18.786 17.144 NaN 19.067 16.188 16.127 17.417 17.779 NaN NaN ... 18.476 15.782 17.447 19.195 16.799 19.190 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 14.523 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 NaN 15.387 17.236 17.431 15.128 16.280 ... 18.991 17.015 17.410 19.088 16.288 19.702 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 13.438 16.565 16.267 16.990 NaN 16.777 ... 18.560 16.529 17.545 18.715 17.075 19.760 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 12.627 12.445
Sample_004 15.603 15.331 15.375 16.679 15.473 20.450 18.682 17.081 14.140 19.686 14.495 16.418 17.390 17.493 NaN 17.497 ... 18.305 16.285 17.297 18.668 16.736 19.624 14.757 15.094 14.048 15.256 17.075 19.582 15.328 19.867 13.145 12.235
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 15.682 16.886 14.910 16.482 NaN 17.705 17.039 NaN 16.413 19.102 16.064 15.350 17.154 17.175 NaN 16.598 ... 18.290 15.968 17.104 18.726 15.808 19.894 15.235 15.684 14.236 15.415 17.551 17.922 16.340 19.928 12.929 11.802
Sample_206 15.798 17.554 15.600 15.938 NaN 18.154 18.152 16.503 16.860 18.538 15.288 16.582 17.902 19.575 NaN 17.070 ... 17.977 16.885 17.109 18.460 15.035 20.015 15.422 16.106 NaN 15.345 17.084 18.708 14.249 19.433 NaN NaN
Sample_207 15.739 16.877 15.469 16.898 NaN 18.636 17.950 16.321 16.401 18.849 17.580 15.768 17.519 19.306 16.884 16.065 ... 18.149 15.878 16.938 19.502 16.283 20.306 15.808 16.098 14.403 15.715 16.586 18.725 16.138 19.599 13.637 11.174
Sample_208 15.477 16.779 14.995 16.132 NaN 14.908 17.530 NaN 16.119 18.368 15.202 17.560 17.502 18.977 NaN NaN ... 17.881 15.554 17.155 18.892 15.920 20.203 15.157 16.712 NaN 14.640 16.533 19.411 15.807 19.545 13.216 NaN
Sample_209 15.727 17.261 15.175 16.235 NaN 17.893 17.744 16.371 15.780 18.806 16.532 16.338 16.989 18.688 15.680 16.036 ... 18.125 16.575 16.776 18.675 15.713 20.042 15.237 15.652 15.211 14.205 16.749 19.275 15.732 19.577 11.042 11.791

210 rows × 1421 columns

Hide code cell source

notna = df.notna()
data_stats_filtered = pd.concat(
    [
        notna.sum().describe().rename('feat_stats'),
        notna.sum(axis=1).describe().rename('sample_stats')
    ],
    axis=1)
data_stats_filtered.to_excel(writer, sheet_name='data_stats_filtered')
data_stats_filtered
feat_stats sample_stats
count 1,421.000 210.000
mean 177.346 1,200.043
std 42.341 100.703
min 52.000 800.000
25% 155.000 1,147.000
50% 200.000 1,220.000
75% 210.000 1,278.750
max 210.000 1,342.000

Second step - Sample selection#

Select samples based on completeness

Hide code cell source

if isinstance(args.sample_completeness, float):
    msg = f'Fraction of minimum sample completeness over all features specified with: {args.sample_completeness}\n'
    # assumes df in wide format
    args.overwrite_entry('sample_completeness', int(
        df.shape[1] * args.sample_completeness))
    msg += f'This translates to a minimum number of features per sample (to be included): {args.sample_completeness}'
    logger.info(msg)

sample_counts = df.notna().sum(axis=1)  # if DataFrame
sample_counts.describe()
root - INFO     Fraction of minimum sample completeness over all features specified with: 0.5
This translates to a minimum number of features per sample (to be included): 710
count     210.000
mean    1,200.043
std       100.703
min       800.000
25%     1,147.000
50%     1,220.000
75%     1,278.750
max     1,342.000
dtype: float64

Hide code cell source

mask = sample_counts > args.sample_completeness
msg = f'Drop {len(mask) - mask.sum()} of {len(mask)} initial samples.'
logger.info(msg)
df = df.loc[mask]
df = df.dropna(
    axis=1, how='all')  # drop now missing features
root - INFO     Drop 0 of 210 initial samples.

Hide code cell source

args.N, args.M = df.shape  # save data dimensions
args.used_samples = df.index.to_list()

Histogram of features per sample#

Hide code cell source

group = 1
ax = df.notna().sum(axis=1).hist()
ax.set_xlabel(f'{args.feat_name_display.capitalize()} per eligable sample')
ax.set_ylabel('observations')
fname = args.out_figures / f'0_{group}_hist_features_per_sample'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_hist_features_per_sample
_images/f48e6c9a157abe2755cce0c88c956fc1e048d81a747399af1dfc482408b65d26.png

Hide code cell source

ax = df.notna().sum(axis=0).sort_values().plot()
_new_labels = [l_.get_text().split(';')[0] for l_ in ax.get_xticklabels()]
_ = ax.set_xticklabels(_new_labels, rotation=45,
                       horizontalalignment='right')
ax.set_xlabel(f'{args.feat_name_display.capitalize()} prevalence')
ax.set_ylabel('observations')
fname = args.out_figures / f'0_{group}_feature_prevalence'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_feature_prevalence
_images/04c01c58ddef0be274d4b2263e1448098ecc9fb4ef2a6583507316a49af7c565.png

Number off observations accross feature value#

Hide code cell source

min_max = pimmslearn.plotting.data.min_max(df.stack())
ax, bins = pimmslearn.plotting.data.plot_histogram_intensities(
    df.stack(), min_max=min_max)
ax.set_xlabel('Intensity binned')
fname = args.out_figures / f'0_{group}_intensity_distribution_overall'

figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_intensity_distribution_overall
_images/76bb625b8927dc9860c85d56a13c0cf472ed05b562ced7e4790b8e669f79c1e9.png

Hide code cell source

ax = pimmslearn.plotting.data.plot_feat_median_over_prop_missing(
    data=df, type='scatter')
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter'
ax.set_xlabel(
    f'{args.feat_name_display.capitalize()} binned by their median intensity'
    f' (N {args.feat_name_display})')
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_intensity_median_vs_prop_missing_scatter
_images/2539882cc3f7fd680eee7e18387012cdeebd1bd0f86328b573f3a17c80357905.png

Hide code cell source

ax, _data_feat_median_over_prop_missing = pimmslearn.plotting.data.plot_feat_median_over_prop_missing(
    data=df, type='boxplot', return_plot_data=True)
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot'
ax.set_xlabel(
    f'{args.feat_name_display.capitalize()} binned by their median intensity'
    f' (N {args.feat_name_display})')
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
_data_feat_median_over_prop_missing.to_csv(fname.with_suffix('.csv'))
# _data_feat_median_over_prop_missing.to_excel(fname.with_suffix('.xlsx'))
del _data_feat_median_over_prop_missing
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/data.py:327: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ax = ax[0]  # returned series due to by argument?
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_intensity_median_vs_prop_missing_boxplot
_images/a82b70b0d2531921d92d365266313545986927c8ca1670cd2f0219bfbf1c28a8.png

Interactive and Single plots#

Hide code cell source

_feature_display_name = f'identified {args.feat_name_display}'
sample_counts.name = _feature_display_name

Hide code cell source

K = 2
df = df.astype(float)
pcs = get_PCA(df, n_components=K)  # should be renamed to get_PCs
pcs = pcs.iloc[:, :K].join(df_meta).join(sample_counts)

pcs_name = pcs.columns[:2]
pcs_index_name = pcs.index.name
pcs = pcs.reset_index()
pcs
Sample ID principal component 1 (11.62 %) principal component 2 (10.17 %) _collection site _age at CSF collection _gender _t-tau [ng/L] _p-tau [ng/L] _Abeta-42 [ng/L] _Abeta-40 [ng/L] _Abeta-42/Abeta-40 ratio _primary biochemical AD classification _clinical AD diagnosis _MMSE score PlaceholderTime identified protein groups
0 Sample_000 -16.681 -8.492 Sweden 71.000 f 703.000 85.000 562.000 NaN NaN biochemical control NaN NaN 0 1,205
1 Sample_001 -12.801 -5.119 Sweden 77.000 m 518.000 91.000 334.000 NaN NaN biochemical AD NaN NaN 1 1,219
2 Sample_002 -13.784 -12.951 Sweden 75.000 m 974.000 87.000 515.000 NaN NaN biochemical AD NaN NaN 2 1,256
3 Sample_003 -16.440 -10.112 Sweden 72.000 f 950.000 109.000 394.000 NaN NaN biochemical AD NaN NaN 3 1,225
4 Sample_004 -15.129 -6.471 Sweden 63.000 f 873.000 88.000 234.000 NaN NaN biochemical AD NaN NaN 4 1,209
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
205 Sample_205 6.682 -15.297 Berlin 69.000 f 1,945.000 NaN 699.000 12,140.000 0.058 biochemical AD AD 17.000 205 1,339
206 Sample_206 7.405 -6.126 Berlin 73.000 m 299.000 NaN 1,420.000 16,571.000 0.086 biochemical control non-AD 28.000 206 1,258
207 Sample_207 7.329 -11.041 Berlin 71.000 f 262.000 NaN 639.000 9,663.000 0.066 biochemical control non-AD 28.000 207 1,301
208 Sample_208 8.058 -5.501 Berlin 83.000 m 289.000 NaN 1,436.000 11,285.000 0.127 biochemical control non-AD 24.000 208 1,259
209 Sample_209 3.657 -7.572 Berlin 63.000 f 591.000 NaN 1,299.000 11,232.000 0.116 biochemical control non-AD 29.000 209 1,247

210 rows × 16 columns

Hide code cell source

pcs.describe(include='all').T
count unique top freq mean std min 25% 50% 75% max
Sample ID 210 210 Sample_000 1 NaN NaN NaN NaN NaN NaN NaN
principal component 1 (11.62 %) 210.000 NaN NaN NaN -0.000 8.981 -17.550 -11.061 4.635 6.655 9.125
principal component 2 (10.17 %) 210.000 NaN NaN NaN -0.000 8.404 -18.953 -6.310 0.305 5.485 23.359
_collection site 197 4 Berlin 83 NaN NaN NaN NaN NaN NaN NaN
_age at CSF collection 197.000 NaN NaN NaN 67.726 12.123 20.000 63.000 70.000 74.000 88.000
_gender 197 2 f 99 NaN NaN NaN NaN NaN NaN NaN
_t-tau [ng/L] 181.000 NaN NaN NaN 553.624 372.272 78.000 275.000 441.000 802.000 2,390.000
_p-tau [ng/L] 98.000 NaN NaN NaN 72.449 40.869 16.000 36.750 73.500 93.750 233.000
_Abeta-42 [ng/L] 181.000 NaN NaN NaN 687.105 381.119 154.000 417.000 593.000 892.000 2,206.000
_Abeta-40 [ng/L] 121.000 NaN NaN NaN 10,505.843 5,192.847 2,450.000 6,608.000 9,515.000 12,967.000 26,080.000
_Abeta-42/Abeta-40 ratio 121.000 NaN NaN NaN 0.079 0.047 0.016 0.045 0.067 0.105 0.370
_primary biochemical AD classification 197 2 biochemical control 109 NaN NaN NaN NaN NaN NaN NaN
_clinical AD diagnosis 137 2 non-AD 88 NaN NaN NaN NaN NaN NaN NaN
_MMSE score 83.000 NaN NaN NaN 25.723 4.028 12.000 23.500 27.000 29.000 30.000
PlaceholderTime 210.000 NaN NaN NaN 104.500 60.766 0.000 52.250 104.500 156.750 209.000
identified protein groups 210.000 NaN NaN NaN 1,200.043 100.703 800.000 1,147.000 1,220.000 1,278.750 1,342.000

Hide code cell source

if args.meta_cat_col:
    fig, ax = plt.subplots(figsize=(3, 3))
    analyzers.seaborn_scatter(
        pcs[pcs_name], ax, meta=pcs[args.meta_cat_col], title=f"by {args.meta_cat_col}")
    fname = (args.out_figures
             / f'0_{group}_pca_sample_by_{"_".join(args.meta_cat_col.split())}')
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_pca_sample_by__collection_site
_images/0b4446e61193d8514f3785efdd4967b4c6d7a9fdac403377821b28e0e1cf1216.png

Hide code cell source

if args.meta_date_col != 'PlaceholderTime':
    fig, ax = plt.subplots()
    analyzers.plot_date_map(
        df=pcs[pcs_name], ax=ax, dates=pcs[args.meta_date_col], title=f'by {args.meta_date_col}')
    fname = args.out_figures / f'0_{group}_pca_sample_by_date'
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, fname)
  • size: number of features in a single sample

Hide code cell source

fig, ax = plt.subplots()
col_identified_feat = _feature_display_name
analyzers.plot_scatter(
    pcs[pcs_name],
    ax,
    pcs[col_identified_feat],
    feat_name_display=args.feat_name_display,
    size=5,
)
fname = (args.out_figures
         / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}.pdf')
figures[fname.stem] = fname
pimmslearn.savefig(fig, fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_pca_sample_by_identified_protein_groups.pdf
_images/4c08bdd6d4592004b6d42af813ec3564180ef5773d3b81b5c658f925665b32b2.png

Hide code cell source

# ! write principal components to excel (if needed)
# pcs.set_index([df.index.name])[[*pcs_name, col_identified_feat]].to_excel(fname.with_suffix('.xlsx'))

Hide code cell source

fig = px.scatter(
    pcs, x=pcs_name[0], y=pcs_name[1],
    hover_name=pcs_index_name,
    # hover_data=analysis.df_meta,
    title=f'First two Principal Components of {args.M} {args.feat_name_display} for {pcs.shape[0]} samples',
    # color=pcs['Software Version'],
    color=col_identified_feat,
    template='none',
    width=1200,  # 4 inches x 300 dpi
    height=600  # 2 inches x 300 dpi
)
fname = (args.out_figures
         / f'0_{group}_pca_sample_by_{"_".join(col_identified_feat.split())}_plotly.pdf')
figures[fname.stem] = fname
fig.write_image(fname)
fig  # stays interactive in html
choreographer.browsers.chromium - INFO     Chromium init'ed with kwargs {}
choreographer.browsers.chromium - INFO     Found chromium path: /usr/bin/google-chrome
choreographer.utils._tmpfile - INFO     Temp directory created: /tmp/tmp8en2t9dh.
choreographer.browser_async - INFO     Opening browser.
choreographer.utils._tmpfile - INFO     Temp directory created: /tmp/tmpesnmtc3y.
choreographer.browsers.chromium - INFO     ldd failed. e: Command '['ldd', '/usr/bin/google-chrome']' returned non-zero exit status 1., stderr: None
choreographer.browsers.chromium - INFO     Temporary directory at: /tmp/tmpesnmtc3y
kaleido.kaleido - INFO     Conforming 1 to file:///tmp/tmp8en2t9dh/index.html
kaleido.kaleido - INFO     Waiting on all navigates
kaleido.kaleido - INFO     All navigates done, putting them all in queue.
kaleido.kaleido - INFO     Getting tab from queue (has 1)
kaleido.kaleido - INFO     Got CCF1
kaleido._kaleido_tab - INFO     Processing First_two_Principal_Components_of_1421_protein_groups_for_210_samples.pdf
kaleido._kaleido_tab - INFO     Sending big command for First_two_Principal_Components_of_1421_protein_groups_for_210_samples.pdf.
kaleido._kaleido_tab - INFO     Sent big command for First_two_Principal_Components_of_1421_protein_groups_for_210_samples.pdf.
kaleido.kaleido - INFO     Reloading tab CCF1 before return.
kaleido.kaleido - INFO     Putting tab CCF1 back (queue size: 0).
kaleido.kaleido - INFO     Waiting for all cleanups to finish.
kaleido.kaleido - INFO     Exiting Kaleido
choreographer.utils._tmpfile - INFO     TemporaryDirectory.cleanup() worked.
choreographer.utils._tmpfile - INFO     shutil.rmtree worked.
kaleido.kaleido - INFO     Cancelling tasks.
kaleido.kaleido - INFO     Exiting Kaleido/Choreo
choreographer.browser_async - INFO     Closing browser.
choreographer.utils._tmpfile - INFO     TemporaryDirectory.cleanup() worked.
choreographer.utils._tmpfile - INFO     shutil.rmtree worked.
kaleido.kaleido - INFO     Cancelling tasks.
kaleido.kaleido - INFO     Exiting Kaleido/Choreo
choreographer.browser_async - INFO     Closing browser.
choreographer.utils._tmpfile - INFO     TemporaryDirectory.cleanup() worked.
choreographer.utils._tmpfile - INFO     shutil.rmtree worked.
choreographer.utils._tmpfile - INFO     TemporaryDirectory.cleanup() worked.
choreographer.utils._tmpfile - INFO     shutil.rmtree worked.

Sample Medians and percentiles#

Hide code cell source

df.head()
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 A0A075B6J9 A0A075B6K4 A0A075B6K5 A0A075B6P5;P01615 A0A075B6Q5 A0A075B6R2 ... Q9Y4L1 Q9Y5F6;Q9Y5F6-2 Q9Y5I4;Q9Y5I4-2 Q9Y5Y7 Q9Y617 Q9Y646 Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 NaN 16.148 17.343 17.016 NaN NaN ... 18.598 16.469 17.187 18.840 16.859 19.322 16.012 15.178 NaN 15.050 16.842 19.863 NaN 19.563 12.837 12.805
Sample_001 15.936 16.874 15.519 16.387 13.796 19.941 18.786 17.144 NaN 19.067 16.188 16.127 17.417 17.779 NaN NaN ... 18.476 15.782 17.447 19.195 16.799 19.190 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 14.523 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 NaN 15.387 17.236 17.431 15.128 16.280 ... 18.991 17.015 17.410 19.088 16.288 19.702 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 13.438 16.565 16.267 16.990 NaN 16.777 ... 18.560 16.529 17.545 18.715 17.075 19.760 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 12.627 12.445
Sample_004 15.603 15.331 15.375 16.679 15.473 20.450 18.682 17.081 14.140 19.686 14.495 16.418 17.390 17.493 NaN 17.497 ... 18.305 16.285 17.297 18.668 16.736 19.624 14.757 15.094 14.048 15.256 17.075 19.582 15.328 19.867 13.145 12.235

5 rows × 1421 columns

Hide code cell source

df_w_date = df.join(df_meta[args.meta_date_col])
df_w_date = df_w_date.set_index(args.meta_date_col).sort_index()
if not args.meta_date_col == 'PlaceholderTime':
    df_w_date.to_period('min')
df_w_date = df_w_date.T
df_w_date
PlaceholderTime 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 ... 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
A0A024QZX5;A0A087X1N8;P35237 15.912 15.936 16.111 16.107 15.603 15.812 15.500 15.221 15.980 15.679 15.981 15.983 15.949 15.475 15.706 15.744 ... 14.994 15.211 14.609 15.119 15.455 15.614 NaN 15.863 15.656 15.401 NaN 15.682 15.798 15.739 15.477 15.727
A0A024R0T9;K7ER74;P02655 16.852 16.874 14.523 17.032 15.331 18.614 17.409 17.684 16.386 16.590 15.948 16.669 17.694 17.726 17.252 17.340 ... 17.586 18.131 18.024 18.273 17.390 17.834 17.310 16.615 17.953 18.199 17.279 16.886 17.554 16.877 16.779 17.261
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 15.570 15.519 15.935 15.802 15.375 15.624 15.912 15.385 15.894 15.375 15.993 15.654 15.751 15.191 15.469 14.457 ... NaN 15.143 NaN 15.399 NaN 16.456 NaN 15.932 14.859 15.125 15.287 14.910 15.600 15.469 14.995 15.175
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 16.481 16.387 16.416 16.979 16.679 15.958 16.234 16.418 16.271 16.210 16.742 16.159 15.790 16.403 16.433 16.024 ... 16.300 16.166 15.935 16.335 16.116 16.107 15.954 16.030 16.498 16.513 16.513 16.482 15.938 16.898 16.132 16.235
A0A075B6H7 17.301 13.796 18.175 15.963 15.473 18.317 NaN 17.214 17.794 NaN 16.166 16.681 NaN 17.284 17.545 16.999 ... NaN NaN 12.612 NaN NaN NaN NaN 16.483 14.835 13.569 14.483 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Q9Y6R7 19.863 20.299 19.598 19.429 19.582 19.130 18.690 18.996 19.993 18.550 19.727 19.216 20.987 19.837 18.966 19.874 ... 18.905 17.909 18.558 19.101 17.878 19.403 19.233 18.816 18.902 18.813 19.456 17.922 18.708 18.725 19.411 19.275
Q9Y6X5 NaN 15.556 15.735 NaN 15.328 NaN NaN NaN NaN NaN 15.187 15.403 14.979 14.827 NaN 15.801 ... 16.177 14.916 15.355 15.554 14.801 16.005 15.895 15.562 16.089 15.669 15.012 16.340 14.249 16.138 15.807 15.732
Q9Y6Y8;Q9Y6Y8-2 19.563 19.386 20.447 20.216 19.867 19.633 20.057 18.680 20.023 19.948 19.047 20.855 19.851 20.344 19.532 21.643 ... 19.364 19.283 19.382 19.690 18.413 19.706 18.582 19.809 20.204 19.418 17.847 19.928 19.433 19.599 19.545 19.577
Q9Y6Y9 12.837 13.970 12.636 12.627 13.145 12.224 12.817 12.897 NaN 13.685 13.478 12.923 NaN NaN NaN NaN ... 12.398 12.172 11.766 NaN NaN NaN NaN NaN 12.707 12.978 12.287 12.929 NaN 13.637 13.216 11.042
S4R3U6 12.805 12.442 12.505 12.445 12.235 NaN NaN NaN 13.008 12.279 12.534 11.679 NaN NaN NaN NaN ... NaN 10.033 NaN 10.309 NaN 11.089 NaN 10.133 11.529 10.498 10.563 11.802 NaN 11.174 NaN 11.791

1421 rows × 210 columns

Hide code cell source

ax = df_w_date.plot.box(rot=80,
                        figsize=(7, 3),
                        fontsize=7,
                        showfliers=False,
                        showcaps=False,
                        boxprops=dict(linewidth=.4, color='darkblue'),
                        flierprops=dict(markersize=.4, color='lightblue'),
                        )
_ = pimmslearn.plotting.select_xticks(ax)
fig = ax.get_figure()
fname = args.out_figures / f'0_{group}_median_boxplot'
df_w_date.to_pickle(fname.with_suffix('.pkl'))
figures[fname.stem] = fname
pimmslearn.savefig(fig, fname)
del df_w_date
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_1_median_boxplot
_images/830bca6c3075c8d3a1ef7b9c98321ff8241cd6dd5c530d48fbab548be2fde407.png

Percentiles of intensities in dataset

Hide code cell source

df.stack().describe(percentiles=np.linspace(0.05, 0.95, 19).round(2))
count   252,009.000
mean         17.119
std           2.567
min           6.695
5%           13.564
10%          14.280
15%          14.739
20%          15.098
25%          15.404
30%          15.677
35%          15.933
40%          16.194
45%          16.458
50%          16.733
55%          17.026
60%          17.337
65%          17.688
70%          18.087
75%          18.561
80%          19.128
85%          19.729
90%          20.519
95%          21.770
max          30.735
dtype: float64

Plot sample median over time#

  • check if points are equally spaced (probably QC samples are run in close proximity)

  • the machine will be not use for intermediate periods

Hide code cell source

if not args.meta_date_col == 'PlaceholderTime':
    dates = df_meta[args.meta_date_col].sort_values()
    median_sample_intensity = (df
                               .median(axis=1)
                               .to_frame('median intensity'))
    median_sample_intensity = median_sample_intensity.join(dates)

    ax = median_sample_intensity.plot.scatter(x=dates.name, y='median intensity',
                                              rot=90,
                                              #   fontsize=6,
                                              figsize=(8, 2),
                                              s=5,
                                              xticks=pimmslearn.plotting.select_dates(
                                                  median_sample_intensity[dates.name])
                                              )
    fig = ax.get_figure()
    fname = args.out_figures / f'0_{group}_median_scatter'
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, fname)
  • the closer the labels are there denser the samples are measured around that time.

Feature frequency in data#

Hide code cell source

msg = "Total number of samples in data: {}"
logger.info(msg.format(len(df)))
root - INFO     Total number of samples in data: 210

Recalculate feature frequency after selecting samples

Hide code cell source

freq_per_feature = feature_frequency(df)
freq_per_feature
protein groups
A0A024QZX5;A0A087X1N8;P35237                                                     197
A0A024R0T9;K7ER74;P02655                                                         208
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8   185
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                          208
A0A075B6H7                                                                        97
                                                                                  ..
Q9Y6R7                                                                           210
Q9Y6X5                                                                           186
Q9Y6Y8;Q9Y6Y8-2                                                                  210
Q9Y6Y9                                                                           126
S4R3U6                                                                           135
Name: freq, Length: 1421, dtype: int64

Hide code cell source

# freq_per_feature.name = 'Gene names freq' # name it differently?
# index.name is lost when data is stored
fname = args.data / 'freq_features.json'
dumps[fname.name] = fname
freq_per_feature.to_json(fname)
fname = fname.with_suffix('.pkl')
dumps[fname.name] = fname
freq_per_feature.to_pickle(fname)

Split: Train, validation and test data#

Select features as described in

Lazar, Cosmin, Laurent Gatto, Myriam Ferro, Christophe Bruley, and Thomas Burger. 2016. “Accounting for the Multiple Natures of Missing Values in Label-Free Quantitative Proteomics Data Sets to Compare Imputation Strategies.” Journal of Proteome Research 15 (4): 1116–25.

  • select frac_mnar based on threshold matrix on quantile of overall frac of data to be used for validation and test data split, e.g. 0.1 = quantile(0.1)

  • select frac_mnar from intensities selected using threshold matrix

Hide code cell source

splits = DataSplits(is_wide_format=False)
logger.info(f"{splits = }")
splits.__annotations__
root - INFO     splits = DataSplits(train_X=None, val_y=None, test_y=None)
{'is_wide_format': 'bool',
 'train_X': 'pd.DataFrame',
 'val_y': 'pd.DataFrame',
 'test_y': 'pd.DataFrame'}

Create some target values by sampling X% of the validation and test data. Simulated missing values are not used for validation and testing.

Hide code cell source

df_long = pimmslearn.io.datasplits.long_format(df)
df_long.head()
intensity
Sample ID protein groups
Sample_000 A0A024QZX5;A0A087X1N8;P35237 15.912
A0A024R0T9;K7ER74;P02655 16.852
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 15.570
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 16.481
A0A075B6H7 17.301

Hide code cell source

group = 2

splits, thresholds, fake_na_mcar, fake_na_mnar = pimmslearn.sampling.sample_mnar_mcar(
    df_long=df_long,
    frac_non_train=args.frac_non_train,
    frac_mnar=args.frac_mnar,
    random_state=args.random_state,
)
logger.info(f"{splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }")
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/sampling.py:209: FutureWarning:

Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead

/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/sampling.py:210: FutureWarning:

Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead

pimmslearn.sampling - INFO     int(N * frac_non_train) = 25,200
pimmslearn.sampling - INFO     len(fake_na_mnar) = 6,300
pimmslearn.sampling - INFO     len(splits.train_X) = 245,709
pimmslearn.sampling - INFO     len(fake_na) = 25,200
pimmslearn.sampling - INFO     len(fake_na_mcar) = 18,900
root - INFO     splits.train_X.shape = (226809,) - splits.val_y.shape = (12600,) - splits.test_y.shape = (12600,)

Hide code cell source

N = len(df_long)
N_MCAR = len(fake_na_mcar)
N_MNAR = len(fake_na_mnar)

fig, axes = plt.subplots(1, 2, figsize=(6, 2))
ax = axes[0]
plot_histogram_intensities = partial(pimmslearn.plotting.data.plot_histogram_intensities,
                                     min_max=min_max,
                                     alpha=0.8)
plot_histogram_intensities(
    df_long.squeeze(),
    ax=ax,
    label='observed')
plot_histogram_intensities(
    thresholds,
    ax=ax,
    label='thresholds')
if args.use_every_nth_xtick > 1:
    ax.set_xticks(ax.get_xticks()[::2])
ax.legend()
ax = axes[1]
plot_histogram_intensities(
    fake_na_mnar.squeeze(),
    ax=ax,
    label=f'MNAR ({N_MNAR:,d})',
    color='C2')
plot_histogram_intensities(
    fake_na_mcar.squeeze(),
    ax=ax,
    color='C3',
    label=f'MCAR ({N_MCAR:,d})')
if args.use_every_nth_xtick > 1:
    ax.set_xticks(ax.get_xticks()[::2])
ax.legend()
fname = args.out_figures / f'0_{group}_mnar_mcar_histograms.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_2_mnar_mcar_histograms.pdf
_images/852e1d729a527083b7f95baeeec09c0c6d3734df5f96daf608608d2e55bac42b.png

Hide code cell source

counts_per_bin = pimmslearn.pandas.get_counts_per_bin(
    df=pd.concat(
        [df_long.squeeze().to_frame('observed'),
         thresholds.to_frame('threshold'),
         fake_na_mnar.squeeze().to_frame(f'MNAR ({N_MNAR:,d})'),
         fake_na_mcar.squeeze().to_frame(f'MCAR ({N_MCAR:,d})')],
        axis=1),
    bins=range(min_max[0], min_max[1] + 1, 1))
counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
observed threshold MNAR (6,300) MCAR (18,900)
bin
(6, 7] 1 0 0 1
(7, 8] 4 0 2 0
(8, 9] 39 0 8 2
(9, 10] 136 0 20 13
(10, 11] 529 3 114 29
(11, 12] 1,731 387 353 103
(12, 13] 4,644 11,870 953 310
(13, 14] 12,161 77,911 2,085 729
(14, 15] 27,515 117,843 2,120 1,951
(15, 16] 44,723 40,668 611 3,406
(16, 17] 45,981 3,261 33 3,511
(17, 18] 36,274 63 1 2,733
(18, 19] 25,049 3 0 1,975
(19, 20] 20,386 0 0 1,619
(20, 21] 13,495 0 0 1,061
(21, 22] 8,234 0 0 602
(22, 23] 4,885 0 0 367
(23, 24] 2,459 0 0 197
(24, 25] 1,237 0 0 98
(25, 26] 1,392 0 0 108
(26, 27] 643 0 0 46
(27, 28] 88 0 0 4
(28, 29] 216 0 0 15
(29, 30] 163 0 0 18
(30, 31] 24 0 0 2

Keep simulated samples only in a subset of the samples#

In case only a subset of the samples should be used for validation and testing, although these samples can be used for fitting the models, the following cell will select samples stratified by the eventually set meta_cat_col column.

The procedure is experimental and turned off by default.

Hide code cell source

if 0.0 < args.prop_sample_w_sim < 1.0:
    to_stratify = None
    if args.meta_cat_col and df_meta is not None:
        to_stratify = df_meta[args.meta_cat_col].fillna(-1)  # ! fillna with -1 as separate category (sofisticate check)
    train_idx, val_test_idx = train_test_split(splits.train_X.index.levels[0],
                                               test_size=args.prop_sample_w_sim,
                                               stratify=to_stratify,
                                               random_state=42)
    val_idx, test_idx = train_test_split(val_test_idx,
                                         test_size=.5,
                                         stratify=to_stratify.loc[val_test_idx] if to_stratify is not None else None,
                                         random_state=42)
    logger.info(f"Sample in Train: {len(train_idx):,d} - Validation: {len(val_idx):,d} - Test: {len(test_idx):,d}")
    # reassign some simulated missing values to training data:
    splits.train_X = pd.concat(
        [splits.train_X,
         splits.val_y.loc[train_idx],
         splits.test_y.loc[train_idx]
         ])
    splits.val_y = splits.val_y.loc[val_idx]
    splits.test_y = splits.test_y.loc[test_idx]
    logger.info(f"New shapes: {splits.train_X.shape = } - {splits.val_y.shape = } - {splits.test_y.shape = }")

Hide code cell source

splits.test_y.groupby(level=-1).count().describe()
count   1,418.000
mean        8.886
std         3.986
min         1.000
25%         6.000
50%         8.000
75%        11.000
max        32.000
Name: intensity, dtype: float64

Hide code cell source

splits.val_y
Sample ID   protein groups                   
Sample_158  Q9UN70;Q9UN70-2                     14.630
Sample_050  Q9Y287                              15.755
Sample_107  Q8N475;Q8N475-2                     15.029
Sample_199  P06307                              19.376
Sample_067  Q5VUB5                              15.309
                                                 ...  
Sample_111  F6SYF8;Q9UBP4                       22.822
Sample_002  A0A0A0MT36                          18.165
Sample_049  Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4   15.525
Sample_182  Q8NFT8                              14.379
Sample_123  Q16853;Q16853-2                     14.504
Name: intensity, Length: 12600, dtype: float64

Hide code cell source

splits.train_X.groupby(level=-1).count().describe()
count   1,421.000
mean      159.612
std        42.227
min        36.000
25%       134.000
50%       179.000
75%       192.000
max       203.000
Name: intensity, dtype: float64

Hide code cell source

# Check that feature indices and sample indicies overlap between splits
# -> a single feature cannot be only in the validation or test split
# -> single features should be put into the training data
# -> or raise error as feature completness treshold is so low that less than 3 samples
# per feature are allowd.

splits = pimmslearn.sampling.check_split_integrity(splits)

Some tools require at least 4 observation in the training data, which is a good requirment. Due to “MNAR” sampling, most measurments of a features can end up in the validation or test data.

In that case: Move the validation measurments back to the training data. If after this procedure the condition is still not met, a value error is raised.

Hide code cell source

mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4
if mask_min_4_measurments.any():
    idx = mask_min_4_measurments.loc[mask_min_4_measurments].index
    logger.warning(f"Features with less than 4 measurments in training data: {idx.to_list()}")
    to_remove = splits.val_y.loc[pd.IndexSlice[:, idx]]
    logger.info("To remove from validation data: ")
    display(to_remove)
    splits.train_X = pd.concat([splits.train_X, to_remove])
    splits.val_y = splits.val_y.drop(to_remove.index)
    # check condition again
    mask_min_4_measurments = splits.train_X.groupby(level=1).count() < 4
    if mask_min_4_measurments.any():
        idx = mask_min_4_measurments.loc[mask_min_4_measurments].index
        raise ValueError("Some features still have less than 4 measurments in training data"
                         f" after removing the features from the validation data: {idx.to_list()}")

Save in long format#

  • Data in long format: (peptide, sample_id, intensity)

  • no missing values kept

Hide code cell source

# dumps data in long-format
splits_dumped = splits.dump(folder=args.data, file_format=args.file_format)
dumps.update(splits_dumped)
splits_dumped
pimmslearn.io.datasplits - INFO     'test_y' has shape: (12600,)
pimmslearn.io.datasplits - INFO     save 'test_y' to file: runs/alzheimer_study/data/test_y.csv
pimmslearn.io.datasplits - INFO     'train_X' has shape: (226809,)
pimmslearn.io.datasplits - INFO     save 'train_X' to file: runs/alzheimer_study/data/train_X.csv
pimmslearn.io.datasplits - INFO     'val_y' has shape: (12600,)
pimmslearn.io.datasplits - INFO     save 'val_y' to file: runs/alzheimer_study/data/val_y.csv
{'test_y.csv': 'runs/alzheimer_study/data/test_y.csv',
 'train_X.csv': 'runs/alzheimer_study/data/train_X.csv',
 'val_y.csv': 'runs/alzheimer_study/data/val_y.csv'}

Reload from disk#

Hide code cell source

splits = DataSplits.from_folder(args.data, file_format=args.file_format)
pimmslearn.io.datasplits - INFO     Loaded 'train_X' from file: runs/alzheimer_study/data/train_X.csv
pimmslearn.io.datasplits - INFO     Loaded 'val_y' from file: runs/alzheimer_study/data/val_y.csv
pimmslearn.io.datasplits - INFO     Loaded 'test_y' from file: runs/alzheimer_study/data/test_y.csv

plot distribution of splits#

Hide code cell source

splits_df = pd.DataFrame(index=df_long.index)
splits_df['train'] = splits.train_X
splits_df['val'] = splits.val_y
splits_df['test'] = splits.test_y
stats_splits = splits_df.describe()
stats_splits.to_excel(writer, 'stats_splits', float_format='%.3f')
stats_splits
/tmp/ipykernel_3011/3486782611.py:6: FutureWarning:

Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
train val test
count 226,809.000 12,600.000 12,600.000
mean 17.206 16.340 16.339
std 2.533 2.717 2.741
min 7.068 6.695 7.209
25% 15.506 14.395 14.412
50% 16.805 15.958 15.935
75% 18.626 17.931 17.910
max 30.735 29.841 30.140

Hide code cell source

# whitespaces in legends are not displayed correctly...
# max_int_len   = len(str(int(stats_splits.loc['count'].max()))) +1
# _legend = [
#     f'{s:<5} (N={int(stats_splits.loc["count", s]):>{max_int_len},d})'.replace(
#         ' ', '\u00A0')
#     for s in ('train', 'val', 'test')]
_legend = [
    f'{s:<5} (N={int(stats_splits.loc["count", s]):,d})'
    for s in ('train', 'val', 'test')]
print(_legend)
['train (N=226,809)', 'val   (N=12,600)', 'test  (N=12,600)']

Hide code cell source

group = 3
ax = (splits
      .train_X
      .plot
      .hist(
          bins=bins,
          ax=None,
          color='C0',
      ))
_ = (splits
     .val_y
     .plot
     .hist(bins=bins,
           xticks=list(bins),
           ax=ax,
           color='C2',
           legend=True)
     )
ax.legend(_legend[:-1])
if args.use_every_nth_xtick > 1:
    ax.set_xticks(ax.get_xticks()[::2])
ax.set_xlabel('Intensity bins')
fname = args.out_figures / f'0_{group}_val_over_train_split.pdf'
figures[fname.name] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_3_val_over_train_split.pdf
_images/53ccdee7b26a7851a83b6638b772a69dc9f4239539d414e4d0005a3f87f3e723.png

Hide code cell source

min_bin, max_bin = pimmslearn.plotting.data.min_max(splits.val_y)
bins = range(int(min_bin), int(max_bin) + 1, 1)
ax = splits_df.plot.hist(bins=bins,
                         xticks=list(bins),
                         legend=False,
                         stacked=True,
                         color=['C0', 'C1', 'C2'],
                         )
if args.use_every_nth_xtick > 1:
    ax.set_xticks(ax.get_xticks()[::2])
ax.legend(_legend)
ax.set_xlabel('Intensity bins')
ax.yaxis.set_major_formatter("{x:,.0f}")
fname = args.out_figures / f'0_{group}_splits_freq_stacked.pdf'
figures[fname.name] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_3_splits_freq_stacked.pdf
_images/347eb95c899b32273999a22ff315c4a7ac1e23e870b7e58b19d1b24f8b864051.png

Hide code cell source

counts_per_bin = pimmslearn.pandas.get_counts_per_bin(df=splits_df, bins=bins)
counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
train val test
bin
(6, 7] 0 1 0
(7, 8] 2 0 2
(8, 9] 29 3 7
(9, 10] 103 15 18
(10, 11] 386 74 69
(11, 12] 1,275 239 217
(12, 13] 3,381 629 634
(13, 14] 9,347 1,420 1,394
(14, 15] 23,444 2,029 2,042
(15, 16] 40,706 1,963 2,054
(16, 17] 42,437 1,757 1,787
(17, 18] 33,540 1,401 1,333
(18, 19] 23,074 1,010 965
(19, 20] 18,767 827 792
(20, 21] 12,434 525 536
(21, 22] 7,632 282 320
(22, 23] 4,518 185 182
(23, 24] 2,262 95 102
(24, 25] 1,139 53 45
(25, 26] 1,284 58 50
(26, 27] 597 21 25
(27, 28] 84 1 3
(28, 29] 201 7 8
(29, 30] 145 5 13

Hide code cell source

ax = splits_df.drop('train', axis=1).plot.hist(bins=bins,
                                               xticks=list(bins),
                                               color=['C1', 'C2'],
                                               legend=False,
                                               stacked=True,
                                               )
if args.use_every_nth_xtick > 1:
    ax.set_xticks(ax.get_xticks()[::2])
ax.legend(_legend[1:])
ax.set_xlabel('Intensity bins')
ax.yaxis.set_major_formatter("{x:,.0f}")
fname = args.out_figures / f'0_{group}_val_test_split_freq_stacked_.pdf'
figures[fname.name] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_3_val_test_split_freq_stacked_.pdf
_images/f595bb71dc1b9646fd97941ee72f96f43ea093c5727e7c4ed27c0cd3ff967023.png

plot training data missing plots

Hide code cell source

splits.to_wide_format()

Hide code cell source

ax = pimmslearn.plotting.data.plot_feat_median_over_prop_missing(
    data=splits.train_X, type='scatter')
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_scatter_train'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_3_intensity_median_vs_prop_missing_scatter_train
_images/c2376bf958908e17364d3d2c5bb9f5feccab8ec014cb2c63df8ea46a2fe4d4da.png

Hide code cell source

ax = pimmslearn.plotting.data.plot_feat_median_over_prop_missing(
    data=splits.train_X, type='boxplot')
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_train'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/data.py:327: FutureWarning:

Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_3_intensity_median_vs_prop_missing_boxplot_train
_images/cc29fa3679a135839a2d22d07379b257d21a68d05c147ff9b945fd4f8b298139.png

Hide code cell source

medians = (splits
           .train_X
           .median()
           .astype(int)
           ).to_frame('median_floor')

feat_with_median = medians.groupby('median_floor').size().rename('M feat')
medians = medians.join(feat_with_median, on='median_floor')
medians = medians.apply(lambda s: "{:02,d} (N={:3,d})".format(*s), axis=1)

fig, ax = plt.subplots(figsize=(6, 2))
s = 1
s_axes = pd.DataFrame({'medians': medians,
                       'Validation split': splits.val_y.notna().sum(),
                       'Training split': splits.train_X.notna().sum()}
                      ).plot.box(by='medians',
                                 boxprops=dict(linewidth=s),
                                 flierprops=dict(markersize=s),
                                 ax=ax)
for ax in s_axes:
    _ = ax.set_xticklabels(ax.get_xticklabels(),
                           rotation=45,
                           horizontalalignment='right')
    ax.set_xlabel(f'{args.feat_name_display.capitalize()} binned by their median intensity '
                  f'(N {args.feat_name_display})')
    _ = ax.set_ylabel('Frequency')
fname = args.out_figures / f'0_{group}_intensity_median_vs_prop_missing_boxplot_val_train'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), fname)
/tmp/ipykernel_3011/657883200.py:16: UserWarning:

To output multiple subplots, the figure containing the passed axes is being cleared.
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/0_3_intensity_median_vs_prop_missing_boxplot_val_train
_images/97961bf21eb8d2a17d9680b2be2a20ebc2241900839456e985f81326f0719bb1.png

Save parameters#

Hide code cell source

fname = args.folder_experiment / 'data_config.yaml'
args.dump(fname)
args
root - INFO     Dumped config to: runs/alzheimer_study/data_config.yaml
{'FN_INTENSITIES': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/proteome.csv',
 'M': 1421,
 'N': 210,
 'column_names': ['protein groups'],
 'data': PosixPath('runs/alzheimer_study/data'),
 'feat_name_display': 'protein groups',
 'feat_prevalence': 52,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'folder_data': '',
 'folder_experiment': PosixPath('runs/alzheimer_study'),
 'frac_mnar': 0.25,
 'frac_non_train': 0.1,
 'index_col': [0],
 'logarithm': 'log2',
 'meta_cat_col': '_collection site',
 'meta_date_col': 'PlaceholderTime',
 'out_figures': PosixPath('runs/alzheimer_study/figures'),
 'out_folder': PosixPath('runs/alzheimer_study'),
 'out_metrics': PosixPath('runs/alzheimer_study'),
 'out_models': PosixPath('runs/alzheimer_study'),
 'out_preds': PosixPath('runs/alzheimer_study/preds'),
 'prop_sample_w_sim': 1.0,
 'random_state': 42,
 'sample_N': False,
 'sample_completeness': 710,
 'select_N': None,
 'use_every_nth_xtick': 1,
 'used_samples': ['Sample_000',
                  'Sample_001',
                  'Sample_002',
                  'Sample_003',
                  'Sample_004',
                  'Sample_005',
                  'Sample_006',
                  'Sample_007',
                  'Sample_008',
                  'Sample_009',
                  'Sample_010',
                  'Sample_011',
                  'Sample_012',
                  'Sample_013',
                  'Sample_014',
                  'Sample_015',
                  'Sample_016',
                  'Sample_017',
                  'Sample_018',
                  'Sample_019',
                  'Sample_020',
                  'Sample_021',
                  'Sample_022',
                  'Sample_023',
                  'Sample_024',
                  'Sample_025',
                  'Sample_026',
                  'Sample_027',
                  'Sample_028',
                  'Sample_029',
                  'Sample_030',
                  'Sample_031',
                  'Sample_032',
                  'Sample_033',
                  'Sample_034',
                  'Sample_035',
                  'Sample_036',
                  'Sample_037',
                  'Sample_038',
                  'Sample_039',
                  'Sample_040',
                  'Sample_041',
                  'Sample_042',
                  'Sample_043',
                  'Sample_044',
                  'Sample_045',
                  'Sample_046',
                  'Sample_047',
                  'Sample_048',
                  'Sample_049',
                  'Sample_050',
                  'Sample_051',
                  'Sample_052',
                  'Sample_053',
                  'Sample_054',
                  'Sample_055',
                  'Sample_056',
                  'Sample_057',
                  'Sample_058',
                  'Sample_059',
                  'Sample_060',
                  'Sample_061',
                  'Sample_062',
                  'Sample_063',
                  'Sample_064',
                  'Sample_065',
                  'Sample_066',
                  'Sample_067',
                  'Sample_068',
                  'Sample_069',
                  'Sample_070',
                  'Sample_071',
                  'Sample_072',
                  'Sample_073',
                  'Sample_074',
                  'Sample_075',
                  'Sample_076',
                  'Sample_077',
                  'Sample_078',
                  'Sample_079',
                  'Sample_080',
                  'Sample_081',
                  'Sample_082',
                  'Sample_083',
                  'Sample_084',
                  'Sample_085',
                  'Sample_086',
                  'Sample_087',
                  'Sample_088',
                  'Sample_089',
                  'Sample_090',
                  'Sample_091',
                  'Sample_092',
                  'Sample_093',
                  'Sample_094',
                  'Sample_095',
                  'Sample_096',
                  'Sample_097',
                  'Sample_098',
                  'Sample_099',
                  'Sample_100',
                  'Sample_101',
                  'Sample_102',
                  'Sample_103',
                  'Sample_104',
                  'Sample_105',
                  'Sample_106',
                  'Sample_107',
                  'Sample_108',
                  'Sample_109',
                  'Sample_110',
                  'Sample_111',
                  'Sample_112',
                  'Sample_113',
                  'Sample_114',
                  'Sample_115',
                  'Sample_116',
                  'Sample_117',
                  'Sample_118',
                  'Sample_119',
                  'Sample_120',
                  'Sample_121',
                  'Sample_122',
                  'Sample_123',
                  'Sample_124',
                  'Sample_125',
                  'Sample_126',
                  'Sample_127',
                  'Sample_128',
                  'Sample_129',
                  'Sample_130',
                  'Sample_131',
                  'Sample_132',
                  'Sample_133',
                  'Sample_134',
                  'Sample_135',
                  'Sample_136',
                  'Sample_137',
                  'Sample_138',
                  'Sample_139',
                  'Sample_140',
                  'Sample_141',
                  'Sample_142',
                  'Sample_143',
                  'Sample_144',
                  'Sample_145',
                  'Sample_146',
                  'Sample_147',
                  'Sample_148',
                  'Sample_149',
                  'Sample_150',
                  'Sample_151',
                  'Sample_152',
                  'Sample_153',
                  'Sample_154',
                  'Sample_155',
                  'Sample_156',
                  'Sample_157',
                  'Sample_158',
                  'Sample_159',
                  'Sample_160',
                  'Sample_161',
                  'Sample_162',
                  'Sample_163',
                  'Sample_164',
                  'Sample_165',
                  'Sample_166',
                  'Sample_167',
                  'Sample_168',
                  'Sample_169',
                  'Sample_170',
                  'Sample_171',
                  'Sample_172',
                  'Sample_173',
                  'Sample_174',
                  'Sample_175',
                  'Sample_176',
                  'Sample_177',
                  'Sample_178',
                  'Sample_179',
                  'Sample_180',
                  'Sample_181',
                  'Sample_182',
                  'Sample_183',
                  'Sample_184',
                  'Sample_185',
                  'Sample_186',
                  'Sample_187',
                  'Sample_188',
                  'Sample_189',
                  'Sample_190',
                  'Sample_191',
                  'Sample_192',
                  'Sample_193',
                  'Sample_194',
                  'Sample_195',
                  'Sample_196',
                  'Sample_197',
                  'Sample_198',
                  'Sample_199',
                  'Sample_200',
                  'Sample_201',
                  'Sample_202',
                  'Sample_203',
                  'Sample_204',
                  'Sample_205',
                  'Sample_206',
                  'Sample_207',
                  'Sample_208',
                  'Sample_209']}

Saved Figures#

Hide code cell source

# saved figures
figures
{'0_1_hist_features_per_sample': PosixPath('runs/alzheimer_study/figures/0_1_hist_features_per_sample'),
 '0_1_feature_prevalence': PosixPath('runs/alzheimer_study/figures/0_1_feature_prevalence'),
 '0_1_intensity_distribution_overall': PosixPath('runs/alzheimer_study/figures/0_1_intensity_distribution_overall'),
 '0_1_intensity_median_vs_prop_missing_scatter': PosixPath('runs/alzheimer_study/figures/0_1_intensity_median_vs_prop_missing_scatter'),
 '0_1_intensity_median_vs_prop_missing_boxplot': PosixPath('runs/alzheimer_study/figures/0_1_intensity_median_vs_prop_missing_boxplot'),
 '0_1_pca_sample_by__collection_site': PosixPath('runs/alzheimer_study/figures/0_1_pca_sample_by__collection_site'),
 '0_1_pca_sample_by_identified_protein_groups': PosixPath('runs/alzheimer_study/figures/0_1_pca_sample_by_identified_protein_groups.pdf'),
 '0_1_pca_sample_by_identified_protein_groups_plotly': PosixPath('runs/alzheimer_study/figures/0_1_pca_sample_by_identified_protein_groups_plotly.pdf'),
 '0_1_median_boxplot': PosixPath('runs/alzheimer_study/figures/0_1_median_boxplot'),
 '0_2_mnar_mcar_histograms': PosixPath('runs/alzheimer_study/figures/0_2_mnar_mcar_histograms.pdf'),
 '0_3_val_over_train_split.pdf': PosixPath('runs/alzheimer_study/figures/0_3_val_over_train_split.pdf'),
 '0_3_splits_freq_stacked.pdf': PosixPath('runs/alzheimer_study/figures/0_3_splits_freq_stacked.pdf'),
 '0_3_val_test_split_freq_stacked_.pdf': PosixPath('runs/alzheimer_study/figures/0_3_val_test_split_freq_stacked_.pdf'),
 '0_3_intensity_median_vs_prop_missing_scatter_train': PosixPath('runs/alzheimer_study/figures/0_3_intensity_median_vs_prop_missing_scatter_train'),
 '0_3_intensity_median_vs_prop_missing_boxplot_train': PosixPath('runs/alzheimer_study/figures/0_3_intensity_median_vs_prop_missing_boxplot_train'),
 '0_3_intensity_median_vs_prop_missing_boxplot_val_train': PosixPath('runs/alzheimer_study/figures/0_3_intensity_median_vs_prop_missing_boxplot_val_train')}

Saved dumps

Hide code cell source

writer.close()
dumps
{'01_0_data_stats.xlsx': 'runs/alzheimer_study/01_0_data_stats.xlsx',
 'freq_features.json': PosixPath('runs/alzheimer_study/data/freq_features.json'),
 'freq_features.pkl': PosixPath('runs/alzheimer_study/data/freq_features.pkl'),
 'test_y.csv': 'runs/alzheimer_study/data/test_y.csv',
 'train_X.csv': 'runs/alzheimer_study/data/train_X.csv',
 'val_y.csv': 'runs/alzheimer_study/data/val_y.csv'}