Compare models#

  1. Load available configurations

  2. Load validation predictions

    • calculate absolute error

    • select top N for plotting by MAE from smallest (best) to largest (worst) (top N as specified, default 5)

    • correlation per sample, correlation per feat, correlation overall

    • MAE plots

  3. Load test data predictions

    • as for validation data

    • top N based on validation data

Hide code cell source

import logging
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from IPython.display import display

import pimmslearn
import pimmslearn.imputation
import pimmslearn.models
import pimmslearn.nb
from pimmslearn.analyzers import compare_predictions
from pimmslearn.io import datasplits
from pimmslearn.models.collect_dumps import collect, select_content

pd.options.display.max_rows = 30
pd.options.display.min_rows = 10
pd.options.display.max_colwidth = 100

plt.rcParams.update({'figure.figsize': (4, 2)})
pimmslearn.plotting.make_large_descriptors(7)

logger = pimmslearn.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)


def load_config_file(fname: Path, first_split='config_') -> dict:
    with open(fname) as f:
        loaded = yaml.safe_load(f)
    key = f"{select_content(fname.stem, first_split=first_split)}"
    return key, loaded


def build_text(s):
    ret = ''
    if not np.isnan(s["latent_dim"]):
        ret += f'LD: {int(s["latent_dim"])} '
    try:
        if len(s["hidden_layers"]):
            t = ",".join(str(x) for x in s["hidden_layers"])
            ret += f"HL: {t}"
    except TypeError:
        # nan
        pass
    return ret

Hide code cell source

# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

# files and folders
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
folder_data: str = ''  # specify data directory if needed
file_format: str = 'csv'  # change default to pickled files
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
models: str = 'Median,CF,DAE,VAE'  # picked models to compare (comma separated)
sel_models: str = ''  # user defined comparison (comma separated)
# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10
plot_to_n: int = 5
feat_name_display: str = None  # display name for feature name in plural (e.g. 'protein groups')
save_agg_pred: bool = False  # save aggregated predictions of validation and test data
# Parameters
fn_rawfile_metadata = "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv"
folder_experiment = "runs/alzheimer_study"
models = "Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO"

Some argument transformations

Hide code cell source

args = pimmslearn.nb.get_params(args, globals=globals())
args
root - INFO     Removed from global namespace: folder_experiment
root - INFO     Removed from global namespace: folder_data
root - INFO     Removed from global namespace: file_format
root - INFO     Removed from global namespace: fn_rawfile_metadata
root - INFO     Removed from global namespace: models
root - INFO     Removed from global namespace: sel_models
root - INFO     Removed from global namespace: plot_to_n
root - INFO     Removed from global namespace: feat_name_display
root - INFO     Removed from global namespace: save_agg_pred
{'folder_experiment': 'runs/alzheimer_study',
 'folder_data': '',
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'sel_models': '',
 'plot_to_n': 5,
 'feat_name_display': None,
 'save_agg_pred': False}

Hide code cell source

args = pimmslearn.nb.args_from_dict(args)
args
{'data': Path('runs/alzheimer_study/data'),
 'feat_name_display': None,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'folder_data': '',
 'folder_experiment': Path('runs/alzheimer_study'),
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'out_figures': Path('runs/alzheimer_study/figures'),
 'out_folder': Path('runs/alzheimer_study'),
 'out_metrics': Path('runs/alzheimer_study'),
 'out_models': Path('runs/alzheimer_study'),
 'out_preds': Path('runs/alzheimer_study/preds'),
 'plot_to_n': 5,
 'save_agg_pred': False,
 'sel_models': ''}

Hide code cell source

figures = {}
dumps = {}

Hide code cell source

TARGET_COL = 'observed'
METRIC = 'MAE'
MIN_FREQ = None
MODELS_PASSED = args.models.split(',')
MODELS = MODELS_PASSED.copy()
FEAT_NAME_DISPLAY = args.feat_name_display
SEL_MODELS = None
if args.sel_models:
    SEL_MODELS = args.sel_models.split(',')

Hide code cell source

# list(sns.color_palette().as_hex()) # string representation of colors
if args.plot_to_n > 10:
    logger.warning("Set maximum of models to 10 (maximum)")
    args.overwrite_entry('plot_to_n', 10)

Hide code cell source

data = datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)
pimmslearn.io.datasplits - INFO     Loaded 'train_X' from file: runs/alzheimer_study/data/train_X.csv
pimmslearn.io.datasplits - INFO     Loaded 'val_y' from file: runs/alzheimer_study/data/val_y.csv
pimmslearn.io.datasplits - INFO     Loaded 'test_y' from file: runs/alzheimer_study/data/test_y.csv

Hide code cell source

fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)

pimmslearn.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],
                                     title='Validation split', size=1, xlabel='')
pimmslearn.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],
                                     title='Test split', size=1, xlabel='')
fig.suptitle("Simulated missing values per sample", size=8)
# hide axis and use only for common x label
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')
group = 1
fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png
_images/bcc830576e6800fb332656d5089810696f99617ab1fb5ce7e49196ead302c780.png

data completeness across entire data#

Hide code cell source

# load frequency of training features...
# needs to be pickle -> index.name needed
freq_feat = pimmslearn.io.datasplits.load_freq(args.data, file='freq_features.json')
freq_feat.head()  # training data
A0A024QZX5;A0A087X1N8;P35237                                                     197
A0A024R0T9;K7ER74;P02655                                                         208
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8   185
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                          208
A0A075B6H7                                                                        97
Name: freq, dtype: int64

Hide code cell source

prop = freq_feat / len(data.train_X.index.levels[0])
prop.sort_values().to_frame().plot(
    xlabel=f'{data.val_y.index.names[-1]}',
    ylabel='Proportion of identification in samples')
<Axes: xlabel='protein groups', ylabel='Proportion of identification in samples'>
_images/f5bcc76ed99dbb1c984c6c2b88c5d09907ee5ee566249f664a6d2e18581e1d75.png

View training data in wide format

Hide code cell source

data.to_wide_format()
data.train_X
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 ... Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 ... 16.012 15.178 NaN 15.050 16.842 NaN NaN 19.563 NaN 12.805
Sample_001 NaN 16.874 15.519 16.387 NaN 19.941 18.786 17.144 NaN 19.067 ... 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 NaN 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 ... 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 ... 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 NaN 12.445
Sample_004 15.603 15.331 15.375 16.679 NaN 20.450 18.682 17.081 14.140 19.686 ... 14.757 NaN NaN 15.256 17.075 19.582 15.328 NaN 13.145 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 15.682 16.886 14.910 16.482 NaN 17.705 17.039 NaN 16.413 19.102 ... NaN 15.684 14.236 15.415 17.551 17.922 16.340 19.928 12.929 NaN
Sample_206 15.798 17.554 15.600 15.938 NaN 18.154 18.152 16.503 16.860 18.538 ... 15.422 16.106 NaN 15.345 17.084 18.708 NaN 19.433 NaN NaN
Sample_207 15.739 NaN 15.469 16.898 NaN 18.636 17.950 16.321 16.401 18.849 ... 15.808 16.098 14.403 15.715 NaN 18.725 16.138 19.599 13.637 11.174
Sample_208 15.477 16.779 14.995 16.132 NaN 14.908 NaN NaN 16.119 18.368 ... 15.157 16.712 NaN 14.640 16.533 19.411 15.807 19.545 NaN NaN
Sample_209 NaN 17.261 15.175 16.235 NaN 17.893 17.744 16.371 15.780 18.806 ... 15.237 15.652 15.211 14.205 16.749 19.275 15.732 19.577 11.042 11.791

210 rows × 1421 columns

Number of samples and features:

Hide code cell source

N_SAMPLES, M_FEAT = data.train_X.shape
print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}")
N samples: 210, M features: 1421

Collect outputs in excel file:

Hide code cell source

fname = args.folder_experiment / '01_2_performance_summary.xlsx'
dumps[fname.stem] = fname
writer = pd.ExcelWriter(fname)
print(f"Saving to: {fname}")
Saving to: runs/alzheimer_study/01_2_performance_summary.xlsx

Model specifications#

  • used for bar plot annotations

Hide code cell source

# model_key could be used as key from config file
# ? load only specified configs?
# ? case: no config file available?
all_configs = collect(
    paths=(fname for fname in args.out_models.iterdir()
           if fname.suffix == '.yaml'
           and 'model_config' in fname.name),
    load_fn=load_config_file
)
model_configs = pd.DataFrame(all_configs).set_index('id')
model_configs.T.to_excel(writer, sheet_name='model_params')
model_configs.T
id VAE KNN CF DAE KNN5 Median
M 1421 1421 1421 1421 1421 1421
batch_size 64.000 64.000 1,024.000 64.000 64.000 NaN
cuda False True False False True NaN
data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data
epoch_trained 128.000 NaN 19.000 156.000 NaN NaN
epochs_max 300.000 50.000 100.000 300.000 50.000 NaN
file_format csv csv csv csv csv csv
fn_rawfile_metadata https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv
folder_data NaN
folder_experiment runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
hidden_layers [64] NaN NaN [64] NaN NaN
latent_dim 10.000 NaN 50.000 10.000 NaN NaN
meta_cat_col NaN NaN NaN NaN NaN NaN
meta_date_col NaN NaN NaN NaN NaN NaN
model VAE KNN CF DAE KNN Median
model_key VAE KNN CF DAE KNN5 Median
n_params 277998 1 83283 184983 1 1421
out_figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures
out_folder runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_metrics runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_models runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds
patience 50.000 NaN 1.000 25.000 NaN NaN
sample_idx_position 0 0 0 0 0 0
save_pred_real_na True True True True True True
force_train NaN True NaN NaN True NaN
neighbors NaN 3.000 NaN NaN 5.000 NaN
pred_test_Median NaN NaN NaN NaN NaN runs/alzheimer_study/preds/pred_test_Median.csv
pred_val_Median NaN NaN NaN NaN NaN runs/alzheimer_study/preds/pred_val_Median.csv

Set Feature name (columns are features, rows are samples)

Hide code cell source

# index name
freq_feat.index.name = data.train_X.columns.name
# sample index name
sample_index_name = data.train_X.index.name

Load predictions on validation and test data split#

Validation data#

  • set top N models to plot based on validation data split

Hide code cell source

pred_val = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='val',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
SAMPLE_ID, FEAT_NAME = pred_val.index.names
if not FEAT_NAME_DISPLAY:
    FEAT_NAME_DISPLAY = FEAT_NAME
pred_val[MODELS]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 15.752 15.468 15.750 15.598 15.427 15.449 15.469 16.800 NaN 58.276 ... 7.068 12.648 2,513.638 12.219 14.614 15.766 15.752 17.206 15.700 0
Sample_050 Q9Y287 17.221 16.491 17.023 16.879 17.776 17.314 16.453 17.288 NaN 16.993 ... 7.068 12.464 19.829 12.573 15.944 16.809 17.221 17.807 16.738 0
Sample_107 Q8N475;Q8N475-2 14.846 14.171 13.672 14.352 14.150 14.355 13.110 17.187 NaN -78.084 ... 7.068 11.703 2,582.130 12.107 12.374 14.609 14.846 17.434 13.776 0
Sample_199 P06307 18.973 19.574 18.925 19.046 19.247 19.385 19.639 16.711 NaN 102.283 ... 7.068 13.012 2,483.120 13.407 16.587 19.265 18.973 17.111 19.015 0
Sample_067 Q5VUB5 14.726 15.250 15.290 15.046 15.232 15.040 15.465 16.743 NaN -36.470 ... 7.068 12.321 2,569.564 11.629 13.158 14.951 14.726 17.031 14.699 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.918 23.101 22.936 22.925 22.884 22.899 22.994 17.042 NaN 104.484 ... 7.068 13.197 2,634.108 13.324 22.112 22.892 22.918 17.330 22.872 0
Sample_002 A0A0A0MT36 15.877 15.633 15.529 16.001 16.857 16.142 15.882 16.792 NaN -18.408 ... 7.068 11.981 2,448.503 12.041 13.184 15.891 15.877 16.879 15.671 0
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 16.278 15.475 15.956 15.407 15.840 15.574 15.406 17.032 NaN -27.128 ... 7.068 11.557 2,487.550 13.051 14.732 15.637 16.278 17.215 15.574 0
Sample_182 Q8NFT8 13.995 13.904 14.157 13.615 13.685 13.480 14.322 16.764 NaN -12.434 ... 7.068 12.446 2,426.191 11.649 11.505 13.935 13.995 17.125 14.518 0
Sample_123 Q16853;Q16853-2 14.849 14.329 14.553 14.488 14.612 14.627 14.582 16.686 NaN 78.799 ... 7.068 12.503 2,461.806 12.430 13.516 14.640 14.849 16.981 14.485 0

12600 rows × 24 columns

Describe absolute error

Hide code cell source

errors_val = (pred_val
              .drop(TARGET_COL, axis=1)
              .sub(pred_val[TARGET_COL], axis=0)
              [MODELS])
errors_val  # over all samples and all features
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 1.122 0.838 1.119 0.968 0.797 0.819 0.839 2.169 NaN 43.645 ... -7.562 -1.982 2,499.008 -2.411 -0.016 1.135 1.122 2.575 1.070 -14.630
Sample_050 Q9Y287 1.466 0.736 1.268 1.124 2.021 1.559 0.698 1.533 NaN 1.238 ... -8.687 -3.291 4.074 -3.182 0.189 1.054 1.466 2.052 0.983 -15.755
Sample_107 Q8N475;Q8N475-2 -0.183 -0.858 -1.357 -0.677 -0.880 -0.674 -1.919 2.157 NaN -93.113 ... -7.961 -3.326 2,567.100 -2.922 -2.655 -0.420 -0.183 2.405 -1.253 -15.029
Sample_199 P06307 -0.403 0.198 -0.451 -0.330 -0.129 0.009 0.263 -2.665 NaN 82.907 ... -12.308 -6.363 2,463.744 -5.969 -2.789 -0.111 -0.403 -2.265 -0.360 -19.376
Sample_067 Q5VUB5 -0.583 -0.058 -0.019 -0.263 -0.077 -0.269 0.156 1.434 NaN -51.779 ... -8.241 -2.988 2,554.255 -3.680 -2.151 -0.358 -0.583 1.723 -0.610 -15.309
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 0.096 0.279 0.113 0.103 0.062 0.077 0.171 -5.781 NaN 81.662 ... -15.754 -9.625 2,611.285 -9.498 -0.711 0.070 0.096 -5.493 0.050 -22.822
Sample_002 A0A0A0MT36 -2.288 -2.532 -2.636 -2.164 -1.308 -2.023 -2.283 -1.373 NaN -36.573 ... -11.097 -6.184 2,430.338 -6.124 -4.981 -2.274 -2.288 -1.286 -2.494 -18.165
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 0.753 -0.050 0.430 -0.118 0.314 0.049 -0.120 1.507 NaN -42.653 ... -8.457 -3.968 2,472.025 -2.474 -0.793 0.112 0.753 1.690 0.049 -15.525
Sample_182 Q8NFT8 -0.383 -0.475 -0.222 -0.764 -0.694 -0.899 -0.057 2.385 NaN -26.813 ... -7.311 -1.933 2,411.812 -2.730 -2.873 -0.444 -0.383 2.746 0.139 -14.379
Sample_123 Q16853;Q16853-2 0.345 -0.175 0.049 -0.016 0.108 0.123 0.077 2.181 NaN 64.295 ... -7.436 -2.001 2,447.302 -2.074 -0.989 0.135 0.345 2.477 -0.019 -14.504

12600 rows × 24 columns

Select top N for plotting and set colors#

Hide code cell source

ORDER_MODELS = (errors_val
                .abs()
                .mean()
                .sort_values()
                .index
                .to_list())
ORDER_MODELS
['BPCA',
 'VAE',
 'DAE',
 'CF',
 'TRKNN',
 'RF',
 'KNN5',
 'KNN',
 'KNN_IMPUTE',
 'IRM',
 'ROWMEDIAN',
 'Median',
 'LLS',
 'QRILC',
 'COLMEDIAN',
 'SVDMETHOD',
 'PI',
 'MINDET',
 'MINPROB',
 'MINIMUM',
 'ZERO',
 'IMPSEQROB',
 'MLE',
 'IMPSEQ']

Hide code cell source

pred_val = pred_val[[TARGET_COL] + ORDER_MODELS]
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_val.csv'
    dumps[fname.stem] = fname
    pred_val.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_val
observed BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 14.630 15.469 15.598 15.750 15.468 15.700 15.766 15.449 15.427 15.937 ... 16.800 17.206 12.219 11.916 12.648 7.068 0 58.276 2,513.638 NaN
Sample_050 Q9Y287 15.755 16.453 16.879 17.023 16.491 16.738 16.809 17.314 17.776 16.961 ... 17.288 17.807 12.573 12.900 12.464 7.068 0 16.993 19.829 NaN
Sample_107 Q8N475;Q8N475-2 15.029 13.110 14.352 13.672 14.171 13.776 14.609 14.355 14.150 15.437 ... 17.187 17.434 12.107 12.313 11.703 7.068 0 -78.084 2,582.130 NaN
Sample_199 P06307 19.376 19.639 19.046 18.925 19.574 19.015 19.265 19.385 19.247 18.861 ... 16.711 17.111 13.407 12.285 13.012 7.068 0 102.283 2,483.120 NaN
Sample_067 Q5VUB5 15.309 15.465 15.046 15.290 15.250 14.699 14.951 15.040 15.232 15.079 ... 16.743 17.031 11.629 11.827 12.321 7.068 0 -36.470 2,569.564 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.822 22.994 22.925 22.936 23.101 22.872 22.892 22.899 22.884 22.837 ... 17.042 17.330 13.324 12.161 13.197 7.068 0 104.484 2,634.108 NaN
Sample_002 A0A0A0MT36 18.165 15.882 16.001 15.529 15.633 15.671 15.891 16.142 16.857 15.446 ... 16.792 16.879 12.041 12.586 11.981 7.068 0 -18.408 2,448.503 NaN
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 15.525 15.406 15.407 15.956 15.475 15.574 15.637 15.574 15.840 15.995 ... 17.032 17.215 13.051 12.352 11.557 7.068 0 -27.128 2,487.550 NaN
Sample_182 Q8NFT8 14.379 14.322 13.615 14.157 13.904 14.518 13.935 13.480 13.685 14.675 ... 16.764 17.125 11.649 12.504 12.446 7.068 0 -12.434 2,426.191 NaN
Sample_123 Q16853;Q16853-2 14.504 14.582 14.488 14.553 14.329 14.485 14.640 14.627 14.612 14.824 ... 16.686 16.981 12.430 12.689 12.503 7.068 0 78.799 2,461.806 NaN

12600 rows × 25 columns

Hide code cell source

mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS]
mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f')
mae_stats_ordered_val.T
count mean std min 25% 50% 75% max
BPCA 12,600.000 0.422 0.501 0.000 0.119 0.269 0.534 9.370
VAE 12,600.000 0.432 0.519 0.000 0.121 0.278 0.547 8.994
DAE 12,600.000 0.432 0.521 0.000 0.122 0.272 0.545 9.133
CF 12,600.000 0.449 0.496 0.000 0.141 0.304 0.579 7.485
TRKNN 12,600.000 0.450 0.516 0.000 0.132 0.295 0.569 7.975
RF 12,600.000 0.460 0.529 0.000 0.134 0.300 0.587 9.880
KNN5 12,600.000 0.467 0.546 0.000 0.135 0.305 0.594 10.231
KNN 12,600.000 0.481 0.565 0.000 0.138 0.310 0.618 10.502
KNN_IMPUTE 12,600.000 0.554 0.668 0.000 0.164 0.359 0.692 7.550
IRM 12,600.000 0.588 0.637 0.000 0.176 0.396 0.767 7.953
ROWMEDIAN 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
Median 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
LLS 12,600.000 1.329 54.974 0.000 0.151 0.343 0.662 4,842.571
QRILC 12,600.000 1.649 1.289 0.001 0.833 1.349 2.079 13.928
COLMEDIAN 12,600.000 2.210 1.634 0.000 0.947 1.972 3.094 12.944
SVDMETHOD 12,600.000 2.309 1.635 0.000 1.027 2.091 3.251 12.624
PI 12,600.000 3.817 2.639 0.000 1.774 3.360 5.359 18.101
MINDET 12,600.000 4.108 2.650 0.001 2.089 3.678 5.665 17.920
MINPROB 12,600.000 4.113 2.686 0.000 2.084 3.694 5.708 17.909
MINIMUM 12,600.000 9.272 2.717 0.373 7.327 8.890 10.863 22.773
ZERO 12,600.000 16.340 2.717 6.695 14.395 15.958 17.931 29.841
IMPSEQROB 12,600.000 333.478 793.700 0.002 12.282 33.864 87.298 2,869.299
MLE 12,600.000 2,172.384 865.925 0.009 2,435.415 2,495.362 2,552.718 2,873.681
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Some model have fixed colors, others are assigned randomly

Note

  1. The order of “new” models is important for the color assignment.

  2. User defined model keys for the same model with two configuration will yield different colors.

Hide code cell source

COLORS_TO_USE = pimmslearn.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS))
pimmslearn.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE)
pimmslearn.plotting.defaults - INFO     Reused some colors!
BPCAVAEDAECFTRKNNRFKNN5KNNKNN_IMPUTEIRMROWMEDIANMedianLLSQRILCCOLMEDIANSVDMETHODPIMINDETMINPROBMINIMUMZEROIMPSEQROBMLEIMPSEQ

Hide code cell source

TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n]
TOP_N_COLOR_PALETTE = {model: color for model,
                       color in zip(TOP_N_ORDER, COLORS_TO_USE)}
TOP_N_ORDER
['BPCA', 'VAE', 'DAE', 'CF', 'TRKNN']

Correlation per sample#

Hide code cell source

corr_per_sample_val = (pred_val
                       .groupby(sample_index_name)
                       .apply(
                           lambda df: df.corr().loc[TARGET_COL]
                       )[ORDER_MODELS])

min_corr = int(corr_per_sample_val.min().min() * 10) / 10
kwargs = dict(ylim=(min_corr, 1), rot=90,
              #     boxprops=dict(linewidth=1.5),
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model pred. per sample on validation data',
              ylabel='correlation per sample')
ax = corr_per_sample_val[TOP_N_ORDER].plot.box(**kwargs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.xlsx'
dumps[fname.stem] = fname
with pd.ExcelWriter(fname) as w:
    corr_per_sample_val.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_val.to_excel(w, sheet_name='correlations')
    corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf
_images/31ae7ed8debb633b223456ab1954e9dfb689a1fc30615044db119d872196e2aa.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_val[TOP_N_ORDER]).min()
mask = (corr_per_sample_val[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_val.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
observed BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID                                                
Sample_010 0.948707 0.928795 0.939680 0.939311 0.944645 0.928248 0.940594 0.946115 0.932235 0.932518 0.869102 0.869102 0.913668 0.782729 nan 0.065802 0.191534 nan -0.174631 nan nan 0.589151 nan nan
Sample_018 0.968582 0.936431 0.920387 0.961592 0.953573 0.923923 0.925531 0.938822 0.939383 0.952858 0.908717 0.908717 0.936909 0.932376 nan 0.161907 0.091454 nan 0.006277 nan nan 0.375658 nan nan
Sample_054 0.932254 0.930614 0.916210 0.926968 0.910271 0.936441 0.936915 0.925876 0.905679 0.913765 0.915748 0.915748 0.929264 0.789307 nan 0.190649 -0.142084 nan -0.117917 nan nan 0.836204 nan nan
Sample_071 0.887866 0.909605 0.907382 0.908911 0.888162 0.906414 0.901240 0.895286 0.880453 0.865003 0.885806 0.885806 0.899799 0.849572 nan 0.178303 0.031775 nan -0.133502 nan nan 0.364226 nan nan
Sample_073 0.930349 0.922763 0.921461 0.916778 0.919876 0.905874 0.933555 0.950641 0.916774 0.901773 0.900178 0.900178 0.909057 0.912581 nan -0.017963 0.010632 nan 0.079804 nan nan 0.356937 nan nan
Sample_095 0.940942 0.918491 0.925223 0.926031 0.927289 0.925051 0.924950 0.930902 0.909714 0.913905 0.878167 0.878167 0.917350 0.757668 nan -0.120269 -0.008036 nan -0.025743 nan nan 0.419195 nan nan
Sample_133 0.919483 0.924200 0.938710 0.918443 0.928251 0.926853 0.903483 0.903370 0.885348 0.878925 0.899233 0.899233 0.881238 0.872025 nan 0.219841 0.261224 nan 0.023684 nan nan 0.409126 nan nan
Sample_139 0.927681 0.928092 0.930150 0.938595 0.957367 0.934729 0.912868 0.901552 0.878475 0.891290 0.907333 0.907333 0.928867 0.858117 nan 0.156894 0.066544 nan 0.342560 nan nan 0.554137 nan nan
Sample_150 0.950334 0.910516 0.877762 0.951996 0.945063 0.927092 0.885565 0.868275 0.930981 0.907849 0.892997 0.892997 0.940619 0.828743 nan 0.166841 0.088846 nan -0.046880 nan nan 0.335988 nan nan
Sample_171 0.924707 0.917611 0.911777 0.933590 0.916959 0.895942 0.902581 0.906699 0.884571 0.881019 0.875433 0.875433 -0.090619 0.911127 nan -0.004823 -0.120075 nan 0.197905 nan nan 0.302006 nan nan
Sample_173 0.916627 0.954449 0.946506 0.939216 0.932711 0.962014 0.939783 0.940274 0.918589 0.916299 0.925428 0.925428 0.926916 0.967632 nan 0.059663 0.194551 nan 0.107264 nan nan 0.334436 nan nan
Sample_174 0.970316 0.886038 0.880650 0.961332 0.967356 0.861425 0.854645 0.846532 0.920737 0.920759 0.887409 0.887409 0.972096 0.788844 nan 0.306279 -0.124010 nan -0.050976 nan nan 0.357612 nan nan
Sample_198 0.914339 0.946290 0.958050 0.944786 0.932612 0.945375 0.955742 0.947627 0.936142 0.946119 0.956493 0.956493 0.924497 0.953493 nan 0.097862 -0.194275 nan -0.328568 nan nan 0.481999 nan nan

Error plot#

Hide code cell source

c_error_min = 4.5
mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1)
errors_val.loc[mask].sort_index(level=1).head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_012 A0A024QZX5;A0A087X1N8;P35237 -0.324 0.151 -0.198 -0.360 -0.246 -0.416 -0.140 0.856 NaN -53.608 ... -8.881 -4.229 0.632 -4.534 -1.020 -0.407 -0.324 1.241 -0.318 -15.949
Sample_017 A0A024QZX5;A0A087X1N8;P35237 0.347 0.292 0.401 0.220 -0.093 -0.022 0.251 1.658 NaN 26.724 ... -8.211 -3.532 1.228 -1.795 -0.442 0.306 0.347 2.214 0.305 -15.279
Sample_050 A0A024QZX5;A0A087X1N8;P35237 0.544 0.199 0.191 0.087 0.024 -0.102 0.178 2.207 NaN 0.348 ... -8.013 -1.029 3.294 -2.022 -0.150 0.137 0.544 2.691 0.238 -15.081
Sample_102 A0A024QZX5;A0A087X1N8;P35237 -0.029 -0.176 -0.152 0.006 0.030 0.067 -0.107 0.942 NaN 19.277 ... -8.586 -3.965 1.609 -3.875 -0.723 -0.083 -0.029 1.168 -0.065 -15.654
Sample_109 A0A024QZX5;A0A087X1N8;P35237 0.343 -0.158 -0.245 0.050 -0.179 -0.004 -0.263 1.518 NaN -28.795 ... -8.215 -4.357 -2.077 -4.131 -0.973 -0.005 0.343 1.968 -0.012 -15.283

5 rows × 24 columns

Hide code cell source

errors_val = errors_val.abs().groupby(
    freq_feat.index.name).mean()  # absolute error
errors_val = errors_val.join(freq_feat)
errors_val = errors_val.sort_values(by=freq_feat.name, ascending=True)
errors_val.head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
Q9Y281;Q9Y281-3 0.415 0.312 0.299 0.322 0.325 0.285 0.266 4.078 NaN 10.519 ... 0.547 2,473.194 0.643 1.316 0.392 0.415 4.472 0.307 12.573 52
K7EPJ5;O60291;O60291-2;O60291-3;O60291-4 0.331 0.455 0.348 0.341 0.281 0.385 0.387 3.029 NaN 13.344 ... 1.377 2,525.930 1.337 1.475 0.301 0.331 3.452 0.257 13.857 52
B1AJQ6;Q86Y82 1.082 0.943 0.591 0.057 0.482 1.226 0.674 3.367 NaN 5.026 ... 1.623 2,441.128 1.108 4.360 0.996 1.082 3.926 0.900 13.380 52
P69892 0.872 1.547 1.615 1.265 1.734 1.286 1.360 1.980 NaN 18.023 ... 3.334 2,532.051 2.260 8.552 1.109 0.872 2.320 0.966 14.768 53
A2RU67 0.689 0.382 0.330 0.500 0.503 0.462 0.539 4.495 NaN 15.116 ... 0.828 1,998.072 0.774 1.736 0.525 0.689 4.870 0.462 12.437 53

5 rows × 25 columns

Hide code cell source

errors_val.describe()[ORDER_MODELS].T  # mean of means
count mean std min 25% 50% 75% max
BPCA 1,419.000 0.408 0.306 0.017 0.222 0.320 0.494 4.195
VAE 1,419.000 0.420 0.321 0.022 0.229 0.328 0.484 3.354
DAE 1,419.000 0.422 0.324 0.039 0.226 0.326 0.495 3.490
CF 1,419.000 0.437 0.295 0.067 0.253 0.363 0.516 3.348
TRKNN 1,419.000 0.437 0.309 0.000 0.241 0.349 0.526 3.647
RF 1,419.000 0.447 0.319 0.014 0.253 0.358 0.518 3.441
KNN5 1,419.000 0.455 0.322 0.039 0.256 0.369 0.540 3.634
KNN 1,419.000 0.468 0.333 0.012 0.267 0.375 0.549 3.693
KNN_IMPUTE 1,419.000 0.531 0.378 0.063 0.296 0.424 0.636 3.430
IRM 1,419.000 0.555 0.372 0.030 0.311 0.449 0.674 3.476
ROWMEDIAN 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
Median 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
LLS 1,419.000 1.088 19.029 0.023 0.279 0.408 0.596 706.018
QRILC 1,419.000 1.619 0.895 0.415 1.035 1.359 1.901 8.552
COLMEDIAN 1,419.000 2.071 1.509 0.038 0.916 1.738 2.812 12.631
SVDMETHOD 1,419.000 2.136 1.467 0.149 0.976 1.893 2.905 12.211
PI 1,419.000 4.119 2.435 0.618 2.260 3.651 5.452 16.889
MINDET 1,419.000 4.438 2.493 0.374 2.622 4.032 5.828 17.100
MINPROB 1,419.000 4.434 2.485 0.405 2.633 3.978 5.789 17.204
MINIMUM 1,419.000 9.620 2.542 3.842 7.854 9.231 11.051 22.371
ZERO 1,419.000 16.688 2.542 10.910 14.922 16.299 18.119 29.439
IMPSEQROB 1,419.000 443.657 892.834 0.830 23.477 43.842 100.325 2,633.136
MLE 1,419.000 2,171.007 331.079 1.453 1,992.846 2,214.845 2,487.619 2,683.431
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Hide code cell source

c_avg_error = 2
mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1)
errors_val.loc[mask]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
O60512 2.233 2.164 2.072 1.929 2.293 2.209 1.989 5.769 NaN 8.471 ... 1.725 2,128.612 2.220 2.674 1.975 2.233 6.098 2.559 10.910 58
P18206;P18206-2 2.427 1.861 1.480 2.011 1.744 1.637 1.297 3.821 NaN 7.976 ... 1.151 2,518.839 1.577 2.103 1.983 2.427 4.085 1.581 12.898 97
Q99538 2.502 2.722 2.562 2.319 2.711 2.517 2.464 2.615 NaN 8.517 ... 2.570 1,769.534 2.386 2.831 2.333 2.502 2.767 2.399 14.984 107
P02100 2.192 1.562 1.997 2.688 2.283 2.509 1.033 1.996 NaN 14.829 ... 4.120 2,512.438 3.786 4.227 2.439 2.192 2.106 1.856 16.373 127
A0A0G2JRN3 3.053 3.348 3.490 3.354 3.693 3.634 4.195 3.998 NaN 71.992 ... 7.315 1,986.331 6.704 7.139 3.441 3.053 3.976 3.647 19.496 128
P01817 2.254 2.088 2.118 2.111 1.963 2.117 2.385 2.736 NaN 10.059 ... 2.395 2,369.250 2.556 2.760 2.039 2.254 3.104 2.039 14.053 133
Q15375;Q15375-4 4.171 1.479 1.404 1.361 1.608 1.331 1.981 3.754 NaN 16.223 ... 6.806 2,285.221 6.264 4.400 2.054 4.171 3.566 2.065 19.101 163
P68871 2.331 1.252 2.080 2.427 1.616 1.638 0.571 1.720 NaN 23.608 ... 4.042 2,237.073 4.079 4.090 2.260 2.331 2.014 0.854 16.378 168
P69905 2.793 1.830 3.024 2.994 2.936 2.820 1.032 2.807 NaN 94.049 ... 6.011 1,992.771 5.738 5.797 2.788 2.793 2.626 1.016 18.200 190
P35527 2.216 1.399 2.509 2.285 2.064 2.156 1.295 2.273 NaN 96.343 ... 5.007 2,335.097 4.374 4.485 1.928 2.216 2.403 1.169 17.045 195
P15509;P15509-2;P15509-3;P15509-5;P15509-7;P15509-8 2.252 1.716 1.888 1.805 1.218 1.374 1.336 3.397 NaN 48.350 ... 5.790 1,276.662 5.495 4.765 1.764 2.252 3.146 2.437 18.354 201

11 rows × 25 columns

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 3))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred_val[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,)
ax.set_ylabel(f"Average error ({METRIC})")
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf
_images/b43ef07949efd2430cde08a281ab70960126ddc02250325a9195509489c29302.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=94) BPCA 0.715 0.601 0.849
1 11\n(N=94) CF 0.644 0.521 0.793
2 11\n(N=94) DAE 0.690 0.578 0.809
3 11\n(N=94) TRKNN 0.655 0.545 0.784
4 11\n(N=94) VAE 0.632 0.505 0.783
... ... ... ... ... ...
85 29\n(N=5) BPCA 0.175 0.062 0.288
86 29\n(N=5) CF 0.156 0.083 0.257
87 29\n(N=5) DAE 0.137 0.061 0.221
88 29\n(N=5) TRKNN 0.193 0.128 0.257
89 29\n(N=5) VAE 0.114 0.038 0.214

90 rows × 5 columns

Hide code cell source

errors_binned.head()
dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_158 Q9UN70;Q9UN70-2 BPCA 0.839 15 2,398 15\n(N=2,398)
1 Sample_158 Q9UN70;Q9UN70-2 VAE 0.968 15 2,398 15\n(N=2,398)
2 Sample_158 Q9UN70;Q9UN70-2 DAE 1.119 15 2,398 15\n(N=2,398)
3 Sample_158 Q9UN70;Q9UN70-2 CF 0.838 15 2,398 15\n(N=2,398)
4 Sample_158 Q9UN70;Q9UN70-2 TRKNN 1.070 15 2,398 15\n(N=2,398)

test data#

Hide code cell source

pred_test = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='test',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
pred_test = pred_test[[TARGET_COL] + ORDER_MODELS]
pred_test = pred_test.join(freq_feat, on=freq_feat.index.name)
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_test.csv'
    dumps[fname.stem] = fname
    pred_test.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_test
observed BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_000 A0A075B6P5;P01615 17.016 17.483 17.316 17.122 17.317 17.438 17.522 17.207 17.190 18.269 ... 17.496 13.575 12.970 12.434 7.068 0 229.376 2,505.226 NaN 210
A0A087X089;Q16627;Q16627-2 18.280 17.769 18.197 18.151 18.324 17.930 17.544 18.146 18.293 17.797 ... 17.695 12.412 12.970 12.686 7.068 0 -20.319 2,505.226 NaN 210
A0A0B4J2B5;S4R460 21.735 22.459 22.292 22.467 22.549 22.397 22.173 21.959 21.835 22.205 ... 17.493 12.356 12.970 13.380 7.068 0 -10.898 2,505.226 NaN 210
A0A140T971;O95865;Q5SRR8;Q5SSV3 14.603 15.285 15.275 15.161 15.510 15.399 15.215 15.143 15.172 15.557 ... 17.087 12.895 12.970 13.055 7.068 0 -2.819 2,505.226 NaN 145
A0A140TA33;A0A140TA41;A0A140TA52;P22105;P22105-3;P22105-4 16.143 16.583 16.844 16.612 16.357 16.775 16.668 16.743 16.625 16.646 ... 17.508 12.459 12.970 11.446 7.068 0 -42.837 2,505.226 NaN 210
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_209 Q96ID5 16.074 15.866 16.154 15.817 15.787 16.122 16.076 15.981 15.909 15.925 ... 17.133 15.119 12.435 12.352 7.068 0 20.373 17.260 NaN 194
Q9H492;Q9H492-2 13.173 13.249 13.393 13.690 13.643 13.273 13.392 13.432 13.669 13.594 ... 17.109 11.969 12.435 12.264 7.068 0 14.713 19.076 NaN 111
Q9HC57 14.207 13.756 13.975 13.930 14.126 14.589 14.453 14.131 13.962 14.391 ... 17.157 13.594 12.435 12.369 7.068 0 21.445 19.649 NaN 128
Q9NPH3;Q9NPH3-2;Q9NPH3-5 14.962 15.096 15.077 15.057 15.309 15.099 14.996 15.123 15.094 15.117 ... 17.257 13.304 12.435 12.693 7.068 0 35.578 16.125 NaN 199
Q9UGM5;Q9UGM5-2 16.871 16.395 16.471 16.444 16.690 16.429 16.619 16.378 16.255 17.054 ... 17.133 12.097 12.435 11.639 7.068 0 82.601 13.608 NaN 209

12600 rows × 26 columns

Write averages for all models to excel (from before?)

Hide code cell source

errors_test_mae = pimmslearn.pandas.calc_errors.get_absolute_error(
    pred_test
)
mae_stats_ordered_test = errors_test_mae.describe()[ORDER_MODELS]
mae_stats_ordered_test
BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE IRM ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
count 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 ... 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 0.000
mean 0.432 0.437 0.436 0.458 0.458 0.466 0.469 0.482 0.558 0.587 ... 2.223 2.330 3.798 4.109 4.120 9.271 16.339 334.546 2,186.302 NaN
std 0.518 0.539 0.537 0.512 0.539 0.547 0.546 0.562 0.679 0.647 ... 1.662 1.653 2.682 2.667 2.708 2.741 2.741 793.494 853.899 NaN
min 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.001 0.003 0.001 0.141 7.209 0.001 0.001 NaN
25% 0.121 0.121 0.120 0.141 0.132 0.134 0.138 0.140 0.163 0.175 ... 0.961 1.044 1.738 2.132 2.128 7.344 14.412 12.192 2,436.455 NaN
50% 0.280 0.276 0.273 0.313 0.299 0.301 0.307 0.316 0.364 0.394 ... 1.954 2.098 3.342 3.635 3.659 8.867 15.935 34.192 2,496.971 NaN
75% 0.546 0.546 0.548 0.584 0.584 0.590 0.596 0.612 0.703 0.762 ... 3.119 3.286 5.318 5.610 5.651 10.842 17.910 91.928 2,555.017 NaN
max 7.635 9.299 8.408 7.055 9.111 8.083 8.577 8.171 9.005 7.829 ... 13.272 13.022 18.583 18.317 18.347 23.072 30.140 2,869.824 2,873.005 NaN

8 rows × 24 columns

Hide code cell source

mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f')

Hide code cell source

cp_mean_perf = pd.concat([
    mae_stats_ordered_val.loc['mean'],
    mae_stats_ordered_test.loc['mean'],
],
    axis=1,
    keys=['val', 'test']
).sort_values(by='val')
cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f')
cp_mean_perf
val test
BPCA 0.422 0.432
VAE 0.432 0.437
DAE 0.432 0.436
CF 0.449 0.458
TRKNN 0.450 0.458
RF 0.460 0.466
KNN5 0.467 0.469
KNN 0.481 0.482
KNN_IMPUTE 0.554 0.558
IRM 0.588 0.587
ROWMEDIAN 0.598 0.602
Median 0.598 0.602
LLS 1.329 0.874
QRILC 1.649 1.626
COLMEDIAN 2.210 2.223
SVDMETHOD 2.309 2.330
PI 3.817 3.798
MINDET 4.108 4.109
MINPROB 4.113 4.120
MINIMUM 9.272 9.271
ZERO 16.340 16.339
IMPSEQROB 333.478 334.546
MLE 2,172.384 2,186.302
IMPSEQ NaN NaN

Hide code cell source

writer.close()

Intensity distribution as histogram#

Plot top 4 models predictions for intensities in test data

Hide code cell source

min_max = pimmslearn.plotting.data.min_max(pred_test[TARGET_COL])
top_n = 4
fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True)

for model, color, ax in zip(
        ORDER_MODELS[:top_n],
        COLORS_TO_USE[:top_n],
        axes):

    ax, bins = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[TARGET_COL],
        color='grey',
        min_max=min_max,
        ax=ax
    )
    ax, _ = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[model],
        color=color,
        min_max=min_max,
        ax=ax,
        alpha=0.5,
    )
    _ = [(l_.set_rotation(90))
         for l_ in ax.get_xticklabels()]
    ax.legend()

axes[0].set_ylabel('Number of observations')

fname = args.out_figures / f'2_{group}_intensity_binned_top_{top_n}_models_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf
_images/3c1905d89b05727a76b9cc4cc18863af95dd7d3df5607cdc3d0e4588a6f18e32.png

Hide code cell source

counts_per_bin = pimmslearn.pandas.get_counts_per_bin(df=pred_test,
                                                bins=bins,
                                                columns=[TARGET_COL, *ORDER_MODELS[:top_n]])

counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
observed BPCA VAE DAE CF
bin
(7, 8] 2 0 0 0 0
(8, 9] 7 0 0 1 0
(9, 10] 18 2 1 0 0
(10, 11] 69 29 16 30 17
(11, 12] 217 165 122 146 147
(12, 13] 634 531 513 537 569
(13, 14] 1,394 1,248 1,220 1,249 1,213
(14, 15] 2,042 2,033 2,115 2,094 2,078
(15, 16] 2,054 2,359 2,420 2,339 2,368
(16, 17] 1,787 1,867 1,845 1,862 1,846
(17, 18] 1,333 1,363 1,357 1,354 1,401
(18, 19] 965 956 918 941 928
(19, 20] 792 789 839 806 785
(20, 21] 536 528 509 509 530
(21, 22] 320 322 318 321 312
(22, 23] 182 176 180 184 179
(23, 24] 102 92 90 85 91
(24, 25] 45 38 38 43 34
(25, 26] 50 57 59 55 54
(26, 27] 25 20 17 20 25
(27, 28] 3 2 0 1 0
(28, 29] 8 11 12 12 12
(29, 30] 13 11 11 10 11

Correlation per sample#

Hide code cell source

corr_per_sample_test = (pred_test
                        .groupby(sample_index_name)
                        .apply(lambda df: df.corr().loc[TARGET_COL])
                        [ORDER_MODELS])
corr_per_sample_test = corr_per_sample_test.join(
    pred_test
    .groupby(sample_index_name)[TARGET_COL]
    .count()
    .rename('n_obs')
)
too_few_obs = corr_per_sample_test['n_obs'] < 3
corr_per_sample_test.loc[~too_few_obs].describe()
BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 ... 210.000 210.000 0.000 210.000 0.000 0.000 210.000 30.000 0.000 210.000
mean 0.969 0.967 0.968 0.968 0.966 0.966 0.965 0.963 0.948 0.952 ... 0.076 -0.007 NaN 0.002 NaN NaN 0.371 -0.018 NaN 60.000
std 0.017 0.019 0.017 0.016 0.019 0.018 0.018 0.019 0.035 0.022 ... 0.192 0.140 NaN 0.124 NaN NaN 0.139 0.151 NaN 9.810
min 0.878 0.864 0.909 0.907 0.858 0.883 0.870 0.888 0.722 0.865 ... -0.402 -0.359 NaN -0.303 NaN NaN 0.021 -0.287 NaN 31.000
25% 0.962 0.962 0.961 0.962 0.960 0.959 0.956 0.953 0.938 0.943 ... -0.059 -0.103 NaN -0.084 NaN NaN 0.288 -0.118 NaN 53.000
50% 0.973 0.971 0.971 0.972 0.970 0.969 0.970 0.968 0.958 0.956 ... 0.067 -0.004 NaN 0.003 NaN NaN 0.368 -0.042 NaN 60.000
75% 0.981 0.981 0.980 0.980 0.979 0.978 0.979 0.978 0.969 0.966 ... 0.200 0.094 NaN 0.088 NaN NaN 0.448 0.062 NaN 67.000
max 0.994 0.992 0.994 0.992 0.992 0.993 0.992 0.990 0.987 0.988 ... 0.546 0.402 NaN 0.328 NaN NaN 0.889 0.393 NaN 86.000

8 rows × 25 columns

Hide code cell source

# ! add minimum
kwargs = dict(ylim=(0.7, 1), rot=90,
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model predictions per sample on test data',
              ylabel='correlation per sample')
ax = (corr_per_sample_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_sample_test.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_test.to_excel(w, sheet_name='correlations')
    corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf
_images/bc4562a4ff62fa9955faf80ddf2c6dbd2464eb8a53c06b5051a6a53ce0c7bcdf.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_test[TOP_N_ORDER]).min()
mask = (corr_per_sample_test[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_test.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
  BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
Sample ID                                                  
Sample_015 0.958593 0.922462 0.932853 0.933572 0.967788 0.943706 0.954757 0.964680 0.932223 0.908295 0.882874 0.882874 0.923304 0.947907 nan 0.082354 -0.022672 nan -0.288437 nan nan 0.127719 -0.097389 nan 38
Sample_043 0.949633 0.864473 0.943454 0.911269 0.858409 0.883356 0.870351 0.888093 0.847971 0.882989 0.814366 0.814366 0.828364 0.762636 nan -0.060303 -0.150482 nan -0.029890 nan nan 0.444295 nan nan 57
Sample_047 0.939299 0.941508 0.937551 0.926076 0.916738 0.940193 0.950780 0.945377 0.874472 0.900719 0.896683 0.896683 0.009710 0.887812 nan -0.077002 -0.260646 nan -0.013040 nan nan 0.524493 nan nan 46
Sample_062 0.967359 0.949660 0.926246 0.962460 0.972252 0.946554 0.957733 0.960333 0.963936 0.952356 0.924748 0.924748 0.967603 0.763070 nan 0.247211 0.024737 nan 0.221667 nan nan 0.182505 nan nan 63
Sample_080 0.922142 0.908179 0.932541 0.926535 0.911596 0.905104 0.912309 0.921951 0.902040 0.883475 0.893836 0.893836 0.880736 0.859183 nan 0.093012 -0.136253 nan -0.192582 nan nan 0.383774 nan nan 64
Sample_091 0.878328 0.908694 0.933339 0.914861 0.931055 0.895622 0.918275 0.903784 0.920915 0.864672 0.903019 0.903019 0.910180 0.820676 nan -0.095511 0.291936 nan 0.050907 nan nan 0.341572 nan nan 60
Sample_108 0.929388 0.947307 0.946377 0.943595 0.940255 0.944137 0.951487 0.946047 0.866107 0.915455 0.939810 0.939810 0.929216 0.837153 nan -0.044493 0.057531 nan -0.221811 nan nan 0.407365 nan nan 68
Sample_109 0.937615 0.912115 0.931146 0.939219 0.924847 0.925261 0.898780 0.893626 0.841761 0.879726 0.890426 0.890426 0.931145 0.817513 nan -0.059637 -0.049996 nan 0.173790 nan nan 0.337532 -0.040683 nan 59
Sample_111 0.978525 0.922072 0.961509 0.957757 0.974002 0.922326 0.958219 0.933775 0.923850 0.935239 0.857016 0.857016 0.962568 0.564494 nan -0.130888 -0.049505 nan 0.022072 nan nan 0.451452 nan nan 54
Sample_112 0.942505 0.937217 0.923210 0.941756 0.947678 0.952080 0.962984 0.962546 0.956576 0.944488 0.948732 0.948732 0.947997 0.900140 nan 0.166292 0.019390 nan 0.087408 nan nan 0.504405 nan nan 57
Sample_115 0.891712 0.914102 0.909337 0.907024 0.915296 0.924770 0.928234 0.918842 0.853242 0.874847 0.881285 0.881285 0.901459 0.809391 nan 0.094778 -0.029463 nan -0.004058 nan nan 0.320851 nan nan 63
Sample_134 0.933622 0.928515 0.924094 0.933204 0.907465 0.932700 0.952746 0.935936 0.905001 0.915387 0.865397 0.865397 0.881808 0.916916 nan 0.389792 -0.222527 nan -0.173287 nan nan 0.344741 nan nan 66
Sample_138 0.957581 0.916806 0.928845 0.945503 0.953928 0.934378 0.927573 0.936390 0.943933 0.936371 0.921359 0.921359 0.963983 0.874113 nan 0.001445 0.087341 nan 0.088404 nan nan 0.523470 nan nan 46
Sample_148 0.975203 0.951496 0.917677 0.966507 0.979465 0.960635 0.946668 0.926864 0.929094 0.955283 0.935395 0.935395 0.984939 0.856731 nan 0.037085 -0.054781 nan 0.130882 nan nan 0.362124 nan nan 62
Sample_151 0.947829 0.912317 0.912959 0.933383 0.919188 0.927066 0.937720 0.937262 0.934510 0.915733 0.904552 0.904552 0.917004 0.810306 nan -0.189751 -0.097933 nan -0.024291 nan nan 0.302307 nan nan 70
Sample_152 0.922635 0.932885 0.928967 0.923144 0.926056 0.926985 0.932482 0.931084 0.918127 0.917052 0.909410 0.909410 0.877491 0.879750 nan 0.098949 0.223252 nan -0.002779 nan nan 0.336118 nan nan 64
Sample_162 0.929186 0.942253 0.943409 0.942112 0.933190 0.945806 0.937839 0.949772 0.956867 0.940055 0.937255 0.937255 0.933909 0.946955 nan 0.516397 0.221636 nan 0.014711 nan nan 0.294184 0.087903 nan 51
Sample_167 0.952090 0.935986 0.938566 0.949042 0.931476 0.932797 0.939793 0.936802 0.922116 0.930438 0.905413 0.905413 0.923164 0.888335 nan 0.221299 0.168443 nan -0.200358 nan nan 0.235179 nan nan 65
Sample_171 0.948100 0.914141 0.937970 0.949127 0.901446 0.910509 0.919215 0.909432 0.845442 0.899387 0.863135 0.863135 0.898770 0.811898 nan -0.061550 -0.103004 nan -0.296041 nan nan 0.344922 nan nan 40
Sample_181 0.912274 0.926595 0.912683 0.926865 0.920976 0.922596 0.929397 0.913043 0.869468 0.929034 0.896030 0.896030 0.899227 0.863741 nan -0.243627 0.198538 nan -0.007464 nan nan 0.419029 0.117814 nan 60
Sample_184 0.944725 0.940416 0.927864 0.936760 0.946711 0.940123 0.930853 0.927579 0.934207 0.906590 0.924270 0.924270 0.921680 0.888240 nan 0.133736 -0.128490 nan 0.083020 nan nan 0.517095 nan nan 60
Sample_185 0.949315 0.948668 0.943933 0.954384 0.929238 0.937701 0.936803 0.930556 0.924211 0.929391 0.922411 0.922411 0.899905 0.918327 nan -0.264227 0.003111 nan -0.035621 nan nan 0.576069 nan nan 69
Sample_199 0.928280 0.930880 0.937833 0.938319 0.930234 0.938214 0.917037 0.925542 0.912243 0.918083 0.910943 0.910943 0.937794 0.844058 nan -0.086879 -0.235543 nan -0.024566 nan nan 0.289504 nan nan 45
Sample_200 0.934067 0.937810 0.914547 0.929957 0.933368 0.934597 0.934918 0.918361 0.722446 0.926170 0.891269 0.891269 0.916117 0.670712 nan 0.034387 0.130846 nan -0.000717 nan nan 0.535540 -0.109406 nan 40

Hide code cell source

feature_names = pred_test.index.levels[-1]
N_SAMPLES = pred_test.index
M = len(feature_names)
pred_test.loc[pd.IndexSlice[:, feature_names[random.randint(0, M - 1)]], :]
observed BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_024 P01303 18.058 18.100 18.259 18.245 18.182 18.273 17.979 18.123 17.758 18.115 ... 17.335 11.414 12.658 13.284 7.068 0 29.794 2,508.981 NaN 209
Sample_042 P01303 17.789 17.146 17.135 17.213 16.945 17.321 17.336 17.036 17.159 17.253 ... 17.527 11.878 12.863 12.462 7.068 0 -42.462 2,556.320 NaN 209
Sample_053 P01303 17.259 17.205 17.262 17.427 16.984 17.473 17.628 17.470 17.489 17.693 ... 17.407 12.430 12.620 12.576 7.068 0 -43.766 2,542.848 NaN 209
Sample_057 P01303 17.874 17.931 18.016 18.112 18.170 17.962 18.038 18.348 18.355 17.847 ... 17.632 12.164 12.440 13.117 7.068 0 191.987 2,580.387 NaN 209
Sample_081 P01303 17.774 17.791 17.876 17.852 17.958 17.556 17.818 17.841 17.715 17.642 ... 16.942 13.141 12.067 12.513 7.068 0 -92.519 2,631.820 NaN 209
Sample_126 P01303 17.616 17.085 17.366 17.156 17.271 17.110 17.142 17.419 17.176 17.239 ... 17.703 11.375 12.774 12.622 7.068 0 102.623 2,523.647 NaN 209
Sample_131 P01303 16.847 16.641 16.778 16.487 16.592 16.419 16.717 16.757 16.567 16.637 ... 17.332 11.347 11.902 12.089 7.068 0 29.644 20.128 NaN 209
Sample_155 P01303 17.592 17.483 17.711 17.500 17.504 17.482 17.564 17.810 17.825 17.558 ... 17.206 13.538 12.643 12.690 7.068 0 -58.077 2,472.584 NaN 209
Sample_169 P01303 16.647 17.018 17.027 17.071 17.037 17.171 17.037 16.907 17.152 17.271 ... 17.217 13.205 12.761 13.276 7.068 0 -104.236 2,467.707 NaN 209
Sample_180 P01303 17.804 17.426 17.581 17.138 17.248 17.660 17.622 17.142 17.185 17.492 ... 17.046 15.341 11.714 11.369 7.068 0 -39.225 2,596.437 NaN 209

10 rows × 26 columns

Hide code cell source

options = random.sample(sorted(set(feature_names)), 1)
pred_test.loc[pd.IndexSlice[:, options[0]], :]
observed BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_040 P06307 17.617 17.542 17.840 17.762 17.726 17.638 18.300 17.742 17.242 18.159 ... 17.599 13.758 12.446 12.870 7.068 0 83.496 2,747.808 NaN 210
Sample_062 P06307 19.468 19.383 19.574 19.487 19.364 19.409 19.244 18.976 18.965 19.337 ... 17.173 11.716 11.570 11.522 7.068 0 -38.069 2,524.588 NaN 210
Sample_068 P06307 18.646 18.619 18.361 18.377 18.634 18.330 18.546 18.751 18.567 18.597 ... 17.085 11.597 11.865 11.284 7.068 0 -75.558 2,647.739 NaN 210
Sample_071 P06307 18.665 18.827 18.959 19.350 18.725 19.193 19.231 19.195 19.095 18.977 ... 17.315 12.161 12.212 12.880 7.068 0 -188.697 2,443.160 NaN 210
Sample_146 P06307 19.080 19.100 18.902 18.973 19.022 19.081 19.145 19.275 19.128 18.967 ... 17.414 12.457 12.174 12.467 7.068 0 111.166 2,460.342 NaN 210
Sample_201 P06307 18.634 19.279 19.287 19.137 18.823 18.906 19.019 19.141 19.145 19.260 ... 17.077 12.948 11.973 12.222 7.068 0 99.369 2,545.155 NaN 210

6 rows × 26 columns

Correlation per feature#

Hide code cell source

corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply(
    lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS]
corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[
    TARGET_COL].count().rename('n_obs'))

too_few_obs = corr_per_feat_test['n_obs'] < 3
corr_per_feat_test.loc[~too_few_obs].describe()
BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 ... 1,396.000 1,396.000 1,396.000 1,396.000 0.000 0.000 1,396.000 1,396.000 0.000 1,396.000
mean 0.636 0.624 0.635 0.606 0.607 0.588 0.564 0.538 0.445 0.470 ... -0.008 0.020 0.086 0.055 NaN NaN -0.009 -0.016 NaN 8.999
std 0.337 0.343 0.347 0.341 0.342 0.354 0.359 0.369 0.422 0.388 ... 0.447 0.414 0.441 0.415 NaN NaN 0.411 0.433 NaN 3.913
min -0.998 -0.975 -0.989 -0.907 -0.977 -1.000 -0.983 -0.991 -1.000 -0.999 ... -0.999 -1.000 -0.999 -0.999 NaN NaN -1.000 -0.999 NaN 3.000
25% 0.506 0.487 0.507 0.445 0.455 0.431 0.392 0.359 0.221 0.266 ... -0.341 -0.273 -0.242 -0.223 NaN NaN -0.285 -0.313 NaN 6.000
50% 0.746 0.738 0.747 0.708 0.706 0.698 0.677 0.634 0.554 0.554 ... -0.005 0.027 0.118 0.063 NaN NaN 0.004 -0.016 NaN 8.000
75% 0.880 0.870 0.883 0.861 0.859 0.844 0.833 0.809 0.770 0.768 ... 0.331 0.319 0.425 0.356 NaN NaN 0.268 0.294 NaN 11.000
max 0.999 0.998 1.000 1.000 1.000 0.999 0.999 1.000 1.000 0.998 ... 0.999 0.994 0.999 1.000 NaN NaN 0.992 0.998 NaN 32.000

8 rows × 25 columns

Hide code cell source

corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0)
BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
protein groups
A0A0A0MS09;P01880;P01880-2 1.000 1.000 -1.000 1.000 1.000 1.000 -1.000 -1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000 1.000 1.000 -1.000 -1.000 1.000 -1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
A0A0C4DH29 -1.000 -1.000 1.000 1.000 -1.000 1.000 1.000 -1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000 1.000 1.000 -1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 ... 1.000 -1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000 1.000 -1.000 -1.000 -1.000 1.000 -1.000 1.000 -1.000 1.000 ... 1.000 1.000 -1.000 1.000 NaN NaN -1.000 -1.000 NaN 2
D6RF35 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
E7EQ64;P07477 1.000 1.000 -1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN 1.000 -1.000 NaN 2
F8WDW9;Q96AP7 -1.000 -1.000 1.000 1.000 -1.000 1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
J3KRP0 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 -1.000 1.000 1.000 NaN NaN 1.000 1.000 NaN 2
O43581-2;O43581-3;O43581-5 -1.000 -1.000 -1.000 -1.000 -1.000 1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P04075 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
P04080 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 1.000 -1.000 ... -1.000 1.000 -1.000 1.000 NaN NaN -1.000 -1.000 NaN 2
P33151 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P62258 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 1.000 1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
Q9NYQ8 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 -1.000 1.000 ... -1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
Q9Y281;Q9Y281-3 1.000 1.000 -1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2

16 rows × 25 columns

Hide code cell source

kwargs = dict(rot=90,
              flierprops=dict(markersize=1),
              ylabel=f'correlation per {FEAT_NAME_DISPLAY}')
ax = (corr_per_feat_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs)
      )
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                       horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_feat.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_feat_test.loc[~too_few_obs].describe().to_excel(
        w, sheet_name='summary')
    corr_per_feat_test.to_excel(w, sheet_name='correlations')
    corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf
_images/15354ff0c709af219c11f215b520b1d195c63479759a17e4734a33a5c6dacf85.png

Hide code cell source

feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count()
feat_count_test.name = 'count'
feat_count_test.head()
protein groups
A0A024QZX5;A0A087X1N8;P35237                                                     10
A0A024R0T9;K7ER74;P02655                                                          8
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8    6
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                           8
A0A075B6H7                                                                        4
Name: count, dtype: int64

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_feat_test[TOP_N_ORDER]).min()
mask = (corr_per_feat_test[TOP_N_ORDER] < treshold).any(axis=1)


def highlight_min(s, color, tolerence=0.00001):
    return np.where((s - s.min()).abs() < tolerence, f"background-color: {color};", None)


view = (corr_per_feat_test
        .join(feat_count_test)
        .loc[mask]
        .sort_values('count'))

if not view.empty:
    display(view
            .style.
            apply(highlight_min, color='yellow', axis=1,
                  subset=corr_per_feat_test.columns)
            )
else:
    print("None found")
  BPCA VAE DAE CF TRKNN RF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs count
protein groups                                                    
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000000 1.000000 1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 nan nan -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
A0A0A0MS09;P01880;P01880-2 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 1.000000 1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan 1.000000 -1.000000 nan 2 2
A0A0C4DH29 -1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan -1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
E7EQ64;P07477 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 nan nan -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 -1.000000 nan 2 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000000 1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 nan 2 2
O43581-2;O43581-3;O43581-5 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 1.000000 nan 2 2
F8WDW9;Q96AP7 -1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
Q9Y281;Q9Y281-3 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 nan nan 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
O95497 0.491301 0.006032 -0.019442 0.727802 -0.187796 0.437617 -0.278320 -0.038249 -0.972465 0.974255 nan nan -0.924475 0.900622 -0.728224 -0.107445 0.055343 0.908126 0.598429 nan nan 0.163889 -0.914582 nan 3 3
A0A087X117;A0A0G2JN29;J3KN36;P69849;Q15155;Q5JPE7;Q5JPE7-2 0.678810 -0.335242 0.074387 -0.260795 0.676860 0.405337 0.312055 0.660508 -0.865326 0.808609 nan nan -0.988011 0.737155 -0.484680 -0.563403 -0.264378 -0.965384 -0.840834 nan nan -0.178456 0.909074 nan 3 3
P04040 0.955425 -0.974819 0.025449 0.808358 0.364706 -0.249266 0.860313 0.995399 0.584629 0.685429 nan nan 0.736559 -0.946844 0.910187 0.961838 0.461600 0.889522 -0.140074 nan nan -0.755848 0.778883 nan 3 3
A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;P01892;P10316 -0.405690 -0.446550 -0.293776 -0.515214 -0.426425 -0.170837 0.225780 0.141568 -0.230425 0.240677 nan nan 0.029026 0.327288 0.969574 0.997890 -0.928519 -0.651641 0.775695 nan nan 0.599349 0.988049 nan 3 3
Q0P6D2 0.488693 -0.439842 0.701665 0.621431 -0.977189 -0.850888 -0.914209 -0.991429 0.990772 -0.817525 nan nan -0.910486 -0.194424 -0.815144 -0.873162 -0.723498 -0.856054 -0.787328 nan nan 0.198784 -0.816625 nan 3 3
P67936 -0.781265 -0.893621 -0.976195 0.634995 -0.280233 -0.998752 -0.930367 0.668141 -0.999486 -0.951571 nan nan -0.952164 0.933330 -0.989773 -0.899019 0.926267 0.990700 0.820784 nan nan -0.364239 -0.919047 nan 3 3
P04430 0.283036 0.990430 -0.730800 0.646756 0.721758 0.889461 0.986075 0.557924 0.865988 0.496168 nan nan -0.601575 -0.232889 0.997846 0.975279 0.310983 0.123528 0.134784 nan nan -0.324236 0.831769 nan 3 3
Q15223;Q15223-2;Q15223-3 0.989596 0.419068 0.270341 -0.561875 0.177474 0.085388 0.779421 0.476563 0.352579 -0.993723 nan nan 0.613817 0.902436 -0.593128 0.320433 -0.108319 0.556287 0.652682 nan nan 0.802926 -0.921276 nan 3 3
Q5FWE3;Q5FWE3-3 -0.595898 0.201862 0.385469 -0.907185 0.940836 0.534106 -0.515612 -0.450286 0.997780 -0.317397 nan nan 0.852964 -0.309584 0.879697 0.674983 0.886388 0.966111 0.626489 nan nan -0.999926 0.995547 nan 3 3
Q96PX8 0.556202 0.922348 0.814688 -0.826645 0.999202 0.997958 0.858514 0.967634 -0.991603 -0.887272 nan nan 0.981687 0.302885 -0.896770 -0.879121 -0.986526 -0.561677 -0.920466 nan nan -0.998875 -0.998875 nan 3 3
Q96KR4;Q96KR4-3 -0.811499 -0.580649 0.980409 0.235041 0.188634 -0.661888 0.962865 0.998946 -0.996456 -0.998544 nan nan -0.499851 -0.950477 -0.969111 -0.794681 -0.428497 0.464970 0.664937 nan nan 0.770952 0.620557 nan 3 3
Q9BRA2 -0.997854 -0.935210 -0.927972 -0.690861 -0.976681 -0.999896 0.355335 -0.969137 -0.999909 0.328330 nan nan -0.376394 0.385922 -0.888126 -0.946135 0.977436 -0.251266 -0.283746 nan nan 0.231242 0.893058 nan 3 3
P14138 -0.783274 0.548192 -0.070290 -0.703583 0.277987 -0.048544 0.474087 -0.039227 0.184293 -0.975804 nan nan -0.284116 -0.078875 -0.680464 -0.988691 0.993598 -0.256388 0.945599 nan nan 0.330904 0.330904 nan 3 3
Q9UI40;Q9UI40-2 0.154643 0.987826 -0.688708 0.358041 0.705987 -0.853234 0.043055 -0.368219 0.609727 -0.080993 nan nan 0.216445 0.766181 0.418780 0.168840 -0.664907 0.994516 0.992209 nan nan 0.588145 0.963302 nan 3 3
Q99538 0.593821 -0.382252 -0.414148 -0.122187 0.377925 0.080309 -0.288458 -0.152372 0.223491 -0.280701 nan nan 0.914120 0.362325 0.708292 -0.029421 0.755783 0.753316 0.703783 nan nan -0.421840 -0.113954 nan 4 4
P01912;Q5Y7A7 0.312242 -0.119839 -0.821944 -0.033886 -0.737679 -0.268736 -0.983163 -0.923553 0.367817 0.598949 nan nan -0.036948 -0.538071 0.393426 0.622172 -0.772553 0.128832 -0.064211 nan nan -0.520030 -0.464856 nan 4 4
E9PN95;P11684 -0.044137 0.605194 -0.345949 0.053934 -0.416898 0.539608 -0.241660 0.598536 0.553236 0.989063 nan nan 0.572649 0.439690 -0.972121 -0.214485 -0.738027 -0.729323 -0.837687 nan nan 0.624754 0.488347 nan 4 4
P01704 0.467663 -0.416432 0.168174 0.536644 -0.028085 0.988339 -0.769039 0.534897 0.229086 -0.463727 nan nan 0.151957 0.726697 0.353124 -0.603360 0.785460 -0.375894 -0.049201 nan nan 0.964419 -0.791674 nan 4 4
A0A075B7B8 0.995309 0.900409 0.864174 0.948440 -0.584344 0.591563 0.645944 0.628331 0.493563 0.290645 nan nan -0.198260 0.001422 0.596184 0.426337 -0.732647 -0.818663 -0.800930 nan nan -0.419682 0.603615 nan 4 4
A8MXB9;J3KQJ1;Q8NBJ7 -0.526368 -0.447143 -0.458650 -0.655122 -0.288690 -0.423512 0.719047 0.882512 0.575047 0.781017 nan nan 0.983037 0.783286 -0.315140 -0.447225 -0.927725 0.092247 0.811351 nan nan -0.625728 -0.380936 nan 4 4
A0A0G2JRN3 -0.879258 -0.829447 -0.526409 -0.815601 -0.190835 -0.492901 0.676931 0.355859 -0.978184 -0.922357 nan nan -0.966410 0.735846 -0.890772 -0.522710 -0.732201 -0.786540 -0.904598 nan nan 0.786883 0.273736 nan 4 4
Q8WXD2 -0.194738 -0.543811 -0.483046 -0.713191 0.030615 -0.326551 -0.644254 -0.896598 -0.044385 -0.527932 nan nan -0.031366 0.492905 -0.879923 -0.696602 0.746786 0.367920 0.034106 nan nan -0.304569 0.801129 nan 4 4
P55058 0.331515 -0.608719 0.113612 0.374788 -0.453671 -0.340616 -0.730945 -0.687075 0.561214 0.511726 nan nan -0.850960 0.486280 -0.860013 0.937309 -0.487709 0.109603 -0.185087 nan nan -0.314778 -0.708824 nan 4 4
P78310;P78310-2;P78310-5;P78310-6;P78310-7 -0.287802 0.800178 0.569200 0.670388 0.187632 0.841824 0.236820 0.177240 0.323788 -0.925309 nan nan -0.002267 -0.822582 0.370874 -0.992637 -0.643111 -0.740181 0.023997 nan nan -0.197104 0.065686 nan 4 4
P31150 -0.695110 -0.159017 -0.774124 -0.192626 0.079765 -0.557999 -0.302639 -0.258552 0.871434 0.374254 nan nan -0.383378 -0.428704 -0.744603 -0.826531 0.604429 0.679134 0.469222 nan nan 0.018162 0.997740 nan 4 4
P69905 0.965583 -0.239220 -0.988720 0.928952 0.990263 0.394696 -0.734862 -0.884057 0.787808 0.652423 nan nan 0.995940 0.548652 -0.089591 0.819375 0.712199 -0.613292 -0.943708 nan nan 0.599413 0.563706 nan 4 4
P48745 -0.215191 -0.598565 -0.476158 -0.080149 0.281906 0.015847 -0.708241 -0.711413 0.592691 0.072594 nan nan -0.966228 0.302475 -0.502678 -0.007247 -0.827384 -0.811427 0.148708 nan nan -0.816749 -0.563041 nan 4 4
P21810 0.488982 -0.676759 -0.704459 0.063799 0.558018 0.428499 0.725338 0.497175 0.976315 0.606417 nan nan 0.507237 0.896792 0.086433 0.181221 0.011772 -0.013604 -0.630774 nan nan -0.032190 -0.515024 nan 4 4
Q9NS85 0.209240 -0.438335 -0.574161 0.767141 -0.748054 0.298505 0.174286 0.827546 -0.293415 -0.490454 nan nan -0.811087 0.046101 0.688432 0.684565 0.678043 -0.573150 -0.201913 nan nan 0.028761 0.700517 nan 4 4
Q9BXJ0 -0.083789 0.663480 0.766845 -0.209229 0.933855 0.130750 -0.933744 -0.936561 0.858297 -0.922274 nan nan 0.867440 0.385463 -0.771777 -0.792200 0.535398 -0.072542 0.222381 nan nan -0.736944 -0.736944 nan 4 4
Q8IUK5;Q8IUK5-2;Q8IUK5-3 0.958859 0.820163 0.830442 -0.405278 0.738376 0.582097 0.355171 0.619281 0.646793 0.857888 nan nan 0.853405 0.729878 -0.302518 -0.348384 -0.262974 0.310391 0.504708 nan nan 0.375566 0.798411 nan 4 4
A0A087WSY4 0.341097 -0.600145 -0.584888 0.553195 -0.455325 0.665257 0.666119 0.804721 -0.612366 -0.162759 nan nan 0.149034 -0.996092 0.994065 0.280256 0.420555 -0.553907 -0.866920 nan nan -0.767064 0.652348 nan 4 4
Q9ULP0-3;Q9ULP0-6 -0.433605 0.258770 0.041203 0.005970 0.280988 0.490640 -0.127518 -0.217068 -0.824736 0.569381 nan nan 0.047124 0.633039 -0.147969 0.088032 0.736751 0.597287 0.504754 nan nan 0.651242 0.678026 nan 4 4
B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 -0.586205 -0.667163 -0.514340 -0.692606 -0.468526 -0.699705 -0.950792 -0.978604 -0.206013 0.640277 nan nan -0.926715 0.685896 -0.611749 -0.688762 -0.027437 -0.359254 -0.327054 nan nan -0.630160 0.512953 nan 4 4
Q5BIV9 -0.499694 -0.179659 -0.633642 -0.432250 -0.073578 0.024311 -0.595082 -0.855339 -0.481080 -0.438393 nan nan -0.128645 0.976007 -0.778869 -0.513219 0.534874 -0.261649 -0.398743 nan nan -0.127227 -0.239878 nan 5 5
Q8NFZ4 -0.225202 -0.301404 0.080765 -0.281291 0.001304 -0.077608 -0.011690 0.057673 -0.422722 -0.787582 nan nan 0.200058 0.190495 0.685266 0.396679 -0.025852 -0.363897 -0.441232 nan nan -0.741786 0.576960 nan 5 5
Q8N428 -0.711604 -0.573305 -0.795503 -0.747728 -0.372718 -0.685640 -0.815146 -0.973013 0.485147 -0.775526 nan nan 0.896715 -0.899799 -0.145359 0.398169 0.071729 0.571660 0.557237 nan nan -0.592174 0.633450 nan 5 5
Q6ZVL6 -0.405009 0.156725 0.605517 0.081346 0.541507 0.645357 -0.626545 -0.421210 0.745658 0.380994 nan nan 0.809662 -0.190445 0.343913 -0.237720 0.487557 0.617457 0.593542 nan nan -0.305862 -0.650631 nan 5 5
Q9NZC2;Q9NZC2-2;Q9NZC2-3 -0.532191 -0.430352 -0.696358 -0.535665 -0.549588 -0.415690 0.118561 -0.020609 -0.710573 -0.652656 nan nan -0.534201 0.599099 0.889235 -0.779762 0.101340 -0.454867 -0.657463 nan nan 0.820294 0.820294 nan 5 5
P02533 0.968900 -0.614408 -0.634572 0.818353 0.859288 -0.375385 0.106114 0.085868 0.760846 0.490142 nan nan 0.864174 -0.185620 0.303053 0.460298 0.152549 0.446403 0.732666 nan nan 0.207036 -0.545616 nan 5 5
P01834 0.145967 -0.267701 -0.155320 0.097708 -0.196562 -0.310765 0.157569 0.333278 0.249827 0.136816 nan nan -0.132585 0.174505 -0.169066 -0.492225 0.747588 0.283761 0.194347 nan nan -0.157407 -0.157407 nan 5 5
H3BRV9;P61970 0.339675 0.261760 0.307452 -0.284999 0.291001 0.381024 -0.022061 -0.094914 0.046162 0.065582 nan nan 0.456388 0.046716 -0.959485 -0.426972 0.885503 0.672857 0.644364 nan nan -0.736409 -0.210767 nan 5 5
O75882;O75882-2;O75882-3 0.808756 0.874293 0.899195 0.302756 -0.195404 0.247892 0.647639 0.664223 0.069134 -0.175885 nan nan 0.085212 -0.516730 -0.812939 0.161950 0.644485 -0.782643 0.570415 nan nan -0.899663 0.811337 nan 5 5
P10124 -0.597027 -0.352082 -0.299123 -0.627915 -0.186782 0.347086 0.571509 -0.252024 0.436283 -0.628340 nan nan -0.573988 0.120248 -0.483333 -0.479793 0.588589 -0.261585 -0.472450 nan nan 0.414290 0.578378 nan 5 5
P01282;P01282-2 -0.216410 0.511015 0.483339 0.690699 0.209445 0.431524 0.888676 -0.090521 -0.394861 -0.216389 nan nan -0.782966 0.280301 0.261410 0.709596 -0.207622 0.425854 0.774907 nan nan -0.467714 0.087763 nan 5 5
P01024 0.625152 0.788486 0.703325 -0.209389 0.912857 0.494906 0.382613 0.434509 0.873362 0.101833 nan nan -0.374828 -0.142975 -0.327068 -0.418929 -0.684168 -0.215456 0.385086 nan nan 0.026244 0.424774 nan 5 5
P00441 -0.466488 -0.476209 -0.625219 -0.263857 -0.418119 -0.309013 0.047755 -0.045985 -0.047843 -0.228959 nan nan -0.221865 -0.486946 -0.478446 -0.697790 -0.040567 -0.145801 -0.327803 nan nan 0.030159 -0.379242 nan 5 5
D6R956;P09936 0.296679 -0.074515 0.109385 0.197141 -0.551658 -0.142232 -0.259386 -0.664319 0.151812 0.024035 nan nan 0.313797 -0.096810 -0.119808 -0.402161 -0.279851 -0.212790 0.141617 nan nan -0.514141 0.033000 nan 5 5
O15031 -0.384689 0.762128 0.652996 0.166502 0.545446 0.511480 0.392387 0.288900 0.278546 0.565508 nan nan 0.144879 0.267337 -0.264862 -0.348062 0.090350 -0.121316 0.322847 nan nan -0.296688 -0.296688 nan 5 5
O75339 0.036295 -0.441068 -0.236469 -0.118799 0.082569 -0.424093 -0.383440 -0.134371 -0.662474 0.284821 nan nan -0.707550 0.175596 -0.475700 -0.809598 -0.223164 -0.206109 -0.513042 nan nan 0.211736 0.029954 nan 5 5
B1AJZ9;B1AJZ9-4;H0YE38;Q5JYW6 -0.213396 -0.216534 -0.092470 0.214623 -0.655514 -0.001862 -0.355725 -0.358839 -0.320693 0.330414 nan nan -0.420475 0.430841 -0.515651 -0.659479 0.764882 0.428130 -0.050070 nan nan 0.163593 0.301970 nan 5 5
G3V2U7;P07311 0.478113 -0.485911 -0.320549 0.751172 0.277132 0.359633 0.126790 -0.075889 -0.045458 0.298977 nan nan -0.201011 -0.132411 -0.434263 -0.349618 0.497854 0.286177 0.519642 nan nan -0.412639 -0.436945 nan 5 5
Q10469 -0.098233 -0.793383 -0.395595 -0.592442 -0.291141 -0.658679 -0.671391 -0.600598 0.649427 0.606429 nan nan -0.185669 -0.313284 -0.517750 -0.162715 -0.400094 -0.550749 -0.707623 nan nan -0.084422 0.700076 nan 5 5
Q13508;Q13508-2;Q13508-3 -0.588905 0.863755 0.259252 -0.248896 0.317771 0.712649 0.367007 -0.137197 -0.196508 0.072987 nan nan 0.595601 -0.852565 -0.921969 -0.902558 -0.656859 -0.647509 -0.483765 nan nan -0.816043 0.878974 nan 5 5
P35052 0.702134 0.375035 0.755442 -0.561350 0.549258 0.642162 0.762479 0.349263 -0.226479 0.720987 nan nan 0.394751 -0.131589 -0.626621 -0.692949 0.822326 -0.614645 -0.133282 nan nan 0.436106 -0.605529 nan 5 5
Q9NZP8 0.485369 0.452440 0.252116 0.279322 -0.596706 -0.329113 0.391275 0.135336 -0.921105 0.672486 nan nan -0.849012 -0.556002 0.108741 -0.011939 -0.444240 0.306441 0.161372 nan nan 0.307325 0.307325 nan 5 5
A0A0G2JRQ6 -0.226956 -0.406151 -0.281120 -0.276238 -0.249237 -0.275689 -0.267241 -0.104896 0.030670 -0.906417 nan nan -0.206273 0.411912 -0.360860 0.551245 0.064595 0.307142 0.029626 nan nan -0.589769 -0.589769 nan 5 5
A0A0C4DGV8;Q13214;Q13214-2 0.215023 -0.099604 -0.309103 0.518665 -0.952860 -0.798575 -0.317076 -0.315165 -0.648660 -0.694447 nan nan -0.483616 0.107943 0.951239 0.908215 -0.940086 -0.051800 -0.193120 nan nan -0.698798 -0.910792 nan 5 5
Q9UHI8 0.073471 0.077558 -0.094514 -0.410856 -0.445122 -0.201874 0.316329 -0.061541 0.216956 -0.119815 nan nan -0.208262 0.033820 0.051369 0.334440 -0.543958 0.118070 -0.008399 nan nan -0.151806 -0.746015 nan 5 5
P05451 0.097378 -0.563360 0.339232 -0.023629 -0.057327 -0.292363 -0.360316 -0.684154 -0.327039 0.306881 nan nan -0.336078 -0.749436 -0.538331 -0.470228 -0.204209 -0.412800 -0.155690 nan nan -0.749296 -0.302038 nan 6 6
M0R009 -0.339372 -0.143983 -0.276861 -0.357138 -0.584432 -0.432676 -0.356792 -0.247140 -0.037546 -0.001198 nan nan -0.732570 0.516390 -0.580618 0.320436 0.122398 -0.750481 -0.677517 nan nan 0.635403 -0.182952 nan 6 6
H3BRQ4;K4DIB9;P50238 -0.093569 -0.257584 -0.451479 -0.426628 -0.057295 -0.510532 -0.083371 -0.277897 0.428824 0.210272 nan nan 0.321101 -0.342142 -0.052992 0.460109 0.774445 0.387204 -0.226158 nan nan 0.012052 -0.720358 nan 6 6
P01036;P01037 0.014913 -0.290998 -0.089759 -0.436258 -0.482398 -0.356114 -0.220449 -0.046435 -0.115861 -0.146825 nan nan 0.512862 0.620557 -0.389652 -0.354472 0.294608 -0.362058 -0.793423 nan nan 0.673582 0.579024 nan 6 6
P10644;P10644-2 -0.252574 0.043000 -0.738111 0.131153 -0.097608 0.075879 -0.336136 -0.255617 0.448682 -0.000995 nan nan 0.107935 -0.802931 0.383896 -0.528502 -0.075836 0.577309 0.370361 nan nan -0.154795 -0.502540 nan 6 6
P29401;P29401-2 0.085483 0.016919 0.077307 -0.228074 -0.368660 0.590332 0.223946 0.365861 0.001412 -0.392136 nan nan 0.108221 -0.659968 0.187563 0.224033 -0.484776 -0.281768 0.052490 nan nan 0.300495 0.397517 nan 6 6
B1AHL2;P23142-4 -0.158215 0.565754 0.497049 -0.358912 0.335561 0.614565 0.406105 0.299639 0.061705 0.284261 nan nan -0.183055 -0.611232 -0.075900 -0.165674 0.531431 0.742655 0.716256 nan nan -0.302453 -0.153443 nan 6 6
E9PKE3;P11142 -0.225336 -0.528109 -0.322927 0.185920 0.346178 0.017122 -0.515610 -0.674428 0.038561 -0.291303 nan nan -0.729284 -0.775801 -0.288577 0.003161 -0.409208 -0.507155 0.195049 nan nan 0.281878 0.415848 nan 6 6
Q9BT88 -0.231560 -0.338617 0.320368 -0.257206 -0.505983 -0.236452 0.248942 0.470496 -0.357877 -0.223776 nan nan 0.096659 -0.485983 -0.078098 0.471098 0.745549 0.626185 0.560263 nan nan 0.342112 0.354892 nan 7 7
Q9BX67 0.701146 -0.268281 -0.080032 0.293328 0.063862 -0.306395 -0.048843 -0.246240 0.445479 0.340506 nan nan 0.033937 0.840299 -0.269549 -0.431929 0.374572 0.232872 -0.174785 nan nan -0.194531 -0.561094 nan 7 7
Q8IWU5;Q8IWU5-2 -0.218403 -0.154552 -0.137773 -0.062886 -0.483524 -0.053507 -0.071915 -0.451814 -0.684735 0.392763 nan nan -0.070980 -0.124379 0.334826 0.479673 -0.868262 -0.096058 -0.260937 nan nan 0.632888 -0.013472 nan 7 7
A0A1B0GUU9;P01871;P01871-2 -0.016782 0.284607 -0.189730 0.347381 0.513541 0.437864 -0.215636 -0.389016 0.225661 0.071553 nan nan 0.639000 0.378442 0.696541 -0.263863 0.015056 -0.175805 -0.243377 nan nan 0.683104 -0.101031 nan 7 7
O00339;O00339-2;O00339-3 0.132164 0.357336 -0.069538 -0.339497 0.452370 0.297166 0.191099 0.055085 0.059785 0.194724 nan nan 0.060109 -0.247123 0.501661 0.236744 0.320229 0.613125 0.410843 nan nan -0.497438 0.393346 nan 7 7
C9JKT8;Q9UEW3;Q9UEW3-2 -0.293733 -0.135493 -0.408090 -0.337902 -0.671856 -0.217604 0.252685 0.407657 0.516418 -0.293680 nan nan -0.461590 0.082109 -0.356302 -0.545248 -0.609281 -0.645563 -0.602800 nan nan -0.058753 -0.279069 nan 7 7
P80108 -0.376648 -0.597474 0.060614 -0.028931 -0.172435 0.004105 -0.184693 -0.137669 0.293716 -0.492363 nan nan 0.087362 -0.717288 -0.600824 -0.578633 -0.689465 -0.443645 -0.006545 nan nan 0.119112 0.119112 nan 7 7
Q96S96 -0.324600 -0.240355 0.061306 -0.305130 -0.462902 -0.518224 -0.359991 -0.296281 -0.305383 0.059876 nan nan -0.582492 -0.470132 -0.157328 -0.153988 -0.191805 -0.086735 -0.334456 nan nan -0.682257 0.760044 nan 7 7
P19835;X6R868 -0.578277 -0.399227 -0.478640 -0.396950 -0.428965 -0.259946 -0.359418 -0.458749 -0.930907 0.408941 nan nan -0.762851 0.436766 -0.321335 -0.213048 -0.659159 -0.001008 0.345971 nan nan -0.780480 0.407170 nan 7 7
Q15847 0.233620 0.474755 -0.219589 0.316562 0.197438 0.399315 0.333432 0.489641 0.633836 0.360723 nan nan 0.634402 0.461452 0.060445 -0.141118 -0.201241 -0.056074 0.290123 nan nan 0.438014 -0.117445 nan 7 7
P35443 -0.174408 0.465246 0.389144 0.373444 -0.252097 -0.446747 0.404203 0.582749 0.068469 -0.211928 nan nan 0.314822 0.014860 -0.017793 -0.130271 -0.007412 0.039112 0.057461 nan nan 0.022830 0.275796 nan 7 7
P05556;P05556-2;P05556-3;P05556-4;P05556-5 -0.472444 -0.391648 -0.058888 -0.147754 -0.424667 -0.248815 -0.339333 -0.164482 -0.016348 0.328409 nan nan -0.091371 0.044472 0.247949 0.184702 0.351070 -0.061795 -0.369386 nan nan 0.003355 0.029215 nan 8 8
Q13790 -0.521996 -0.007459 0.456437 -0.031162 -0.198715 0.248193 -0.462789 0.222006 -0.215508 -0.483551 nan nan 0.029153 -0.842012 0.174426 -0.331592 0.533707 0.061342 0.038016 nan nan 0.078235 -0.389478 nan 8 8
P17677;P17677-2 -0.225142 0.256284 -0.091668 0.371661 -0.046967 0.160251 -0.060209 -0.272913 -0.312226 0.255004 nan nan -0.234374 0.091583 0.302693 0.676453 0.122154 -0.380524 0.022267 nan nan 0.169275 -0.057105 nan 8 8
Q9HBT6 -0.262534 0.020080 0.055121 -0.361772 0.204430 0.077743 0.099041 -0.130810 0.445839 -0.463106 nan nan 0.711355 0.359710 -0.045300 -0.091698 -0.689669 -0.213494 -0.289302 nan nan 0.106877 -0.188251 nan 8 8
P05160 0.074302 0.242570 0.201341 -0.424184 0.104137 0.133626 0.074495 -0.078294 0.262649 0.267963 nan nan 0.417342 0.604326 0.488189 0.260562 0.469878 -0.888594 -0.483718 nan nan 0.410691 0.612747 nan 8 8
B7Z5R6;Q14596;Q14596-2 0.170383 -0.138044 -0.173794 -0.455690 -0.034715 -0.284539 0.158769 0.376342 0.835850 -0.173276 nan nan 0.070890 0.167096 -0.097908 0.041231 -0.352945 0.037118 -0.147185 nan nan 0.285089 -0.165235 nan 8 8
A0A0G2JQD2;A0A0G2JQM0;A0A0G2JRN4;P30711 -0.278053 -0.004585 -0.121915 -0.333077 -0.254147 -0.279410 -0.176005 -0.274112 0.078554 -0.628982 nan nan -0.142576 -0.159017 -0.484875 -0.287631 0.580197 -0.051869 -0.326113 nan nan 0.739107 -0.015886 nan 8 8
Q5JRA6;Q5JRA6-2 -0.221290 -0.252514 -0.354845 -0.272221 -0.172243 -0.562583 0.450879 0.021417 -0.649397 -0.669281 nan nan -0.367023 0.538789 -0.544434 -0.362956 -0.358425 -0.396093 -0.497716 nan nan 0.085440 0.745223 nan 8 8
Q5SRI9 -0.274015 0.544164 -0.031676 -0.297146 -0.087155 -0.006899 -0.584189 -0.348800 0.733544 -0.259834 nan nan 0.162788 -0.502975 0.592005 0.136408 -0.410653 -0.088515 0.145306 nan nan 0.192704 -0.380808 nan 8 8
F8W9L4;I6LM06;P22607;P22607-2;P22607-3;P22607-4 -0.068033 -0.092526 0.369567 -0.196246 -0.151430 -0.052598 0.002257 0.171469 -0.213931 -0.361605 nan nan -0.081476 -0.079870 0.379222 0.573945 0.261162 0.538350 0.521283 nan nan 0.179061 0.086046 nan 8 8
Q9Y653;Q9Y653-2;Q9Y653-3 -0.130134 -0.642717 -0.721810 -0.073556 -0.417296 -0.396910 -0.604021 -0.672811 -0.191744 0.165965 nan nan -0.729686 0.103094 0.286967 0.024990 -0.308340 -0.282547 -0.567510 nan nan 0.406484 -0.099690 nan 8 8
Q9P232 -0.178684 -0.184960 -0.313992 -0.353787 -0.387374 -0.112035 -0.587127 -0.640844 -0.695382 -0.457551 nan nan -0.400998 -0.454553 0.256287 0.419846 0.042732 0.477733 -0.040308 nan nan -0.766501 -0.447343 nan 8 8
P18206;P18206-2 0.275209 0.405787 0.162550 0.222269 -0.189763 0.415991 -0.393250 -0.430189 0.404893 0.478030 nan nan -0.101734 -0.156234 0.168785 0.035300 -0.269676 -0.121760 -0.401545 nan nan -0.668040 0.105408 nan 8 8
P50395 0.013608 -0.184709 0.536306 0.036988 0.174813 0.072422 0.327166 0.193138 0.152363 0.368766 nan nan 0.016074 -0.298785 0.147720 0.026398 0.221468 0.045322 -0.487677 nan nan -0.756427 -0.524876 nan 9 9
P40121;P40121-2 -0.202385 0.336346 0.162169 -0.454180 0.354102 0.495110 0.424692 0.529660 0.206394 0.067427 nan nan -0.529026 -0.268866 0.396670 0.185199 0.093960 -0.359970 0.095978 nan nan -0.176933 -0.108032 nan 9 9
P10745 -0.236641 0.107034 -0.007321 0.386269 0.023283 -0.056744 0.468128 0.489925 -0.073139 0.328449 nan nan 0.136523 0.214013 0.597717 0.544391 0.049692 0.161125 -0.018697 nan nan 0.688639 -0.246050 nan 9 9
Q14019 -0.344741 0.053059 0.296340 -0.049153 0.165256 -0.131657 -0.725812 -0.693759 0.525948 -0.022084 nan nan 0.367858 0.139047 0.064769 0.401079 -0.204196 0.256334 -0.114108 nan nan -0.513253 -0.468898 nan 9 9
B1ALD9;Q15063;Q15063-3;Q15063-5 -0.101006 0.218129 -0.400116 -0.154466 -0.287475 -0.018221 -0.240490 -0.113355 -0.369493 0.319294 nan nan 0.425439 -0.554097 0.080040 0.086550 0.297180 0.619472 0.601499 nan nan 0.053837 -0.648727 nan 9 9
Q8TER0;Q8TER0-5 0.145885 0.304643 0.324222 -0.307674 -0.007321 -0.244573 0.237426 0.022913 0.301689 -0.095614 nan nan 0.184665 -0.273812 -0.061344 -0.079136 -0.155782 0.333547 0.139298 nan nan 0.242612 -0.073210 nan 9 9
B1AJR6;B1AJR9;B1AJS0;O14522 0.530251 0.011783 -0.350286 -0.382916 0.093327 -0.145017 -0.445185 -0.353217 0.644977 0.465073 nan nan -0.049114 0.076219 -0.304727 -0.305222 0.318337 0.183477 -0.002163 nan nan 0.348627 -0.076654 nan 9 9
E9PGA6;Q9BXJ4;Q9BXJ4-2;Q9BXJ4-3 0.124919 0.354347 0.561465 -0.255115 -0.190596 0.357955 0.189015 0.273937 -0.120507 0.106537 nan nan 0.095878 -0.704846 0.608922 0.429551 0.470608 0.323193 -0.060018 nan nan 0.096747 0.648033 nan 9 9
P08493;P08493-2 0.038081 -0.241962 -0.337063 0.017602 -0.232150 -0.353290 -0.635377 -0.703795 -0.351766 0.150211 nan nan 0.024427 0.540411 -0.406176 -0.246588 -0.031453 0.360308 -0.175179 nan nan -0.397982 -0.397982 nan 9 9
O43529 0.020461 0.354569 -0.235335 -0.235473 0.174158 0.015109 -0.013005 -0.117824 0.195129 0.128859 nan nan -0.187278 -0.156499 0.069547 0.283319 0.245811 0.566172 0.723000 nan nan -0.238252 -0.423734 nan 9 9
A0A087X1V2 -0.070690 0.401167 0.366961 0.190520 -0.243238 -0.473401 0.057184 0.003097 0.088347 -0.016904 nan nan 0.227076 0.360872 -0.181107 -0.345283 -0.317063 0.241761 -0.121066 nan nan 0.039602 -0.005971 nan 9 9
A0A0C4DH24 -0.413803 -0.367901 -0.467625 0.049570 -0.451837 -0.424799 -0.168096 -0.181333 -0.444924 0.152891 nan nan -0.499855 -0.194774 0.293136 0.105732 -0.240371 0.189404 0.600492 nan nan -0.601267 0.147397 nan 9 9
Q9Y6C2 0.340254 0.003971 0.269055 -0.107374 -0.272410 -0.320560 0.348677 0.614777 -0.561969 0.551982 nan nan 0.042545 0.617908 0.079803 0.126374 -0.474401 -0.186103 0.067290 nan nan 0.017971 -0.127935 nan 9 9
A0A1W2PQB1;H0Y755;M9MML0;P08637 -0.090323 0.538660 0.464256 0.565336 -0.253373 0.307366 0.460288 0.690784 0.595184 -0.139070 nan nan 0.331327 -0.091707 -0.081580 -0.266497 0.100051 0.540145 0.018180 nan nan -0.617375 0.448386 nan 10 10
Q6UWH4;Q6UWH4-2 -0.588718 -0.373212 -0.649532 -0.731985 0.098882 -0.408146 -0.397136 0.151063 -0.079928 -0.371243 nan nan -0.426708 -0.256199 -0.578150 -0.418792 -0.031383 0.143652 -0.182510 nan nan -0.126561 -0.059018 nan 10 10
P08670 -0.314275 0.000790 -0.704777 -0.477269 -0.179390 -0.339291 -0.501234 -0.180491 -0.411473 0.025687 nan nan 0.206552 -0.236680 0.551632 0.469208 0.504694 0.402038 0.622039 nan nan 0.028298 0.291405 nan 10 10
O76070 -0.432052 0.097098 0.075383 -0.054455 -0.215355 0.052178 -0.204646 0.162530 0.469000 0.054400 nan nan -0.278608 0.005612 -0.212317 -0.086282 -0.020171 -0.128095 0.035117 nan nan -0.474129 0.337800 nan 10 10
P55774 0.103249 0.014032 -0.020924 0.046070 -0.308916 -0.117385 -0.215141 -0.218578 -0.211760 -0.158074 nan nan -0.181675 -0.184327 -0.183622 -0.001697 0.172315 -0.506454 -0.186076 nan nan -0.584551 -0.551401 nan 11 11
F8WD41;Q15166 -0.202856 0.050659 -0.117144 0.334256 0.391304 0.321933 -0.359926 -0.291802 0.180810 -0.065243 nan nan 0.029724 -0.038424 -0.066574 0.265087 0.511526 0.363056 0.422421 nan nan 0.207901 0.110472 nan 11 11
P01742 0.375201 -0.133795 0.158134 0.273729 -0.267428 -0.314677 0.026400 0.271311 -0.587962 -0.094306 nan nan -0.393033 0.027556 0.676899 0.724786 0.094095 0.594179 0.084517 nan nan -0.194256 0.578374 nan 11 11
Q9NQS3;Q9NQS3-2;Q9NQS3-3 -0.424592 -0.174870 -0.233725 -0.431886 -0.139485 -0.058324 -0.111027 -0.140219 0.069189 -0.172248 nan nan 0.202371 0.094076 0.035925 0.120460 -0.014317 0.092650 0.013402 nan nan 0.156133 -0.174368 nan 11 11
Q9NZ08;Q9NZ08-2 0.032883 0.011742 -0.616331 0.352689 0.433371 0.201463 0.299292 0.489257 0.572066 0.260534 nan nan 0.348860 -0.322144 -0.357765 -0.168136 -0.080049 0.521799 0.576676 nan nan 0.202766 0.753444 nan 12 12
Q9BUJ0 -0.037313 -0.241301 -0.109426 0.237188 -0.314651 -0.201934 -0.098921 0.027051 -0.265561 -0.119432 nan nan -0.111523 0.254821 -0.202398 -0.237412 0.093479 -0.107921 -0.058248 nan nan 0.049800 -0.265784 nan 12 12
Q96RW7;Q96RW7-2 -0.274538 0.040786 -0.426774 -0.009779 -0.301343 -0.188272 -0.398478 -0.352505 0.219858 -0.008263 nan nan 0.625088 -0.322137 0.502826 0.258426 -0.144703 -0.017822 0.221795 nan nan 0.378646 0.234101 nan 12 12
A1L4H1 0.186316 -0.354492 -0.303958 -0.073844 0.309188 0.028659 0.072585 -0.116746 -0.221594 0.008499 nan nan -0.326665 -0.222907 0.300334 -0.324173 -0.272284 0.289843 -0.072788 nan nan -0.031547 -0.042797 nan 12 12
P29966 0.767339 -0.258546 0.774187 0.477068 0.078527 0.219639 0.720862 0.172931 0.306123 0.322428 nan nan 0.323159 0.241110 0.159322 0.397631 0.324776 0.358982 0.375901 nan nan 0.350638 0.249010 nan 13 13
E9PC84;P24821;P24821-4 -0.143665 0.364621 0.343768 -0.348903 0.196921 0.268681 -0.152219 -0.260515 0.109813 0.115043 nan nan -0.056667 -0.147116 -0.459416 -0.294693 -0.197709 -0.100782 0.509278 nan nan -0.175112 -0.288219 nan 13 13
B4DYV8;Q8WZ75;Q8WZ75-2;Q8WZ75-3 0.270149 0.464524 0.326030 0.540666 -0.317889 0.237184 0.130729 0.093670 -0.515780 0.120208 nan nan -0.440024 -0.289846 0.078349 0.291298 -0.199041 0.133283 0.084889 nan nan 0.085688 -0.033982 nan 14 14
Q6PCB0 -0.010818 -0.265970 -0.186697 -0.115342 -0.187322 -0.197913 -0.287276 -0.323757 -0.316441 0.132796 nan nan -0.011942 0.119986 -0.530040 -0.329471 0.194657 -0.188112 -0.156246 nan nan -0.023465 -0.115061 nan 14 14
P58401 0.064731 0.212851 0.267818 -0.382707 0.283526 0.288007 0.036887 -0.244870 0.294747 -0.400913 nan nan 0.448144 0.117253 -0.344344 -0.446814 -0.230384 -0.629203 -0.364833 nan nan 0.566115 0.385599 nan 14 14
O43852;O43852-3;O43852-5 0.193891 -0.059541 -0.128161 0.085625 -0.218009 -0.100403 0.295117 0.141397 0.293714 -0.062552 nan nan -0.155637 0.320761 0.614990 0.394764 -0.129205 0.574898 0.528409 nan nan -0.110063 -0.377160 nan 15 15
Q6ZMP0;Q6ZMP0-2 -0.257877 0.189618 -0.117451 -0.556357 0.132715 -0.033329 0.066672 -0.028928 -0.151054 -0.094501 nan nan -0.193571 -0.024305 0.414953 -0.432438 -0.249664 -0.152902 0.037846 nan nan 0.481989 0.031743 nan 16 16
P11597;P11597-2 0.091263 -0.501992 -0.563194 -0.331391 -0.244961 -0.503978 -0.071420 -0.029940 0.225605 -0.022021 nan nan -0.443310 0.053947 -0.211026 0.052557 0.090677 0.330721 0.413472 nan nan 0.400242 -0.067354 nan 16 16
Q96AQ6;Q96AQ6-2 0.074929 -0.134084 0.407666 -0.027920 -0.286627 0.103219 0.266461 0.377825 -0.047226 -0.184508 nan nan -0.159338 -0.486787 -0.158511 0.136220 0.006758 -0.189966 -0.267794 nan nan 0.554486 -0.006715 nan 16 16
A6XMH3;P01236;Q5I0G2 0.143374 -0.311822 -0.228508 -0.333240 -0.263686 -0.457319 0.249922 0.236575 -0.184831 0.126854 nan nan -0.456084 -0.222762 0.183538 0.124415 -0.016717 -0.746360 -0.586571 nan nan 0.415223 0.098237 nan 17 17
Q13231;Q13231-3 0.028238 0.145173 0.253618 0.011859 -0.248124 -0.063473 -0.144117 -0.108428 0.011950 0.128017 nan nan 0.134116 -0.064587 -0.284982 -0.276036 -0.219837 0.135271 0.134225 nan nan -0.001439 0.440431 nan 19 19

Error plot#

Hide code cell source

metrics = pimmslearn.models.Metrics()
test_metrics = metrics.add_metrics(
    pred_test[['observed', *TOP_N_ORDER]], key='test data')
test_metrics = pd.DataFrame(test_metrics)[TOP_N_ORDER]
test_metrics
Selected as truth to compare to: observed
BPCA VAE DAE CF TRKNN
MSE 0.455 0.481 0.478 0.472 0.500
MAE 0.432 0.437 0.436 0.458 0.458
N 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000
prop 1.000 1.000 1.000 1.000 1.000

Hide code cell source

n_in_comparison = int(test_metrics.loc['N'].unique()[0])
n_in_comparison
12600

Hide code cell source

_to_plot = test_metrics.loc[METRIC].to_frame().T
_to_plot.index = [feature_names.name]
_to_plot
BPCA VAE DAE CF TRKNN
protein groups 0.432 0.437 0.436 0.458 0.458

Hide code cell source

try:
    text = model_configs[["latent_dim", "hidden_layers"]].apply(
        build_text,
        axis=1)
except KeyError:
    logger.warning("No PIMMS models in comparsion. Using empty text")
    text = pd.Series('', index=model_configs.columns)

_to_plot.loc["text"] = text
_to_plot = _to_plot.fillna('')
_to_plot
BPCA VAE DAE CF TRKNN
protein groups 0.432 0.437 0.436 0.458 0.458
text LD: 10 HL: 64 LD: 10 HL: 64 LD: 50

Hide code cell source

fig, ax = plt.subplots(figsize=(4, 2))  # size of the plot can be adjusted
ax = _to_plot.loc[[feature_names.name]].plot.bar(
    rot=0,
    ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY}\n({n_in_comparison:,} intensities)",
    # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
    color=COLORS_TO_USE,
    ax=ax,
    width=.7)
ax = pimmslearn.plotting.add_height_to_barplot(ax, size=7)
ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=7)
ax.set_xticklabels([])
fname = args.out_figures / f'2_{group}_performance_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_performance_test.pdf
_images/4c8ec3c15b3689860d048375b7588ee5663168b3462d1a16033752954adb4308.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
_to_plot_long = _to_plot.T
_to_plot_long = _to_plot_long.rename(
    {feature_names.name: 'metric_value'}, axis=1)
_to_plot_long['data level'] = feature_names.name
_to_plot_long = _to_plot_long.set_index('data level', append=True)
_to_plot_long.to_csv(fname.with_suffix('.csv'))

Plot error by median feature intensity#

Hide code cell source

pimmslearn.plotting.make_large_descriptors(7)
fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred=pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC,
    palette=COLORS_TO_USE
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
pimmslearn.plotting.make_large_descriptors(6)
fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: UserWarning: The palette list has more values (24) than needed (5), which may not be intended.
  sns.barplot(
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_000 A0A075B6P5;P01615 BPCA 0.467 19 912 19\n(N=912)
1 Sample_000 A0A075B6P5;P01615 VAE 0.300 19 912 19\n(N=912)
2 Sample_000 A0A075B6P5;P01615 DAE 0.106 19 912 19\n(N=912)
3 Sample_000 A0A075B6P5;P01615 CF 0.301 19 912 19\n(N=912)
4 Sample_000 A0A075B6P5;P01615 TRKNN 0.422 19 912 19\n(N=912)
... ... ... ... ... ... ... ...
62,995 Sample_209 Q9UGM5;Q9UGM5-2 BPCA 0.476 16 1,913 16\n(N=1,913)
62,996 Sample_209 Q9UGM5;Q9UGM5-2 VAE 0.400 16 1,913 16\n(N=1,913)
62,997 Sample_209 Q9UGM5;Q9UGM5-2 DAE 0.427 16 1,913 16\n(N=1,913)
62,998 Sample_209 Q9UGM5;Q9UGM5-2 CF 0.181 16 1,913 16\n(N=1,913)
62,999 Sample_209 Q9UGM5;Q9UGM5-2 TRKNN 0.442 16 1,913 16\n(N=1,913)

63000 rows × 7 columns

_images/a45244949d8c4b3ab4e388b03ce0a71f93c59c92a5e525028174e9db3b0d4939.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=63) BPCA 0.619 0.504 0.744
1 11\n(N=63) CF 0.652 0.533 0.781
2 11\n(N=63) DAE 0.634 0.504 0.780
3 11\n(N=63) TRKNN 0.589 0.482 0.698
4 11\n(N=63) VAE 0.611 0.482 0.742
... ... ... ... ... ...
85 29\n(N=12) BPCA 0.133 0.065 0.212
86 29\n(N=12) CF 0.207 0.129 0.296
87 29\n(N=12) DAE 0.154 0.100 0.227
88 29\n(N=12) TRKNN 0.230 0.158 0.317
89 29\n(N=12) VAE 0.155 0.084 0.238

90 rows × 5 columns

Hide code cell source

(errors_binned
 .set_index(
     ['model', errors_binned.columns[-1]]
 )
 .loc[ORDER_MODELS[0]]
 .sort_values(by=METRIC))
Sample ID protein groups MAE bin n_obs
intensity binned by median of protein groups
18\n(N=846) Sample_142 P09972 0.000 18 846
15\n(N=2,557) Sample_021 A0A0A0MT66 0.000 15 2,557
14\n(N=2,074) Sample_058 Q16853;Q16853-2 0.000 14 2,074
16\n(N=1,913) Sample_015 B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 0.000 16 1,913
15\n(N=2,557) Sample_079 A6NCT7;Q07092;Q07092-2 0.000 15 2,557
... ... ... ... ... ...
14\n(N=2,074) Sample_011 P11597;P11597-2 5.771 14 2,074
14\n(N=2,074) Sample_184 F8WD41;Q15166 6.195 14 2,074
17\n(N=1,393) Sample_108 P27824;P27824-2 6.482 17 1,393
14\n(N=2,074) Sample_091 F8WD41;Q15166 6.823 14 2,074
14\n(N=2,074) Sample_115 P17050 7.635 14 2,074

12600 rows × 5 columns

Custom model selection#

Hide code cell source

if SEL_MODELS:
    metrics = pimmslearn.models.Metrics()
    test_metrics = metrics.add_metrics(
        pred_test[['observed', *SEL_MODELS]], key='test data')
    test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS]
    test_metrics

    n_in_comparison = int(test_metrics.loc['N'].unique()[0])
    n_in_comparison

    _to_plot = test_metrics.loc[METRIC].to_frame().T
    _to_plot.index = [feature_names.name]
    _to_plot

    try:
        text = model_configs[["latent_dim", "hidden_layers"]].apply(
            build_text,
            axis=1)
    except KeyError:
        logger.warning("No PIMMS models in comparsion. Using empty text")
        text = pd.Series('', index=model_configs.columns)

    _to_plot.loc["text"] = text
    _to_plot = _to_plot.fillna('')
    _to_plot

    fig, ax = plt.subplots(figsize=(4, 2))
    ax = _to_plot.loc[[feature_names.name]].plot.bar(
        rot=0,
        ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)",
        # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
        color=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS)),
        ax=ax,
        width=.7)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    ax = pimmslearn.plotting.add_height_to_barplot(ax, size=5)
    ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5)
    ax.set_xticklabels([])

    fname = args.out_figures / f'2_{group}_performance_test_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, name=fname)

    dumps[fname.stem] = fname.with_suffix('.csv')
    _to_plot_long = _to_plot.T
    _to_plot_long = _to_plot_long.rename(
        {feature_names.name: 'metric_value'}, axis=1)
    _to_plot_long['data level'] = feature_names.name
    _to_plot_long = _to_plot_long.set_index('data level', append=True)
    _to_plot_long.to_csv(fname.with_suffix('.csv'))

Hide code cell source

# custom selection
if SEL_MODELS:
    pimmslearn.plotting.make_large_descriptors(7)
    fig, ax = plt.subplots(figsize=(8, 2))

    ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
        pred=pred_test[
            [TARGET_COL] + SEL_MODELS
        ],
        feat_medians=data.train_X.median(),
        ax=ax,
        metric_name=METRIC,
        feat_name=FEAT_NAME_DISPLAY,
        palette=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS))
    )
    # ax.set_ylim(0, 1.5)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    # for text in ax.legend().get_texts():
    #     text.set_fontsize(6)
    fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(ax.get_figure(), name=fname)
    plt.show(fig)

    dumps[fname.stem] = fname.with_suffix('.csv')
    errors_binned.to_csv(fname.with_suffix('.csv'))
    pimmslearn.plotting.make_large_descriptors(6)
    # ax.xaxis.set_tick_params(rotation=0) # horizontal

    # ! only used for reporting
    plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
        errors=errors_binned,
        feat_name=FEAT_NAME_DISPLAY,
        metric_name=METRIC
    )
    plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
    display(plotted)

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_binned(
    pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    ax=ax,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_test_errors_binned_by_int.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:50: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  ax = sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf
_images/50da5fccb91da7b4a3d4c8628f89e206b73c83da69c1174e475582d008513aac.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE intensity bin
0 Sample_143 P02768 BPCA 0.065 30\n(N=2)
1 Sample_143 P02768 VAE 0.300 30\n(N=2)
2 Sample_143 P02768 DAE 0.244 30\n(N=2)
3 Sample_143 P02768 CF 0.445 30\n(N=2)
4 Sample_143 P02768 TRKNN 0.574 30\n(N=2)

Figures dumped to disk#

Hide code cell source

figures
{'2_1_fake_na_val_test_splits': Path('runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png'),
 '2_1_pred_corr_val_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf'),
 '2_1_errors_binned_by_feat_median_val': Path('runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf'),
 '2_1_intensity_binned_top_4_models_test': Path('runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf'),
 '2_1_pred_corr_test_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf'),
 '2_1_pred_corr_test_per_feat': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf'),
 '2_1_performance_test': Path('runs/alzheimer_study/figures/2_1_performance_test.pdf'),
 '2_1_test_errors_binned_by_feat_medians': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf'),
 '2_1_test_errors_binned_by_int': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf')}

Hide code cell source

dumps
print("done")
done