Compare models#

  1. Load available configurations

  2. Load validation predictions

    • calculate absolute error

    • select top N for plotting by MAE from smallest (best) to largest (worst) (top N as specified, default 5)

    • correlation per sample, correlation per feat, correlation overall

    • MAE plots

  3. Load test data predictions

    • as for validation data

    • top N based on validation data

Hide code cell source

import logging
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from IPython.display import display

import pimmslearn
import pimmslearn.imputation
import pimmslearn.models
import pimmslearn.nb
from pimmslearn.analyzers import compare_predictions
from pimmslearn.io import datasplits
from pimmslearn.models.collect_dumps import collect, select_content

pd.options.display.max_rows = 30
pd.options.display.min_rows = 10
pd.options.display.max_colwidth = 100

plt.rcParams.update({'figure.figsize': (4, 2)})
pimmslearn.plotting.make_large_descriptors(7)

logger = pimmslearn.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)


def load_config_file(fname: Path, first_split='config_') -> dict:
    with open(fname) as f:
        loaded = yaml.safe_load(f)
    key = f"{select_content(fname.stem, first_split=first_split)}"
    return key, loaded


def build_text(s):
    ret = ''
    if not np.isnan(s["latent_dim"]):
        ret += f'LD: {int(s["latent_dim"])} '
    try:
        if len(s["hidden_layers"]):
            t = ",".join(str(x) for x in s["hidden_layers"])
            ret += f"HL: {t}"
    except TypeError:
        # nan
        pass
    return ret

Hide code cell source

# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

# files and folders
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
folder_data: str = ''  # specify data directory if needed
file_format: str = 'csv'  # change default to pickled files
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
models: str = 'Median,CF,DAE,VAE'  # picked models to compare (comma separated)
sel_models: str = ''  # user defined comparison (comma separated)
# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10
plot_to_n: int = 5
feat_name_display: str = None  # display name for feature name in plural (e.g. 'protein groups')
save_agg_pred: bool = False  # save aggregated predictions of validation and test data
# Parameters
fn_rawfile_metadata = "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv"
folder_experiment = "runs/alzheimer_study"
models = "Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO"

Some argument transformations

Hide code cell source

args = pimmslearn.nb.get_params(args, globals=globals())
args
root - INFO     Removed from global namespace: folder_experiment
root - INFO     Removed from global namespace: folder_data
root - INFO     Removed from global namespace: file_format
root - INFO     Removed from global namespace: fn_rawfile_metadata
root - INFO     Removed from global namespace: models
root - INFO     Removed from global namespace: sel_models
root - INFO     Removed from global namespace: plot_to_n
root - INFO     Removed from global namespace: feat_name_display
root - INFO     Removed from global namespace: save_agg_pred
{'folder_experiment': 'runs/alzheimer_study',
 'folder_data': '',
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'sel_models': '',
 'plot_to_n': 5,
 'feat_name_display': None,
 'save_agg_pred': False}

Hide code cell source

args = pimmslearn.nb.args_from_dict(args)
args
{'data': Path('runs/alzheimer_study/data'),
 'feat_name_display': None,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'folder_data': '',
 'folder_experiment': Path('runs/alzheimer_study'),
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'out_figures': Path('runs/alzheimer_study/figures'),
 'out_folder': Path('runs/alzheimer_study'),
 'out_metrics': Path('runs/alzheimer_study'),
 'out_models': Path('runs/alzheimer_study'),
 'out_preds': Path('runs/alzheimer_study/preds'),
 'plot_to_n': 5,
 'save_agg_pred': False,
 'sel_models': ''}

Hide code cell source

figures = {}
dumps = {}

Hide code cell source

TARGET_COL = 'observed'
METRIC = 'MAE'
MIN_FREQ = None
MODELS_PASSED = args.models.split(',')
MODELS = MODELS_PASSED.copy()
FEAT_NAME_DISPLAY = args.feat_name_display
SEL_MODELS = None
if args.sel_models:
    SEL_MODELS = args.sel_models.split(',')

Hide code cell source

# list(sns.color_palette().as_hex()) # string representation of colors
if args.plot_to_n > 10:
    logger.warning("Set maximum of models to 10 (maximum)")
    args.overwrite_entry('plot_to_n', 10)

Hide code cell source

data = datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)
pimmslearn.io.datasplits - INFO     Loaded 'train_X' from file: runs/alzheimer_study/data/train_X.csv
pimmslearn.io.datasplits - INFO     Loaded 'val_y' from file: runs/alzheimer_study/data/val_y.csv
pimmslearn.io.datasplits - INFO     Loaded 'test_y' from file: runs/alzheimer_study/data/test_y.csv

Hide code cell source

fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)

pimmslearn.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],
                                     title='Validation split', size=1, xlabel='')
pimmslearn.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],
                                     title='Test split', size=1, xlabel='')
fig.suptitle("Simulated missing values per sample", size=8)
# hide axis and use only for common x label
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')
group = 1
fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png
_images/ff44d05a21d3f419368f72b6eb7fda86b93151c578c8443bbdd78e45b0c5157c.png

data completeness across entire data#

Hide code cell source

# load frequency of training features...
# needs to be pickle -> index.name needed
freq_feat = pimmslearn.io.datasplits.load_freq(args.data, file='freq_features.json')
freq_feat.head()  # training data
A0A024QZX5;A0A087X1N8;P35237                                                     197
A0A024R0T9;K7ER74;P02655                                                         208
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8   185
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                          208
A0A075B6H7                                                                        97
Name: freq, dtype: int64

Hide code cell source

prop = freq_feat / len(data.train_X.index.levels[0])
prop.sort_values().to_frame().plot(
    xlabel=f'{data.val_y.index.names[-1]}',
    ylabel='Proportion of identification in samples')
<Axes: xlabel='protein groups', ylabel='Proportion of identification in samples'>
_images/f54ca175675c21e4dbf0fd38dd83213b125edb0ef678c7d6c81091b09e70ccbd.png

View training data in wide format

Hide code cell source

data.to_wide_format()
data.train_X
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 ... Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 ... 16.012 15.178 NaN 15.050 16.842 NaN NaN 19.563 NaN 12.805
Sample_001 NaN 16.874 15.519 16.387 NaN 19.941 18.786 17.144 NaN 19.067 ... 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 NaN 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 ... 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 ... 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 NaN 12.445
Sample_004 15.603 15.331 15.375 16.679 NaN 20.450 18.682 17.081 14.140 19.686 ... 14.757 NaN NaN 15.256 17.075 19.582 15.328 NaN 13.145 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 15.682 16.886 14.910 16.482 NaN 17.705 17.039 NaN 16.413 19.102 ... NaN 15.684 14.236 15.415 17.551 17.922 16.340 19.928 12.929 NaN
Sample_206 15.798 17.554 15.600 15.938 NaN 18.154 18.152 16.503 16.860 18.538 ... 15.422 16.106 NaN 15.345 17.084 18.708 NaN 19.433 NaN NaN
Sample_207 15.739 NaN 15.469 16.898 NaN 18.636 17.950 16.321 16.401 18.849 ... 15.808 16.098 14.403 15.715 NaN 18.725 16.138 19.599 13.637 11.174
Sample_208 15.477 16.779 14.995 16.132 NaN 14.908 NaN NaN 16.119 18.368 ... 15.157 16.712 NaN 14.640 16.533 19.411 15.807 19.545 NaN NaN
Sample_209 NaN 17.261 15.175 16.235 NaN 17.893 17.744 16.371 15.780 18.806 ... 15.237 15.652 15.211 14.205 16.749 19.275 15.732 19.577 11.042 11.791

210 rows × 1421 columns

Number of samples and features:

Hide code cell source

N_SAMPLES, M_FEAT = data.train_X.shape
print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}")
N samples: 210, M features: 1421

Collect outputs in excel file:

Hide code cell source

fname = args.folder_experiment / '01_2_performance_summary.xlsx'
dumps[fname.stem] = fname
writer = pd.ExcelWriter(fname)
print(f"Saving to: {fname}")
Saving to: runs/alzheimer_study/01_2_performance_summary.xlsx

Model specifications#

  • used for bar plot annotations

Hide code cell source

# model_key could be used as key from config file
# ? load only specified configs?
# ? case: no config file available?
all_configs = collect(
    paths=(fname for fname in args.out_models.iterdir()
           if fname.suffix == '.yaml'
           and 'model_config' in fname.name),
    load_fn=load_config_file
)
model_configs = pd.DataFrame(all_configs).set_index('id')
model_configs.T.to_excel(writer, sheet_name='model_params')
model_configs.T
id Median CF KNN KNN5 VAE DAE
M 1421 1421 1421 1421 1421 1421
data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data
file_format csv csv csv csv csv csv
fn_rawfile_metadata https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv
folder_experiment runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
meta_cat_col NaN NaN NaN NaN NaN NaN
meta_date_col NaN NaN NaN NaN NaN NaN
model Median CF KNN KNN VAE DAE
model_key Median CF KNN KNN5 VAE DAE
n_params 1421 83283 1 1 277998 184983
out_figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures
out_folder runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_metrics runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_models runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds
pred_test_Median runs/alzheimer_study/preds/pred_test_Median.csv NaN NaN NaN NaN NaN
pred_val_Median runs/alzheimer_study/preds/pred_val_Median.csv NaN NaN NaN NaN NaN
sample_idx_position 0 0 0 0 0 0
save_pred_real_na True True True True True True
batch_size NaN 1,024.000 64.000 64.000 64.000 64.000
cuda NaN False True True False False
epoch_trained NaN 14.000 NaN NaN 133.000 78.000
epochs_max NaN 100.000 50.000 50.000 300.000 300.000
folder_data NaN
latent_dim NaN 50.000 NaN NaN 10.000 10.000
patience NaN 1.000 NaN NaN 50.000 25.000
force_train NaN NaN True True NaN NaN
neighbors NaN NaN 3.000 5.000 NaN NaN
hidden_layers NaN NaN NaN NaN [64] [64]

Set Feature name (columns are features, rows are samples)

Hide code cell source

# index name
freq_feat.index.name = data.train_X.columns.name
# sample index name
sample_index_name = data.train_X.index.name

Load predictions on validation and test data split#

Validation data#

  • set top N models to plot based on validation data split

Hide code cell source

pred_val = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='val',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
SAMPLE_ID, FEAT_NAME = pred_val.index.names
if not FEAT_NAME_DISPLAY:
    FEAT_NAME_DISPLAY = FEAT_NAME
pred_val[MODELS]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 15.752 15.406 15.730 15.809 15.427 15.449 15.469 16.800 NaN 58.276 ... 7.068 11.317 2,513.638 13.354 15.062 15.476 15.752 17.206 15.700 0
Sample_050 Q9Y287 17.221 16.554 16.781 16.745 17.776 17.314 16.453 17.288 NaN 16.993 ... 7.068 13.053 19.829 14.536 14.579 16.870 17.221 17.807 16.738 0
Sample_107 Q8N475;Q8N475-2 14.846 14.013 14.507 14.855 14.150 14.355 13.110 17.187 NaN -78.084 ... 7.068 11.832 2,582.130 12.372 13.287 14.672 14.846 17.434 13.776 0
Sample_199 P06307 18.973 19.567 19.164 18.962 19.247 19.385 19.639 16.711 NaN 102.283 ... 7.068 12.766 2,483.120 12.309 17.477 19.103 18.973 17.111 19.015 0
Sample_067 Q5VUB5 14.726 16.089 15.268 14.887 15.232 15.040 15.465 16.743 NaN -36.470 ... 7.068 11.715 2,569.564 11.997 12.216 14.779 14.726 17.031 14.699 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.918 23.107 23.005 22.788 22.884 22.899 22.994 17.042 NaN 104.484 ... 7.068 11.273 2,634.108 14.133 22.276 22.788 22.918 17.330 22.872 0
Sample_002 A0A0A0MT36 15.877 15.516 15.694 15.909 16.857 16.142 15.882 16.792 NaN -18.408 ... 7.068 13.566 2,448.503 12.647 13.552 16.020 15.877 16.879 15.671 0
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 16.278 15.117 15.847 15.770 15.840 15.574 15.406 17.032 NaN -27.128 ... 7.068 12.344 2,487.550 11.850 14.976 15.538 16.278 17.215 15.574 0
Sample_182 Q8NFT8 13.995 12.802 13.751 13.472 13.685 13.480 14.322 16.764 NaN -12.434 ... 7.068 13.018 2,426.191 11.593 12.188 13.446 13.995 17.125 14.518 0
Sample_123 Q16853;Q16853-2 14.849 14.356 14.545 14.590 14.612 14.627 14.582 16.686 NaN 78.799 ... 7.068 14.303 2,461.806 13.110 13.865 14.638 14.849 16.981 14.485 0

12600 rows × 24 columns

Describe absolute error

Hide code cell source

errors_val = (pred_val
              .drop(TARGET_COL, axis=1)
              .sub(pred_val[TARGET_COL], axis=0)
              [MODELS])
errors_val  # over all samples and all features
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 1.122 0.775 1.100 1.178 0.797 0.819 0.839 2.169 NaN 43.645 ... -7.562 -3.314 2,499.008 -1.276 0.431 0.845 1.122 2.575 1.070 -14.630
Sample_050 Q9Y287 1.466 0.800 1.026 0.990 2.021 1.559 0.698 1.533 NaN 1.238 ... -8.687 -2.702 4.074 -1.219 -1.176 1.115 1.466 2.052 0.983 -15.755
Sample_107 Q8N475;Q8N475-2 -0.183 -1.016 -0.522 -0.175 -0.880 -0.674 -1.919 2.157 NaN -93.113 ... -7.961 -3.198 2,567.100 -2.657 -1.743 -0.358 -0.183 2.405 -1.253 -15.029
Sample_199 P06307 -0.403 0.191 -0.211 -0.414 -0.129 0.009 0.263 -2.665 NaN 82.907 ... -12.308 -6.610 2,463.744 -7.067 -1.898 -0.272 -0.403 -2.265 -0.360 -19.376
Sample_067 Q5VUB5 -0.583 0.780 -0.041 -0.422 -0.077 -0.269 0.156 1.434 NaN -51.779 ... -8.241 -3.594 2,554.255 -3.311 -3.092 -0.530 -0.583 1.723 -0.610 -15.309
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 0.096 0.285 0.183 -0.034 0.062 0.077 0.171 -5.781 NaN 81.662 ... -15.754 -11.549 2,611.285 -8.689 -0.547 -0.034 0.096 -5.493 0.050 -22.822
Sample_002 A0A0A0MT36 -2.288 -2.649 -2.471 -2.256 -1.308 -2.023 -2.283 -1.373 NaN -36.573 ... -11.097 -4.599 2,430.338 -5.518 -4.613 -2.145 -2.288 -1.286 -2.494 -18.165
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 0.753 -0.408 0.322 0.245 0.314 0.049 -0.120 1.507 NaN -42.653 ... -8.457 -3.181 2,472.025 -3.675 -0.549 0.013 0.753 1.690 0.049 -15.525
Sample_182 Q8NFT8 -0.383 -1.576 -0.627 -0.907 -0.694 -0.899 -0.057 2.385 NaN -26.813 ... -7.311 -1.361 2,411.812 -2.786 -2.191 -0.933 -0.383 2.746 0.139 -14.379
Sample_123 Q16853;Q16853-2 0.345 -0.148 0.041 0.085 0.108 0.123 0.077 2.181 NaN 64.295 ... -7.436 -0.201 2,447.302 -1.394 -0.639 0.134 0.345 2.477 -0.019 -14.504

12600 rows × 24 columns

Select top N for plotting and set colors#

Hide code cell source

ORDER_MODELS = (errors_val
                .abs()
                .mean()
                .sort_values()
                .index
                .to_list())
ORDER_MODELS
['BPCA',
 'VAE',
 'DAE',
 'TRKNN',
 'RF',
 'KNN5',
 'CF',
 'KNN',
 'KNN_IMPUTE',
 'IRM',
 'ROWMEDIAN',
 'Median',
 'LLS',
 'QRILC',
 'COLMEDIAN',
 'SVDMETHOD',
 'PI',
 'MINDET',
 'MINPROB',
 'MINIMUM',
 'ZERO',
 'IMPSEQROB',
 'MLE',
 'IMPSEQ']

Hide code cell source

pred_val = pred_val[[TARGET_COL] + ORDER_MODELS]
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_val.csv'
    dumps[fname.stem] = fname
    pred_val.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_val
observed BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 14.630 15.469 15.809 15.730 15.700 15.476 15.449 15.406 15.427 15.937 ... 16.800 17.206 13.354 11.916 11.317 7.068 0 58.276 2,513.638 NaN
Sample_050 Q9Y287 15.755 16.453 16.745 16.781 16.738 16.870 17.314 16.554 17.776 16.961 ... 17.288 17.807 14.536 12.900 13.053 7.068 0 16.993 19.829 NaN
Sample_107 Q8N475;Q8N475-2 15.029 13.110 14.855 14.507 13.776 14.672 14.355 14.013 14.150 15.437 ... 17.187 17.434 12.372 12.313 11.832 7.068 0 -78.084 2,582.130 NaN
Sample_199 P06307 19.376 19.639 18.962 19.164 19.015 19.103 19.385 19.567 19.247 18.861 ... 16.711 17.111 12.309 12.285 12.766 7.068 0 102.283 2,483.120 NaN
Sample_067 Q5VUB5 15.309 15.465 14.887 15.268 14.699 14.779 15.040 16.089 15.232 15.079 ... 16.743 17.031 11.997 11.827 11.715 7.068 0 -36.470 2,569.564 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.822 22.994 22.788 23.005 22.872 22.788 22.899 23.107 22.884 22.837 ... 17.042 17.330 14.133 12.161 11.273 7.068 0 104.484 2,634.108 NaN
Sample_002 A0A0A0MT36 18.165 15.882 15.909 15.694 15.671 16.020 16.142 15.516 16.857 15.446 ... 16.792 16.879 12.647 12.586 13.566 7.068 0 -18.408 2,448.503 NaN
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 15.525 15.406 15.770 15.847 15.574 15.538 15.574 15.117 15.840 15.995 ... 17.032 17.215 11.850 12.352 12.344 7.068 0 -27.128 2,487.550 NaN
Sample_182 Q8NFT8 14.379 14.322 13.472 13.751 14.518 13.446 13.480 12.802 13.685 14.675 ... 16.764 17.125 11.593 12.504 13.018 7.068 0 -12.434 2,426.191 NaN
Sample_123 Q16853;Q16853-2 14.504 14.582 14.590 14.545 14.485 14.638 14.627 14.356 14.612 14.824 ... 16.686 16.981 13.110 12.689 14.303 7.068 0 78.799 2,461.806 NaN

12600 rows × 25 columns

Hide code cell source

mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS]
mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f')
mae_stats_ordered_val.T
count mean std min 25% 50% 75% max
BPCA 12,600.000 0.422 0.501 0.000 0.119 0.269 0.534 9.370
VAE 12,600.000 0.431 0.521 0.000 0.122 0.273 0.538 8.939
DAE 12,600.000 0.438 0.522 0.000 0.123 0.280 0.549 9.290
TRKNN 12,600.000 0.450 0.516 0.000 0.132 0.295 0.569 7.975
RF 12,600.000 0.460 0.532 0.000 0.134 0.302 0.581 8.430
KNN5 12,600.000 0.467 0.546 0.000 0.135 0.305 0.594 10.231
CF 12,600.000 0.467 0.503 0.000 0.148 0.327 0.605 6.683
KNN 12,600.000 0.481 0.565 0.000 0.138 0.310 0.618 10.502
KNN_IMPUTE 12,600.000 0.554 0.668 0.000 0.164 0.359 0.692 7.550
IRM 12,600.000 0.588 0.637 0.000 0.176 0.396 0.767 7.953
ROWMEDIAN 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
Median 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
LLS 12,600.000 1.329 54.974 0.000 0.151 0.343 0.662 4,842.571
QRILC 12,600.000 1.651 1.301 0.000 0.828 1.350 2.084 15.546
COLMEDIAN 12,600.000 2.210 1.634 0.000 0.947 1.972 3.094 12.944
SVDMETHOD 12,600.000 2.309 1.635 0.000 1.027 2.091 3.251 12.624
PI 12,600.000 3.817 2.648 0.000 1.782 3.357 5.356 17.563
MINDET 12,600.000 4.108 2.650 0.001 2.089 3.678 5.665 17.920
MINPROB 12,600.000 4.127 2.689 0.000 2.077 3.715 5.723 18.711
MINIMUM 12,600.000 9.272 2.717 0.373 7.327 8.890 10.863 22.773
ZERO 12,600.000 16.340 2.717 6.695 14.395 15.958 17.931 29.841
IMPSEQROB 12,600.000 333.478 793.700 0.002 12.282 33.864 87.298 2,869.299
MLE 12,600.000 2,172.384 865.925 0.009 2,435.415 2,495.362 2,552.718 2,873.681
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Some model have fixed colors, others are assigned randomly

Note

  1. The order of “new” models is important for the color assignment.

  2. User defined model keys for the same model with two configuration will yield different colors.

Hide code cell source

COLORS_TO_USE = pimmslearn.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS))
pimmslearn.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE)
pimmslearn.plotting.defaults - INFO     Reused some colors!
BPCAVAEDAETRKNNRFKNN5CFKNNKNN_IMPUTEIRMROWMEDIANMedianLLSQRILCCOLMEDIANSVDMETHODPIMINDETMINPROBMINIMUMZEROIMPSEQROBMLEIMPSEQ

Hide code cell source

TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n]
TOP_N_COLOR_PALETTE = {model: color for model,
                       color in zip(TOP_N_ORDER, COLORS_TO_USE)}
TOP_N_ORDER
['BPCA', 'VAE', 'DAE', 'TRKNN', 'RF']

Correlation per sample#

Hide code cell source

corr_per_sample_val = (pred_val
                       .groupby(sample_index_name)
                       .apply(
                           lambda df: df.corr().loc[TARGET_COL]
                       )[ORDER_MODELS])

min_corr = int(corr_per_sample_val.min().min() * 10) / 10
kwargs = dict(ylim=(min_corr, 1), rot=90,
              #     boxprops=dict(linewidth=1.5),
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model pred. per sample on validation data',
              ylabel='correlation per sample')
ax = corr_per_sample_val[TOP_N_ORDER].plot.box(**kwargs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.xlsx'
dumps[fname.stem] = fname
with pd.ExcelWriter(fname) as w:
    corr_per_sample_val.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_val.to_excel(w, sheet_name='correlations')
    corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf
_images/a26e06d75703cb407ad9291fc30a34b3eb54ce4f81cedee9031ef7aa9a148260.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_val[TOP_N_ORDER]).min()
mask = (corr_per_sample_val[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_val.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
observed BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID                                                
Sample_010 0.948707 0.934150 0.942435 0.944645 0.922210 0.940594 0.939263 0.946115 0.932235 0.932518 0.869102 0.869102 0.913668 0.833271 nan 0.065802 0.133658 nan -0.154396 nan nan 0.589151 nan nan
Sample_018 0.968582 0.938361 0.916764 0.953573 0.923735 0.925531 0.962830 0.938822 0.939383 0.952858 0.908717 0.908717 0.936909 0.900491 nan 0.161907 0.031425 nan -0.102648 nan nan 0.375658 nan nan
Sample_033 0.960627 0.949817 0.961759 0.954568 0.928322 0.936031 0.951607 0.928838 0.744119 0.927209 0.814959 0.814959 0.949288 0.629308 nan 0.227900 -0.101158 nan 0.200408 nan nan 0.049356 nan nan
Sample_054 0.932254 0.908866 0.930156 0.910271 0.927239 0.936915 0.932995 0.925876 0.905679 0.913765 0.915748 0.915748 0.929264 0.782515 nan 0.190649 0.152989 nan -0.061943 nan nan 0.836204 nan nan
Sample_068 0.957961 0.941328 0.929300 0.931400 0.946551 0.930031 0.949686 0.938301 0.943600 0.919638 0.946119 0.946119 0.670285 0.945672 nan -0.075698 -0.038962 nan 0.152574 nan nan 0.441609 nan nan
Sample_071 0.887866 0.907313 0.902549 0.888162 0.894359 0.901240 0.898973 0.895286 0.880453 0.865003 0.885806 0.885806 0.899799 0.821113 nan 0.178303 -0.240911 nan 0.117724 nan nan 0.364226 nan nan
Sample_073 0.930349 0.910853 0.922530 0.919876 0.905153 0.933555 0.925866 0.950641 0.916774 0.901773 0.900178 0.900178 0.909057 0.902783 nan -0.017963 0.055493 nan -0.040629 nan nan 0.356937 nan nan
Sample_095 0.940942 0.914113 0.921194 0.927289 0.919147 0.924950 0.938496 0.930902 0.909714 0.913905 0.878167 0.878167 0.917350 0.850322 nan -0.120269 0.077372 nan -0.115895 nan nan 0.419195 nan nan
Sample_133 0.919483 0.927641 0.940180 0.928251 0.922336 0.903483 0.922197 0.903370 0.885348 0.878925 0.899233 0.899233 0.881238 0.848223 nan 0.219841 0.065599 nan -0.006491 nan nan 0.409126 nan nan
Sample_139 0.927681 0.939980 0.932944 0.957367 0.931584 0.912868 0.904991 0.901552 0.878475 0.891290 0.907333 0.907333 0.928867 0.675676 nan 0.156894 -0.212564 nan -0.039948 nan nan 0.554137 nan nan
Sample_150 0.950334 0.905675 0.902603 0.945063 0.914321 0.885565 0.953595 0.868275 0.930981 0.907850 0.892997 0.892997 0.940619 0.815802 nan 0.166841 -0.195396 nan -0.248278 nan nan 0.335988 nan nan
Sample_171 0.924707 0.908186 0.925427 0.916959 0.910873 0.902581 0.946038 0.906699 0.884571 0.881019 0.875433 0.875433 -0.090619 0.907328 nan -0.004823 0.154828 nan 0.119043 nan nan 0.302006 nan nan
Sample_173 0.916627 0.945366 0.957861 0.932711 0.955475 0.939783 0.948231 0.940274 0.918589 0.916299 0.925428 0.925428 0.926916 0.956769 nan 0.059663 0.124518 nan 0.006621 nan nan 0.334436 nan nan
Sample_174 0.970316 0.885336 0.875343 0.967356 0.893672 0.854645 0.964825 0.846532 0.920737 0.920759 0.887409 0.887409 0.972096 0.759893 nan 0.306279 0.005729 nan -0.059628 nan nan 0.357612 nan nan
Sample_181 0.964199 0.949148 0.966324 0.945214 0.914694 0.940761 0.929175 0.936048 0.909140 0.899569 0.861266 0.861266 0.893600 0.653508 nan -0.013702 0.192492 nan -0.019062 nan nan 0.530054 -0.103146 nan
Sample_198 0.914339 0.944572 0.944199 0.932612 0.946262 0.955742 0.937625 0.947627 0.936142 0.946119 0.956493 0.956493 0.924497 0.922929 nan 0.097862 -0.046744 nan 0.022802 nan nan 0.481999 nan nan

Error plot#

Hide code cell source

c_error_min = 4.5
mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1)
errors_val.loc[mask].sort_index(level=1).head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_012 A0A024QZX5;A0A087X1N8;P35237 -0.324 -0.213 -0.357 -0.301 -0.246 -0.416 -0.140 0.856 NaN -53.608 ... -8.881 -4.770 0.632 -3.175 -1.542 -0.312 -0.324 1.241 -0.318 -15.949
Sample_017 A0A024QZX5;A0A087X1N8;P35237 0.347 0.769 0.409 0.418 -0.093 -0.022 0.251 1.658 NaN 26.724 ... -8.211 -2.466 1.228 -3.351 -0.379 0.309 0.347 2.214 0.305 -15.279
Sample_050 A0A024QZX5;A0A087X1N8;P35237 0.544 0.581 0.299 0.140 0.024 -0.102 0.178 2.207 NaN 0.348 ... -8.013 -1.840 3.294 -2.772 -0.497 0.169 0.544 2.691 0.238 -15.081
Sample_102 A0A024QZX5;A0A087X1N8;P35237 -0.029 -0.047 -0.079 -0.130 0.030 0.067 -0.107 0.942 NaN 19.277 ... -8.586 -4.440 1.609 -2.686 -0.840 -0.072 -0.029 1.168 -0.065 -15.654
Sample_109 A0A024QZX5;A0A087X1N8;P35237 0.343 -0.343 -0.036 0.063 -0.179 -0.004 -0.263 1.518 NaN -28.795 ... -8.215 -3.566 -2.077 -3.957 -0.501 0.104 0.343 1.968 -0.012 -15.283

5 rows × 24 columns

Hide code cell source

errors_val = errors_val.abs().groupby(
    freq_feat.index.name).mean()  # absolute error
errors_val = errors_val.join(freq_feat)
errors_val = errors_val.sort_values(by=freq_feat.name, ascending=True)
errors_val.head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
Q9Y281;Q9Y281-3 0.415 0.224 0.233 0.359 0.325 0.285 0.266 4.078 NaN 10.519 ... 0.548 2,473.194 1.144 1.147 0.396 0.415 4.472 0.307 12.573 52
K7EPJ5;O60291;O60291-2;O60291-3;O60291-4 0.331 0.564 0.411 0.393 0.281 0.385 0.387 3.029 NaN 13.344 ... 1.075 2,525.930 1.658 1.702 0.282 0.331 3.452 0.257 13.857 52
B1AJQ6;Q86Y82 1.082 0.758 0.013 0.186 0.482 1.226 0.674 3.367 NaN 5.026 ... 0.647 2,441.128 1.173 4.555 1.198 1.082 3.926 0.900 13.380 52
P69892 0.872 1.623 1.817 1.128 1.734 1.286 1.360 1.980 NaN 18.023 ... 2.714 2,532.051 2.110 5.750 1.348 0.872 2.320 0.966 14.768 53
A2RU67 0.689 0.528 0.344 0.416 0.503 0.462 0.539 4.495 NaN 15.116 ... 0.916 1,998.072 1.345 0.879 0.443 0.689 4.870 0.462 12.437 53

5 rows × 25 columns

Hide code cell source

errors_val.describe()[ORDER_MODELS].T  # mean of means
count mean std min 25% 50% 75% max
BPCA 1,419.000 0.408 0.306 0.017 0.222 0.320 0.494 4.195
VAE 1,419.000 0.421 0.322 0.001 0.226 0.331 0.492 3.618
DAE 1,419.000 0.426 0.329 0.013 0.227 0.329 0.502 3.537
TRKNN 1,419.000 0.437 0.309 0.000 0.241 0.349 0.526 3.647
RF 1,419.000 0.448 0.320 0.010 0.250 0.363 0.527 3.526
KNN5 1,419.000 0.455 0.322 0.039 0.256 0.369 0.540 3.634
CF 1,419.000 0.455 0.297 0.026 0.260 0.378 0.533 3.498
KNN 1,419.000 0.468 0.333 0.012 0.267 0.375 0.549 3.693
KNN_IMPUTE 1,419.000 0.531 0.378 0.063 0.296 0.424 0.636 3.430
IRM 1,419.000 0.555 0.372 0.030 0.311 0.449 0.674 3.476
ROWMEDIAN 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
Median 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
LLS 1,419.000 1.088 19.029 0.023 0.279 0.408 0.596 706.018
QRILC 1,419.000 1.619 0.913 0.269 1.040 1.377 1.887 7.703
COLMEDIAN 1,419.000 2.071 1.509 0.038 0.916 1.738 2.812 12.631
SVDMETHOD 1,419.000 2.136 1.467 0.149 0.976 1.893 2.905 12.211
PI 1,419.000 4.121 2.443 0.442 2.275 3.724 5.485 16.760
MINDET 1,419.000 4.438 2.493 0.374 2.622 4.032 5.828 17.100
MINPROB 1,419.000 4.452 2.491 0.363 2.621 4.019 5.853 17.270
MINIMUM 1,419.000 9.620 2.542 3.842 7.854 9.231 11.051 22.371
ZERO 1,419.000 16.688 2.542 10.910 14.922 16.299 18.119 29.439
IMPSEQROB 1,419.000 443.657 892.834 0.830 23.477 43.842 100.325 2,633.136
MLE 1,419.000 2,171.007 331.079 1.453 1,992.846 2,214.845 2,487.619 2,683.431
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Hide code cell source

c_avg_error = 2
mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1)
errors_val.loc[mask]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
O60512 2.233 1.868 2.197 1.820 2.293 2.209 1.989 5.769 NaN 8.471 ... 1.529 2,128.612 2.343 3.910 1.905 2.233 6.098 2.559 10.910 58
P18206;P18206-2 2.427 2.204 1.617 1.947 1.744 1.637 1.297 3.821 NaN 7.976 ... 1.587 2,518.839 1.349 2.933 2.104 2.427 4.085 1.581 12.898 97
Q99538 2.502 2.571 2.387 2.304 2.711 2.517 2.464 2.615 NaN 8.517 ... 2.264 1,769.534 2.844 2.476 2.498 2.502 2.767 2.399 14.984 107
P02100 2.192 1.608 2.366 2.614 2.283 2.509 1.033 1.996 NaN 14.829 ... 4.101 2,512.438 3.655 7.404 2.626 2.192 2.106 1.856 16.373 127
A0A0G2JRN3 3.053 3.498 3.537 3.618 3.693 3.634 4.195 3.998 NaN 71.992 ... 7.717 1,986.331 6.703 7.703 3.526 3.053 3.976 3.647 19.496 128
P01817 2.254 2.069 2.014 2.088 1.963 2.117 2.385 2.736 NaN 10.059 ... 2.611 2,369.250 2.341 3.402 2.146 2.254 3.104 2.039 14.053 133
Q15375;Q15375-4 4.171 1.498 1.428 1.617 1.608 1.331 1.981 3.754 NaN 16.223 ... 6.741 2,285.221 5.848 5.295 1.830 4.171 3.566 2.065 19.101 163
P68871 2.331 1.349 2.207 2.480 1.616 1.638 0.571 1.720 NaN 23.608 ... 4.102 2,237.073 3.825 5.207 2.015 2.331 2.014 0.854 16.378 168
P69905 2.793 1.460 2.649 2.731 2.936 2.820 1.032 2.807 NaN 94.049 ... 5.850 1,992.771 5.812 6.138 2.823 2.793 2.626 1.016 18.200 190
P35527 2.216 1.408 2.544 2.428 2.064 2.156 1.295 2.273 NaN 96.343 ... 4.575 2,335.097 4.233 4.739 2.103 2.216 2.403 1.169 17.045 195
P15509;P15509-2;P15509-3;P15509-5;P15509-7;P15509-8 2.252 1.179 2.425 1.836 1.218 1.374 1.336 3.397 NaN 48.350 ... 6.157 1,276.662 6.332 4.436 1.754 2.252 3.146 2.437 18.354 201

11 rows × 25 columns

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 3))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred_val[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,)
ax.set_ylabel(f"Average error ({METRIC})")
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf
_images/73afc880e0076d831a91ae1826ac8034760e8bde8e6523519e99b77d27d8bf76.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=94) BPCA 0.715 0.601 0.849
1 11\n(N=94) DAE 0.696 0.580 0.827
2 11\n(N=94) RF 0.646 0.529 0.781
3 11\n(N=94) TRKNN 0.655 0.545 0.784
4 11\n(N=94) VAE 0.616 0.497 0.764
... ... ... ... ... ...
85 29\n(N=5) BPCA 0.175 0.062 0.288
86 29\n(N=5) DAE 0.231 0.072 0.389
87 29\n(N=5) RF 0.085 0.035 0.179
88 29\n(N=5) TRKNN 0.193 0.128 0.257
89 29\n(N=5) VAE 0.152 0.064 0.245

90 rows × 5 columns

Hide code cell source

errors_binned.head()
dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_158 Q9UN70;Q9UN70-2 BPCA 0.839 15 2,398 15\n(N=2,398)
1 Sample_158 Q9UN70;Q9UN70-2 VAE 1.178 15 2,398 15\n(N=2,398)
2 Sample_158 Q9UN70;Q9UN70-2 DAE 1.100 15 2,398 15\n(N=2,398)
3 Sample_158 Q9UN70;Q9UN70-2 TRKNN 1.070 15 2,398 15\n(N=2,398)
4 Sample_158 Q9UN70;Q9UN70-2 RF 0.845 15 2,398 15\n(N=2,398)

test data#

Hide code cell source

pred_test = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='test',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
pred_test = pred_test[[TARGET_COL] + ORDER_MODELS]
pred_test = pred_test.join(freq_feat, on=freq_feat.index.name)
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_test.csv'
    dumps[fname.stem] = fname
    pred_test.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_test
observed BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_000 A0A075B6P5;P01615 17.016 17.483 17.000 17.225 17.438 17.717 17.207 18.016 17.190 18.269 ... 17.496 11.729 12.970 12.210 7.068 0 229.376 2,505.226 NaN 210
A0A087X089;Q16627;Q16627-2 18.280 17.769 18.048 18.062 17.930 17.710 18.146 17.892 18.293 17.797 ... 17.695 12.439 12.970 13.601 7.068 0 -20.319 2,505.226 NaN 210
A0A0B4J2B5;S4R460 21.735 22.459 22.254 22.220 22.397 22.228 21.959 22.508 21.835 22.205 ... 17.493 12.746 12.970 13.388 7.068 0 -10.898 2,505.226 NaN 210
A0A140T971;O95865;Q5SRR8;Q5SSV3 14.603 15.285 15.171 15.214 15.399 15.293 15.143 15.431 15.172 15.557 ... 17.087 12.558 12.970 12.612 7.068 0 -2.819 2,505.226 NaN 145
A0A140TA33;A0A140TA41;A0A140TA52;P22105;P22105-3;P22105-4 16.143 16.583 16.614 16.600 16.775 16.550 16.743 16.925 16.625 16.646 ... 17.508 12.619 12.970 12.884 7.068 0 -42.837 2,505.226 NaN 210
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_209 Q96ID5 16.074 15.866 15.919 16.125 16.122 16.036 15.981 15.924 15.909 15.925 ... 17.133 12.853 12.435 13.470 7.068 0 20.373 17.260 NaN 194
Q9H492;Q9H492-2 13.173 13.249 13.252 13.851 13.273 13.335 13.432 13.539 13.669 13.594 ... 17.109 12.317 12.435 11.842 7.068 0 14.713 19.076 NaN 111
Q9HC57 14.207 13.756 14.405 13.592 14.589 14.401 14.131 13.887 13.962 14.391 ... 17.157 12.082 12.435 13.412 7.068 0 21.445 19.649 NaN 128
Q9NPH3;Q9NPH3-2;Q9NPH3-5 14.962 15.096 15.150 14.903 15.099 15.005 15.123 15.487 15.094 15.117 ... 17.257 12.725 12.435 12.369 7.068 0 35.578 16.125 NaN 199
Q9UGM5;Q9UGM5-2 16.871 16.395 16.542 16.285 16.429 16.619 16.378 16.871 16.255 17.054 ... 17.133 11.720 12.435 11.018 7.068 0 82.601 13.608 NaN 209

12600 rows × 26 columns

Write averages for all models to excel (from before?)

Hide code cell source

errors_test_mae = pimmslearn.pandas.calc_errors.get_absolute_error(
    pred_test
)
mae_stats_ordered_test = errors_test_mae.describe()[ORDER_MODELS]
mae_stats_ordered_test
BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE IRM ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
count 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 ... 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 0.000
mean 0.432 0.436 0.442 0.458 0.465 0.469 0.472 0.482 0.558 0.587 ... 2.223 2.330 3.815 4.109 4.124 9.271 16.339 334.546 2,186.302 NaN
std 0.518 0.539 0.534 0.539 0.547 0.546 0.521 0.562 0.679 0.647 ... 1.662 1.653 2.664 2.667 2.700 2.741 2.741 793.494 853.899 NaN
min 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.003 0.001 0.141 7.209 0.001 0.001 NaN
25% 0.121 0.121 0.122 0.132 0.134 0.138 0.145 0.140 0.163 0.175 ... 0.961 1.044 1.804 2.132 2.125 7.344 14.412 12.192 2,436.455 NaN
50% 0.280 0.276 0.282 0.299 0.301 0.307 0.324 0.316 0.364 0.394 ... 1.954 2.098 3.320 3.635 3.655 8.867 15.935 34.192 2,496.971 NaN
75% 0.546 0.545 0.559 0.584 0.595 0.596 0.609 0.612 0.703 0.762 ... 3.119 3.286 5.332 5.610 5.652 10.842 17.910 91.928 2,555.017 NaN
max 7.635 9.435 8.487 9.111 8.588 8.577 7.247 8.171 9.005 7.829 ... 13.272 13.022 18.025 18.317 18.408 23.072 30.140 2,869.824 2,873.005 NaN

8 rows × 24 columns

Hide code cell source

mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f')

Hide code cell source

cp_mean_perf = pd.concat([
    mae_stats_ordered_val.loc['mean'],
    mae_stats_ordered_test.loc['mean'],
],
    axis=1,
    keys=['val', 'test']
).sort_values(by='val')
cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f')
cp_mean_perf
val test
BPCA 0.422 0.432
VAE 0.431 0.436
DAE 0.438 0.442
TRKNN 0.450 0.458
RF 0.460 0.465
KNN5 0.467 0.469
CF 0.467 0.472
KNN 0.481 0.482
KNN_IMPUTE 0.554 0.558
IRM 0.588 0.587
ROWMEDIAN 0.598 0.602
Median 0.598 0.602
LLS 1.329 0.874
QRILC 1.651 1.633
COLMEDIAN 2.210 2.223
SVDMETHOD 2.309 2.330
PI 3.817 3.815
MINDET 4.108 4.109
MINPROB 4.127 4.124
MINIMUM 9.272 9.271
ZERO 16.340 16.339
IMPSEQROB 333.478 334.546
MLE 2,172.384 2,186.302
IMPSEQ NaN NaN

Hide code cell source

writer.close()

Intensity distribution as histogram#

Plot top 4 models predictions for intensities in test data

Hide code cell source

min_max = pimmslearn.plotting.data.min_max(pred_test[TARGET_COL])
top_n = 4
fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True)

for model, color, ax in zip(
        ORDER_MODELS[:top_n],
        COLORS_TO_USE[:top_n],
        axes):

    ax, bins = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[TARGET_COL],
        color='grey',
        min_max=min_max,
        ax=ax
    )
    ax, _ = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[model],
        color=color,
        min_max=min_max,
        ax=ax,
        alpha=0.5,
    )
    _ = [(l_.set_rotation(90))
         for l_ in ax.get_xticklabels()]
    ax.legend()

axes[0].set_ylabel('Number of observations')

fname = args.out_figures / f'2_{group}_intensity_binned_top_{top_n}_models_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf
_images/1f696283cf4fb3fe3978517a180cd5ffbb6ee62808d9dd257416f08bc7b55375.png

Hide code cell source

counts_per_bin = pimmslearn.pandas.get_counts_per_bin(df=pred_test,
                                                bins=bins,
                                                columns=[TARGET_COL, *ORDER_MODELS[:top_n]])

counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
observed BPCA VAE DAE TRKNN
bin
(7, 8] 2 0 0 0 0
(8, 9] 7 0 0 0 0
(9, 10] 18 2 1 4 1
(10, 11] 69 29 23 36 13
(11, 12] 217 165 138 157 113
(12, 13] 634 531 531 479 479
(13, 14] 1,394 1,248 1,268 1,235 1,224
(14, 15] 2,042 2,033 2,096 2,106 2,118
(15, 16] 2,054 2,359 2,367 2,354 2,429
(16, 17] 1,787 1,867 1,850 1,884 1,842
(17, 18] 1,333 1,363 1,342 1,374 1,401
(18, 19] 965 956 933 940 923
(19, 20] 792 789 818 791 800
(20, 21] 536 528 514 514 533
(21, 22] 320 322 308 321 323
(22, 23] 182 176 181 174 171
(23, 24] 102 92 91 92 92
(24, 25] 45 38 41 42 37
(25, 26] 50 57 54 56 59
(26, 27] 25 20 20 16 17
(27, 28] 3 2 1 2 2
(28, 29] 8 11 11 11 11
(29, 30] 13 11 12 12 12

Correlation per sample#

Hide code cell source

corr_per_sample_test = (pred_test
                        .groupby(sample_index_name)
                        .apply(lambda df: df.corr().loc[TARGET_COL])
                        [ORDER_MODELS])
corr_per_sample_test = corr_per_sample_test.join(
    pred_test
    .groupby(sample_index_name)[TARGET_COL]
    .count()
    .rename('n_obs')
)
too_few_obs = corr_per_sample_test['n_obs'] < 3
corr_per_sample_test.loc[~too_few_obs].describe()
BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 ... 210.000 210.000 0.000 210.000 0.000 0.000 210.000 30.000 0.000 210.000
mean 0.969 0.967 0.968 0.966 0.965 0.965 0.967 0.963 0.948 0.952 ... 0.076 0.009 NaN 0.008 NaN NaN 0.371 -0.018 NaN 60.000
std 0.017 0.019 0.017 0.019 0.018 0.018 0.016 0.019 0.035 0.022 ... 0.192 0.133 NaN 0.130 NaN NaN 0.139 0.151 NaN 9.810
min 0.878 0.850 0.911 0.858 0.872 0.870 0.906 0.888 0.722 0.865 ... -0.402 -0.351 NaN -0.354 NaN NaN 0.021 -0.287 NaN 31.000
25% 0.962 0.962 0.960 0.960 0.959 0.956 0.962 0.953 0.938 0.943 ... -0.059 -0.080 NaN -0.061 NaN NaN 0.288 -0.118 NaN 53.000
50% 0.973 0.971 0.971 0.970 0.970 0.970 0.971 0.968 0.958 0.956 ... 0.067 0.005 NaN 0.007 NaN NaN 0.368 -0.042 NaN 60.000
75% 0.981 0.980 0.980 0.979 0.977 0.979 0.977 0.978 0.969 0.966 ... 0.200 0.099 NaN 0.085 NaN NaN 0.448 0.062 NaN 67.000
max 0.994 0.994 0.992 0.992 0.991 0.992 0.993 0.990 0.987 0.988 ... 0.546 0.451 NaN 0.367 NaN NaN 0.889 0.393 NaN 86.000

8 rows × 25 columns

Hide code cell source

# ! add minimum
kwargs = dict(ylim=(0.7, 1), rot=90,
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model predictions per sample on test data',
              ylabel='correlation per sample')
ax = (corr_per_sample_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_sample_test.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_test.to_excel(w, sheet_name='correlations')
    corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf
_images/7793a4132c3252ade2fa8842d6cebecdb6ed2ff2bdd73af382e5a58ec84ac2af.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_test[TOP_N_ORDER]).min()
mask = (corr_per_sample_test[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_test.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
  BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
Sample ID                                                  
Sample_035 0.946885 0.936694 0.933902 0.954952 0.927356 0.958440 0.949157 0.936034 0.951218 0.940977 0.924673 0.924673 0.959100 0.837433 nan 0.257382 -0.067806 nan 0.013245 nan nan 0.699718 nan nan 57
Sample_043 0.949633 0.850380 0.939264 0.858409 0.871622 0.870351 0.909424 0.888093 0.847971 0.882989 0.814366 0.814366 0.828364 0.747920 nan -0.060303 0.028315 nan -0.018491 nan nan 0.444295 nan nan 57
Sample_047 0.939299 0.938965 0.945415 0.916738 0.943330 0.950780 0.946309 0.945377 0.874472 0.900719 0.896683 0.896683 0.009710 0.896444 nan -0.077002 0.116478 nan 0.034894 nan nan 0.524493 nan nan 46
Sample_075 0.936817 0.936659 0.927665 0.942922 0.945101 0.947937 0.951762 0.921178 0.951033 0.929515 0.940811 0.940811 0.962745 0.786656 nan 0.075613 -0.048656 nan 0.197165 nan nan 0.633450 nan nan 58
Sample_080 0.922142 0.901736 0.919533 0.911596 0.902379 0.912309 0.915941 0.921951 0.902040 0.883475 0.893836 0.893836 0.880736 0.871008 nan 0.093012 0.082319 nan -0.111358 nan nan 0.383774 nan nan 64
Sample_091 0.878328 0.914147 0.931318 0.931055 0.911315 0.918275 0.921545 0.903784 0.920915 0.864672 0.903019 0.903019 0.910180 0.803095 nan -0.095511 0.005837 nan 0.064610 nan nan 0.341572 nan nan 60
Sample_108 0.929388 0.948149 0.928883 0.940255 0.946996 0.951487 0.943803 0.946047 0.866107 0.915455 0.939810 0.939810 0.929216 0.799005 nan -0.044493 0.210078 nan -0.187856 nan nan 0.407365 nan nan 68
Sample_109 0.937615 0.929595 0.911407 0.924847 0.924946 0.898780 0.940592 0.893626 0.841761 0.879726 0.890426 0.890426 0.931145 0.715112 nan -0.059637 0.049204 nan -0.050712 nan nan 0.337532 -0.040683 nan 59
Sample_111 0.978525 0.911553 0.944741 0.974002 0.921311 0.958219 0.966623 0.933775 0.923850 0.935239 0.857016 0.857016 0.962568 0.751855 nan -0.130888 -0.156891 nan -0.068413 nan nan 0.451452 nan nan 54
Sample_115 0.891712 0.906082 0.913761 0.915296 0.922587 0.928234 0.906340 0.918842 0.853242 0.874847 0.881285 0.881285 0.901459 0.806567 nan 0.094778 -0.151482 nan -0.220034 nan nan 0.320851 nan nan 63
Sample_134 0.933622 0.920815 0.949210 0.907465 0.935493 0.952746 0.932140 0.935936 0.905001 0.915387 0.865397 0.865397 0.881808 0.898931 nan 0.389792 -0.067654 nan -0.085666 nan nan 0.344741 nan nan 66
Sample_138 0.957581 0.921846 0.934612 0.953928 0.928186 0.927573 0.951431 0.936390 0.943933 0.936371 0.921359 0.921359 0.963983 0.865835 nan 0.001445 0.046128 nan 0.100226 nan nan 0.523470 nan nan 46
Sample_148 0.975203 0.936953 0.921283 0.979465 0.956317 0.946668 0.967196 0.926864 0.929094 0.955283 0.935395 0.935395 0.984939 0.861317 nan 0.037085 -0.194248 nan 0.035463 nan nan 0.362124 nan nan 62
Sample_151 0.947829 0.919439 0.926731 0.919188 0.930063 0.937720 0.924564 0.937262 0.934510 0.915733 0.904552 0.904552 0.917004 0.856063 nan -0.189751 -0.150272 nan -0.047265 nan nan 0.302307 nan nan 70
Sample_152 0.922635 0.923706 0.932690 0.926056 0.925567 0.932482 0.943238 0.931084 0.918127 0.917052 0.909410 0.909410 0.877491 0.897524 nan 0.098949 0.096217 nan -0.061480 nan nan 0.336118 nan nan 64
Sample_162 0.929186 0.942583 0.931385 0.933190 0.950278 0.937839 0.927674 0.949772 0.956867 0.940055 0.937255 0.937255 0.933909 0.928758 nan 0.516397 0.101290 nan -0.224773 nan nan 0.294184 0.087903 nan 51
Sample_167 0.952090 0.928407 0.942830 0.931476 0.936620 0.939793 0.942442 0.936802 0.922116 0.930438 0.905413 0.905413 0.923164 0.907804 nan 0.221299 -0.016542 nan 0.224084 nan nan 0.235179 nan nan 65
Sample_171 0.948100 0.921832 0.911514 0.901446 0.902910 0.919215 0.907038 0.909432 0.845442 0.899387 0.863135 0.863135 0.898770 0.779281 nan -0.061550 -0.214416 nan 0.015564 nan nan 0.344922 nan nan 40
Sample_181 0.912274 0.924417 0.920240 0.920976 0.921442 0.929397 0.913684 0.913043 0.869468 0.929034 0.896030 0.896030 0.899227 0.854767 nan -0.243627 -0.006608 nan 0.124659 nan nan 0.419029 0.117814 nan 60
Sample_185 0.949315 0.944890 0.943669 0.929238 0.945453 0.936803 0.947703 0.930556 0.924211 0.929391 0.922411 0.922411 0.899905 0.920831 nan -0.264227 -0.027163 nan -0.072097 nan nan 0.576069 nan nan 69
Sample_199 0.928280 0.934778 0.937839 0.930234 0.928331 0.917037 0.929491 0.925542 0.912243 0.918083 0.910943 0.910943 0.937794 0.893909 nan -0.086879 -0.205229 nan 0.125423 nan nan 0.289504 nan nan 45

Hide code cell source

feature_names = pred_test.index.levels[-1]
N_SAMPLES = pred_test.index
M = len(feature_names)
pred_test.loc[pd.IndexSlice[:, feature_names[random.randint(0, M - 1)]], :]
observed BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_000 M0QXF7;M0QYN0;Q969H8 17.454 17.244 17.310 17.264 17.490 17.171 17.538 17.514 17.729 17.216 ... 17.486 12.375 12.970 12.137 7.068 0 2,505.226 2,505.226 NaN 208
Sample_041 M0QXF7;M0QYN0;Q969H8 17.183 16.705 17.027 16.926 17.005 16.982 16.960 16.979 16.967 16.781 ... 17.663 11.984 12.627 12.855 7.068 0 2,676.041 2,676.041 NaN 208
Sample_178 M0QXF7;M0QYN0;Q969H8 16.903 16.991 16.867 17.088 16.825 16.620 17.107 16.691 17.061 16.667 ... 17.046 12.903 12.049 11.551 7.068 0 2,434.685 2,434.685 NaN 208

3 rows × 26 columns

Hide code cell source

options = random.sample(sorted(set(feature_names)), 1)
pred_test.loc[pd.IndexSlice[:, options[0]], :]
observed BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_068 Q07507 15.687 16.058 15.564 15.938 16.113 15.783 15.585 15.681 15.499 15.381 ... 17.085 12.417 11.865 11.529 7.068 0 8.993 2,647.739 NaN 166
Sample_088 Q07507 15.502 15.693 15.671 15.835 15.696 15.812 15.847 15.617 15.656 15.706 ... 17.065 11.197 11.537 12.554 7.068 0 -30.841 2,609.972 NaN 166
Sample_123 Q07507 15.217 15.592 15.397 15.403 15.337 15.482 15.355 15.717 15.349 15.654 ... 16.999 13.281 12.689 13.571 7.068 0 72.176 2,461.806 NaN 166
Sample_168 Q07507 15.715 15.306 15.609 15.758 15.722 15.872 15.775 15.723 15.658 15.758 ... 17.213 12.376 12.095 12.574 7.068 0 -4.089 2,514.391 NaN 166

4 rows × 26 columns

Correlation per feature#

Hide code cell source

corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply(
    lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS]
corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[
    TARGET_COL].count().rename('n_obs'))

too_few_obs = corr_per_feat_test['n_obs'] < 3
corr_per_feat_test.loc[~too_few_obs].describe()
BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 ... 1,396.000 1,396.000 1,396.000 1,396.000 0.000 0.000 1,396.000 1,396.000 0.000 1,396.000
mean 0.636 0.632 0.630 0.607 0.590 0.564 0.596 0.538 0.445 0.470 ... -0.008 0.014 0.086 0.044 NaN NaN -0.009 -0.016 NaN 8.999
std 0.337 0.336 0.344 0.342 0.347 0.359 0.342 0.369 0.422 0.388 ... 0.447 0.415 0.441 0.412 NaN NaN 0.411 0.433 NaN 3.913
min -0.998 -1.000 -0.899 -0.977 -0.973 -0.983 -0.966 -0.991 -1.000 -0.999 ... -0.999 -0.991 -0.999 -1.000 NaN NaN -1.000 -0.999 NaN 3.000
25% 0.506 0.507 0.480 0.455 0.445 0.392 0.438 0.359 0.221 0.266 ... -0.341 -0.288 -0.242 -0.249 NaN NaN -0.285 -0.313 NaN 6.000
50% 0.746 0.734 0.745 0.706 0.688 0.677 0.695 0.634 0.554 0.554 ... -0.005 0.018 0.118 0.070 NaN NaN 0.004 -0.016 NaN 8.000
75% 0.880 0.873 0.881 0.859 0.846 0.833 0.848 0.809 0.770 0.768 ... 0.331 0.314 0.425 0.350 NaN NaN 0.268 0.294 NaN 11.000
max 0.999 1.000 0.999 1.000 0.999 0.999 1.000 1.000 1.000 0.998 ... 0.999 1.000 0.999 0.998 NaN NaN 0.992 0.998 NaN 32.000

8 rows × 25 columns

Hide code cell source

corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0)
BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
protein groups
A0A0A0MS09;P01880;P01880-2 1.000 -1.000 1.000 1.000 1.000 -1.000 1.000 -1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000 1.000 1.000 -1.000 -1.000 -1.000 -1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 1.000 NaN NaN 1.000 1.000 NaN 2
A0A0C4DH29 -1.000 1.000 -1.000 -1.000 -1.000 1.000 1.000 -1.000 1.000 1.000 ... 1.000 -1.000 -1.000 1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000 1.000 1.000 1.000 1.000 1.000 -1.000 1.000 -1.000 -1.000 ... 1.000 -1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000 -1.000 1.000 -1.000 1.000 -1.000 -1.000 1.000 -1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
D6RF35 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
E7EQ64;P07477 1.000 1.000 1.000 1.000 -1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN 1.000 -1.000 NaN 2
F8WDW9;Q96AP7 -1.000 -1.000 1.000 -1.000 1.000 -1.000 1.000 -1.000 -1.000 -1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
J3KRP0 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 -1.000 1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
O43581-2;O43581-3;O43581-5 -1.000 -1.000 -1.000 -1.000 1.000 -1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P04075 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
P04080 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 1.000 -1.000 ... -1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
P33151 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P62258 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 1.000 1.000 1.000 NaN NaN -1.000 1.000 NaN 2
Q9NYQ8 1.000 1.000 1.000 1.000 -1.000 -1.000 -1.000 -1.000 -1.000 1.000 ... -1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
Q9Y281;Q9Y281-3 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 1.000 1.000 1.000 ... -1.000 1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2

16 rows × 25 columns

Hide code cell source

kwargs = dict(rot=90,
              flierprops=dict(markersize=1),
              ylabel=f'correlation per {FEAT_NAME_DISPLAY}')
ax = (corr_per_feat_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs)
      )
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                       horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_feat.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_feat_test.loc[~too_few_obs].describe().to_excel(
        w, sheet_name='summary')
    corr_per_feat_test.to_excel(w, sheet_name='correlations')
    corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf
_images/110b5b7f19c7d1c14b4a54ce16af4a4b911f9d3638a99e4dbdda74652a6f4c93.png

Hide code cell source

feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count()
feat_count_test.name = 'count'
feat_count_test.head()
protein groups
A0A024QZX5;A0A087X1N8;P35237                                                     10
A0A024R0T9;K7ER74;P02655                                                          8
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8    6
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                           8
A0A075B6H7                                                                        4
Name: count, dtype: int64

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_feat_test[TOP_N_ORDER]).min()
mask = (corr_per_feat_test[TOP_N_ORDER] < treshold).any(axis=1)


def highlight_min(s, color, tolerence=0.00001):
    return np.where((s - s.min()).abs() < tolerence, f"background-color: {color};", None)


view = (corr_per_feat_test
        .join(feat_count_test)
        .loc[mask]
        .sort_values('count'))

if not view.empty:
    display(view
            .style.
            apply(highlight_min, color='yellow', axis=1,
                  subset=corr_per_feat_test.columns)
            )
else:
    print("None found")
  BPCA VAE DAE TRKNN RF KNN5 CF KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs count
protein groups                                                    
A0A0A0MS09;P01880;P01880-2 1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan 1.000000 -1.000000 nan 2 2
A0A0C4DH29 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 nan 2 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 1.000000 1.000000 nan nan -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 1.000000 nan nan 1.000000 1.000000 nan 2 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000000 -1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
O43581-2;O43581-3;O43581-5 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 1.000000 nan 2 2
F8WDW9;Q96AP7 -1.000000 -1.000000 1.000000 -1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
E7EQ64;P07477 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 nan nan -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 -1.000000 nan 2 2
Q9NYQ8 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 nan nan 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
O95497 0.491301 -0.408791 -0.623604 -0.187796 -0.103226 -0.278320 -0.474745 -0.038249 -0.972465 0.974252 nan nan -0.924475 -0.996219 -0.728224 -0.107445 0.993302 0.908126 0.849159 nan nan 0.163889 -0.914582 nan 3 3
A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;P01892;P10316 -0.405690 -0.299025 -0.353892 -0.426425 0.317683 0.225780 -0.636327 0.141568 -0.230425 0.240705 nan nan 0.029026 0.528379 0.969574 0.997890 -0.857778 -0.651641 0.959874 nan nan 0.599349 0.988049 nan 3 3
A0A087X117;A0A0G2JN29;J3KN36;P69849;Q15155;Q5JPE7;Q5JPE7-2 0.678810 -0.345800 0.019325 0.676860 0.644938 0.312055 0.949084 0.660508 -0.865326 0.808609 nan nan -0.988011 0.063747 -0.484680 -0.563403 0.920083 -0.965384 -0.999844 nan nan -0.178456 0.909074 nan 3 3
Q96KR4;Q96KR4-3 -0.811499 0.511330 -0.893553 0.188634 0.384829 0.962865 0.427827 0.998946 -0.996456 -0.998544 nan nan -0.499851 -0.365745 -0.969111 -0.794681 0.992729 0.464970 -0.938329 nan nan 0.770952 0.620557 nan 3 3
Q0P6D2 0.488693 0.999137 -0.523867 -0.977189 -0.973042 -0.914209 0.323275 -0.991429 0.990772 -0.817525 nan nan -0.910486 -0.378046 -0.815144 -0.873162 0.451279 -0.856054 0.088313 nan nan 0.198784 -0.816625 nan 3 3
P14138 -0.783274 0.353916 0.297632 0.277987 0.561580 0.474087 -0.788160 -0.039227 0.184293 -0.975804 nan nan -0.284116 -0.439838 -0.680464 -0.988691 -0.058765 -0.256388 -0.884610 nan nan 0.330904 0.330904 nan 3 3
Q5FWE3;Q5FWE3-3 -0.595898 0.900974 0.899276 0.940836 -0.850171 -0.515612 0.420976 -0.450286 0.997780 -0.317398 nan nan 0.852964 -0.485075 0.879697 0.674983 -0.498986 0.966111 0.986679 nan nan -0.999926 0.995547 nan 3 3
Q15223;Q15223-2;Q15223-3 0.989596 -0.199683 0.428444 0.177474 0.403288 0.779421 0.273193 0.476563 0.352579 -0.993723 nan nan 0.613817 -0.951453 -0.593128 0.320433 -0.496977 0.556287 0.925110 nan nan 0.802926 -0.921276 nan 3 3
P67936 -0.781265 -0.963644 -0.072392 -0.280233 -0.428370 -0.930367 -0.965717 0.668141 -0.999486 -0.951572 nan nan -0.952164 -0.995699 -0.989773 -0.899019 0.837250 0.990700 -0.437753 nan nan -0.364239 -0.919047 nan 3 3
Q9BRA2 -0.997854 -0.999826 -0.796300 -0.976681 -0.578426 0.355335 -0.640091 -0.969137 -0.999909 0.328330 nan nan -0.376394 -0.951592 -0.888126 -0.946135 -0.817740 -0.251266 -0.618440 nan nan 0.231242 0.893058 nan 3 3
P04040 0.955425 -0.817788 -0.651690 0.364706 -0.357571 0.860313 0.706907 0.995399 0.584629 0.685429 nan nan 0.736559 -0.942664 0.910187 0.961838 -0.690971 0.889522 0.686372 nan nan -0.755848 0.778883 nan 3 3
Q9UI40;Q9UI40-2 0.154643 0.831778 -0.355295 0.705987 0.742499 0.043055 0.691403 -0.368219 0.609727 -0.080993 nan nan 0.216445 0.483812 0.418780 0.168840 0.730947 0.994516 -0.961060 nan nan 0.588145 0.963302 nan 3 3
P21810 0.488982 -0.261304 -0.437732 0.558018 0.895563 0.725338 0.190133 0.497175 0.976315 0.606417 nan nan 0.507237 0.975390 0.086433 0.181221 0.525005 -0.013604 -0.957964 nan nan -0.032190 -0.515024 nan 4 4
P01912;Q5Y7A7 0.312242 0.385963 -0.617942 -0.737679 0.336045 -0.983163 0.530255 -0.923553 0.367817 0.598949 nan nan -0.036948 -0.079982 0.393426 0.622172 0.083071 0.128832 0.724282 nan nan -0.520030 -0.464856 nan 4 4
P48745 -0.215191 0.683301 -0.490411 0.281906 -0.630930 -0.708241 0.043992 -0.711413 0.592691 0.072595 nan nan -0.966228 -0.179875 -0.502678 -0.007247 -0.666807 -0.811427 -0.580122 nan nan -0.816749 -0.563041 nan 4 4
P15291;P15291-2 0.596370 -0.235246 0.985230 0.570304 -0.331058 0.348129 -0.003310 0.275317 0.125792 0.961734 nan nan 0.578856 -0.599561 0.496488 -0.072229 0.014830 -0.272047 -0.038077 nan nan 0.584357 -0.113144 nan 4 4
E9PN95;P11684 -0.044137 0.539586 0.373076 -0.416898 0.092771 -0.241660 0.183922 0.598536 0.553236 0.989064 nan nan 0.572649 0.682816 -0.972121 -0.214485 0.864893 -0.729323 -0.986130 nan nan 0.624754 0.488347 nan 4 4
P01704 0.467663 -0.094260 0.182829 -0.028085 -0.403467 -0.769039 0.330325 0.534897 0.229086 -0.463726 nan nan 0.151957 0.817778 0.353124 -0.603360 -0.868895 -0.375894 0.556847 nan nan 0.964419 -0.791674 nan 4 4
A0A087WSY4 0.341097 -0.090745 -0.850622 -0.455325 0.087264 0.666119 0.139244 0.804721 -0.612366 -0.162758 nan nan 0.149034 -0.153547 0.994065 0.280256 0.105481 -0.553907 -0.802278 nan nan -0.767064 0.652348 nan 4 4
B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 -0.586205 -0.639539 -0.184916 -0.468526 -0.339314 -0.950792 -0.671857 -0.978604 -0.206013 0.640277 nan nan -0.926715 0.253163 -0.611749 -0.688762 -0.360095 -0.359254 0.022091 nan nan -0.630160 0.512953 nan 4 4
P78310;P78310-2;P78310-5;P78310-6;P78310-7 -0.287802 0.585634 0.708781 0.187632 0.155664 0.236820 0.138196 0.177240 0.323788 -0.925310 nan nan -0.002267 -0.809940 0.370874 -0.992637 0.096872 -0.740181 0.225409 nan nan -0.197104 0.065686 nan 4 4
P69905 0.965583 -0.438234 -0.836464 0.990263 -0.329547 -0.734862 0.928251 -0.884057 0.787808 0.652423 nan nan 0.995940 0.901529 -0.089591 0.819375 -0.866314 -0.613292 0.751098 nan nan 0.599413 0.563706 nan 4 4
P62805 0.288891 0.176837 0.306050 0.216616 -0.196019 0.997199 0.144702 0.999754 0.762963 0.304920 nan nan 0.058312 0.169315 0.970248 -0.577842 0.753887 0.967646 0.595564 nan nan -0.353942 -0.929071 nan 4 4
Q8WXD2 -0.194738 -0.327466 -0.501355 0.030615 -0.032132 -0.644254 -0.107541 -0.896598 -0.044385 -0.527932 nan nan -0.031366 0.169239 -0.879923 -0.696602 0.420495 0.367920 0.523029 nan nan -0.304569 0.801129 nan 4 4
P31150 -0.695110 -0.312927 -0.777454 0.079765 -0.349764 -0.302639 -0.919334 -0.258552 0.871434 0.374253 nan nan -0.383378 0.322925 -0.744603 -0.826531 0.317684 0.679134 -0.106456 nan nan 0.018162 0.997740 nan 4 4
P55058 0.331515 -0.632377 -0.015757 -0.453671 -0.267412 -0.730945 0.512525 -0.687075 0.561214 0.511731 nan nan -0.850960 -0.898916 -0.860013 0.937309 0.166547 0.109603 0.685787 nan nan -0.314778 -0.708824 nan 4 4
Q99538 0.593821 -0.275201 -0.451008 0.377925 -0.527607 -0.288458 -0.102663 -0.152372 0.223491 -0.280701 nan nan 0.914120 -0.194519 0.708292 -0.029421 0.518933 0.753316 0.491944 nan nan -0.421840 -0.113954 nan 4 4
Q9NS85 0.209240 -0.124772 -0.211847 -0.748054 -0.517249 0.174286 0.922441 0.827546 -0.293415 -0.490454 nan nan -0.811087 0.396028 0.688432 0.684565 0.500513 -0.573150 0.696927 nan nan 0.028761 0.700517 nan 4 4
A8MXB9;J3KQJ1;Q8NBJ7 -0.526368 -0.127019 -0.708682 -0.288690 -0.712488 0.719047 -0.528825 0.882512 0.575047 0.781017 nan nan 0.983037 -0.767503 -0.315140 -0.447225 0.792325 0.092247 -0.218425 nan nan -0.625728 -0.380936 nan 4 4
A0A075B7B8 0.995309 0.887667 0.963291 -0.584344 0.835407 0.645944 0.834050 0.628331 0.493563 0.290645 nan nan -0.198260 0.197362 0.596184 0.426337 0.660357 -0.818663 -0.952064 nan nan -0.419682 0.603615 nan 4 4
A0A0G2JRN3 -0.879258 -0.483820 0.407109 -0.190835 -0.821424 0.676931 -0.334543 0.355859 -0.978184 -0.922357 nan nan -0.966410 0.804803 -0.890772 -0.522710 -0.980154 -0.786540 -0.525030 nan nan 0.786883 0.273736 nan 4 4
Q9ULP0-3;Q9ULP0-6 -0.433605 -0.040819 0.019276 0.280988 0.363213 -0.127518 -0.027651 -0.217068 -0.824736 0.569382 nan nan 0.047124 0.209891 -0.147969 0.088032 0.739418 0.597287 -0.248417 nan nan 0.651242 0.678026 nan 4 4
P10124 -0.597027 -0.292157 -0.183093 -0.186782 -0.250682 0.571509 -0.247751 -0.252024 0.436283 -0.628340 nan nan -0.573988 -0.985971 -0.483333 -0.479793 -0.204656 -0.261585 -0.817668 nan nan 0.414290 0.578378 nan 5 5
Q13508;Q13508-2;Q13508-3 -0.588905 0.585571 -0.012161 0.317771 0.874966 0.367007 0.875101 -0.137197 -0.196508 0.072987 nan nan 0.595601 -0.148856 -0.921969 -0.902558 -0.822271 -0.647509 -0.125325 nan nan -0.816043 0.878974 nan 5 5
Q5BIV9 -0.499694 -0.497641 -0.439066 -0.073578 0.196112 -0.595082 -0.508927 -0.855339 -0.481080 -0.438393 nan nan -0.128645 -0.289227 -0.778869 -0.513219 -0.146415 -0.261649 0.232945 nan nan -0.127227 -0.239878 nan 5 5
P00441 -0.466488 -0.294375 -0.448908 -0.418119 -0.159710 0.047755 -0.615701 -0.045985 -0.047843 -0.228959 nan nan -0.221865 -0.845945 -0.478446 -0.697790 0.656874 -0.145801 -0.527482 nan nan 0.030159 -0.379242 nan 5 5
P01282;P01282-2 -0.216410 0.630950 0.498602 0.209445 0.221254 0.888676 -0.187627 -0.090521 -0.394861 -0.216388 nan nan -0.782966 -0.070473 0.261410 0.709596 -0.596828 0.425854 0.919430 nan nan -0.467714 0.087763 nan 5 5
O75339 0.036295 -0.346935 -0.169974 0.082569 -0.523435 -0.383440 -0.131520 -0.134371 -0.662474 0.284821 nan nan -0.707550 0.152225 -0.475700 -0.809598 0.017363 -0.206109 0.523349 nan nan 0.211736 0.029954 nan 5 5
A0A0G2JRQ6 -0.226956 -0.759670 -0.502194 -0.249237 -0.234283 -0.267241 -0.594549 -0.104896 0.030670 -0.906417 nan nan -0.206273 0.600598 -0.360860 0.551245 0.151696 0.307142 -0.805609 nan nan -0.589769 -0.589769 nan 5 5
G3V2U7;P07311 0.478113 -0.238618 -0.686224 0.277132 -0.007988 0.126790 -0.235804 -0.075889 -0.045458 0.298977 nan nan -0.201011 -0.455432 -0.434263 -0.349618 0.323939 0.286177 -0.047089 nan nan -0.412639 -0.436945 nan 5 5
P01834 0.145967 -0.411931 -0.406536 -0.196562 -0.612906 0.157569 0.314805 0.333278 0.249827 0.136816 nan nan -0.132585 0.503725 -0.169066 -0.492225 -0.044416 0.283761 0.783489 nan nan -0.157407 -0.157407 nan 5 5
P02533 0.968900 -0.669619 -0.898847 0.859288 -0.433821 0.106114 0.667975 0.085868 0.760846 0.490144 nan nan 0.864174 0.283604 0.303053 0.460298 -0.114587 0.446403 0.060099 nan nan 0.207036 -0.545616 nan 5 5
O15031 -0.384689 0.839600 0.575780 0.545446 0.286972 0.392387 0.728316 0.288900 0.278546 0.565509 nan nan 0.144879 -0.620584 -0.264862 -0.348062 -0.104415 -0.121316 0.243429 nan nan -0.296688 -0.296688 nan 5 5
A0A0C4DGV8;Q13214;Q13214-2 0.215023 0.141294 -0.730594 -0.952860 -0.817245 -0.317076 0.067910 -0.315165 -0.648660 -0.694448 nan nan -0.483616 0.800076 0.951239 0.908215 -0.452530 -0.051800 -0.172222 nan nan -0.698798 -0.910792 nan 5 5
A0A087WTA8;P08123 0.096399 0.311864 0.440189 0.343521 -0.215563 0.040533 -0.831089 0.508009 -0.588204 0.186415 nan nan -0.351979 0.713450 -0.044345 0.288494 0.197417 -0.338414 -0.646585 nan nan 0.534275 0.099873 nan 5 5
D6R956;P09936 0.296679 0.043760 -0.017206 -0.551658 0.597571 -0.259386 0.285553 -0.664319 0.151812 0.024035 nan nan 0.313797 -0.627608 -0.119808 -0.402161 -0.341539 -0.212790 -0.519916 nan nan -0.514141 0.033000 nan 5 5
B1AJZ9;B1AJZ9-4;H0YE38;Q5JYW6 -0.213396 0.107639 -0.225299 -0.655514 -0.465770 -0.355725 -0.403188 -0.358839 -0.320693 0.330414 nan nan -0.420475 0.109067 -0.515651 -0.659479 -0.192616 0.428130 0.082893 nan nan 0.163593 0.301970 nan 5 5
Q9UHI8 0.073471 -0.102686 0.014193 -0.445122 -0.148405 0.316329 0.126096 -0.061541 0.216956 -0.119815 nan nan -0.208262 0.520078 0.051369 0.334440 0.087100 0.118070 -0.250934 nan nan -0.151806 -0.746015 nan 5 5
Q9NZP8 0.485369 -0.183202 0.557411 -0.596706 0.356651 0.391275 0.431668 0.135336 -0.921105 0.672485 nan nan -0.849012 0.392650 0.108741 -0.011939 0.644961 0.306441 -0.367893 nan nan 0.307325 0.307325 nan 5 5
Q9NZC2;Q9NZC2-2;Q9NZC2-3 -0.532191 -0.570799 -0.493292 -0.549588 -0.297234 0.118561 -0.846149 -0.020609 -0.710573 -0.652656 nan nan -0.534201 0.064670 0.889235 -0.779762 0.588627 -0.454867 -0.255885 nan nan 0.820294 0.820294 nan 5 5
Q8N428 -0.711604 -0.753251 -0.384073 -0.372718 -0.445067 -0.815146 -0.682438 -0.973013 0.485147 -0.775526 nan nan 0.896715 -0.678569 -0.145359 0.398169 -0.484174 0.571660 0.510003 nan nan -0.592174 0.633450 nan 5 5
Q6ZVL6 -0.405009 0.298309 0.558766 0.541507 0.054537 -0.626545 0.188454 -0.421210 0.745658 0.380994 nan nan 0.809662 0.498901 0.343913 -0.237720 0.526273 0.617457 0.285531 nan nan -0.305862 -0.650631 nan 5 5
Q10469 -0.098233 -0.534922 -0.719899 -0.291141 -0.527896 -0.671391 -0.395838 -0.600598 0.649427 0.606420 nan nan -0.185669 0.080247 -0.517750 -0.162715 0.489727 -0.550749 -0.257022 nan nan -0.084422 0.700076 nan 5 5
Q8NFZ4 -0.225202 0.128062 0.059731 0.001304 -0.377481 -0.011690 -0.061573 0.057673 -0.422722 -0.787582 nan nan 0.200058 0.031356 0.685266 0.396679 0.892019 -0.363897 0.027415 nan nan -0.741786 0.576960 nan 5 5
O75882;O75882-2;O75882-3 0.808756 0.597213 0.728211 -0.195404 0.034871 0.647639 0.949369 0.664223 0.069134 -0.175888 nan nan 0.085212 0.010297 -0.812939 0.161950 -0.706524 -0.782643 0.318492 nan nan -0.899663 0.811337 nan 5 5
E9PKE3;P11142 -0.225336 -0.225165 -0.272864 0.346178 0.430965 -0.515610 0.103645 -0.674428 0.038561 -0.291304 nan nan -0.729284 -0.877792 -0.288577 0.003161 -0.327598 -0.507155 -0.311310 nan nan 0.281878 0.415848 nan 6 6
H3BRQ4;K4DIB9;P50238 -0.093569 -0.184898 -0.264632 -0.057295 -0.388212 -0.083371 -0.172636 -0.277897 0.428824 0.210272 nan nan 0.321101 -0.590159 -0.052992 0.460109 0.474277 0.387204 0.397906 nan nan 0.012052 -0.720358 nan 6 6
P01036;P01037 0.014913 -0.284069 -0.295049 -0.482398 -0.326443 -0.220449 -0.272399 -0.046435 -0.115861 -0.146823 nan nan 0.512862 -0.075426 -0.389652 -0.354472 0.279976 -0.362058 -0.565774 nan nan 0.673582 0.579024 nan 6 6
M0R1Q1 0.396686 0.419092 0.263690 0.325560 -0.179042 -0.039238 0.142109 -0.085327 -0.106837 0.714349 nan nan -0.163523 -0.831651 -0.244713 0.064760 0.097193 -0.433376 -0.409335 nan nan 0.641507 -0.168034 nan 6 6
M0R009 -0.339372 -0.602374 -0.246287 -0.584432 -0.719774 -0.356792 -0.056454 -0.247140 -0.037546 -0.001198 nan nan -0.732570 0.147271 -0.580618 0.320436 -0.451889 -0.750481 -0.771040 nan nan 0.635403 -0.182952 nan 6 6
P29401;P29401-2 0.085483 0.129807 -0.150347 -0.368660 0.531401 0.223946 -0.360097 0.365861 0.001412 -0.392136 nan nan 0.108221 0.255466 0.187563 0.224033 0.405335 -0.281768 0.086132 nan nan 0.300495 0.397517 nan 6 6
P05451 0.097378 -0.441128 -0.294145 -0.057327 -0.769861 -0.360316 -0.719673 -0.684154 -0.327039 0.306881 nan nan -0.336078 0.043237 -0.538331 -0.470228 -0.425665 -0.412800 -0.026200 nan nan -0.749296 -0.302038 nan 6 6
P10644;P10644-2 -0.252574 0.065703 -0.590319 -0.097608 -0.042255 -0.336136 0.122475 -0.255617 0.448682 -0.000994 nan nan 0.107935 0.714397 0.383896 -0.528502 0.152861 0.577309 0.480850 nan nan -0.154795 -0.502540 nan 6 6
P80108 -0.376648 -0.400943 -0.443181 -0.172435 -0.361780 -0.184693 -0.192249 -0.137669 0.293716 -0.492363 nan nan 0.087362 -0.279293 -0.600824 -0.578633 -0.695436 -0.443645 0.276221 nan nan 0.119112 0.119112 nan 7 7
Q15782;Q15782-6 0.541282 0.760955 0.453398 0.544543 -0.245504 -0.457200 0.561364 -0.759973 0.508678 0.748062 nan nan 0.831187 0.613225 -0.069165 -0.032695 0.616707 0.921498 0.574544 nan nan -0.520388 0.414014 nan 7 7
Q96S96 -0.324600 -0.068711 -0.079901 -0.462902 -0.337798 -0.359991 -0.148849 -0.296281 -0.305383 0.059875 nan nan -0.582492 -0.731655 -0.157328 -0.153988 -0.707385 -0.086735 0.235913 nan nan -0.682257 0.760044 nan 7 7
Q8IWU5;Q8IWU5-2 -0.218403 -0.293641 0.175722 -0.483524 -0.540851 -0.071915 -0.343877 -0.451814 -0.684735 0.392762 nan nan -0.070980 -0.247075 0.334826 0.479673 -0.666186 -0.096058 0.011527 nan nan 0.632888 -0.013472 nan 7 7
P19835;X6R868 -0.578277 -0.449392 -0.702783 -0.428965 -0.655040 -0.359418 -0.859368 -0.458749 -0.930907 0.408941 nan nan -0.762851 -0.543660 -0.321335 -0.213048 0.036173 -0.001008 -0.403300 nan nan -0.780480 0.407170 nan 7 7
P35443 -0.174408 0.012854 0.526426 -0.252097 0.290786 0.404203 0.597353 0.582749 0.068469 -0.211928 nan nan 0.314822 -0.233013 -0.017793 -0.130271 -0.765372 0.039112 0.003036 nan nan 0.022830 0.275796 nan 7 7
P16083;Q5TD07 0.032550 0.330372 0.143689 0.487772 -0.384301 0.339314 0.365007 0.599143 -0.622392 -0.468872 nan nan 0.220317 -0.045870 -0.775371 -0.704979 -0.250107 0.186306 -0.010250 nan nan -0.530460 -0.697128 nan 7 7
P51884 0.582757 -0.288561 0.013636 0.531542 0.148666 -0.331174 0.647818 -0.291611 -0.063590 -0.760356 nan nan 0.361296 -0.421648 0.243111 0.498870 0.109291 0.197945 0.445695 nan nan -0.230796 -0.230796 nan 7 7
C9JKT8;Q9UEW3;Q9UEW3-2 -0.293733 0.185454 -0.081072 -0.671856 -0.319385 0.252685 -0.351222 0.407657 0.516418 -0.293680 nan nan -0.461590 0.140443 -0.356302 -0.545248 -0.381322 -0.645563 -0.824795 nan nan -0.058753 -0.279069 nan 7 7
B0QYF8;P02144 0.080182 0.393220 0.387582 0.219902 -0.251514 -0.125582 0.634180 0.307024 -0.272763 -0.369672 nan nan -0.353914 0.484599 -0.932969 -0.882776 0.179168 0.090686 0.045923 nan nan -0.770465 -0.286177 nan 7 7
O75173;O75173-2;Q5VTW1 0.540898 0.302016 0.319705 -0.164242 0.172791 0.674482 0.405285 0.754451 -0.057066 -0.080403 nan nan 0.127738 0.465212 -0.114357 -0.317503 -0.189888 -0.576178 0.216410 nan nan 0.167095 0.137148 nan 7 7
P56817;P56817-2 0.511872 0.140449 0.206446 -0.093333 -0.267017 0.176506 0.248143 0.310725 0.174447 0.466366 nan nan 0.382517 -0.583295 0.378440 0.324176 -0.073491 0.457098 0.390217 nan nan 0.149203 -0.237699 nan 7 7
Q9BX67 0.701146 -0.263727 -0.036035 0.063862 0.042128 -0.048843 0.679110 -0.246240 0.445479 0.340506 nan nan 0.033937 0.199213 -0.269549 -0.431929 -0.340610 0.232872 -0.038976 nan nan -0.194531 -0.561094 nan 7 7
Q9BT88 -0.231560 -0.318093 -0.439269 -0.505983 -0.556064 0.248942 -0.044628 0.470496 -0.357877 -0.223776 nan nan 0.096659 0.124113 -0.078098 0.471098 -0.009569 0.626185 0.474545 nan nan 0.342112 0.354892 nan 7 7
K7ES70;P55083;P55083-2 0.053310 -0.068479 0.077745 -0.070135 -0.216039 0.203235 0.151685 0.059892 -0.126223 0.098984 nan nan 0.114059 0.017831 -0.093409 0.299230 -0.796695 0.019436 -0.278499 nan nan 0.371669 0.274381 nan 8 8
K7EKE8;Q92692;Q92692-2 0.199577 -0.116588 0.102888 0.035418 -0.390527 -0.300452 0.046691 -0.213314 -0.055248 0.203956 nan nan -0.185280 0.023561 -0.443701 -0.533869 0.334938 -0.385227 -0.493122 nan nan 0.621417 0.523851 nan 8 8
P12273 0.309929 -0.106217 -0.539555 0.125788 -0.221281 -0.170046 0.620734 -0.219819 -0.188677 0.272716 nan nan -0.741473 -0.379759 -0.153439 -0.054931 0.019636 -0.291898 -0.305074 nan nan -0.347341 -0.122712 nan 8 8
P17677;P17677-2 -0.225142 0.291270 -0.109052 -0.046967 0.264213 -0.060209 0.597347 -0.272913 -0.312226 0.255004 nan nan -0.234374 -0.239127 0.302693 0.676453 -0.131942 -0.380524 0.302604 nan nan 0.169275 -0.057105 nan 8 8
P18206;P18206-2 0.275209 0.460561 0.274986 -0.189763 0.012492 -0.393250 0.222341 -0.430189 0.404893 0.478030 nan nan -0.101734 -0.717748 0.168785 0.035300 0.367803 -0.121760 0.553271 nan nan -0.668040 0.105408 nan 8 8
P13645 0.963941 -0.296929 0.398515 0.918731 0.482631 0.601159 0.890981 0.675528 0.851338 0.977696 nan nan -0.343877 0.227459 -0.001342 -0.531893 -0.314453 -0.406771 -0.063214 nan nan -0.874671 -0.050935 nan 8 8
Q5SRI9 -0.274015 0.552476 0.366659 -0.087155 0.563963 -0.584189 -0.207766 -0.348800 0.733544 -0.259834 nan nan 0.162788 0.328694 0.592005 0.136408 0.044433 -0.088515 0.375309 nan nan 0.192704 -0.380808 nan 8 8
Q5JRA6;Q5JRA6-2 -0.221290 0.169740 -0.081361 -0.172243 -0.059984 0.450879 -0.327147 0.021417 -0.649397 -0.669281 nan nan -0.367023 0.332314 -0.544434 -0.362956 0.080021 -0.396093 -0.724821 nan nan 0.085440 0.745223 nan 8 8
Q13790 -0.521996 0.502512 0.298571 -0.198715 0.261663 -0.462789 0.293426 0.222006 -0.215508 -0.483552 nan nan 0.029153 0.017264 0.174426 -0.331592 -0.388649 0.061342 0.044322 nan nan 0.078235 -0.389478 nan 8 8
P05556;P05556-2;P05556-3;P05556-4;P05556-5 -0.472444 -0.030604 -0.314654 -0.424667 -0.361837 -0.339333 -0.231172 -0.164482 -0.016348 0.328409 nan nan -0.091371 -0.018265 0.247949 0.184702 0.353914 -0.061795 0.139908 nan nan 0.003355 0.029215 nan 8 8
Q9HBT6 -0.262534 0.054967 0.215041 0.204430 -0.015586 0.099041 -0.589051 -0.130810 0.445839 -0.463107 nan nan 0.711355 0.403674 -0.045300 -0.091698 -0.095759 -0.213494 0.115791 nan nan 0.106877 -0.188251 nan 8 8
Q9P232 -0.178684 -0.245937 -0.044191 -0.387374 0.008159 -0.587127 -0.360846 -0.640844 -0.695382 -0.457551 nan nan -0.400998 0.281978 0.256287 0.419846 -0.737696 0.477733 0.657573 nan nan -0.766501 -0.447343 nan 8 8
Q9UM22 0.095872 0.704424 0.250795 0.360971 -0.310580 0.210947 0.415552 0.579654 0.376433 0.702658 nan nan 0.705769 0.604392 0.396264 0.202060 -0.690243 -0.182529 -0.210782 nan nan 0.544593 0.023512 nan 8 8
Q9Y653;Q9Y653-2;Q9Y653-3 -0.130134 -0.637618 -0.532727 -0.417296 -0.181330 -0.604021 -0.048771 -0.672811 -0.191744 0.165959 nan nan -0.729686 -0.014206 0.286967 0.024990 0.365352 -0.282547 -0.078242 nan nan 0.406484 -0.099690 nan 8 8
O15204;O15204-2 0.584330 0.918572 -0.360237 0.230570 -0.399612 -0.376917 0.793931 -0.381210 0.401527 0.683591 nan nan 0.073044 0.597812 -0.632177 -0.384078 0.434960 0.554693 0.405536 nan nan -0.369992 0.163446 nan 8 8
A0A0G2JQD2;A0A0G2JQM0;A0A0G2JRN4;P30711 -0.278053 -0.059338 -0.299129 -0.254147 -0.029111 -0.176005 0.550803 -0.274112 0.078554 -0.628984 nan nan -0.142576 0.494811 -0.484875 -0.287631 -0.284062 -0.051869 -0.432316 nan nan 0.739107 -0.015886 nan 8 8
Q14019 -0.344741 0.321704 0.114891 0.165256 0.097457 -0.725812 0.289589 -0.693759 0.525948 -0.022084 nan nan 0.367858 0.001572 0.064769 0.401079 -0.007457 0.256334 0.104904 nan nan -0.513253 -0.468898 nan 9 9
Q06481;Q06481-2 0.409385 0.124210 0.247080 -0.068025 -0.234771 0.173026 0.435174 0.400801 -0.430178 0.529760 nan nan -0.014359 -0.114514 -0.102763 0.386024 0.496540 -0.337000 -0.416108 nan nan 0.213431 -0.282300 nan 9 9
A0A087X1V2 -0.070690 -0.038396 0.191590 -0.243238 -0.013846 0.057184 -0.147707 0.003097 0.088347 -0.016908 nan nan 0.227076 0.449367 -0.181107 -0.345283 0.314176 0.241761 -0.164150 nan nan 0.039602 -0.005971 nan 9 9
B1ALD9;Q15063;Q15063-3;Q15063-5 -0.101006 0.276383 -0.386316 -0.287475 -0.065944 -0.240490 0.234487 -0.113355 -0.369493 0.319297 nan nan 0.425439 -0.040114 0.080040 0.086550 0.196306 0.619472 0.468683 nan nan 0.053837 -0.648727 nan 9 9
A0A0A0MRJ6;F6S8N6;H7BY58;P22061;P22061-2 0.302851 -0.238476 -0.011681 -0.170927 -0.163722 0.109176 0.427046 0.179939 -0.309833 0.501859 nan nan -0.040253 -0.149094 -0.578080 -0.848097 -0.751595 0.184935 0.197383 nan nan -0.241246 -0.541386 nan 9 9
A0A0C4DH24 -0.413803 -0.478112 -0.242685 -0.451837 -0.314515 -0.168096 -0.519206 -0.181333 -0.444924 0.152891 nan nan -0.499855 0.095468 0.293136 0.105732 0.247235 0.189404 -0.079999 nan nan -0.601267 0.147397 nan 9 9
B1AJQ6;Q86Y82 0.036729 -0.338502 0.123148 0.003516 0.094614 0.200578 -0.556243 0.321117 0.511875 0.436191 nan nan 0.363410 0.429255 0.081571 0.042638 -0.069041 0.006824 0.069755 nan nan -0.332403 -0.058363 nan 9 9
E9PL83;P35318 0.357204 -0.209888 0.375894 -0.015352 0.455210 -0.054645 0.517585 -0.457266 0.123687 -0.539022 nan nan 0.167702 -0.169518 -0.347613 -0.174028 0.106301 -0.082180 0.188665 nan nan -0.339713 -0.744234 nan 9 9
E9PGA6;Q9BXJ4;Q9BXJ4-2;Q9BXJ4-3 0.124919 0.277052 0.094868 -0.190596 -0.105489 0.189015 0.543206 0.273937 -0.120507 0.106538 nan nan 0.095878 -0.115276 0.608922 0.429551 0.079898 0.323193 0.203013 nan nan 0.096747 0.648033 nan 9 9
O43529 0.020461 -0.013220 -0.272887 0.174158 -0.195494 -0.013005 0.179683 -0.117824 0.195129 0.128859 nan nan -0.187278 -0.392751 0.069547 0.283319 0.424633 0.566172 0.382211 nan nan -0.238252 -0.423734 nan 9 9
Q9Y6C2 0.340254 -0.037013 0.260312 -0.272410 -0.647096 0.348677 0.109492 0.614777 -0.561969 0.551990 nan nan 0.042545 -0.011904 0.079803 0.126374 -0.189944 -0.186103 -0.063065 nan nan 0.017971 -0.127935 nan 9 9
Q9H8J5 0.212741 -0.277422 0.339434 0.179407 0.015454 -0.105348 0.037673 0.219253 -0.539968 0.309827 nan nan -0.131488 -0.426695 -0.288969 -0.356958 -0.383166 -0.502489 -0.521808 nan nan 0.079233 0.320108 nan 9 9
P08493;P08493-2 0.038081 -0.215232 -0.452859 -0.232150 -0.144641 -0.635377 0.764655 -0.703795 -0.351766 0.150211 nan nan 0.024427 -0.017811 -0.406176 -0.246588 0.611507 0.360308 -0.049740 nan nan -0.397982 -0.397982 nan 9 9
B1AJR6;B1AJR9;B1AJS0;O14522 0.530251 -0.077189 -0.237757 0.093327 -0.129644 -0.445185 0.142851 -0.353217 0.644977 0.465073 nan nan -0.049114 0.076749 -0.304727 -0.305222 -0.516284 0.183477 0.759872 nan nan 0.348627 -0.076654 nan 9 9
P10745 -0.236641 -0.055035 0.132291 0.023283 0.249140 0.468128 -0.248778 0.489925 -0.073139 0.328450 nan nan 0.136523 -0.209085 0.597717 0.544391 -0.135434 0.161125 0.021381 nan nan 0.688639 -0.246050 nan 9 9
P50395 0.013608 -0.499149 -0.001964 0.174813 -0.400909 0.327166 -0.111803 0.193138 0.152363 0.368766 nan nan 0.016074 -0.611504 0.147720 0.026398 -0.385451 0.045322 0.277061 nan nan -0.756427 -0.524876 nan 9 9
P40121;P40121-2 -0.202385 0.174906 0.179679 0.354102 0.530344 0.424692 -0.418731 0.529660 0.206394 0.067427 nan nan -0.529026 0.106692 0.396670 0.185199 0.143594 -0.359970 0.082299 nan nan -0.176933 -0.108032 nan 9 9
P23468-2 0.493788 0.564488 0.626671 -0.171365 0.545512 0.596700 0.258028 0.308434 0.116222 0.669847 nan nan 0.594490 -0.511545 0.315955 0.244091 -0.088072 -0.216794 -0.092626 nan nan 0.102505 0.055662 nan 10 10
O76070 -0.432052 0.274759 0.152764 -0.215355 0.275401 -0.204646 -0.227776 0.162530 0.469000 0.054400 nan nan -0.278608 0.691494 -0.212317 -0.086282 -0.337393 -0.128095 0.178774 nan nan -0.474129 0.337800 nan 10 10
A0A0C4DH38 0.206311 0.269573 0.171244 0.156814 -0.401812 0.806180 0.250020 0.814056 -0.279316 0.003565 nan nan 0.582856 0.536127 0.313090 0.516658 0.237863 -0.222864 -0.444024 nan nan 0.106138 -0.497263 nan 10 10
A0A1W2PQB1;H0Y755;M9MML0;P08637 -0.090323 0.634542 0.399653 -0.253373 0.752411 0.460288 0.087874 0.690784 0.595184 -0.139070 nan nan 0.331327 0.082747 -0.081580 -0.266497 0.252034 0.540145 0.237650 nan nan -0.617375 0.448386 nan 10 10
Q6UWH4;Q6UWH4-2 -0.588718 -0.149646 -0.470844 0.098882 -0.472780 -0.397136 -0.791764 0.151063 -0.079928 -0.371243 nan nan -0.426708 -0.384665 -0.578150 -0.418792 -0.052021 0.143652 -0.090828 nan nan -0.126561 -0.059018 nan 10 10
P08670 -0.314275 -0.552437 -0.082178 -0.179390 -0.209779 -0.501234 -0.304603 -0.180491 -0.411473 0.025687 nan nan 0.206552 0.478899 0.551632 0.469208 -0.068671 0.402038 -0.106562 nan nan 0.028298 0.291405 nan 10 10
P05362 0.260185 0.111855 -0.012399 0.353519 -0.241539 0.322340 -0.598690 0.655755 -0.578210 0.231633 nan nan -0.670783 -0.036818 0.104107 -0.653851 -0.496616 -0.447509 -0.134554 nan nan 0.390289 0.574095 nan 10 10
F8WD41;Q15166 -0.202856 0.429966 0.105490 0.391304 0.324974 -0.359926 0.046618 -0.291802 0.180810 -0.065244 nan nan 0.029724 -0.322490 -0.066574 0.265087 -0.203570 0.363056 -0.080742 nan nan 0.207901 0.110472 nan 11 11
P01742 0.375201 -0.153611 0.326738 -0.267428 -0.351380 0.026400 0.306093 0.271311 -0.587962 -0.094306 nan nan -0.393033 -0.205356 0.676899 0.724786 0.203480 0.594179 0.503616 nan nan -0.194256 0.578374 nan 11 11
Q9NQS3;Q9NQS3-2;Q9NQS3-3 -0.424592 -0.063953 -0.060579 -0.139485 -0.169269 -0.111027 -0.224083 -0.140219 0.069189 -0.172248 nan nan 0.202371 -0.574758 0.035925 0.120460 0.604995 0.092650 -0.361905 nan nan 0.156133 -0.174368 nan 11 11
P55774 0.103249 -0.158828 -0.123305 -0.308916 0.082870 -0.215141 0.280781 -0.218578 -0.211760 -0.158074 nan nan -0.181675 -0.083477 -0.183622 -0.001697 0.730929 -0.506454 -0.172455 nan nan -0.584551 -0.551401 nan 11 11
Q9UFP1 0.121246 -0.300301 0.131683 -0.094306 0.031877 0.028588 0.090304 0.208374 0.056005 -0.173611 nan nan -0.123046 0.489002 -0.266922 -0.405138 -0.086840 0.076316 0.313306 nan nan 0.159000 0.376368 nan 12 12
Q9BUJ0 -0.037313 -0.237783 -0.145656 -0.314651 -0.102085 -0.098921 -0.171046 0.027051 -0.265561 -0.119433 nan nan -0.111523 -0.073993 -0.202398 -0.237412 -0.249121 -0.107921 0.386495 nan nan 0.049800 -0.265784 nan 12 12
F8W703;Q9HCK4;Q9HCK4-2;Q9HCK4-3;R4GMM8 0.167222 0.535578 0.520488 -0.168381 0.289721 0.103854 -0.057334 0.077320 -0.352388 0.444449 nan nan -0.181184 -0.001253 0.819450 0.322500 0.307470 0.546251 0.282076 nan nan -0.614758 -0.839847 nan 12 12
Q96RW7;Q96RW7-2 -0.274538 -0.152896 -0.128715 -0.301343 -0.083378 -0.398478 -0.052788 -0.352505 0.219858 -0.008263 nan nan 0.625088 0.300343 0.502826 0.258426 -0.328800 -0.017822 -0.198446 nan nan 0.378646 0.234101 nan 12 12
A1L4H1 0.186316 -0.268853 -0.453403 0.309188 -0.229280 0.072585 0.199842 -0.116746 -0.221594 0.008497 nan nan -0.326665 -0.397964 0.300334 -0.324173 0.006907 0.289843 0.178257 nan nan -0.031547 -0.042797 nan 12 12
A0A087WSV8;V9HW75 0.506714 0.303788 0.389117 0.169682 -0.165770 0.112915 0.415837 0.190487 -0.229917 0.192111 nan nan 0.245198 -0.035197 -0.062070 -0.112864 -0.024355 -0.168605 -0.071594 nan nan 0.472271 -0.420030 nan 12 12
P53634 0.321988 -0.070735 0.253750 -0.130860 -0.230908 -0.101786 0.163428 -0.152078 0.263806 0.352835 nan nan 0.180917 -0.081091 0.333731 0.177133 0.232156 -0.016131 0.045658 nan nan -0.353595 0.259023 nan 12 12
B4DYV8;Q8WZ75;Q8WZ75-2;Q8WZ75-3 0.270149 0.432661 0.114235 -0.317889 0.264029 0.130729 0.341275 0.093670 -0.515780 0.120207 nan nan -0.440024 -0.023383 0.078349 0.291298 0.147052 0.133283 -0.408509 nan nan 0.085688 -0.033982 nan 14 14
P26447 0.217122 0.396173 0.429465 0.396644 -0.308501 -0.003897 0.217616 -0.234569 0.245939 0.395222 nan nan 0.433258 0.507762 0.126961 0.053396 -0.313181 0.341588 0.028673 nan nan -0.220886 -0.189247 nan 14 14
Q8IWT1 -0.047436 -0.056599 -0.180531 -0.004676 0.195685 0.063009 -0.312294 -0.010575 0.380227 -0.308109 nan nan 0.002011 0.162880 -0.218865 -0.342029 -0.006976 -0.114925 -0.302952 nan nan 0.606799 -0.274417 nan 14 14
Q6PCB0 -0.010818 -0.297155 -0.160731 -0.187322 -0.445445 -0.287276 -0.157095 -0.323757 -0.316441 0.132796 nan nan -0.011942 0.261268 -0.530040 -0.329471 0.070670 -0.188112 0.226026 nan nan -0.023465 -0.115061 nan 14 14
O43852;O43852-3;O43852-5 0.193891 0.198643 -0.129793 -0.218009 0.040860 0.295117 0.060138 0.141397 0.293714 -0.062552 nan nan -0.155637 0.108296 0.614990 0.394764 0.262998 0.574898 0.595861 nan nan -0.110063 -0.377160 nan 15 15
P11597;P11597-2 0.091263 -0.288931 -0.304143 -0.244961 -0.119263 -0.071420 -0.088106 -0.029940 0.225605 -0.022021 nan nan -0.443310 0.029185 -0.211026 0.052557 0.261912 0.330721 0.108906 nan nan 0.400242 -0.067354 nan 16 16
Q96AQ6;Q96AQ6-2 0.074929 -0.055779 0.088331 -0.286627 0.227730 0.266461 -0.083947 0.377825 -0.047226 -0.184507 nan nan -0.159338 0.204428 -0.158511 0.136220 -0.083160 -0.189966 -0.048210 nan nan 0.554486 -0.006715 nan 16 16
Q6ZMP0;Q6ZMP0-2 -0.257877 0.243712 -0.016268 0.132715 -0.178596 0.066672 -0.115560 -0.028928 -0.151054 -0.094501 nan nan -0.193571 0.316554 0.414953 -0.432438 -0.469487 -0.152902 -0.146005 nan nan 0.481989 0.031743 nan 16 16
A6XMH3;P01236;Q5I0G2 0.143374 -0.234068 -0.300993 -0.263686 -0.220400 0.249922 -0.209505 0.236575 -0.184831 0.126854 nan nan -0.456084 -0.215451 0.183538 0.124415 0.035354 -0.746360 -0.438234 nan nan 0.415223 0.098237 nan 17 17
Q13231;Q13231-3 0.028238 0.160841 0.445237 -0.248124 -0.176016 -0.144117 0.357706 -0.108428 0.011950 0.128017 nan nan 0.134116 0.142108 -0.284982 -0.276036 -0.002785 0.135271 0.335335 nan nan -0.001439 0.440431 nan 19 19

Error plot#

Hide code cell source

metrics = pimmslearn.models.Metrics()
test_metrics = metrics.add_metrics(
    pred_test[['observed', *TOP_N_ORDER]], key='test data')
test_metrics = pd.DataFrame(test_metrics)[TOP_N_ORDER]
test_metrics
Selected as truth to compare to: observed
BPCA VAE DAE TRKNN RF
MSE 0.455 0.481 0.480 0.500 0.516
MAE 0.432 0.436 0.442 0.458 0.465
N 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000
prop 1.000 1.000 1.000 1.000 1.000

Hide code cell source

n_in_comparison = int(test_metrics.loc['N'].unique()[0])
n_in_comparison
12600

Hide code cell source

_to_plot = test_metrics.loc[METRIC].to_frame().T
_to_plot.index = [feature_names.name]
_to_plot
BPCA VAE DAE TRKNN RF
protein groups 0.432 0.436 0.442 0.458 0.465

Hide code cell source

try:
    text = model_configs[["latent_dim", "hidden_layers"]].apply(
        build_text,
        axis=1)
except KeyError:
    logger.warning("No PIMMS models in comparsion. Using empty text")
    text = pd.Series('', index=model_configs.columns)

_to_plot.loc["text"] = text
_to_plot = _to_plot.fillna('')
_to_plot
BPCA VAE DAE TRKNN RF
protein groups 0.432 0.436 0.442 0.458 0.465
text LD: 10 HL: 64 LD: 10 HL: 64

Hide code cell source

fig, ax = plt.subplots(figsize=(4, 2))  # size of the plot can be adjusted
ax = _to_plot.loc[[feature_names.name]].plot.bar(
    rot=0,
    ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY}\n({n_in_comparison:,} intensities)",
    # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
    color=COLORS_TO_USE,
    ax=ax,
    width=.7)
ax = pimmslearn.plotting.add_height_to_barplot(ax, size=7)
ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=7)
ax.set_xticklabels([])
fname = args.out_figures / f'2_{group}_performance_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_performance_test.pdf
_images/871b6deb0d63b0d2cbe61b7c44cf00d33730d187ba13b80294d4b56694d0e801.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
_to_plot_long = _to_plot.T
_to_plot_long = _to_plot_long.rename(
    {feature_names.name: 'metric_value'}, axis=1)
_to_plot_long['data level'] = feature_names.name
_to_plot_long = _to_plot_long.set_index('data level', append=True)
_to_plot_long.to_csv(fname.with_suffix('.csv'))

Plot error by median feature intensity#

Hide code cell source

pimmslearn.plotting.make_large_descriptors(7)
fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred=pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC,
    palette=COLORS_TO_USE
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
pimmslearn.plotting.make_large_descriptors(6)
fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: UserWarning: The palette list has more values (24) than needed (5), which may not be intended.
  sns.barplot(
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_000 A0A075B6P5;P01615 BPCA 0.467 19 912 19\n(N=912)
1 Sample_000 A0A075B6P5;P01615 VAE 0.016 19 912 19\n(N=912)
2 Sample_000 A0A075B6P5;P01615 DAE 0.209 19 912 19\n(N=912)
3 Sample_000 A0A075B6P5;P01615 TRKNN 0.422 19 912 19\n(N=912)
4 Sample_000 A0A075B6P5;P01615 RF 0.701 19 912 19\n(N=912)
... ... ... ... ... ... ... ...
62,995 Sample_209 Q9UGM5;Q9UGM5-2 BPCA 0.476 16 1,913 16\n(N=1,913)
62,996 Sample_209 Q9UGM5;Q9UGM5-2 VAE 0.329 16 1,913 16\n(N=1,913)
62,997 Sample_209 Q9UGM5;Q9UGM5-2 DAE 0.586 16 1,913 16\n(N=1,913)
62,998 Sample_209 Q9UGM5;Q9UGM5-2 TRKNN 0.442 16 1,913 16\n(N=1,913)
62,999 Sample_209 Q9UGM5;Q9UGM5-2 RF 0.252 16 1,913 16\n(N=1,913)

63000 rows × 7 columns

_images/1a911bfdf6a8a195de676cf4f8ce9bbf9d7299d29d58aeb1ea191949c538b4e9.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=63) BPCA 0.619 0.504 0.744
1 11\n(N=63) DAE 0.667 0.527 0.804
2 11\n(N=63) RF 0.580 0.459 0.709
3 11\n(N=63) TRKNN 0.589 0.482 0.698
4 11\n(N=63) VAE 0.567 0.446 0.687
... ... ... ... ... ...
85 29\n(N=12) BPCA 0.133 0.065 0.212
86 29\n(N=12) DAE 0.174 0.106 0.248
87 29\n(N=12) RF 0.214 0.129 0.306
88 29\n(N=12) TRKNN 0.230 0.158 0.317
89 29\n(N=12) VAE 0.164 0.092 0.253

90 rows × 5 columns

Hide code cell source

(errors_binned
 .set_index(
     ['model', errors_binned.columns[-1]]
 )
 .loc[ORDER_MODELS[0]]
 .sort_values(by=METRIC))
Sample ID protein groups MAE bin n_obs
intensity binned by median of protein groups
18\n(N=846) Sample_142 P09972 0.000 18 846
15\n(N=2,557) Sample_021 A0A0A0MT66 0.000 15 2,557
14\n(N=2,074) Sample_058 Q16853;Q16853-2 0.000 14 2,074
16\n(N=1,913) Sample_015 B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 0.000 16 1,913
15\n(N=2,557) Sample_079 A6NCT7;Q07092;Q07092-2 0.000 15 2,557
... ... ... ... ... ...
14\n(N=2,074) Sample_011 P11597;P11597-2 5.771 14 2,074
14\n(N=2,074) Sample_184 F8WD41;Q15166 6.195 14 2,074
17\n(N=1,393) Sample_108 P27824;P27824-2 6.482 17 1,393
14\n(N=2,074) Sample_091 F8WD41;Q15166 6.823 14 2,074
14\n(N=2,074) Sample_115 P17050 7.635 14 2,074

12600 rows × 5 columns

Custom model selection#

Hide code cell source

if SEL_MODELS:
    metrics = pimmslearn.models.Metrics()
    test_metrics = metrics.add_metrics(
        pred_test[['observed', *SEL_MODELS]], key='test data')
    test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS]
    test_metrics

    n_in_comparison = int(test_metrics.loc['N'].unique()[0])
    n_in_comparison

    _to_plot = test_metrics.loc[METRIC].to_frame().T
    _to_plot.index = [feature_names.name]
    _to_plot

    try:
        text = model_configs[["latent_dim", "hidden_layers"]].apply(
            build_text,
            axis=1)
    except KeyError:
        logger.warning("No PIMMS models in comparsion. Using empty text")
        text = pd.Series('', index=model_configs.columns)

    _to_plot.loc["text"] = text
    _to_plot = _to_plot.fillna('')
    _to_plot

    fig, ax = plt.subplots(figsize=(4, 2))
    ax = _to_plot.loc[[feature_names.name]].plot.bar(
        rot=0,
        ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)",
        # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
        color=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS)),
        ax=ax,
        width=.7)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    ax = pimmslearn.plotting.add_height_to_barplot(ax, size=5)
    ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5)
    ax.set_xticklabels([])

    fname = args.out_figures / f'2_{group}_performance_test_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, name=fname)

    dumps[fname.stem] = fname.with_suffix('.csv')
    _to_plot_long = _to_plot.T
    _to_plot_long = _to_plot_long.rename(
        {feature_names.name: 'metric_value'}, axis=1)
    _to_plot_long['data level'] = feature_names.name
    _to_plot_long = _to_plot_long.set_index('data level', append=True)
    _to_plot_long.to_csv(fname.with_suffix('.csv'))

Hide code cell source

# custom selection
if SEL_MODELS:
    pimmslearn.plotting.make_large_descriptors(7)
    fig, ax = plt.subplots(figsize=(8, 2))

    ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
        pred=pred_test[
            [TARGET_COL] + SEL_MODELS
        ],
        feat_medians=data.train_X.median(),
        ax=ax,
        metric_name=METRIC,
        feat_name=FEAT_NAME_DISPLAY,
        palette=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS))
    )
    # ax.set_ylim(0, 1.5)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    # for text in ax.legend().get_texts():
    #     text.set_fontsize(6)
    fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(ax.get_figure(), name=fname)
    plt.show(fig)

    dumps[fname.stem] = fname.with_suffix('.csv')
    errors_binned.to_csv(fname.with_suffix('.csv'))
    pimmslearn.plotting.make_large_descriptors(6)
    # ax.xaxis.set_tick_params(rotation=0) # horizontal

    # ! only used for reporting
    plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
        errors=errors_binned,
        feat_name=FEAT_NAME_DISPLAY,
        metric_name=METRIC
    )
    plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
    display(plotted)

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_binned(
    pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    ax=ax,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_test_errors_binned_by_int.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:50: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  ax = sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf
_images/2ebb5ec004820b4cd42712ba83a175ee0a55e53238b26177be18ad57eb479280.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE intensity bin
0 Sample_143 P02768 BPCA 0.065 30\n(N=2)
1 Sample_143 P02768 VAE 0.326 30\n(N=2)
2 Sample_143 P02768 DAE 0.325 30\n(N=2)
3 Sample_143 P02768 TRKNN 0.574 30\n(N=2)
4 Sample_143 P02768 RF 0.437 30\n(N=2)

Figures dumped to disk#

Hide code cell source

figures
{'2_1_fake_na_val_test_splits': Path('runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png'),
 '2_1_pred_corr_val_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf'),
 '2_1_errors_binned_by_feat_median_val': Path('runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf'),
 '2_1_intensity_binned_top_4_models_test': Path('runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf'),
 '2_1_pred_corr_test_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf'),
 '2_1_pred_corr_test_per_feat': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf'),
 '2_1_performance_test': Path('runs/alzheimer_study/figures/2_1_performance_test.pdf'),
 '2_1_test_errors_binned_by_feat_medians': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf'),
 '2_1_test_errors_binned_by_int': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf')}

Hide code cell source

dumps
print("done")
done