Compare models#

  1. Load available configurations

  2. Load validation predictions

    • calculate absolute error

    • select top N for plotting by MAE from smallest (best) to largest (worst) (top N as specified, default 5)

    • correlation per sample, correlation per feat, correlation overall

    • MAE plots

  3. Load test data predictions

    • as for validation data

    • top N based on validation data

Hide code cell source

import logging
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from IPython.display import display

import pimmslearn
import pimmslearn.imputation
import pimmslearn.models
import pimmslearn.nb
from pimmslearn.analyzers import compare_predictions
from pimmslearn.io import datasplits
from pimmslearn.models.collect_dumps import collect, select_content

pd.options.display.max_rows = 30
pd.options.display.min_rows = 10
pd.options.display.max_colwidth = 100

plt.rcParams.update({'figure.figsize': (4, 2)})
pimmslearn.plotting.make_large_descriptors(7)

logger = pimmslearn.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)


def load_config_file(fname: Path, first_split='config_') -> dict:
    with open(fname) as f:
        loaded = yaml.safe_load(f)
    key = f"{select_content(fname.stem, first_split=first_split)}"
    return key, loaded


def build_text(s):
    ret = ''
    if not np.isnan(s["latent_dim"]):
        ret += f'LD: {int(s["latent_dim"])} '
    try:
        if len(s["hidden_layers"]):
            t = ",".join(str(x) for x in s["hidden_layers"])
            ret += f"HL: {t}"
    except TypeError:
        # nan
        pass
    return ret

Hide code cell source

# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

# files and folders
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
folder_data: str = ''  # specify data directory if needed
file_format: str = 'csv'  # change default to pickled files
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
models: str = 'Median,CF,DAE,VAE'  # picked models to compare (comma separated)
sel_models: str = ''  # user defined comparison (comma separated)
# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10
plot_to_n: int = 5
feat_name_display: str = None  # display name for feature name in plural (e.g. 'protein groups')
save_agg_pred: bool = False  # save aggregated predictions of validation and test data
# Parameters
fn_rawfile_metadata = "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv"
folder_experiment = "runs/alzheimer_study"
models = "Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO"

Some argument transformations

Hide code cell source

args = pimmslearn.nb.get_params(args, globals=globals())
args
root - INFO     Removed from global namespace: folder_experiment
root - INFO     Removed from global namespace: folder_data
root - INFO     Removed from global namespace: file_format
root - INFO     Removed from global namespace: fn_rawfile_metadata
root - INFO     Removed from global namespace: models
root - INFO     Removed from global namespace: sel_models
root - INFO     Removed from global namespace: plot_to_n
root - INFO     Removed from global namespace: feat_name_display
root - INFO     Removed from global namespace: save_agg_pred
{'folder_experiment': 'runs/alzheimer_study',
 'folder_data': '',
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'sel_models': '',
 'plot_to_n': 5,
 'feat_name_display': None,
 'save_agg_pred': False}

Hide code cell source

args = pimmslearn.nb.args_from_dict(args)
args
{'data': Path('runs/alzheimer_study/data'),
 'feat_name_display': None,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'folder_data': '',
 'folder_experiment': Path('runs/alzheimer_study'),
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'out_figures': Path('runs/alzheimer_study/figures'),
 'out_folder': Path('runs/alzheimer_study'),
 'out_metrics': Path('runs/alzheimer_study'),
 'out_models': Path('runs/alzheimer_study'),
 'out_preds': Path('runs/alzheimer_study/preds'),
 'plot_to_n': 5,
 'save_agg_pred': False,
 'sel_models': ''}

Hide code cell source

figures = {}
dumps = {}

Hide code cell source

TARGET_COL = 'observed'
METRIC = 'MAE'
MIN_FREQ = None
MODELS_PASSED = args.models.split(',')
MODELS = MODELS_PASSED.copy()
FEAT_NAME_DISPLAY = args.feat_name_display
SEL_MODELS = None
if args.sel_models:
    SEL_MODELS = args.sel_models.split(',')

Hide code cell source

# list(sns.color_palette().as_hex()) # string representation of colors
if args.plot_to_n > 10:
    logger.warning("Set maximum of models to 10 (maximum)")
    args.overwrite_entry('plot_to_n', 10)

Hide code cell source

data = datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)
pimmslearn.io.datasplits - INFO     Loaded 'train_X' from file: runs/alzheimer_study/data/train_X.csv
pimmslearn.io.datasplits - INFO     Loaded 'val_y' from file: runs/alzheimer_study/data/val_y.csv
pimmslearn.io.datasplits - INFO     Loaded 'test_y' from file: runs/alzheimer_study/data/test_y.csv

Hide code cell source

fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)

pimmslearn.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],
                                     title='Validation split', size=1, xlabel='')
pimmslearn.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],
                                     title='Test split', size=1, xlabel='')
fig.suptitle("Simulated missing values per sample", size=8)
# hide axis and use only for common x label
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')
group = 1
fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png
_images/93e0d29a12b03e8af2899dbb522653b6ae8fbb651be546a2927a1813a23e72b3.png

data completeness across entire data#

Hide code cell source

# load frequency of training features...
# needs to be pickle -> index.name needed
freq_feat = pimmslearn.io.datasplits.load_freq(args.data, file='freq_features.json')
freq_feat.head()  # training data
A0A024QZX5;A0A087X1N8;P35237                                                     197
A0A024R0T9;K7ER74;P02655                                                         208
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8   185
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                          208
A0A075B6H7                                                                        97
Name: freq, dtype: int64

Hide code cell source

prop = freq_feat / len(data.train_X.index.levels[0])
prop.sort_values().to_frame().plot(
    xlabel=f'{data.val_y.index.names[-1]}',
    ylabel='Proportion of identification in samples')
<Axes: xlabel='protein groups', ylabel='Proportion of identification in samples'>
_images/e0f465e3c6d5d36aa8b3e60537ee3f18584687e65343e7e1ce4e7aeb6c0e5f01.png

View training data in wide format

Hide code cell source

data.to_wide_format()
data.train_X
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 ... Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 ... 16.012 15.178 NaN 15.050 16.842 NaN NaN 19.563 NaN 12.805
Sample_001 NaN 16.874 15.519 16.387 NaN 19.941 18.786 17.144 NaN 19.067 ... 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 NaN 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 ... 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 ... 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 NaN 12.445
Sample_004 15.603 15.331 15.375 16.679 NaN 20.450 18.682 17.081 14.140 19.686 ... 14.757 NaN NaN 15.256 17.075 19.582 15.328 NaN 13.145 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 15.682 16.886 14.910 16.482 NaN 17.705 17.039 NaN 16.413 19.102 ... NaN 15.684 14.236 15.415 17.551 17.922 16.340 19.928 12.929 NaN
Sample_206 15.798 17.554 15.600 15.938 NaN 18.154 18.152 16.503 16.860 18.538 ... 15.422 16.106 NaN 15.345 17.084 18.708 NaN 19.433 NaN NaN
Sample_207 15.739 NaN 15.469 16.898 NaN 18.636 17.950 16.321 16.401 18.849 ... 15.808 16.098 14.403 15.715 NaN 18.725 16.138 19.599 13.637 11.174
Sample_208 15.477 16.779 14.995 16.132 NaN 14.908 NaN NaN 16.119 18.368 ... 15.157 16.712 NaN 14.640 16.533 19.411 15.807 19.545 NaN NaN
Sample_209 NaN 17.261 15.175 16.235 NaN 17.893 17.744 16.371 15.780 18.806 ... 15.237 15.652 15.211 14.205 16.749 19.275 15.732 19.577 11.042 11.791

210 rows × 1421 columns

Number of samples and features:

Hide code cell source

N_SAMPLES, M_FEAT = data.train_X.shape
print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}")
N samples: 210, M features: 1421

Collect outputs in excel file:

Hide code cell source

fname = args.folder_experiment / '01_2_performance_summary.xlsx'
dumps[fname.stem] = fname
writer = pd.ExcelWriter(fname)
print(f"Saving to: {fname}")
Saving to: runs/alzheimer_study/01_2_performance_summary.xlsx

Model specifications#

  • used for bar plot annotations

Hide code cell source

# model_key could be used as key from config file
# ? load only specified configs?
# ? case: no config file available?
all_configs = collect(
    paths=(fname for fname in args.out_models.iterdir()
           if fname.suffix == '.yaml'
           and 'model_config' in fname.name),
    load_fn=load_config_file
)
model_configs = pd.DataFrame(all_configs).set_index('id')
model_configs.T.to_excel(writer, sheet_name='model_params')
model_configs.T
id VAE KNN5 DAE KNN Median CF
M 1421 1421 1421 1421 1421 1421
batch_size 64.000 64.000 64.000 64.000 NaN 1,024.000
cuda False True False True NaN False
data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data
epoch_trained 174.000 NaN 133.000 NaN NaN 17.000
epochs_max 300.000 50.000 300.000 50.000 NaN 100.000
file_format csv csv csv csv csv csv
fn_rawfile_metadata https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv
folder_data NaN
folder_experiment runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
hidden_layers [64] NaN [64] NaN NaN NaN
latent_dim 10.000 NaN 10.000 NaN NaN 50.000
meta_cat_col NaN NaN NaN NaN NaN NaN
meta_date_col NaN NaN NaN NaN NaN NaN
model VAE KNN DAE KNN Median CF
model_key VAE KNN5 DAE KNN Median CF
n_params 277998 1 184983 1 1421 83283
out_figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures
out_folder runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_metrics runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_models runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds
patience 50.000 NaN 25.000 NaN NaN 1.000
sample_idx_position 0 0 0 0 0 0
save_pred_real_na True True True True True True
force_train NaN True NaN True NaN NaN
neighbors NaN 5.000 NaN 3.000 NaN NaN
pred_test_Median NaN NaN NaN NaN runs/alzheimer_study/preds/pred_test_Median.csv NaN
pred_val_Median NaN NaN NaN NaN runs/alzheimer_study/preds/pred_val_Median.csv NaN

Set Feature name (columns are features, rows are samples)

Hide code cell source

# index name
freq_feat.index.name = data.train_X.columns.name
# sample index name
sample_index_name = data.train_X.index.name

Load predictions on validation and test data split#

Validation data#

  • set top N models to plot based on validation data split

Hide code cell source

pred_val = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='val',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
SAMPLE_ID, FEAT_NAME = pred_val.index.names
if not FEAT_NAME_DISPLAY:
    FEAT_NAME_DISPLAY = FEAT_NAME
pred_val[MODELS]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 15.752 15.653 15.686 15.667 15.427 15.449 15.469 16.800 NaN 58.276 ... 7.068 10.929 2,513.638 11.373 14.631 15.597 15.752 17.206 15.700 0
Sample_050 Q9Y287 17.221 16.477 16.760 16.818 17.776 17.314 16.453 17.288 NaN 16.993 ... 7.068 12.593 19.829 12.117 16.151 16.846 17.221 17.807 16.738 0
Sample_107 Q8N475;Q8N475-2 14.846 14.060 14.591 14.404 14.150 14.355 13.110 17.187 NaN -78.084 ... 7.068 11.907 2,582.130 12.354 13.316 14.311 14.846 17.434 13.776 0
Sample_199 P06307 18.973 19.088 18.837 19.018 19.247 19.385 19.639 16.711 NaN 102.283 ... 7.068 12.106 2,483.120 12.446 17.005 18.988 18.973 17.111 19.015 0
Sample_067 Q5VUB5 14.726 14.916 15.094 15.004 15.232 15.040 15.465 16.743 NaN -36.470 ... 7.068 12.167 2,569.564 13.566 12.305 15.002 14.726 17.031 14.699 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.918 22.951 22.872 22.818 22.884 22.899 22.994 17.042 NaN 104.484 ... 7.068 12.722 2,634.108 14.010 22.019 22.851 22.918 17.330 22.872 0
Sample_002 A0A0A0MT36 15.877 16.263 16.226 15.892 16.857 16.142 15.882 16.792 NaN -18.408 ... 7.068 12.925 2,448.503 12.662 13.329 15.823 15.877 16.879 15.671 0
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 16.278 15.158 15.621 15.830 15.840 15.574 15.406 17.032 NaN -27.128 ... 7.068 12.760 2,487.550 13.417 14.969 15.851 16.278 17.215 15.574 0
Sample_182 Q8NFT8 13.995 14.092 13.961 13.239 13.685 13.480 14.322 16.764 NaN -12.434 ... 7.068 13.018 2,426.191 11.854 12.500 13.996 13.995 17.125 14.518 0
Sample_123 Q16853;Q16853-2 14.849 14.647 14.513 14.487 14.612 14.627 14.582 16.686 NaN 78.799 ... 7.068 13.310 2,461.806 13.023 13.923 14.757 14.849 16.981 14.485 0

12600 rows × 24 columns

Describe absolute error

Hide code cell source

errors_val = (pred_val
              .drop(TARGET_COL, axis=1)
              .sub(pred_val[TARGET_COL], axis=0)
              [MODELS])
errors_val  # over all samples and all features
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 1.122 1.023 1.055 1.037 0.797 0.819 0.839 2.169 NaN 43.645 ... -7.562 -3.701 2,499.008 -3.257 0.001 0.966 1.122 2.575 1.070 -14.630
Sample_050 Q9Y287 1.466 0.722 1.005 1.063 2.021 1.559 0.698 1.533 NaN 1.238 ... -8.687 -3.162 4.074 -3.638 0.396 1.091 1.466 2.052 0.983 -15.755
Sample_107 Q8N475;Q8N475-2 -0.183 -0.969 -0.438 -0.626 -0.880 -0.674 -1.919 2.157 NaN -93.113 ... -7.961 -3.123 2,567.100 -2.676 -1.714 -0.719 -0.183 2.405 -1.253 -15.029
Sample_199 P06307 -0.403 -0.288 -0.539 -0.358 -0.129 0.009 0.263 -2.665 NaN 82.907 ... -12.308 -7.270 2,463.744 -6.930 -2.371 -0.388 -0.403 -2.265 -0.360 -19.376
Sample_067 Q5VUB5 -0.583 -0.393 -0.214 -0.305 -0.077 -0.269 0.156 1.434 NaN -51.779 ... -8.241 -3.142 2,554.255 -1.743 -3.004 -0.307 -0.583 1.723 -0.610 -15.309
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 0.096 0.128 0.050 -0.005 0.062 0.077 0.171 -5.781 NaN 81.662 ... -15.754 -10.101 2,611.285 -8.813 -0.804 0.028 0.096 -5.493 0.050 -22.822
Sample_002 A0A0A0MT36 -2.288 -1.902 -1.939 -2.273 -1.308 -2.023 -2.283 -1.373 NaN -36.573 ... -11.097 -5.240 2,430.338 -5.503 -4.836 -2.342 -2.288 -1.286 -2.494 -18.165
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 0.753 -0.367 0.096 0.305 0.314 0.049 -0.120 1.507 NaN -42.653 ... -8.457 -2.766 2,472.025 -2.109 -0.556 0.325 0.753 1.690 0.049 -15.525
Sample_182 Q8NFT8 -0.383 -0.287 -0.417 -1.139 -0.694 -0.899 -0.057 2.385 NaN -26.813 ... -7.311 -1.361 2,411.812 -2.525 -1.879 -0.382 -0.383 2.746 0.139 -14.379
Sample_123 Q16853;Q16853-2 0.345 0.142 0.009 -0.017 0.108 0.123 0.077 2.181 NaN 64.295 ... -7.436 -1.194 2,447.302 -1.481 -0.581 0.253 0.345 2.477 -0.019 -14.504

12600 rows × 24 columns

Select top N for plotting and set colors#

Hide code cell source

ORDER_MODELS = (errors_val
                .abs()
                .mean()
                .sort_values()
                .index
                .to_list())
ORDER_MODELS
['BPCA',
 'DAE',
 'VAE',
 'TRKNN',
 'RF',
 'CF',
 'KNN5',
 'KNN',
 'KNN_IMPUTE',
 'IRM',
 'ROWMEDIAN',
 'Median',
 'LLS',
 'QRILC',
 'COLMEDIAN',
 'SVDMETHOD',
 'PI',
 'MINDET',
 'MINPROB',
 'MINIMUM',
 'ZERO',
 'IMPSEQROB',
 'MLE',
 'IMPSEQ']

Hide code cell source

pred_val = pred_val[[TARGET_COL] + ORDER_MODELS]
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_val.csv'
    dumps[fname.stem] = fname
    pred_val.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_val
observed BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 14.630 15.469 15.686 15.667 15.700 15.597 15.653 15.449 15.427 15.937 ... 16.800 17.206 11.373 11.916 10.929 7.068 0 58.276 2,513.638 NaN
Sample_050 Q9Y287 15.755 16.453 16.760 16.818 16.738 16.846 16.477 17.314 17.776 16.961 ... 17.288 17.807 12.117 12.900 12.593 7.068 0 16.993 19.829 NaN
Sample_107 Q8N475;Q8N475-2 15.029 13.110 14.591 14.404 13.776 14.311 14.060 14.355 14.150 15.437 ... 17.187 17.434 12.354 12.313 11.907 7.068 0 -78.084 2,582.130 NaN
Sample_199 P06307 19.376 19.639 18.837 19.018 19.015 18.988 19.088 19.385 19.247 18.861 ... 16.711 17.111 12.446 12.285 12.106 7.068 0 102.283 2,483.120 NaN
Sample_067 Q5VUB5 15.309 15.465 15.094 15.004 14.699 15.002 14.916 15.040 15.232 15.079 ... 16.743 17.031 13.566 11.827 12.167 7.068 0 -36.470 2,569.564 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.822 22.994 22.872 22.818 22.872 22.851 22.951 22.899 22.884 22.837 ... 17.042 17.330 14.010 12.161 12.722 7.068 0 104.484 2,634.108 NaN
Sample_002 A0A0A0MT36 18.165 15.882 16.226 15.892 15.671 15.823 16.263 16.142 16.857 15.446 ... 16.792 16.879 12.662 12.586 12.925 7.068 0 -18.408 2,448.503 NaN
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 15.525 15.406 15.621 15.830 15.574 15.851 15.158 15.574 15.840 15.995 ... 17.032 17.215 13.417 12.352 12.760 7.068 0 -27.128 2,487.550 NaN
Sample_182 Q8NFT8 14.379 14.322 13.961 13.239 14.518 13.996 14.092 13.480 13.685 14.675 ... 16.764 17.125 11.854 12.504 13.018 7.068 0 -12.434 2,426.191 NaN
Sample_123 Q16853;Q16853-2 14.504 14.582 14.513 14.487 14.485 14.757 14.647 14.627 14.612 14.824 ... 16.686 16.981 13.023 12.689 13.310 7.068 0 78.799 2,461.806 NaN

12600 rows × 25 columns

Hide code cell source

mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS]
mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f')
mae_stats_ordered_val.T
count mean std min 25% 50% 75% max
BPCA 12,600.000 0.422 0.501 0.000 0.119 0.269 0.534 9.370
DAE 12,600.000 0.433 0.520 0.000 0.120 0.275 0.546 9.724
VAE 12,600.000 0.434 0.522 0.000 0.120 0.277 0.547 9.466
TRKNN 12,600.000 0.450 0.516 0.000 0.132 0.295 0.569 7.975
RF 12,600.000 0.461 0.531 0.000 0.136 0.306 0.582 8.743
CF 12,600.000 0.465 0.506 0.000 0.144 0.317 0.606 6.973
KNN5 12,600.000 0.467 0.546 0.000 0.135 0.305 0.594 10.231
KNN 12,600.000 0.481 0.565 0.000 0.138 0.310 0.618 10.502
KNN_IMPUTE 12,600.000 0.554 0.668 0.000 0.164 0.359 0.692 7.550
IRM 12,600.000 0.588 0.637 0.000 0.176 0.396 0.767 7.953
ROWMEDIAN 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
Median 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
LLS 12,600.000 1.329 54.974 0.000 0.151 0.343 0.662 4,842.571
QRILC 12,600.000 1.651 1.283 0.000 0.840 1.360 2.097 17.300
COLMEDIAN 12,600.000 2.210 1.634 0.000 0.947 1.972 3.094 12.944
SVDMETHOD 12,600.000 2.309 1.635 0.000 1.027 2.091 3.251 12.624
PI 12,600.000 3.808 2.640 0.000 1.764 3.331 5.368 18.188
MINDET 12,600.000 4.108 2.650 0.001 2.089 3.678 5.665 17.920
MINPROB 12,600.000 4.136 2.691 0.000 2.112 3.696 5.728 18.690
MINIMUM 12,600.000 9.272 2.717 0.373 7.327 8.890 10.863 22.773
ZERO 12,600.000 16.340 2.717 6.695 14.395 15.958 17.931 29.841
IMPSEQROB 12,600.000 333.478 793.700 0.002 12.282 33.864 87.298 2,869.299
MLE 12,600.000 2,172.384 865.925 0.009 2,435.415 2,495.362 2,552.718 2,873.681
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Some model have fixed colors, others are assigned randomly

Note

  1. The order of “new” models is important for the color assignment.

  2. User defined model keys for the same model with two configuration will yield different colors.

Hide code cell source

COLORS_TO_USE = pimmslearn.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS))
pimmslearn.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE)
pimmslearn.plotting.defaults - INFO     Reused some colors!
BPCADAEVAETRKNNRFCFKNN5KNNKNN_IMPUTEIRMROWMEDIANMedianLLSQRILCCOLMEDIANSVDMETHODPIMINDETMINPROBMINIMUMZEROIMPSEQROBMLEIMPSEQ

Hide code cell source

TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n]
TOP_N_COLOR_PALETTE = {model: color for model,
                       color in zip(TOP_N_ORDER, COLORS_TO_USE)}
TOP_N_ORDER
['BPCA', 'DAE', 'VAE', 'TRKNN', 'RF']

Correlation per sample#

Hide code cell source

corr_per_sample_val = (pred_val
                       .groupby(sample_index_name)
                       .apply(
                           lambda df: df.corr().loc[TARGET_COL]
                       )[ORDER_MODELS])

min_corr = int(corr_per_sample_val.min().min() * 10) / 10
kwargs = dict(ylim=(min_corr, 1), rot=90,
              #     boxprops=dict(linewidth=1.5),
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model pred. per sample on validation data',
              ylabel='correlation per sample')
ax = corr_per_sample_val[TOP_N_ORDER].plot.box(**kwargs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.xlsx'
dumps[fname.stem] = fname
with pd.ExcelWriter(fname) as w:
    corr_per_sample_val.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_val.to_excel(w, sheet_name='correlations')
    corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf
_images/3182a84b5ec971475e373585e3a1486a9d9c72414e3e9df9a68fb2b2602f3419.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_val[TOP_N_ORDER]).min()
mask = (corr_per_sample_val[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_val.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
observed BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID                                                
Sample_010 0.948707 0.944329 0.930396 0.944645 0.923494 0.948871 0.940594 0.946115 0.932235 0.932518 0.869102 0.869102 0.913668 0.840016 nan 0.065802 0.010311 nan 0.039081 nan nan 0.589151 nan nan
Sample_018 0.968582 0.920419 0.935811 0.953573 0.936248 0.958661 0.925531 0.938822 0.939383 0.952858 0.908717 0.908717 0.936909 0.860301 nan 0.161907 0.105573 nan -0.239214 nan nan 0.375658 nan nan
Sample_054 0.932254 0.915454 0.931375 0.910271 0.928487 0.904603 0.936915 0.925876 0.905679 0.913765 0.915748 0.915748 0.929264 0.810828 nan 0.190649 -0.143750 nan -0.007705 nan nan 0.836204 nan nan
Sample_063 0.970046 0.938321 0.925406 0.961257 0.953568 0.953186 0.941369 0.931853 0.928428 0.926100 0.929356 0.929356 0.947981 0.918440 nan 0.294161 0.219797 nan 0.251749 nan nan 0.430496 nan nan
Sample_071 0.887866 0.904644 0.907861 0.888162 0.896374 0.894716 0.901240 0.895286 0.880453 0.865003 0.885806 0.885806 0.899799 0.823719 nan 0.178303 0.189551 nan -0.080796 nan nan 0.364226 nan nan
Sample_073 0.930349 0.921960 0.911967 0.919876 0.912943 0.934973 0.933555 0.950641 0.916774 0.901773 0.900178 0.900178 0.909057 0.901154 nan -0.017963 -0.050031 nan -0.013284 nan nan 0.356937 nan nan
Sample_095 0.940942 0.919449 0.916004 0.927289 0.913928 0.925577 0.924950 0.930902 0.909714 0.913905 0.878167 0.878167 0.917350 0.754702 nan -0.120269 0.174790 nan -0.100521 nan nan 0.419195 nan nan
Sample_133 0.919483 0.934633 0.927364 0.928251 0.921791 0.935246 0.903483 0.903370 0.885348 0.878925 0.899233 0.899233 0.881238 0.842334 nan 0.219841 -0.038762 nan -0.008116 nan nan 0.409126 nan nan
Sample_139 0.927681 0.938710 0.934712 0.957367 0.921729 0.934218 0.912868 0.901552 0.878475 0.891290 0.907333 0.907333 0.928867 0.746243 nan 0.156894 -0.020579 nan 0.070321 nan nan 0.554137 nan nan
Sample_150 0.950334 0.904473 0.898225 0.945063 0.902496 0.957295 0.885565 0.868275 0.930981 0.907850 0.892997 0.892997 0.940619 0.786811 nan 0.166841 0.064295 nan 0.099713 nan nan 0.335988 nan nan
Sample_171 0.924707 0.905284 0.907095 0.916959 0.878049 0.922139 0.902581 0.906699 0.884571 0.881019 0.875433 0.875433 -0.090619 0.890794 nan -0.004823 0.127037 nan 0.023533 nan nan 0.302006 nan nan
Sample_173 0.916627 0.952568 0.953927 0.932711 0.968162 0.941034 0.939783 0.940274 0.918589 0.916299 0.925428 0.925428 0.926916 0.976661 nan 0.059663 0.155837 nan 0.042770 nan nan 0.334436 nan nan
Sample_174 0.970316 0.867911 0.875624 0.967356 0.888588 0.960599 0.854645 0.846532 0.920737 0.920759 0.887409 0.887409 0.972096 0.737074 nan 0.306279 0.025125 nan 0.030095 nan nan 0.357612 nan nan
Sample_198 0.914339 0.950391 0.941888 0.932612 0.944412 0.932354 0.955742 0.947627 0.936142 0.946119 0.956493 0.956493 0.924497 0.948712 nan 0.097862 -0.080979 nan -0.121697 nan nan 0.481999 nan nan

Error plot#

Hide code cell source

c_error_min = 4.5
mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1)
errors_val.loc[mask].sort_index(level=1).head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_012 A0A024QZX5;A0A087X1N8;P35237 -0.324 -0.612 -0.414 -0.358 -0.246 -0.416 -0.140 0.856 NaN -53.608 ... -8.881 -3.052 0.632 -4.155 -1.651 -0.335 -0.324 1.241 -0.318 -15.949
Sample_017 A0A024QZX5;A0A087X1N8;P35237 0.347 0.483 0.386 0.268 -0.093 -0.022 0.251 1.658 NaN 26.724 ... -8.211 -2.425 1.228 -2.753 -0.554 0.368 0.347 2.214 0.305 -15.279
Sample_050 A0A024QZX5;A0A087X1N8;P35237 0.544 0.144 0.121 0.122 0.024 -0.102 0.178 2.207 NaN 0.348 ... -8.013 -2.354 3.294 -1.380 -0.584 0.179 0.544 2.691 0.238 -15.081
Sample_102 A0A024QZX5;A0A087X1N8;P35237 -0.029 -0.105 -0.123 -0.067 0.030 0.067 -0.107 0.942 NaN 19.277 ... -8.586 -3.950 1.609 -2.595 -1.070 -0.061 -0.029 1.168 -0.065 -15.654
Sample_109 A0A024QZX5;A0A087X1N8;P35237 0.343 -0.189 -0.038 0.030 -0.179 -0.004 -0.263 1.518 NaN -28.795 ... -8.215 -4.636 -2.077 -2.671 -0.739 0.057 0.343 1.968 -0.012 -15.283

5 rows × 24 columns

Hide code cell source

errors_val = errors_val.abs().groupby(
    freq_feat.index.name).mean()  # absolute error
errors_val = errors_val.join(freq_feat)
errors_val = errors_val.sort_values(by=freq_feat.name, ascending=True)
errors_val.head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
Q9Y281;Q9Y281-3 0.415 0.298 0.265 0.341 0.325 0.285 0.266 4.078 NaN 10.519 ... 0.921 2,473.194 0.699 1.204 0.346 0.415 4.472 0.307 12.573 52
K7EPJ5;O60291;O60291-2;O60291-3;O60291-4 0.331 0.325 0.363 0.339 0.281 0.385 0.387 3.029 NaN 13.344 ... 1.283 2,525.930 0.862 1.415 0.295 0.331 3.452 0.257 13.857 52
B1AJQ6;Q86Y82 1.082 0.366 0.485 0.738 0.482 1.226 0.674 3.367 NaN 5.026 ... 2.147 2,441.128 0.193 7.594 0.967 1.082 3.926 0.900 13.380 52
P69892 0.872 1.613 1.377 1.396 1.734 1.286 1.360 1.980 NaN 18.023 ... 3.022 2,532.051 2.542 5.889 0.933 0.872 2.320 0.966 14.768 53
A2RU67 0.689 0.845 0.520 0.480 0.503 0.462 0.539 4.495 NaN 15.116 ... 1.120 1,998.072 0.883 1.345 0.544 0.689 4.870 0.462 12.437 53

5 rows × 25 columns

Hide code cell source

errors_val.describe()[ORDER_MODELS].T  # mean of means
count mean std min 25% 50% 75% max
BPCA 1,419.000 0.408 0.306 0.017 0.222 0.320 0.494 4.195
DAE 1,419.000 0.420 0.325 0.007 0.225 0.327 0.497 3.474
VAE 1,419.000 0.422 0.316 0.012 0.233 0.332 0.498 3.231
TRKNN 1,419.000 0.437 0.309 0.000 0.241 0.349 0.526 3.647
RF 1,419.000 0.449 0.322 0.061 0.255 0.358 0.529 3.394
CF 1,419.000 0.452 0.304 0.018 0.260 0.375 0.543 3.264
KNN5 1,419.000 0.455 0.322 0.039 0.256 0.369 0.540 3.634
KNN 1,419.000 0.468 0.333 0.012 0.267 0.375 0.549 3.693
KNN_IMPUTE 1,419.000 0.531 0.378 0.063 0.296 0.424 0.636 3.430
IRM 1,419.000 0.555 0.372 0.030 0.311 0.449 0.674 3.476
ROWMEDIAN 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
Median 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
LLS 1,419.000 1.088 19.029 0.023 0.279 0.408 0.596 706.018
QRILC 1,419.000 1.624 0.907 0.178 1.039 1.375 1.901 7.734
COLMEDIAN 1,419.000 2.071 1.509 0.038 0.916 1.738 2.812 12.631
SVDMETHOD 1,419.000 2.136 1.467 0.149 0.976 1.893 2.905 12.211
PI 1,419.000 4.119 2.443 0.193 2.269 3.640 5.528 17.251
MINDET 1,419.000 4.438 2.493 0.374 2.622 4.032 5.828 17.100
MINPROB 1,419.000 4.454 2.488 0.486 2.601 4.023 5.828 16.963
MINIMUM 1,419.000 9.620 2.542 3.842 7.854 9.231 11.051 22.371
ZERO 1,419.000 16.688 2.542 10.910 14.922 16.299 18.119 29.439
IMPSEQROB 1,419.000 443.657 892.834 0.830 23.477 43.842 100.325 2,633.136
MLE 1,419.000 2,171.007 331.079 1.453 1,992.846 2,214.845 2,487.619 2,683.431
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Hide code cell source

c_avg_error = 2
mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1)
errors_val.loc[mask]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
O60512 2.233 1.968 1.951 1.974 2.293 2.209 1.989 5.769 NaN 8.471 ... 1.705 2,128.612 2.098 2.841 2.095 2.233 6.098 2.559 10.910 58
P18206;P18206-2 2.427 1.806 1.420 1.946 1.744 1.637 1.297 3.821 NaN 7.976 ... 1.127 2,518.839 1.673 1.787 2.240 2.427 4.085 1.581 12.898 97
Q99538 2.502 2.476 2.507 2.257 2.711 2.517 2.464 2.615 NaN 8.517 ... 2.513 1,769.534 2.529 2.536 2.524 2.502 2.767 2.399 14.984 107
P02100 2.192 1.653 2.597 2.554 2.283 2.509 1.033 1.996 NaN 14.829 ... 4.528 2,512.438 3.704 6.539 2.458 2.192 2.106 1.856 16.373 127
A0A0G2JRN3 3.053 3.264 3.474 3.231 3.693 3.634 4.195 3.998 NaN 71.992 ... 7.070 1,986.331 6.858 5.751 3.394 3.053 3.976 3.647 19.496 128
P01817 2.254 2.132 2.139 2.052 1.963 2.117 2.385 2.736 NaN 10.059 ... 2.655 2,369.250 2.608 3.227 2.041 2.254 3.104 2.039 14.053 133
Q15375;Q15375-4 4.171 1.615 1.719 1.721 1.608 1.331 1.981 3.754 NaN 16.223 ... 6.563 2,285.221 6.683 4.574 1.887 4.171 3.566 2.065 19.101 163
P68871 2.331 1.251 2.175 2.226 1.616 1.638 0.571 1.720 NaN 23.608 ... 4.173 2,237.073 3.966 3.754 2.104 2.331 2.014 0.854 16.378 168
P69905 2.793 1.575 2.792 2.901 2.936 2.820 1.032 2.807 NaN 94.049 ... 5.990 1,992.771 5.565 6.164 2.982 2.793 2.626 1.016 18.200 190
P35527 2.216 1.218 2.497 2.316 2.064 2.156 1.295 2.273 NaN 96.343 ... 5.009 2,335.097 4.125 4.692 2.146 2.216 2.403 1.169 17.045 195
P15509;P15509-2;P15509-3;P15509-5;P15509-7;P15509-8 2.252 1.594 2.233 2.003 1.218 1.374 1.336 3.397 NaN 48.350 ... 5.653 1,276.662 5.963 4.135 1.840 2.252 3.146 2.437 18.354 201

11 rows × 25 columns

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 3))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred_val[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,)
ax.set_ylabel(f"Average error ({METRIC})")
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:99: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(data=errors,
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf
_images/1b0a3150f579a4cc7e343b1c0ea0371a9efbdb5b153ad64223a62f71daa0309a.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=94) BPCA 0.715 0.601 0.849
1 11\n(N=94) DAE 0.687 0.567 0.813
2 11\n(N=94) RF 0.614 0.504 0.728
3 11\n(N=94) TRKNN 0.655 0.545 0.784
4 11\n(N=94) VAE 0.591 0.479 0.721
... ... ... ... ... ...
85 29\n(N=5) BPCA 0.175 0.062 0.288
86 29\n(N=5) DAE 0.172 0.092 0.265
87 29\n(N=5) RF 0.188 0.123 0.259
88 29\n(N=5) TRKNN 0.193 0.128 0.257
89 29\n(N=5) VAE 0.159 0.069 0.263

90 rows × 5 columns

Hide code cell source

errors_binned.head()
dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_158 Q9UN70;Q9UN70-2 BPCA 0.839 15 2,398 15\n(N=2,398)
1 Sample_158 Q9UN70;Q9UN70-2 DAE 1.055 15 2,398 15\n(N=2,398)
2 Sample_158 Q9UN70;Q9UN70-2 VAE 1.037 15 2,398 15\n(N=2,398)
3 Sample_158 Q9UN70;Q9UN70-2 TRKNN 1.070 15 2,398 15\n(N=2,398)
4 Sample_158 Q9UN70;Q9UN70-2 RF 0.966 15 2,398 15\n(N=2,398)

test data#

Hide code cell source

pred_test = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='test',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
pred_test = pred_test[[TARGET_COL] + ORDER_MODELS]
pred_test = pred_test.join(freq_feat, on=freq_feat.index.name)
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_test.csv'
    dumps[fname.stem] = fname
    pred_test.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_test
observed BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_000 A0A075B6P5;P01615 17.016 17.483 17.051 17.275 17.438 17.801 17.612 17.207 17.190 18.269 ... 17.496 14.415 12.970 12.377 7.068 0 229.376 2,505.226 NaN 210
A0A087X089;Q16627;Q16627-2 18.280 17.769 17.982 17.850 17.930 17.891 17.963 18.146 18.293 17.797 ... 17.695 13.103 12.970 12.358 7.068 0 -20.319 2,505.226 NaN 210
A0A0B4J2B5;S4R460 21.735 22.459 22.380 22.300 22.397 22.388 22.574 21.959 21.835 22.205 ... 17.493 13.677 12.970 12.510 7.068 0 -10.898 2,505.226 NaN 210
A0A140T971;O95865;Q5SRR8;Q5SSV3 14.603 15.285 14.978 15.302 15.399 15.196 15.219 15.143 15.172 15.557 ... 17.087 13.618 12.970 13.105 7.068 0 -2.819 2,505.226 NaN 145
A0A140TA33;A0A140TA41;A0A140TA52;P22105;P22105-3;P22105-4 16.143 16.583 16.577 16.674 16.775 16.611 16.540 16.743 16.625 16.646 ... 17.508 12.280 12.970 13.037 7.068 0 -42.837 2,505.226 NaN 210
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_209 Q96ID5 16.074 15.866 15.935 16.083 16.122 16.001 16.190 15.981 15.909 15.925 ... 17.133 14.434 12.435 12.341 7.068 0 20.373 17.260 NaN 194
Q9H492;Q9H492-2 13.173 13.249 13.489 13.592 13.273 13.301 13.659 13.432 13.669 13.594 ... 17.109 12.727 12.435 11.744 7.068 0 14.713 19.076 NaN 111
Q9HC57 14.207 13.756 13.757 14.310 14.589 14.740 14.216 14.131 13.962 14.391 ... 17.157 12.494 12.435 11.253 7.068 0 21.445 19.649 NaN 128
Q9NPH3;Q9NPH3-2;Q9NPH3-5 14.962 15.096 15.254 14.952 15.099 14.995 15.142 15.123 15.094 15.117 ... 17.257 12.096 12.435 12.668 7.068 0 35.578 16.125 NaN 199
Q9UGM5;Q9UGM5-2 16.871 16.395 16.364 16.360 16.429 16.590 16.523 16.378 16.255 17.054 ... 17.133 13.199 12.435 12.794 7.068 0 82.601 13.608 NaN 209

12600 rows × 26 columns

Write averages for all models to excel (from before?)

Hide code cell source

errors_test_mae = pimmslearn.pandas.calc_errors.get_absolute_error(
    pred_test
)
mae_stats_ordered_test = errors_test_mae.describe()[ORDER_MODELS]
mae_stats_ordered_test
BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
count 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 ... 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 0.000
mean 0.432 0.438 0.439 0.458 0.465 0.468 0.469 0.482 0.558 0.587 ... 2.223 2.330 3.804 4.109 4.127 9.271 16.339 334.546 2,186.302 NaN
std 0.518 0.533 0.540 0.539 0.546 0.524 0.546 0.562 0.679 0.647 ... 1.662 1.653 2.674 2.667 2.711 2.741 2.741 793.494 853.899 NaN
min 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.003 0.001 0.141 7.209 0.001 0.001 NaN
25% 0.121 0.122 0.120 0.132 0.134 0.143 0.138 0.140 0.163 0.175 ... 0.961 1.044 1.783 2.132 2.108 7.344 14.412 12.192 2,436.455 NaN
50% 0.280 0.276 0.278 0.299 0.305 0.322 0.307 0.316 0.364 0.394 ... 1.954 2.098 3.305 3.635 3.673 8.867 15.935 34.192 2,496.971 NaN
75% 0.546 0.556 0.555 0.584 0.591 0.601 0.596 0.612 0.703 0.762 ... 3.119 3.286 5.361 5.610 5.665 10.842 17.910 91.928 2,555.017 NaN
max 7.635 8.606 9.528 9.111 8.597 7.130 8.577 8.171 9.005 7.829 ... 13.272 13.022 19.081 18.317 19.298 23.072 30.140 2,869.824 2,873.005 NaN

8 rows × 24 columns

Hide code cell source

mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f')

Hide code cell source

cp_mean_perf = pd.concat([
    mae_stats_ordered_val.loc['mean'],
    mae_stats_ordered_test.loc['mean'],
],
    axis=1,
    keys=['val', 'test']
).sort_values(by='val')
cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f')
cp_mean_perf
val test
BPCA 0.422 0.432
DAE 0.433 0.438
VAE 0.434 0.439
TRKNN 0.450 0.458
RF 0.461 0.465
CF 0.465 0.468
KNN5 0.467 0.469
KNN 0.481 0.482
KNN_IMPUTE 0.554 0.558
IRM 0.588 0.587
ROWMEDIAN 0.598 0.602
Median 0.598 0.602
LLS 1.329 0.874
QRILC 1.651 1.630
COLMEDIAN 2.210 2.223
SVDMETHOD 2.309 2.330
PI 3.808 3.804
MINDET 4.108 4.109
MINPROB 4.136 4.127
MINIMUM 9.272 9.271
ZERO 16.340 16.339
IMPSEQROB 333.478 334.546
MLE 2,172.384 2,186.302
IMPSEQ NaN NaN

Hide code cell source

writer.close()

Intensity distribution as histogram#

Plot top 4 models predictions for intensities in test data

Hide code cell source

min_max = pimmslearn.plotting.data.min_max(pred_test[TARGET_COL])
top_n = 4
fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True)

for model, color, ax in zip(
        ORDER_MODELS[:top_n],
        COLORS_TO_USE[:top_n],
        axes):

    ax, bins = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[TARGET_COL],
        color='grey',
        min_max=min_max,
        ax=ax
    )
    ax, _ = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[model],
        color=color,
        min_max=min_max,
        ax=ax,
        alpha=0.5,
    )
    _ = [(l_.set_rotation(90))
         for l_ in ax.get_xticklabels()]
    ax.legend()

axes[0].set_ylabel('Number of observations')

fname = args.out_figures / f'2_{group}_intensity_binned_top_{top_n}_models_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf
_images/85ee20313e8f2d787aa5408920ab2de79189ea177508be38232172f83d14dd61.png

Hide code cell source

counts_per_bin = pimmslearn.pandas.get_counts_per_bin(df=pred_test,
                                                bins=bins,
                                                columns=[TARGET_COL, *ORDER_MODELS[:top_n]])

counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
observed BPCA DAE VAE TRKNN
bin
(7, 8] 2 0 0 0 0
(8, 9] 7 0 0 0 0
(9, 10] 18 2 0 0 1
(10, 11] 69 29 26 15 13
(11, 12] 217 165 156 119 113
(12, 13] 634 531 541 501 479
(13, 14] 1,394 1,248 1,215 1,228 1,224
(14, 15] 2,042 2,033 2,090 2,121 2,118
(15, 16] 2,054 2,359 2,350 2,420 2,429
(16, 17] 1,787 1,867 1,846 1,873 1,842
(17, 18] 1,333 1,363 1,385 1,346 1,401
(18, 19] 965 956 938 927 923
(19, 20] 792 789 800 821 800
(20, 21] 536 528 527 509 533
(21, 22] 320 322 314 312 323
(22, 23] 182 176 185 180 171
(23, 24] 102 92 86 90 92
(24, 25] 45 38 43 39 37
(25, 26] 50 57 56 58 59
(26, 27] 25 20 17 18 17
(27, 28] 3 2 2 0 2
(28, 29] 8 11 10 12 11
(29, 30] 13 11 12 11 12

Correlation per sample#

Hide code cell source

corr_per_sample_test = (pred_test
                        .groupby(sample_index_name)
                        .apply(lambda df: df.corr().loc[TARGET_COL])
                        [ORDER_MODELS])
corr_per_sample_test = corr_per_sample_test.join(
    pred_test
    .groupby(sample_index_name)[TARGET_COL]
    .count()
    .rename('n_obs')
)
too_few_obs = corr_per_sample_test['n_obs'] < 3
corr_per_sample_test.loc[~too_few_obs].describe()
BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 ... 210.000 210.000 0.000 210.000 0.000 0.000 210.000 30.000 0.000 210.000
mean 0.969 0.968 0.967 0.966 0.966 0.966 0.965 0.963 0.948 0.952 ... 0.076 -0.001 NaN 0.001 NaN NaN 0.371 -0.018 NaN 60.000
std 0.017 0.018 0.019 0.019 0.018 0.017 0.018 0.019 0.035 0.022 ... 0.192 0.134 NaN 0.135 NaN NaN 0.139 0.151 NaN 9.810
min 0.878 0.905 0.847 0.858 0.872 0.893 0.870 0.888 0.722 0.865 ... -0.402 -0.388 NaN -0.315 NaN NaN 0.021 -0.287 NaN 31.000
25% 0.962 0.962 0.962 0.960 0.959 0.960 0.956 0.953 0.938 0.943 ... -0.059 -0.085 NaN -0.072 NaN NaN 0.288 -0.118 NaN 53.000
50% 0.973 0.972 0.971 0.970 0.969 0.970 0.970 0.968 0.958 0.956 ... 0.067 0.001 NaN -0.000 NaN NaN 0.368 -0.042 NaN 60.000
75% 0.981 0.980 0.980 0.979 0.978 0.978 0.979 0.978 0.969 0.966 ... 0.200 0.098 NaN 0.075 NaN NaN 0.448 0.062 NaN 67.000
max 0.994 0.993 0.994 0.992 0.992 0.994 0.992 0.990 0.987 0.988 ... 0.546 0.314 NaN 0.487 NaN NaN 0.889 0.393 NaN 86.000

8 rows × 25 columns

Hide code cell source

# ! add minimum
kwargs = dict(ylim=(0.7, 1), rot=90,
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model predictions per sample on test data',
              ylabel='correlation per sample')
ax = (corr_per_sample_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_sample_test.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_test.to_excel(w, sheet_name='correlations')
    corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf
_images/e88cc53aac56a564a27882d698badbdbbdacfb9198562ecf00aa96b3ff2c2f4d.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_test[TOP_N_ORDER]).min()
mask = (corr_per_sample_test[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_test.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
  BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
Sample ID                                                  
Sample_015 0.958593 0.910744 0.944713 0.967788 0.928044 0.936299 0.954757 0.964680 0.932223 0.908295 0.882874 0.882874 0.923304 0.938062 nan 0.082354 -0.038744 nan 0.131829 nan nan 0.127719 -0.097389 nan 38
Sample_043 0.949633 0.942351 0.846692 0.858409 0.872425 0.916899 0.870351 0.888093 0.847971 0.882989 0.814366 0.814366 0.828364 0.769910 nan -0.060303 0.009113 nan -0.062515 nan nan 0.444295 nan nan 57
Sample_047 0.939299 0.946556 0.936990 0.916738 0.934731 0.925031 0.950780 0.945377 0.874472 0.900719 0.896683 0.896683 0.009710 0.908690 nan -0.077002 0.092931 nan 0.065600 nan nan 0.524493 nan nan 46
Sample_069 0.936850 0.929852 0.935547 0.944619 0.927697 0.943977 0.944406 0.937711 0.904430 0.950822 0.914668 0.914668 0.918364 0.792805 nan 0.044260 -0.030656 nan 0.208589 nan nan 0.271641 nan nan 68
Sample_080 0.922142 0.920085 0.916809 0.911596 0.915391 0.917945 0.912309 0.921951 0.902040 0.883475 0.893836 0.893836 0.880736 0.833866 nan 0.093012 0.198434 nan -0.011553 nan nan 0.383774 nan nan 64
Sample_091 0.878328 0.919289 0.907804 0.931055 0.910469 0.893144 0.918275 0.903784 0.920915 0.864672 0.903019 0.903019 0.910180 0.779463 nan -0.095511 0.092115 nan -0.008017 nan nan 0.341572 nan nan 60
Sample_108 0.929388 0.914144 0.942561 0.940255 0.945892 0.940582 0.951487 0.946047 0.866107 0.915455 0.939810 0.939810 0.929216 0.833782 nan -0.044493 0.173619 nan -0.092451 nan nan 0.407365 nan nan 68
Sample_109 0.937615 0.930815 0.916769 0.924847 0.928346 0.948924 0.898780 0.893626 0.841761 0.879726 0.890426 0.890426 0.931145 0.785881 nan -0.059637 0.110260 nan 0.116488 nan nan 0.337532 -0.040683 nan 59
Sample_111 0.978525 0.965868 0.908651 0.974002 0.933229 0.963899 0.958219 0.933775 0.923850 0.935239 0.857016 0.857016 0.962568 0.671483 nan -0.130888 0.138566 nan 0.161468 nan nan 0.451452 nan nan 54
Sample_115 0.891712 0.909542 0.911853 0.915296 0.923962 0.903012 0.928234 0.918842 0.853242 0.874847 0.881285 0.881285 0.901459 0.847466 nan 0.094778 -0.085421 nan -0.029497 nan nan 0.320851 nan nan 63
Sample_134 0.933622 0.934306 0.930187 0.907465 0.933804 0.923113 0.952746 0.935936 0.905001 0.915387 0.865397 0.865397 0.881808 0.878941 nan 0.389792 -0.068477 nan -0.071644 nan nan 0.344741 nan nan 66
Sample_138 0.957581 0.905076 0.915126 0.953928 0.925701 0.967977 0.927573 0.936390 0.943933 0.936371 0.921359 0.921359 0.963983 0.825719 nan 0.001445 -0.013959 nan -0.025224 nan nan 0.523470 nan nan 46
Sample_148 0.975203 0.926010 0.949715 0.979465 0.946442 0.971604 0.946668 0.926864 0.929094 0.955283 0.935395 0.935395 0.984939 0.859725 nan 0.037085 -0.164023 nan 0.143599 nan nan 0.362124 nan nan 62
Sample_151 0.947829 0.917531 0.923905 0.919188 0.941400 0.923964 0.937720 0.937262 0.934510 0.915733 0.904552 0.904552 0.917004 0.865561 nan -0.189751 -0.043852 nan 0.103193 nan nan 0.302307 nan nan 70
Sample_152 0.922635 0.920418 0.919858 0.926056 0.929744 0.926967 0.932482 0.931084 0.918127 0.917052 0.909410 0.909410 0.877491 0.897752 nan 0.098949 -0.009049 nan 0.029168 nan nan 0.336118 nan nan 64
Sample_162 0.929186 0.930251 0.943147 0.933190 0.945112 0.925274 0.937839 0.949772 0.956867 0.940055 0.937255 0.937255 0.933909 0.930896 nan 0.516397 0.011234 nan 0.050215 nan nan 0.294184 0.087903 nan 51
Sample_167 0.952090 0.938031 0.930949 0.931476 0.933981 0.957702 0.939793 0.936802 0.922116 0.930438 0.905413 0.905413 0.923164 0.911191 nan 0.221299 -0.180808 nan -0.026498 nan nan 0.235179 nan nan 65
Sample_171 0.948100 0.918579 0.928820 0.901446 0.898896 0.915130 0.919215 0.909432 0.845442 0.899387 0.863135 0.863135 0.898770 0.800105 nan -0.061550 0.032891 nan -0.194996 nan nan 0.344922 nan nan 40
Sample_181 0.912274 0.928770 0.928218 0.920976 0.919121 0.929868 0.929397 0.913043 0.869468 0.929034 0.896030 0.896030 0.899227 0.862711 nan -0.243627 -0.166853 nan 0.198834 nan nan 0.419029 0.117814 nan 60
Sample_185 0.949315 0.943508 0.946241 0.929238 0.938571 0.949606 0.936803 0.930556 0.924211 0.929391 0.922411 0.922411 0.899905 0.912469 nan -0.264227 0.059373 nan -0.171097 nan nan 0.576069 nan nan 69
Sample_199 0.928280 0.933976 0.930147 0.930234 0.921014 0.927713 0.917037 0.925542 0.912243 0.918083 0.910943 0.910943 0.937794 0.793546 nan -0.086879 -0.215487 nan 0.171522 nan nan 0.289504 nan nan 45
Sample_200 0.934067 0.923322 0.936132 0.933368 0.932999 0.898005 0.934918 0.918361 0.722446 0.926169 0.891269 0.891269 0.916117 0.736398 nan 0.034387 0.211896 nan 0.115865 nan nan 0.535540 -0.109406 nan 40

Hide code cell source

feature_names = pred_test.index.levels[-1]
N_SAMPLES = pred_test.index
M = len(feature_names)
pred_test.loc[pd.IndexSlice[:, feature_names[random.randint(0, M - 1)]], :]
observed BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_081 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 15.925 15.527 15.462 15.381 15.308 15.444 15.361 15.474 15.404 15.273 ... 16.939 11.687 12.067 12.197 7.068 0 17.649 2,631.820 NaN 185
Sample_105 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 14.786 15.293 15.192 15.541 15.589 15.404 15.071 15.368 15.248 15.183 ... 17.648 13.345 11.896 13.062 7.068 0 57.949 2,614.181 NaN 185
Sample_129 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 15.779 15.226 15.494 15.453 15.455 15.375 14.879 15.387 15.618 15.726 ... 16.973 12.061 11.762 11.779 7.068 0 13.645 2,544.946 NaN 185
Sample_135 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 15.096 15.046 15.062 15.025 15.153 15.083 14.904 14.914 14.917 15.287 ... 17.130 12.869 11.871 12.982 7.068 0 23.874 2,585.491 NaN 185
Sample_171 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 15.537 15.802 16.121 15.858 15.534 15.517 15.490 15.572 15.645 15.467 ... 17.621 13.897 12.295 11.098 7.068 0 19.780 2,512.177 NaN 185
Sample_186 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 15.520 15.270 15.408 15.415 15.413 15.540 15.294 14.919 14.897 15.644 ... 17.042 12.164 12.660 12.628 7.068 0 36.322 2,452.355 NaN 185

6 rows × 26 columns

Hide code cell source

options = random.sample(sorted(set(feature_names)), 1)
pred_test.loc[pd.IndexSlice[:, options[0]], :]
observed BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_001 P08185 20.146 20.026 19.982 20.108 20.242 20.297 20.262 20.023 20.080 20.756 ... 17.251 12.547 12.700 13.268 7.068 0 85.190 18.038 NaN 210
Sample_011 P08185 20.628 20.090 20.039 20.002 19.985 20.054 20.424 20.263 20.249 20.393 ... 17.324 12.814 12.852 12.365 7.068 0 20.544 2,452.287 NaN 210
Sample_014 P08185 20.384 20.440 20.496 20.521 20.348 20.535 20.455 20.650 20.634 20.610 ... 17.802 13.315 13.106 11.830 7.068 0 -10.966 2,561.230 NaN 210
Sample_021 P08185 19.986 20.112 20.076 20.125 20.115 20.097 20.193 20.098 20.194 20.392 ... 17.312 13.219 12.891 12.855 7.068 0 -29.200 2,495.603 NaN 210
Sample_035 P08185 19.815 20.026 20.022 20.056 20.078 20.190 20.136 20.126 20.298 20.418 ... 17.257 14.508 12.813 13.702 7.068 0 20.303 2,467.202 NaN 210
Sample_076 P08185 20.737 20.428 20.336 20.323 20.534 20.586 20.526 20.303 20.283 20.701 ... 16.950 13.357 11.189 11.982 7.068 0 82.442 2,577.638 NaN 210
Sample_093 P08185 20.876 20.975 21.216 21.129 21.258 21.057 21.298 20.979 20.938 20.965 ... 17.088 12.194 11.686 11.457 7.068 0 30.694 2,625.935 NaN 210
Sample_098 P08185 20.195 20.235 20.283 20.035 20.417 20.521 20.177 20.358 20.099 20.495 ... 17.072 11.699 11.808 12.191 7.068 0 97.682 2,502.971 NaN 210
Sample_100 P08185 20.838 21.065 20.974 20.935 21.114 20.889 21.297 20.701 20.714 20.789 ... 16.914 11.242 11.658 11.089 7.068 0 -24.506 2,542.193 NaN 210
Sample_158 P08185 21.174 20.983 20.944 20.829 20.944 20.951 21.232 21.197 21.131 20.805 ... 17.181 11.424 11.916 11.622 7.068 0 114.476 2,513.638 NaN 210
Sample_207 P08185 20.789 20.537 20.497 20.499 20.679 20.498 20.875 20.764 20.836 20.734 ... 17.151 12.714 12.307 11.422 7.068 0 -31.780 2,466.870 NaN 210
Sample_209 P08185 20.944 20.685 20.665 20.553 20.838 20.804 20.691 20.693 20.684 20.818 ... 17.184 12.251 12.435 11.476 7.068 0 2.677 18.308 NaN 210

12 rows × 26 columns

Correlation per feature#

Hide code cell source

corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply(
    lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS]
corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[
    TARGET_COL].count().rename('n_obs'))

too_few_obs = corr_per_feat_test['n_obs'] < 3
corr_per_feat_test.loc[~too_few_obs].describe()
BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 ... 1,396.000 1,396.000 1,396.000 1,396.000 0.000 0.000 1,396.000 1,396.000 0.000 1,396.000
mean 0.636 0.632 0.622 0.607 0.584 0.604 0.564 0.538 0.445 0.470 ... -0.008 0.010 0.086 0.047 NaN NaN -0.009 -0.016 NaN 8.999
std 0.337 0.349 0.343 0.342 0.354 0.336 0.359 0.369 0.422 0.388 ... 0.447 0.413 0.441 0.412 NaN NaN 0.411 0.433 NaN 3.913
min -0.998 -0.942 -1.000 -0.977 -0.991 -0.901 -0.983 -0.991 -1.000 -0.999 ... -0.999 -1.000 -0.999 -1.000 NaN NaN -1.000 -0.999 NaN 3.000
25% 0.506 0.503 0.490 0.455 0.421 0.448 0.392 0.359 0.221 0.266 ... -0.341 -0.282 -0.242 -0.224 NaN NaN -0.285 -0.313 NaN 6.000
50% 0.746 0.750 0.737 0.706 0.692 0.700 0.677 0.634 0.554 0.554 ... -0.005 0.019 0.118 0.045 NaN NaN 0.004 -0.016 NaN 8.000
75% 0.880 0.878 0.868 0.859 0.849 0.852 0.833 0.809 0.770 0.768 ... 0.331 0.292 0.425 0.351 NaN NaN 0.268 0.294 NaN 11.000
max 0.999 0.998 1.000 1.000 1.000 0.999 0.999 1.000 1.000 0.998 ... 0.999 0.999 0.999 0.999 NaN NaN 0.992 0.998 NaN 32.000

8 rows × 25 columns

Hide code cell source

corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0)
BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
protein groups
A0A0A0MS09;P01880;P01880-2 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000 1.000 1.000 -1.000 1.000 1.000 -1.000 1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
A0A0C4DH29 -1.000 1.000 1.000 -1.000 -1.000 -1.000 1.000 -1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000 -1.000 -1.000 1.000 1.000 -1.000 1.000 1.000 -1.000 -1.000 ... 1.000 -1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000 -1.000 1.000 -1.000 -1.000 1.000 -1.000 1.000 -1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
D6RF35 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
E7EQ64;P07477 1.000 1.000 1.000 1.000 -1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN 1.000 -1.000 NaN 2
F8WDW9;Q96AP7 -1.000 1.000 1.000 -1.000 1.000 -1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
J3KRP0 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 1.000 1.000 1.000 NaN NaN 1.000 1.000 NaN 2
O43581-2;O43581-3;O43581-5 -1.000 -1.000 -1.000 -1.000 1.000 -1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P04075 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
P04080 1.000 1.000 1.000 1.000 -1.000 1.000 1.000 -1.000 1.000 -1.000 ... -1.000 1.000 -1.000 1.000 NaN NaN -1.000 -1.000 NaN 2
P33151 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P62258 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 1.000 1.000 1.000 NaN NaN -1.000 1.000 NaN 2
Q9NYQ8 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 -1.000 1.000 ... -1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
Q9Y281;Q9Y281-3 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2

16 rows × 25 columns

Hide code cell source

kwargs = dict(rot=90,
              flierprops=dict(markersize=1),
              ylabel=f'correlation per {FEAT_NAME_DISPLAY}')
ax = (corr_per_feat_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs)
      )
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                       horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_feat.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_feat_test.loc[~too_few_obs].describe().to_excel(
        w, sheet_name='summary')
    corr_per_feat_test.to_excel(w, sheet_name='correlations')
    corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf
_images/98950b54d1a89c94947e9b71571a088e971dea4d1246bad186916e3239447b4b.png

Hide code cell source

feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count()
feat_count_test.name = 'count'
feat_count_test.head()
protein groups
A0A024QZX5;A0A087X1N8;P35237                                                     10
A0A024R0T9;K7ER74;P02655                                                          8
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8    6
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                           8
A0A075B6H7                                                                        4
Name: count, dtype: int64

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_feat_test[TOP_N_ORDER]).min()
mask = (corr_per_feat_test[TOP_N_ORDER] < treshold).any(axis=1)


def highlight_min(s, color, tolerence=0.00001):
    return np.where((s - s.min()).abs() < tolerence, f"background-color: {color};", None)


view = (corr_per_feat_test
        .join(feat_count_test)
        .loc[mask]
        .sort_values('count'))

if not view.empty:
    display(view
            .style.
            apply(highlight_min, color='yellow', axis=1,
                  subset=corr_per_feat_test.columns)
            )
else:
    print("None found")
  BPCA DAE VAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs count
protein groups                                                    
A0A0C4DH29 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 nan nan -1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan 1.000000 -1.000000 nan 2 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000000 -1.000000 1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
E7EQ64;P07477 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 nan nan -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 -1.000000 nan 2 2
F8WDW9;Q96AP7 -1.000000 1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
P04080 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 -1.000000 nan nan -1.000000 1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 nan 2 2
O43581-2;O43581-3;O43581-5 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 1.000000 nan 2 2
O95497 0.491301 -0.200703 -0.041366 -0.187796 -0.606285 0.666073 -0.278320 -0.038249 -0.972465 0.974252 nan nan -0.924475 -0.486012 -0.728224 -0.107445 -0.982782 0.908126 0.999428 nan nan 0.163889 -0.914582 nan 3 3
P04040 0.955425 0.621147 -0.999505 0.364706 0.799875 0.414166 0.860313 0.995399 0.584629 0.685429 nan nan 0.736559 0.554141 0.910187 0.961838 -0.485745 0.889522 0.926272 nan nan -0.755848 0.778883 nan 3 3
A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;P01892;P10316 -0.405690 -0.647248 -0.543920 -0.426425 0.749121 -0.291881 0.225780 0.141568 -0.230425 0.240706 nan nan 0.029026 -0.576786 0.969574 0.997890 -0.978619 -0.651641 -0.153944 nan nan 0.599349 0.988049 nan 3 3
P14138 -0.783274 -0.366421 0.358911 0.277987 0.170679 -0.804787 0.474087 -0.039227 0.184293 -0.975804 nan nan -0.284116 0.587444 -0.680464 -0.988691 -0.558094 -0.256388 0.856104 nan nan 0.330904 0.330904 nan 3 3
A0A087X117;A0A0G2JN29;J3KN36;P69849;Q15155;Q5JPE7;Q5JPE7-2 0.678810 -0.527548 -0.333694 0.676860 0.696461 0.701446 0.312055 0.660508 -0.865326 0.808608 nan nan -0.988011 -0.975921 -0.484680 -0.563403 0.416763 -0.965384 -0.996648 nan nan -0.178456 0.909074 nan 3 3
Q5FWE3;Q5FWE3-3 -0.595898 -0.796036 0.612054 0.940836 0.328855 0.010845 -0.515612 -0.450286 0.997780 -0.317399 nan nan 0.852964 0.790685 0.879697 0.674983 0.970757 0.966111 0.984187 nan nan -0.999926 0.995547 nan 3 3
Q0P6D2 0.488693 -0.610145 -0.146197 -0.977189 -0.623303 0.543846 -0.914209 -0.991429 0.990772 -0.817525 nan nan -0.910486 -0.776215 -0.815144 -0.873162 0.714626 -0.856054 -0.999616 nan nan 0.198784 -0.816625 nan 3 3
P67936 -0.781265 0.272540 -0.759971 -0.280233 -0.776073 -0.022321 -0.930367 0.668141 -0.999486 -0.951572 nan nan -0.952164 0.863659 -0.989773 -0.899019 -0.637075 0.990700 0.998489 nan nan -0.364239 -0.919047 nan 3 3
Q9BRA2 -0.997854 -0.942232 -0.973725 -0.976681 -0.784071 -0.878858 0.355335 -0.969137 -0.999909 0.328330 nan nan -0.376394 -0.999682 -0.888126 -0.946135 -0.979374 -0.251266 -0.903448 nan nan 0.231242 0.893058 nan 3 3
Q96KR4;Q96KR4-3 -0.811499 -0.197558 -0.452866 0.188634 -0.990702 -0.538430 0.962865 0.998946 -0.996456 -0.998544 nan nan -0.499851 0.598757 -0.969111 -0.794681 -0.148142 0.464970 0.057462 nan nan 0.770952 0.620557 nan 3 3
Q9UI40;Q9UI40-2 0.154643 0.785058 0.162090 0.705987 -0.439641 0.886146 0.043055 -0.368219 0.609727 -0.080993 nan nan 0.216445 -0.809808 0.418780 0.168840 -0.672647 0.994516 0.450875 nan nan 0.588145 0.963302 nan 3 3
A0A075B7B8 0.995309 0.954715 0.841441 -0.584344 0.971685 0.992029 0.645944 0.628331 0.493563 0.290645 nan nan -0.198260 -0.009117 0.596184 0.426337 -0.685340 -0.818663 -0.599950 nan nan -0.419682 0.603615 nan 4 4
A0A087WSY4 0.341097 -0.763232 -0.574571 -0.455325 0.453828 0.148264 0.666119 0.804721 -0.612366 -0.162758 nan nan 0.149034 0.110117 0.994065 0.280256 -0.141634 -0.553907 -0.267603 nan nan -0.767064 0.652348 nan 4 4
Q9ULP0-3;Q9ULP0-6 -0.433605 -0.025330 0.191857 0.280988 0.391351 -0.064157 -0.127518 -0.217068 -0.824736 0.569383 nan nan 0.047124 0.862531 -0.147969 0.088032 0.515680 0.597287 0.726550 nan nan 0.651242 0.678026 nan 4 4
Q9NRB3 0.117442 -0.230034 0.102544 0.279822 -0.415625 0.087905 0.754915 0.318615 0.038608 0.329407 nan nan 0.838394 -0.735280 0.017737 0.158711 -0.882577 0.847087 -0.061035 nan nan -0.205822 0.304178 nan 4 4
P69905 0.965583 -0.851069 -0.374948 0.990263 -0.744286 0.972703 -0.734862 -0.884057 0.787808 0.652423 nan nan 0.995940 0.541462 -0.089591 0.819375 -0.839311 -0.613292 -0.812870 nan nan 0.599413 0.563706 nan 4 4
P21810 0.488982 -0.761744 -0.486579 0.558018 -0.695354 -0.893059 0.725338 0.497175 0.976315 0.606417 nan nan 0.507237 0.035458 0.086433 0.181221 0.933104 -0.013604 0.030647 nan nan -0.032190 -0.515024 nan 4 4
P55058 0.331515 -0.629823 -0.579912 -0.453671 -0.538307 0.565953 -0.730945 -0.687075 0.561214 0.511725 nan nan -0.850960 0.014163 -0.860013 0.937309 0.921167 0.109603 0.709260 nan nan -0.314778 -0.708824 nan 4 4
P01912;Q5Y7A7 0.312242 -0.406923 0.173518 -0.737679 0.943808 -0.155871 -0.983163 -0.923553 0.367817 0.598949 nan nan -0.036948 -0.459363 0.393426 0.622172 0.219166 0.128832 -0.802865 nan nan -0.520030 -0.464856 nan 4 4
B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 -0.586205 -0.451342 -0.752758 -0.468526 -0.781136 -0.419255 -0.950792 -0.978604 -0.206013 0.640276 nan nan -0.926715 -0.713910 -0.611749 -0.688762 0.209302 -0.359254 -0.113108 nan nan -0.630160 0.512953 nan 4 4
Q99538 0.593821 0.154559 -0.265760 0.377925 -0.047666 0.336065 -0.288458 -0.152372 0.223491 -0.280701 nan nan 0.914120 0.233821 0.708292 -0.029421 0.741731 0.753316 0.886699 nan nan -0.421840 -0.113954 nan 4 4
A8MXB9;J3KQJ1;Q8NBJ7 -0.526368 -0.592854 -0.161994 -0.288690 0.904824 -0.214112 0.719047 0.882512 0.575047 0.781018 nan nan 0.983037 0.775224 -0.315140 -0.447225 -0.813824 0.092247 -0.201595 nan nan -0.625728 -0.380936 nan 4 4
E9PN95;P11684 -0.044137 0.358713 0.505875 -0.416898 0.316623 0.102882 -0.241660 0.598536 0.553236 0.989063 nan nan 0.572649 -0.095691 -0.972121 -0.214485 0.574750 -0.729323 -0.227479 nan nan 0.624754 0.488347 nan 4 4
A0A0G2JRN3 -0.879258 -0.636153 -0.858115 -0.190835 -0.952830 -0.901498 0.676931 0.355859 -0.978184 -0.922357 nan nan -0.966410 0.805412 -0.890772 -0.522710 -0.923275 -0.786540 0.643754 nan nan 0.786883 0.273736 nan 4 4
P31150 -0.695110 -0.764001 -0.349630 0.079765 0.058685 -0.467482 -0.302639 -0.258552 0.871434 0.374253 nan nan -0.383378 -0.596147 -0.744603 -0.826531 0.721494 0.679134 -0.114232 nan nan 0.018162 0.997740 nan 4 4
P48745 -0.215191 -0.559369 -0.262030 0.281906 -0.476138 -0.101059 -0.708241 -0.711413 0.592691 0.072594 nan nan -0.966228 0.345911 -0.502678 -0.007247 -0.741987 -0.811427 -0.934651 nan nan -0.816749 -0.563041 nan 4 4
P78310;P78310-2;P78310-5;P78310-6;P78310-7 -0.287802 0.729643 0.775453 0.187632 0.163827 0.779200 0.236820 0.177240 0.323788 -0.925310 nan nan -0.002267 -0.034253 0.370874 -0.992637 -0.241782 -0.740181 -0.924721 nan nan -0.197104 0.065686 nan 4 4
Q8WXD2 -0.194738 -0.605442 -0.291770 0.030615 -0.518988 -0.331197 -0.644254 -0.896598 -0.044385 -0.527932 nan nan -0.031366 -0.399413 -0.879923 -0.696602 0.202507 0.367920 0.466050 nan nan -0.304569 0.801129 nan 4 4
Q9NS85 0.209240 0.374908 -0.149727 -0.748054 -0.432896 -0.728562 0.174286 0.827546 -0.293415 -0.490454 nan nan -0.811087 -0.059568 0.688432 0.684565 -0.985387 -0.573150 -0.322718 nan nan 0.028761 0.700517 nan 4 4
Q8TEA8 0.312460 0.444209 0.322861 0.078415 -0.281820 0.220046 -0.199719 -0.061944 0.642442 0.704937 nan nan 0.711175 0.557112 0.925803 0.920805 -0.342622 0.711193 0.523039 nan nan -0.971775 0.075778 nan 4 4
Q13508;Q13508-2;Q13508-3 -0.588905 0.053475 0.826805 0.317771 0.413427 0.132018 0.367007 -0.137197 -0.196508 0.072987 nan nan 0.595601 0.030517 -0.921969 -0.902558 -0.813208 -0.647509 -0.275654 nan nan -0.816043 0.878974 nan 5 5
Q10469 -0.098233 -0.457639 -0.557534 -0.291141 -0.872703 -0.300285 -0.671391 -0.600598 0.649427 0.606430 nan nan -0.185669 0.455183 -0.517750 -0.162715 0.009418 -0.550749 -0.224459 nan nan -0.084422 0.700076 nan 5 5
Q6ZVL6 -0.405009 0.583156 0.269060 0.541507 -0.071618 0.296924 -0.626545 -0.421210 0.745658 0.380995 nan nan 0.809662 -0.477660 0.343913 -0.237720 0.002701 0.617457 0.547283 nan nan -0.305862 -0.650631 nan 5 5
Q9NZC2;Q9NZC2-2;Q9NZC2-3 -0.532191 -0.706473 -0.485905 -0.549588 -0.389522 -0.461760 0.118561 -0.020609 -0.710573 -0.652656 nan nan -0.534201 -0.706058 0.889235 -0.779762 0.693069 -0.454867 -0.765599 nan nan 0.820294 0.820294 nan 5 5
Q9NZP8 0.485369 0.747239 0.310518 -0.596706 -0.544308 0.101310 0.391275 0.135336 -0.921105 0.672486 nan nan -0.849012 0.612408 0.108741 -0.011939 -0.045177 0.306441 -0.080377 nan nan 0.307325 0.307325 nan 5 5
Q9UHI8 0.073471 0.180793 0.181872 -0.445122 -0.278777 0.128984 0.316329 -0.061541 0.216956 -0.119815 nan nan -0.208262 -0.414518 0.051369 0.334440 0.075901 0.118070 0.157268 nan nan -0.151806 -0.746015 nan 5 5
Q8N428 -0.711604 -0.918970 -0.628009 -0.372718 -0.511920 -0.796349 -0.815146 -0.973013 0.485147 -0.775526 nan nan 0.896715 0.089730 -0.145359 0.398169 -0.043980 0.571660 0.339849 nan nan -0.592174 0.633450 nan 5 5
Q8NFZ4 -0.225202 -0.052699 -0.210822 0.001304 -0.245552 -0.567061 -0.011690 0.057673 -0.422722 -0.787582 nan nan 0.200058 0.291280 0.685266 0.396679 -0.135909 -0.363897 -0.948068 nan nan -0.741786 0.576960 nan 5 5
Q5BIV9 -0.499694 -0.458876 -0.284685 -0.073578 -0.085458 -0.448560 -0.595082 -0.855339 -0.481080 -0.438393 nan nan -0.128645 0.077697 -0.778869 -0.513219 0.422128 -0.261649 -0.225768 nan nan -0.127227 -0.239878 nan 5 5
P42262;P42262-2;P42262-3 0.134997 0.198913 0.517102 0.060401 -0.225840 0.444351 0.205425 0.139807 -0.211874 -0.032356 nan nan -0.429564 -0.370133 0.427599 0.545915 -0.862261 -0.124827 0.484905 nan nan 0.087841 0.050468 nan 5 5
P00441 -0.466488 -0.478895 -0.203688 -0.418119 -0.493223 -0.899398 0.047755 -0.045985 -0.047843 -0.228959 nan nan -0.221865 0.523969 -0.478446 -0.697790 0.388847 -0.145801 -0.023925 nan nan 0.030159 -0.379242 nan 5 5
O15031 -0.384689 0.370156 0.932788 0.545446 0.539778 -0.497014 0.392387 0.288900 0.278546 0.565502 nan nan 0.144879 -0.730060 -0.264862 -0.348062 -0.586222 -0.121316 -0.856673 nan nan -0.296688 -0.296688 nan 5 5
D6R956;P09936 0.296679 0.082040 -0.121536 -0.551658 0.426659 0.508404 -0.259386 -0.664319 0.151812 0.024035 nan nan 0.313797 0.988775 -0.119808 -0.402161 -0.421491 -0.212790 -0.381253 nan nan -0.514141 0.033000 nan 5 5
O75339 0.036295 -0.334104 -0.371637 0.082569 -0.118791 0.107463 -0.383440 -0.134371 -0.662474 0.284821 nan nan -0.707550 -0.840662 -0.475700 -0.809598 0.606567 -0.206109 0.237399 nan nan 0.211736 0.029954 nan 5 5
B1AJZ9;B1AJZ9-4;H0YE38;Q5JYW6 -0.213396 -0.337185 -0.357482 -0.655514 0.175020 0.199747 -0.355725 -0.358839 -0.320693 0.330414 nan nan -0.420475 0.293436 -0.515651 -0.659479 0.262498 0.428130 0.261002 nan nan 0.163593 0.301970 nan 5 5
A0A0C4DGV8;Q13214;Q13214-2 0.215023 0.094880 -0.709383 -0.952860 -0.544567 0.810912 -0.317076 -0.315165 -0.648660 -0.694453 nan nan -0.483616 -0.695880 0.951239 0.908215 -0.480898 -0.051800 -0.756235 nan nan -0.698798 -0.910792 nan 5 5
P10124 -0.597027 -0.708992 -0.103501 -0.186782 -0.109721 -0.758455 0.571509 -0.252024 0.436283 -0.628340 nan nan -0.573988 0.704764 -0.483333 -0.479793 -0.397109 -0.261585 -0.062417 nan nan 0.414290 0.578378 nan 5 5
O95428;O95428-5;O95428-6 0.217023 0.857456 0.772269 0.844925 -0.308261 0.724032 -0.014259 0.198640 0.518770 -0.114105 nan nan 0.703897 -0.400925 -0.976420 -0.560374 -0.669866 0.386368 -0.554906 nan nan 0.320271 0.023365 nan 5 5
G3V2U7;P07311 0.478113 -0.111151 -0.247010 0.277132 0.411177 0.509682 0.126790 -0.075889 -0.045458 0.298978 nan nan -0.201011 -0.592220 -0.434263 -0.349618 -0.195583 0.286177 0.604054 nan nan -0.412639 -0.436945 nan 5 5
P01834 0.145967 -0.172807 -0.429955 -0.196562 -0.672276 0.581034 0.157569 0.333278 0.249827 0.136816 nan nan -0.132585 0.162382 -0.169066 -0.492225 0.622181 0.283761 -0.006954 nan nan -0.157407 -0.157407 nan 5 5
A0A0G2JRQ6 -0.226956 -0.582814 -0.615069 -0.249237 -0.445903 -0.567952 -0.267241 -0.104896 0.030670 -0.906417 nan nan -0.206273 -0.103165 -0.360860 0.551245 0.146337 0.307142 0.204349 nan nan -0.589769 -0.589769 nan 5 5
A0A0C4DH73;P01611 0.243542 -0.372306 0.286388 0.773609 0.639977 0.368465 0.796573 0.344724 0.055662 0.108815 nan nan -0.232335 0.861960 -0.561802 0.517434 0.224014 -0.303031 -0.800644 nan nan -0.232278 -0.232290 nan 5 5
P02533 0.968900 -0.886665 -0.578086 0.859288 -0.521344 0.926599 0.106114 0.085868 0.760846 0.490144 nan nan 0.864174 -0.747286 0.303053 0.460298 -0.580837 0.446403 -0.043552 nan nan 0.207036 -0.545616 nan 5 5
P10644;P10644-2 -0.252574 -0.396850 0.191217 -0.097608 0.327042 0.151096 -0.336136 -0.255617 0.448682 -0.000996 nan nan 0.107935 0.081133 0.383896 -0.528502 -0.746578 0.577309 0.258474 nan nan -0.154795 -0.502540 nan 6 6
P01036;P01037 0.014913 -0.304544 -0.357757 -0.482398 -0.194009 -0.127587 -0.220449 -0.046435 -0.115861 -0.146824 nan nan 0.512862 0.177547 -0.389652 -0.354472 -0.427517 -0.362058 -0.069787 nan nan 0.673582 0.579024 nan 6 6
H3BRQ4;K4DIB9;P50238 -0.093569 -0.280029 -0.448037 -0.057295 -0.412742 -0.602581 -0.083371 -0.277897 0.428824 0.210271 nan nan 0.321101 0.573172 -0.052992 0.460109 -0.380802 0.387204 0.217265 nan nan 0.012052 -0.720358 nan 6 6
Q9UKM7 0.670563 0.709636 -0.231501 0.346869 0.084664 0.825416 0.783222 0.816914 -0.357151 0.624441 nan nan 0.514508 -0.776170 0.449623 0.260096 0.522803 0.870641 0.579020 nan nan -0.038345 -0.304687 nan 6 6
M0R009 -0.339372 -0.101949 -0.536443 -0.584432 -0.138788 -0.128776 -0.356792 -0.247140 -0.037546 -0.001198 nan nan -0.732570 0.418898 -0.580618 0.320436 -0.622980 -0.750481 -0.205751 nan nan 0.635403 -0.182952 nan 6 6
P05451 0.097378 -0.215320 -0.605480 -0.057327 -0.504795 -0.651822 -0.360316 -0.684154 -0.327039 0.306881 nan nan -0.336078 0.247303 -0.538331 -0.470228 -0.462987 -0.412800 -0.446230 nan nan -0.749296 -0.302038 nan 6 6
E9PKE3;P11142 -0.225336 -0.317896 -0.161284 0.346178 0.560079 -0.032004 -0.515610 -0.674428 0.038561 -0.291303 nan nan -0.729284 0.710477 -0.288577 0.003161 -0.657004 -0.507155 -0.176644 nan nan 0.281878 0.415848 nan 6 6
P29401;P29401-2 0.085483 -0.231864 -0.144746 -0.368660 0.023887 0.419549 0.223946 0.365861 0.001412 -0.392136 nan nan 0.108221 0.926914 0.187563 0.224033 -0.460343 -0.281768 0.266682 nan nan 0.300495 0.397517 nan 6 6
Q96S96 -0.324600 0.301961 -0.132681 -0.462902 -0.449058 0.214330 -0.359991 -0.296281 -0.305383 0.059876 nan nan -0.582492 0.479312 -0.157328 -0.153988 -0.688889 -0.086735 -0.062041 nan nan -0.682257 0.760044 nan 7 7
P35443 -0.174408 0.810888 0.320413 -0.252097 0.074517 0.272457 0.404203 0.582749 0.068469 -0.211928 nan nan 0.314822 0.410214 -0.017793 -0.130271 -0.052621 0.039112 -0.034617 nan nan 0.022830 0.275796 nan 7 7
P19835;X6R868 -0.578277 -0.677171 -0.582379 -0.428965 -0.222323 -0.418818 -0.359418 -0.458749 -0.930907 0.408941 nan nan -0.762851 -0.114175 -0.321335 -0.213048 0.158614 -0.001008 0.129263 nan nan -0.780480 0.407170 nan 7 7
C9JKT8;Q9UEW3;Q9UEW3-2 -0.293733 -0.200441 -0.157959 -0.671856 -0.366331 -0.064957 0.252685 0.407657 0.516418 -0.293680 nan nan -0.461590 -0.487179 -0.356302 -0.545248 0.144636 -0.645563 -0.823381 nan nan -0.058753 -0.279069 nan 7 7
Q8IWU5;Q8IWU5-2 -0.218403 -0.030256 0.029297 -0.483524 -0.579805 0.583682 -0.071915 -0.451814 -0.684735 0.392763 nan nan -0.070980 0.099606 0.334826 0.479673 0.165475 -0.096058 0.333029 nan nan 0.632888 -0.013472 nan 7 7
Q9BT88 -0.231560 -0.056324 -0.346425 -0.505983 -0.637818 0.117591 0.248942 0.470496 -0.357877 -0.223776 nan nan 0.096659 -0.441272 -0.078098 0.471098 -0.037161 0.626185 0.854452 nan nan 0.342112 0.354892 nan 7 7
Q5VSG8 0.643434 0.834220 -0.364777 0.006904 -0.131047 0.669369 0.216016 0.672377 -0.081632 0.554620 nan nan -0.342809 -0.136635 0.068333 0.338473 0.606930 0.019817 -0.021636 nan nan 0.814893 -0.335813 nan 7 7
P80108 -0.376648 -0.392078 -0.492384 -0.172435 -0.594320 -0.048533 -0.184693 -0.137669 0.293716 -0.492363 nan nan 0.087362 0.500325 -0.600824 -0.578633 -0.456934 -0.443645 -0.334655 nan nan 0.119112 0.119112 nan 7 7
P35555 0.338198 -0.132109 0.018685 0.222641 -0.239691 0.319731 0.085386 -0.235536 0.008502 0.384573 nan nan -0.315052 -0.191980 0.352677 0.335931 0.448441 0.337658 0.332443 nan nan -0.284628 -0.267129 nan 7 7
P51884 0.582757 -0.222141 0.089606 0.531542 -0.241902 0.741616 -0.331174 -0.291611 -0.063590 -0.760356 nan nan 0.361296 0.870266 0.243111 0.498870 -0.547386 0.197945 0.119629 nan nan -0.230796 -0.230796 nan 7 7
Q9P232 -0.178684 -0.468542 -0.116384 -0.387374 -0.224862 -0.539496 -0.587127 -0.640844 -0.695382 -0.457551 nan nan -0.400998 -0.060443 0.256287 0.419846 -0.083285 0.477733 -0.315372 nan nan -0.766501 -0.447343 nan 8 8
Q5JRA6;Q5JRA6-2 -0.221290 -0.367185 -0.204721 -0.172243 0.098999 -0.034860 0.450879 0.021417 -0.649397 -0.669281 nan nan -0.367023 -0.175767 -0.544434 -0.362956 0.106508 -0.396093 -0.407414 nan nan 0.085440 0.745223 nan 8 8
Q5SRI9 -0.274015 -0.146891 0.361648 -0.087155 0.565725 -0.183637 -0.584189 -0.348800 0.733544 -0.259835 nan nan 0.162788 0.635937 0.592005 0.136408 0.310704 -0.088515 0.023438 nan nan 0.192704 -0.380808 nan 8 8
P12273 0.309929 -0.432105 0.189683 0.125788 -0.610911 0.496765 -0.170046 -0.219819 -0.188677 0.272716 nan nan -0.741473 -0.084614 -0.153439 -0.054931 -0.395294 -0.291898 -0.123473 nan nan -0.347341 -0.122712 nan 8 8
P17677;P17677-2 -0.225142 0.071159 0.236599 -0.046967 -0.168203 0.163885 -0.060209 -0.272913 -0.312226 0.255003 nan nan -0.234374 -0.399265 0.302693 0.676453 0.399573 -0.380524 -0.618895 nan nan 0.169275 -0.057105 nan 8 8
P05556;P05556-2;P05556-3;P05556-4;P05556-5 -0.472444 -0.360306 -0.372721 -0.424667 -0.329727 -0.126084 -0.339333 -0.164482 -0.016348 0.328409 nan nan -0.091371 -0.345106 0.247949 0.184702 -0.038247 -0.061795 -0.122392 nan nan 0.003355 0.029215 nan 8 8
A0A0G2JQD2;A0A0G2JQM0;A0A0G2JRN4;P30711 -0.278053 -0.264740 -0.161204 -0.254147 -0.300375 0.070724 -0.176005 -0.274112 0.078554 -0.628982 nan nan -0.142576 -0.149309 -0.484875 -0.287631 0.011238 -0.051869 -0.421709 nan nan 0.739107 -0.015886 nan 8 8
O95841 0.500995 0.605715 -0.524331 -0.046983 -0.248361 0.603550 0.726898 0.515862 0.107641 -0.307110 nan nan 0.898836 0.695888 -0.099352 0.066238 0.098580 0.491465 -0.171810 nan nan 0.138001 0.018682 nan 8 8
K7EKE8;Q92692;Q92692-2 0.199577 -0.261837 0.200838 0.035418 -0.309902 0.412545 -0.300452 -0.213314 -0.055248 0.203956 nan nan -0.185280 0.311770 -0.443701 -0.533869 -0.141147 -0.385227 -0.522518 nan nan 0.621417 0.523851 nan 8 8
Q9HBT6 -0.262534 0.036659 -0.035827 0.204430 -0.203846 -0.084788 0.099041 -0.130810 0.445839 -0.463107 nan nan 0.711355 0.248565 -0.045300 -0.091698 -0.683572 -0.213494 -0.163545 nan nan 0.106877 -0.188251 nan 8 8
Q9Y653;Q9Y653-2;Q9Y653-3 -0.130134 -0.732237 -0.479137 -0.417296 -0.634158 -0.104019 -0.604021 -0.672811 -0.191744 0.165959 nan nan -0.729686 0.405321 0.286967 0.024990 -0.701411 -0.282547 -0.342304 nan nan 0.406484 -0.099690 nan 8 8
Q13790 -0.521996 0.510737 0.069379 -0.198715 0.108776 0.247643 -0.462789 0.222006 -0.215508 -0.483551 nan nan 0.029153 -0.162340 0.174426 -0.331592 0.756248 0.061342 0.460098 nan nan 0.078235 -0.389478 nan 8 8
P20742 0.272179 0.060691 -0.201165 -0.050237 -0.449373 0.661995 0.042236 -0.224989 0.307233 0.709950 nan nan -0.253728 0.478036 -0.013379 0.034007 -0.555995 0.302084 0.040556 nan nan -0.589304 -0.738412 nan 8 8
A0A087X1V2 -0.070690 0.681782 0.098641 -0.243238 -0.401361 -0.171375 0.057184 0.003097 0.088347 -0.016903 nan nan 0.227076 0.135352 -0.181107 -0.345283 0.335080 0.241761 0.433429 nan nan 0.039602 -0.005971 nan 9 9
A0A0C4DH24 -0.413803 -0.454863 -0.324616 -0.451837 -0.311522 -0.423826 -0.168096 -0.181333 -0.444924 0.152891 nan nan -0.499855 -0.130669 0.293136 0.105732 0.559535 0.189404 -0.041792 nan nan -0.601267 0.147397 nan 9 9
P10745 -0.236641 0.080636 0.191974 0.023283 0.129997 0.002952 0.468128 0.489925 -0.073139 0.328449 nan nan 0.136523 -0.629891 0.597717 0.544391 0.180758 0.161125 -0.203471 nan nan 0.688639 -0.246050 nan 9 9
P08493;P08493-2 0.038081 -0.172212 -0.244163 -0.232150 -0.346266 0.314626 -0.635377 -0.703795 -0.351766 0.150211 nan nan 0.024427 0.548798 -0.406176 -0.246588 0.277692 0.360308 0.540192 nan nan -0.397982 -0.397982 nan 9 9
Q9Y6C2 0.340254 0.663371 0.228032 -0.272410 0.017834 0.066819 0.348677 0.614777 -0.561969 0.551984 nan nan 0.042545 0.020829 0.079803 0.126374 0.035681 -0.186103 -0.382913 nan nan 0.017971 -0.127935 nan 9 9
P50395 0.013608 -0.003113 -0.158719 0.174813 -0.514519 -0.355464 0.327166 0.193138 0.152363 0.368767 nan nan 0.016074 -0.137782 0.147720 0.026398 0.042367 0.045322 0.175788 nan nan -0.756427 -0.524876 nan 9 9
B1AJQ6;Q86Y82 0.036729 0.098219 -0.348749 0.003516 0.424109 -0.115359 0.200578 0.321117 0.511875 0.436190 nan nan 0.363410 0.766044 0.081571 0.042638 -0.751987 0.006824 0.044892 nan nan -0.332403 -0.058363 nan 9 9
B1ALD9;Q15063;Q15063-3;Q15063-5 -0.101006 -0.284982 -0.239486 -0.287475 0.427507 0.113073 -0.240490 -0.113355 -0.369493 0.319290 nan nan 0.425439 -0.098348 0.080040 0.086550 0.290249 0.619472 0.166813 nan nan 0.053837 -0.648727 nan 9 9
B1AJR6;B1AJR9;B1AJS0;O14522 0.530251 -0.491270 -0.251360 0.093327 0.069821 0.061145 -0.445185 -0.353217 0.644977 0.465073 nan nan -0.049114 -0.158982 -0.304727 -0.305222 -0.711988 0.183477 -0.337743 nan nan 0.348627 -0.076654 nan 9 9
O43529 0.020461 -0.233067 0.323616 0.174158 -0.112700 0.108546 -0.013005 -0.117824 0.195129 0.128859 nan nan -0.187278 -0.298086 0.069547 0.283319 0.250738 0.566172 0.493952 nan nan -0.238252 -0.423734 nan 9 9
P40121;P40121-2 -0.202385 -0.247832 0.217522 0.354102 0.459498 -0.238511 0.424692 0.529660 0.206394 0.067428 nan nan -0.529026 0.023147 0.396670 0.185199 -0.004286 -0.359970 -0.224362 nan nan -0.176933 -0.108032 nan 9 9
Q14019 -0.344741 0.258659 0.070937 0.165256 0.202345 -0.498165 -0.725812 -0.693759 0.525948 -0.022084 nan nan 0.367858 0.209953 0.064769 0.401079 -0.696091 0.256334 0.307072 nan nan -0.513253 -0.468898 nan 9 9
E9PL83;P35318 0.357204 0.242888 0.087374 -0.015352 -0.276522 0.434096 -0.054645 -0.457266 0.123687 -0.539022 nan nan 0.167702 -0.263504 -0.347613 -0.174028 -0.079415 -0.082180 -0.217734 nan nan -0.339713 -0.744234 nan 9 9
A0A1W2PQB1;H0Y755;M9MML0;P08637 -0.090323 0.271130 0.598667 -0.253373 0.660014 0.411724 0.460288 0.690784 0.595184 -0.139070 nan nan 0.331327 0.333899 -0.081580 -0.266497 0.412163 0.540145 0.067791 nan nan -0.617375 0.448386 nan 10 10
Q6UWH4;Q6UWH4-2 -0.588718 -0.526603 -0.180183 0.098882 -0.376072 -0.603408 -0.397136 0.151063 -0.079928 -0.371246 nan nan -0.426708 0.291259 -0.578150 -0.418792 -0.606772 0.143652 -0.116874 nan nan -0.126561 -0.059018 nan 10 10
P08670 -0.314275 -0.197812 -0.243993 -0.179390 -0.439239 -0.491445 -0.501234 -0.180491 -0.411473 0.025687 nan nan 0.206552 -0.012673 0.551632 0.469208 0.463497 0.402038 -0.590306 nan nan 0.028298 0.291405 nan 10 10
P00915 0.933351 -0.242240 0.139929 0.883700 0.071421 0.847485 0.215571 0.399271 0.618757 0.705641 nan nan 0.933484 0.155516 0.081495 0.012603 0.182911 0.414364 0.373182 nan nan 0.359490 0.118343 nan 10 10
O76070 -0.432052 0.160705 -0.241133 -0.215355 -0.533848 -0.152224 -0.204646 0.162530 0.469000 0.054400 nan nan -0.278608 0.227182 -0.212317 -0.086282 -0.345334 -0.128095 0.089324 nan nan -0.474129 0.337800 nan 10 10
P01742 0.375201 0.217353 -0.091938 -0.267428 -0.163794 0.396648 0.026400 0.271311 -0.587962 -0.094307 nan nan -0.393033 0.389373 0.676899 0.724786 0.438959 0.594179 0.460979 nan nan -0.194256 0.578374 nan 11 11
P55774 0.103249 -0.170476 0.036400 -0.308916 -0.140466 -0.166995 -0.215141 -0.218578 -0.211760 -0.158074 nan nan -0.181675 0.638468 -0.183622 -0.001697 -0.419327 -0.506454 -0.102026 nan nan -0.584551 -0.551401 nan 11 11
Q9NQS3;Q9NQS3-2;Q9NQS3-3 -0.424592 -0.066023 -0.172660 -0.139485 -0.011962 -0.135858 -0.111027 -0.140219 0.069189 -0.172248 nan nan 0.202371 0.533205 0.035925 0.120460 0.256628 0.092650 0.043700 nan nan 0.156133 -0.174368 nan 11 11
Q9UFP1 0.121246 -0.028452 -0.255185 -0.094306 -0.091207 -0.071982 0.028588 0.208374 0.056005 -0.173611 nan nan -0.123046 -0.156490 -0.266922 -0.405138 -0.384532 0.076316 0.322239 nan nan 0.159000 0.376368 nan 12 12
P53634 0.321988 0.144592 0.027653 -0.130860 -0.396023 0.358270 -0.101786 -0.152078 0.263806 0.352837 nan nan 0.180917 0.127724 0.333731 0.177133 0.141936 -0.016131 0.125836 nan nan -0.353595 0.259023 nan 12 12
A1L4H1 0.186316 -0.576874 -0.318546 0.309188 -0.204797 0.127365 0.072585 -0.116746 -0.221594 0.008497 nan nan -0.326665 -0.110696 0.300334 -0.324173 -0.635177 0.289843 0.385581 nan nan -0.031547 -0.042797 nan 12 12
Q9BUJ0 -0.037313 -0.048522 -0.282450 -0.314651 -0.262387 0.264989 -0.098921 0.027051 -0.265561 -0.119433 nan nan -0.111523 -0.502699 -0.202398 -0.237412 -0.236467 -0.107921 -0.194041 nan nan 0.049800 -0.265784 nan 12 12
Q96RW7;Q96RW7-2 -0.274538 -0.063626 0.093539 -0.301343 -0.039283 0.043721 -0.398478 -0.352505 0.219858 -0.008263 nan nan 0.625088 -0.526270 0.502826 0.258426 0.009098 -0.017822 -0.181509 nan nan 0.378646 0.234101 nan 12 12
B4DYV8;Q8WZ75;Q8WZ75-2;Q8WZ75-3 0.270149 0.286361 0.227855 -0.317889 -0.342710 0.017805 0.130729 0.093670 -0.515780 0.120208 nan nan -0.440024 -0.076757 0.078349 0.291298 -0.111882 0.133283 0.007844 nan nan 0.085688 -0.033982 nan 14 14
Q6PCB0 -0.010818 -0.288217 -0.260301 -0.187322 -0.305239 -0.058480 -0.287276 -0.323757 -0.316441 0.132796 nan nan -0.011942 0.452630 -0.530040 -0.329471 -0.178216 -0.188112 0.425936 nan nan -0.023465 -0.115061 nan 14 14
Q6ZMP0;Q6ZMP0-2 -0.257877 -0.048427 0.205765 0.132715 0.053591 -0.205764 0.066672 -0.028928 -0.151054 -0.094501 nan nan -0.193571 0.148188 0.414953 -0.432438 0.240058 -0.152902 -0.023359 nan nan 0.481989 0.031743 nan 16 16
P11597;P11597-2 0.091263 -0.339740 -0.438873 -0.244961 -0.116961 -0.393768 -0.071420 -0.029940 0.225605 -0.022021 nan nan -0.443310 0.153158 -0.211026 0.052557 -0.173663 0.330721 0.231594 nan nan 0.400242 -0.067354 nan 16 16
Q96AQ6;Q96AQ6-2 0.074929 0.086990 -0.040908 -0.286627 -0.197754 0.267614 0.266461 0.377825 -0.047226 -0.184507 nan nan -0.159338 -0.139034 -0.158511 0.136220 -0.277628 -0.189966 -0.087533 nan nan 0.554486 -0.006715 nan 16 16
A6XMH3;P01236;Q5I0G2 0.143374 -0.576863 -0.394840 -0.263686 -0.067927 -0.233042 0.249922 0.236575 -0.184831 0.126855 nan nan -0.456084 0.069668 0.183538 0.124415 -0.079360 -0.746360 -0.298466 nan nan 0.415223 0.098237 nan 17 17
Q13231;Q13231-3 0.028238 0.205037 0.186973 -0.248124 0.158891 0.284702 -0.144117 -0.108428 0.011950 0.128016 nan nan 0.134116 -0.081188 -0.284982 -0.276036 0.018791 0.135271 0.211449 nan nan -0.001439 0.440431 nan 19 19

Error plot#

Hide code cell source

metrics = pimmslearn.models.Metrics()
test_metrics = metrics.add_metrics(
    pred_test[['observed', *TOP_N_ORDER]], key='test data')
test_metrics = pd.DataFrame(test_metrics)[TOP_N_ORDER]
test_metrics
Selected as truth to compare to: observed
BPCA DAE VAE TRKNN RF
MSE 0.455 0.476 0.485 0.500 0.515
MAE 0.432 0.438 0.439 0.458 0.465
N 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000
prop 1.000 1.000 1.000 1.000 1.000

Hide code cell source

n_in_comparison = int(test_metrics.loc['N'].unique()[0])
n_in_comparison
12600

Hide code cell source

_to_plot = test_metrics.loc[METRIC].to_frame().T
_to_plot.index = [feature_names.name]
_to_plot
BPCA DAE VAE TRKNN RF
protein groups 0.432 0.438 0.439 0.458 0.465

Hide code cell source

try:
    text = model_configs[["latent_dim", "hidden_layers"]].apply(
        build_text,
        axis=1)
except KeyError:
    logger.warning("No PIMMS models in comparsion. Using empty text")
    text = pd.Series('', index=model_configs.columns)

_to_plot.loc["text"] = text
_to_plot = _to_plot.fillna('')
_to_plot
BPCA DAE VAE TRKNN RF
protein groups 0.432 0.438 0.439 0.458 0.465
text LD: 10 HL: 64 LD: 10 HL: 64

Hide code cell source

fig, ax = plt.subplots(figsize=(4, 2))  # size of the plot can be adjusted
ax = _to_plot.loc[[feature_names.name]].plot.bar(
    rot=0,
    ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY}\n({n_in_comparison:,} intensities)",
    # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
    color=COLORS_TO_USE,
    ax=ax,
    width=.7)
ax = pimmslearn.plotting.add_height_to_barplot(ax, size=7)
ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=7)
ax.set_xticklabels([])
fname = args.out_figures / f'2_{group}_performance_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_performance_test.pdf
_images/86847498cc48a54a14d4f932d9c9de1711a587e193de872adb9b41ae7bb57c1c.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
_to_plot_long = _to_plot.T
_to_plot_long = _to_plot_long.rename(
    {feature_names.name: 'metric_value'}, axis=1)
_to_plot_long['data level'] = feature_names.name
_to_plot_long = _to_plot_long.set_index('data level', append=True)
_to_plot_long.to_csv(fname.with_suffix('.csv'))

Plot error by median feature intensity#

Hide code cell source

pimmslearn.plotting.make_large_descriptors(7)
fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred=pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC,
    palette=COLORS_TO_USE
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
pimmslearn.plotting.make_large_descriptors(6)
fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:99: UserWarning: The palette list has more values (24) than needed (5), which may not be intended.
  sns.barplot(data=errors,
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:99: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(data=errors,
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_000 A0A075B6P5;P01615 BPCA 0.467 19 912 19\n(N=912)
1 Sample_000 A0A075B6P5;P01615 DAE 0.035 19 912 19\n(N=912)
2 Sample_000 A0A075B6P5;P01615 VAE 0.259 19 912 19\n(N=912)
3 Sample_000 A0A075B6P5;P01615 TRKNN 0.422 19 912 19\n(N=912)
4 Sample_000 A0A075B6P5;P01615 RF 0.785 19 912 19\n(N=912)
... ... ... ... ... ... ... ...
62,995 Sample_209 Q9UGM5;Q9UGM5-2 BPCA 0.476 16 1,913 16\n(N=1,913)
62,996 Sample_209 Q9UGM5;Q9UGM5-2 DAE 0.507 16 1,913 16\n(N=1,913)
62,997 Sample_209 Q9UGM5;Q9UGM5-2 VAE 0.511 16 1,913 16\n(N=1,913)
62,998 Sample_209 Q9UGM5;Q9UGM5-2 TRKNN 0.442 16 1,913 16\n(N=1,913)
62,999 Sample_209 Q9UGM5;Q9UGM5-2 RF 0.281 16 1,913 16\n(N=1,913)

63000 rows × 7 columns

_images/6b87621ec182fa980c490ee8e47d681c3d4b77fc5de252fe7bae7171d19eaba5.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=63) BPCA 0.619 0.504 0.744
1 11\n(N=63) DAE 0.647 0.526 0.772
2 11\n(N=63) RF 0.572 0.444 0.713
3 11\n(N=63) TRKNN 0.589 0.482 0.698
4 11\n(N=63) VAE 0.592 0.478 0.714
... ... ... ... ... ...
85 29\n(N=12) BPCA 0.133 0.065 0.212
86 29\n(N=12) DAE 0.149 0.097 0.210
87 29\n(N=12) RF 0.261 0.184 0.346
88 29\n(N=12) TRKNN 0.230 0.158 0.317
89 29\n(N=12) VAE 0.178 0.097 0.268

90 rows × 5 columns

Hide code cell source

(errors_binned
 .set_index(
     ['model', errors_binned.columns[-1]]
 )
 .loc[ORDER_MODELS[0]]
 .sort_values(by=METRIC))
Sample ID protein groups MAE bin n_obs
intensity binned by median of protein groups
18\n(N=846) Sample_142 P09972 0.000 18 846
15\n(N=2,557) Sample_021 A0A0A0MT66 0.000 15 2,557
14\n(N=2,074) Sample_058 Q16853;Q16853-2 0.000 14 2,074
16\n(N=1,913) Sample_015 B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 0.000 16 1,913
15\n(N=2,557) Sample_079 A6NCT7;Q07092;Q07092-2 0.000 15 2,557
... ... ... ... ... ...
14\n(N=2,074) Sample_011 P11597;P11597-2 5.771 14 2,074
14\n(N=2,074) Sample_184 F8WD41;Q15166 6.195 14 2,074
17\n(N=1,393) Sample_108 P27824;P27824-2 6.482 17 1,393
14\n(N=2,074) Sample_091 F8WD41;Q15166 6.823 14 2,074
14\n(N=2,074) Sample_115 P17050 7.635 14 2,074

12600 rows × 5 columns

Custom model selection#

Hide code cell source

if SEL_MODELS:
    metrics = pimmslearn.models.Metrics()
    test_metrics = metrics.add_metrics(
        pred_test[['observed', *SEL_MODELS]], key='test data')
    test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS]
    test_metrics

    n_in_comparison = int(test_metrics.loc['N'].unique()[0])
    n_in_comparison

    _to_plot = test_metrics.loc[METRIC].to_frame().T
    _to_plot.index = [feature_names.name]
    _to_plot

    try:
        text = model_configs[["latent_dim", "hidden_layers"]].apply(
            build_text,
            axis=1)
    except KeyError:
        logger.warning("No PIMMS models in comparsion. Using empty text")
        text = pd.Series('', index=model_configs.columns)

    _to_plot.loc["text"] = text
    _to_plot = _to_plot.fillna('')
    _to_plot

    fig, ax = plt.subplots(figsize=(4, 2))
    ax = _to_plot.loc[[feature_names.name]].plot.bar(
        rot=0,
        ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)",
        # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
        color=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS)),
        ax=ax,
        width=.7)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    ax = pimmslearn.plotting.add_height_to_barplot(ax, size=5)
    ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5)
    ax.set_xticklabels([])

    fname = args.out_figures / f'2_{group}_performance_test_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, name=fname)

    dumps[fname.stem] = fname.with_suffix('.csv')
    _to_plot_long = _to_plot.T
    _to_plot_long = _to_plot_long.rename(
        {feature_names.name: 'metric_value'}, axis=1)
    _to_plot_long['data level'] = feature_names.name
    _to_plot_long = _to_plot_long.set_index('data level', append=True)
    _to_plot_long.to_csv(fname.with_suffix('.csv'))

Hide code cell source

# custom selection
if SEL_MODELS:
    pimmslearn.plotting.make_large_descriptors(7)
    fig, ax = plt.subplots(figsize=(8, 2))

    ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
        pred=pred_test[
            [TARGET_COL] + SEL_MODELS
        ],
        feat_medians=data.train_X.median(),
        ax=ax,
        metric_name=METRIC,
        feat_name=FEAT_NAME_DISPLAY,
        palette=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS))
    )
    # ax.set_ylim(0, 1.5)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    # for text in ax.legend().get_texts():
    #     text.set_fontsize(6)
    fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(ax.get_figure(), name=fname)
    plt.show(fig)

    dumps[fname.stem] = fname.with_suffix('.csv')
    errors_binned.to_csv(fname.with_suffix('.csv'))
    pimmslearn.plotting.make_large_descriptors(6)
    # ax.xaxis.set_tick_params(rotation=0) # horizontal

    # ! only used for reporting
    plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
        errors=errors_binned,
        feat_name=FEAT_NAME_DISPLAY,
        metric_name=METRIC
    )
    plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
    display(plotted)

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_binned(
    pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    ax=ax,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_test_errors_binned_by_int.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:45: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  ax = sns.barplot(data=errors_binned, ax=ax,
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf
_images/4bf2f5e1031324afa75f518d6e21c5fc11f71cbdf4afe2bda976a5cf2d510039.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE intensity bin
0 Sample_143 P02768 BPCA 0.065 30\n(N=2)
1 Sample_143 P02768 DAE 0.358 30\n(N=2)
2 Sample_143 P02768 VAE 0.414 30\n(N=2)
3 Sample_143 P02768 TRKNN 0.574 30\n(N=2)
4 Sample_143 P02768 RF 0.483 30\n(N=2)

Figures dumped to disk#

Hide code cell source

figures
{'2_1_fake_na_val_test_splits': Path('runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png'),
 '2_1_pred_corr_val_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf'),
 '2_1_errors_binned_by_feat_median_val': Path('runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf'),
 '2_1_intensity_binned_top_4_models_test': Path('runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf'),
 '2_1_pred_corr_test_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf'),
 '2_1_pred_corr_test_per_feat': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf'),
 '2_1_performance_test': Path('runs/alzheimer_study/figures/2_1_performance_test.pdf'),
 '2_1_test_errors_binned_by_feat_medians': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf'),
 '2_1_test_errors_binned_by_int': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf')}

Hide code cell source

dumps
print("done")
done