Compare models#

  1. Load available configurations

  2. Load validation predictions

    • calculate absolute error

    • select top N for plotting by MAE from smallest (best) to largest (worst) (top N as specified, default 5)

    • correlation per sample, correlation per feat, correlation overall

    • MAE plots

  3. Load test data predictions

    • as for validation data

    • top N based on validation data

Hide code cell source

import logging
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from IPython.display import display

import pimmslearn
import pimmslearn.imputation
import pimmslearn.models
import pimmslearn.nb
from pimmslearn.analyzers import compare_predictions
from pimmslearn.io import datasplits
from pimmslearn.models.collect_dumps import collect, select_content

pd.options.display.max_rows = 30
pd.options.display.min_rows = 10
pd.options.display.max_colwidth = 100

plt.rcParams.update({'figure.figsize': (4, 2)})
pimmslearn.plotting.make_large_descriptors(7)

logger = pimmslearn.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)


def load_config_file(fname: Path, first_split='config_') -> dict:
    with open(fname) as f:
        loaded = yaml.safe_load(f)
    key = f"{select_content(fname.stem, first_split=first_split)}"
    return key, loaded


def build_text(s):
    ret = ''
    if not np.isnan(s["latent_dim"]):
        ret += f'LD: {int(s["latent_dim"])} '
    try:
        if len(s["hidden_layers"]):
            t = ",".join(str(x) for x in s["hidden_layers"])
            ret += f"HL: {t}"
    except TypeError:
        # nan
        pass
    return ret

Hide code cell source

# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

# files and folders
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
folder_data: str = ''  # specify data directory if needed
file_format: str = 'csv'  # change default to pickled files
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
models: str = 'Median,CF,DAE,VAE'  # picked models to compare (comma separated)
sel_models: str = ''  # user defined comparison (comma separated)
# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10
plot_to_n: int = 5
feat_name_display: str = None  # display name for feature name in plural (e.g. 'protein groups')
save_agg_pred: bool = False  # save aggregated predictions of validation and test data
# Parameters
fn_rawfile_metadata = "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv"
folder_experiment = "runs/alzheimer_study"
models = "Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO"

Some argument transformations

Hide code cell source

args = pimmslearn.nb.get_params(args, globals=globals())
args
root - INFO     Removed from global namespace: folder_experiment
root - INFO     Removed from global namespace: folder_data
root - INFO     Removed from global namespace: file_format
root - INFO     Removed from global namespace: fn_rawfile_metadata
root - INFO     Removed from global namespace: models
root - INFO     Removed from global namespace: sel_models
root - INFO     Removed from global namespace: plot_to_n
root - INFO     Removed from global namespace: feat_name_display
root - INFO     Removed from global namespace: save_agg_pred
{'folder_experiment': 'runs/alzheimer_study',
 'folder_data': '',
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'sel_models': '',
 'plot_to_n': 5,
 'feat_name_display': None,
 'save_agg_pred': False}

Hide code cell source

args = pimmslearn.nb.args_from_dict(args)
args
{'data': Path('runs/alzheimer_study/data'),
 'feat_name_display': None,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'folder_data': '',
 'folder_experiment': Path('runs/alzheimer_study'),
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'out_figures': Path('runs/alzheimer_study/figures'),
 'out_folder': Path('runs/alzheimer_study'),
 'out_metrics': Path('runs/alzheimer_study'),
 'out_models': Path('runs/alzheimer_study'),
 'out_preds': Path('runs/alzheimer_study/preds'),
 'plot_to_n': 5,
 'save_agg_pred': False,
 'sel_models': ''}

Hide code cell source

figures = {}
dumps = {}

Hide code cell source

TARGET_COL = 'observed'
METRIC = 'MAE'
MIN_FREQ = None
MODELS_PASSED = args.models.split(',')
MODELS = MODELS_PASSED.copy()
FEAT_NAME_DISPLAY = args.feat_name_display
SEL_MODELS = None
if args.sel_models:
    SEL_MODELS = args.sel_models.split(',')

Hide code cell source

# list(sns.color_palette().as_hex()) # string representation of colors
if args.plot_to_n > 10:
    logger.warning("Set maximum of models to 10 (maximum)")
    args.overwrite_entry('plot_to_n', 10)

Hide code cell source

data = datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)
pimmslearn.io.datasplits - INFO     Loaded 'train_X' from file: runs/alzheimer_study/data/train_X.csv
pimmslearn.io.datasplits - INFO     Loaded 'val_y' from file: runs/alzheimer_study/data/val_y.csv
pimmslearn.io.datasplits - INFO     Loaded 'test_y' from file: runs/alzheimer_study/data/test_y.csv

Hide code cell source

fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)

pimmslearn.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],
                                     title='Validation split', size=1, xlabel='')
pimmslearn.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],
                                     title='Test split', size=1, xlabel='')
fig.suptitle("Simulated missing values per sample", size=8)
# hide axis and use only for common x label
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')
group = 1
fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png
_images/922ed83236f6cfda3316fa943be9844c2e9729f1a7b4a82fc375cb33e5542ee2.png

data completeness across entire data#

Hide code cell source

# load frequency of training features...
# needs to be pickle -> index.name needed
freq_feat = pimmslearn.io.datasplits.load_freq(args.data, file='freq_features.json')
freq_feat.head()  # training data
A0A024QZX5;A0A087X1N8;P35237                                                     197
A0A024R0T9;K7ER74;P02655                                                         208
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8   185
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                          208
A0A075B6H7                                                                        97
Name: freq, dtype: int64

Hide code cell source

prop = freq_feat / len(data.train_X.index.levels[0])
prop.sort_values().to_frame().plot(
    xlabel=f'{data.val_y.index.names[-1]}',
    ylabel='Proportion of identification in samples')
<Axes: xlabel='protein groups', ylabel='Proportion of identification in samples'>
_images/0c818105324a3e5d48d0a6073e9dce2019222361f72da3a20867a93a550de8f3.png

View training data in wide format

Hide code cell source

data.to_wide_format()
data.train_X
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 ... Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 ... 16.012 15.178 NaN 15.050 16.842 NaN NaN 19.563 NaN 12.805
Sample_001 NaN 16.874 15.519 16.387 NaN 19.941 18.786 17.144 NaN 19.067 ... 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 NaN 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 ... 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 ... 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 NaN 12.445
Sample_004 15.603 15.331 15.375 16.679 NaN 20.450 18.682 17.081 14.140 19.686 ... 14.757 NaN NaN 15.256 17.075 19.582 15.328 NaN 13.145 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 15.682 16.886 14.910 16.482 NaN 17.705 17.039 NaN 16.413 19.102 ... NaN 15.684 14.236 15.415 17.551 17.922 16.340 19.928 12.929 NaN
Sample_206 15.798 17.554 15.600 15.938 NaN 18.154 18.152 16.503 16.860 18.538 ... 15.422 16.106 NaN 15.345 17.084 18.708 NaN 19.433 NaN NaN
Sample_207 15.739 NaN 15.469 16.898 NaN 18.636 17.950 16.321 16.401 18.849 ... 15.808 16.098 14.403 15.715 NaN 18.725 16.138 19.599 13.637 11.174
Sample_208 15.477 16.779 14.995 16.132 NaN 14.908 NaN NaN 16.119 18.368 ... 15.157 16.712 NaN 14.640 16.533 19.411 15.807 19.545 NaN NaN
Sample_209 NaN 17.261 15.175 16.235 NaN 17.893 17.744 16.371 15.780 18.806 ... 15.237 15.652 15.211 14.205 16.749 19.275 15.732 19.577 11.042 11.791

210 rows × 1421 columns

Number of samples and features:

Hide code cell source

N_SAMPLES, M_FEAT = data.train_X.shape
print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}")
N samples: 210, M features: 1421

Collect outputs in excel file:

Hide code cell source

fname = args.folder_experiment / '01_2_performance_summary.xlsx'
dumps[fname.stem] = fname
writer = pd.ExcelWriter(fname)
print(f"Saving to: {fname}")
Saving to: runs/alzheimer_study/01_2_performance_summary.xlsx

Model specifications#

  • used for bar plot annotations

Hide code cell source

# model_key could be used as key from config file
# ? load only specified configs?
# ? case: no config file available?
all_configs = collect(
    paths=(fname for fname in args.out_models.iterdir()
           if fname.suffix == '.yaml'
           and 'model_config' in fname.name),
    load_fn=load_config_file
)
model_configs = pd.DataFrame(all_configs).set_index('id')
model_configs.T.to_excel(writer, sheet_name='model_params')
model_configs.T
id CF VAE KNN5 KNN DAE Median
M 1421 1421 1421 1421 1421 1421
batch_size 1,024.000 64.000 64.000 64.000 64.000 NaN
cuda False False True True False NaN
data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data
epoch_trained 13.000 142.000 NaN NaN 118.000 NaN
epochs_max 100.000 300.000 50.000 50.000 300.000 NaN
file_format csv csv csv csv csv csv
fn_rawfile_metadata https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv
folder_data NaN
folder_experiment runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
latent_dim 50.000 10.000 NaN NaN 10.000 NaN
model CF VAE KNN KNN DAE Median
model_key CF VAE KNN5 KNN DAE Median
n_params 83283 277998 1 1 184983 1421
out_figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures
out_folder runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_metrics runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_models runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds
patience 1.000 50.000 NaN NaN 25.000 NaN
sample_idx_position 0 0 0 0 0 0
save_pred_real_na True True True True True True
hidden_layers NaN [64] NaN NaN [64] NaN
meta_cat_col NaN NaN NaN NaN NaN NaN
meta_date_col NaN NaN NaN NaN NaN NaN
force_train NaN NaN True True NaN NaN
neighbors NaN NaN 5.000 3.000 NaN NaN
pred_test_Median NaN NaN NaN NaN NaN runs/alzheimer_study/preds/pred_test_Median.csv
pred_val_Median NaN NaN NaN NaN NaN runs/alzheimer_study/preds/pred_val_Median.csv

Set Feature name (columns are features, rows are samples)

Hide code cell source

# index name
freq_feat.index.name = data.train_X.columns.name
# sample index name
sample_index_name = data.train_X.index.name

Load predictions on validation and test data split#

Validation data#

  • set top N models to plot based on validation data split

Hide code cell source

pred_val = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='val',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
SAMPLE_ID, FEAT_NAME = pred_val.index.names
if not FEAT_NAME_DISPLAY:
    FEAT_NAME_DISPLAY = FEAT_NAME
pred_val[MODELS]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 15.752 15.784 15.599 15.728 15.427 15.449 15.469 16.800 NaN 58.276 ... 7.068 12.315 2,513.638 12.012 15.001 15.618 15.752 17.206 15.700 0
Sample_050 Q9Y287 17.221 16.474 16.924 16.880 17.776 17.314 16.453 17.288 NaN 16.993 ... 7.068 11.560 19.829 13.043 15.995 16.980 17.221 17.807 16.738 0
Sample_107 Q8N475;Q8N475-2 14.846 13.949 14.335 14.722 14.150 14.355 13.110 17.187 NaN -78.084 ... 7.068 12.731 2,582.130 12.844 13.515 14.380 14.846 17.434 13.776 0
Sample_199 P06307 18.973 19.860 19.050 19.024 19.247 19.385 19.639 16.711 NaN 102.283 ... 7.068 12.144 2,483.120 12.189 16.883 19.252 18.973 17.111 19.015 0
Sample_067 Q5VUB5 14.726 15.420 14.976 15.009 15.232 15.040 15.465 16.743 NaN -36.470 ... 7.068 11.606 2,569.564 12.239 12.405 14.716 14.726 17.031 14.699 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.918 23.039 22.964 22.927 22.884 22.899 22.994 17.042 NaN 104.484 ... 7.068 12.559 2,634.108 14.469 22.109 22.933 22.918 17.330 22.872 0
Sample_002 A0A0A0MT36 15.877 16.229 15.849 16.110 16.857 16.142 15.882 16.792 NaN -18.408 ... 7.068 12.953 2,448.503 12.545 13.671 16.043 15.877 16.879 15.671 0
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 16.278 15.264 15.764 15.718 15.840 15.574 15.406 17.032 NaN -27.128 ... 7.068 12.369 2,487.550 12.296 14.717 15.668 16.278 17.215 15.574 0
Sample_182 Q8NFT8 13.995 14.600 13.944 13.400 13.685 13.480 14.322 16.764 NaN -12.434 ... 7.068 12.973 2,426.191 12.358 11.397 13.277 13.995 17.125 14.518 0
Sample_123 Q16853;Q16853-2 14.849 14.734 14.422 14.481 14.612 14.627 14.582 16.686 NaN 78.799 ... 7.068 12.758 2,461.806 11.921 13.617 14.719 14.849 16.981 14.485 0

12600 rows × 24 columns

Describe absolute error

Hide code cell source

errors_val = (pred_val
              .drop(TARGET_COL, axis=1)
              .sub(pred_val[TARGET_COL], axis=0)
              [MODELS])
errors_val  # over all samples and all features
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 1.122 1.154 0.969 1.097 0.797 0.819 0.839 2.169 NaN 43.645 ... -7.562 -2.315 2,499.008 -2.619 0.371 0.988 1.122 2.575 1.070 -14.630
Sample_050 Q9Y287 1.466 0.719 1.169 1.125 2.021 1.559 0.698 1.533 NaN 1.238 ... -8.687 -4.195 4.074 -2.711 0.240 1.225 1.466 2.052 0.983 -15.755
Sample_107 Q8N475;Q8N475-2 -0.183 -1.081 -0.694 -0.307 -0.880 -0.674 -1.919 2.157 NaN -93.113 ... -7.961 -2.299 2,567.100 -2.185 -1.514 -0.650 -0.183 2.405 -1.253 -15.029
Sample_199 P06307 -0.403 0.484 -0.325 -0.352 -0.129 0.009 0.263 -2.665 NaN 82.907 ... -12.308 -7.232 2,463.744 -7.187 -2.492 -0.124 -0.403 -2.265 -0.360 -19.376
Sample_067 Q5VUB5 -0.583 0.111 -0.333 -0.300 -0.077 -0.269 0.156 1.434 NaN -51.779 ... -8.241 -3.703 2,554.255 -3.069 -2.903 -0.593 -0.583 1.723 -0.610 -15.309
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 0.096 0.217 0.142 0.105 0.062 0.077 0.171 -5.781 NaN 81.662 ... -15.754 -10.263 2,611.285 -8.353 -0.714 0.111 0.096 -5.493 0.050 -22.822
Sample_002 A0A0A0MT36 -2.288 -1.936 -2.316 -2.055 -1.308 -2.023 -2.283 -1.373 NaN -36.573 ... -11.097 -5.212 2,430.338 -5.621 -4.494 -2.122 -2.288 -1.286 -2.494 -18.165
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 0.753 -0.261 0.239 0.193 0.314 0.049 -0.120 1.507 NaN -42.653 ... -8.457 -3.157 2,472.025 -3.229 -0.808 0.142 0.753 1.690 0.049 -15.525
Sample_182 Q8NFT8 -0.383 0.222 -0.435 -0.978 -0.694 -0.899 -0.057 2.385 NaN -26.813 ... -7.311 -1.406 2,411.812 -2.021 -2.981 -1.101 -0.383 2.746 0.139 -14.379
Sample_123 Q16853;Q16853-2 0.345 0.230 -0.083 -0.023 0.108 0.123 0.077 2.181 NaN 64.295 ... -7.436 -1.747 2,447.302 -2.583 -0.887 0.215 0.345 2.477 -0.019 -14.504

12600 rows × 24 columns

Select top N for plotting and set colors#

Hide code cell source

ORDER_MODELS = (errors_val
                .abs()
                .mean()
                .sort_values()
                .index
                .to_list())
ORDER_MODELS
['BPCA',
 'VAE',
 'DAE',
 'TRKNN',
 'RF',
 'CF',
 'KNN5',
 'KNN',
 'KNN_IMPUTE',
 'IRM',
 'ROWMEDIAN',
 'Median',
 'LLS',
 'QRILC',
 'COLMEDIAN',
 'SVDMETHOD',
 'PI',
 'MINDET',
 'MINPROB',
 'MINIMUM',
 'ZERO',
 'IMPSEQROB',
 'MLE',
 'IMPSEQ']

Hide code cell source

pred_val = pred_val[[TARGET_COL] + ORDER_MODELS]
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_val.csv'
    dumps[fname.stem] = fname
    pred_val.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_val
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 14.630 15.469 15.728 15.599 15.700 15.618 15.784 15.449 15.427 15.937 ... 16.800 17.206 12.012 11.916 12.315 7.068 0 58.276 2,513.638 NaN
Sample_050 Q9Y287 15.755 16.453 16.880 16.924 16.738 16.980 16.474 17.314 17.776 16.961 ... 17.288 17.807 13.043 12.900 11.560 7.068 0 16.993 19.829 NaN
Sample_107 Q8N475;Q8N475-2 15.029 13.110 14.722 14.335 13.776 14.380 13.949 14.355 14.150 15.437 ... 17.187 17.434 12.844 12.313 12.731 7.068 0 -78.084 2,582.130 NaN
Sample_199 P06307 19.376 19.639 19.024 19.050 19.015 19.252 19.860 19.385 19.247 18.861 ... 16.711 17.111 12.189 12.285 12.144 7.068 0 102.283 2,483.120 NaN
Sample_067 Q5VUB5 15.309 15.465 15.009 14.976 14.699 14.716 15.420 15.040 15.232 15.079 ... 16.743 17.031 12.239 11.827 11.606 7.068 0 -36.470 2,569.564 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.822 22.994 22.927 22.964 22.872 22.933 23.039 22.899 22.884 22.837 ... 17.042 17.330 14.469 12.161 12.559 7.068 0 104.484 2,634.108 NaN
Sample_002 A0A0A0MT36 18.165 15.882 16.110 15.849 15.671 16.043 16.229 16.142 16.857 15.446 ... 16.792 16.879 12.545 12.586 12.953 7.068 0 -18.408 2,448.503 NaN
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 15.525 15.406 15.718 15.764 15.574 15.668 15.264 15.574 15.840 15.995 ... 17.032 17.215 12.296 12.352 12.369 7.068 0 -27.128 2,487.550 NaN
Sample_182 Q8NFT8 14.379 14.322 13.400 13.944 14.518 13.277 14.600 13.480 13.685 14.675 ... 16.764 17.125 12.358 12.504 12.973 7.068 0 -12.434 2,426.191 NaN
Sample_123 Q16853;Q16853-2 14.504 14.582 14.481 14.422 14.485 14.719 14.734 14.627 14.612 14.824 ... 16.686 16.981 11.921 12.689 12.758 7.068 0 78.799 2,461.806 NaN

12600 rows × 25 columns

Hide code cell source

mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS]
mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f')
mae_stats_ordered_val.T
count mean std min 25% 50% 75% max
BPCA 12,600.000 0.422 0.501 0.000 0.119 0.269 0.534 9.370
VAE 12,600.000 0.430 0.519 0.000 0.118 0.275 0.542 9.295
DAE 12,600.000 0.435 0.523 0.000 0.121 0.275 0.547 9.534
TRKNN 12,600.000 0.450 0.516 0.000 0.132 0.295 0.569 7.975
RF 12,600.000 0.462 0.532 0.000 0.135 0.306 0.588 8.982
CF 12,600.000 0.466 0.507 0.000 0.145 0.319 0.609 5.716
KNN5 12,600.000 0.467 0.546 0.000 0.135 0.305 0.594 10.231
KNN 12,600.000 0.481 0.565 0.000 0.138 0.310 0.618 10.502
KNN_IMPUTE 12,600.000 0.554 0.668 0.000 0.164 0.359 0.692 7.550
IRM 12,600.000 0.588 0.637 0.000 0.176 0.396 0.767 7.953
ROWMEDIAN 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
Median 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
LLS 12,600.000 1.329 54.974 0.000 0.151 0.343 0.662 4,842.571
QRILC 12,600.000 1.640 1.272 0.000 0.828 1.345 2.082 15.305
COLMEDIAN 12,600.000 2.210 1.634 0.000 0.947 1.972 3.094 12.944
SVDMETHOD 12,600.000 2.309 1.635 0.000 1.027 2.091 3.251 12.624
PI 12,600.000 3.812 2.646 0.000 1.774 3.356 5.370 16.736
MINDET 12,600.000 4.108 2.650 0.001 2.089 3.678 5.665 17.920
MINPROB 12,600.000 4.133 2.692 0.001 2.106 3.713 5.736 17.937
MINIMUM 12,600.000 9.272 2.717 0.373 7.327 8.890 10.863 22.773
ZERO 12,600.000 16.340 2.717 6.695 14.395 15.958 17.931 29.841
IMPSEQROB 12,600.000 333.478 793.700 0.002 12.282 33.864 87.298 2,869.299
MLE 12,600.000 2,172.384 865.925 0.009 2,435.415 2,495.362 2,552.718 2,873.681
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Some model have fixed colors, others are assigned randomly

Note

  1. The order of “new” models is important for the color assignment.

  2. User defined model keys for the same model with two configuration will yield different colors.

Hide code cell source

COLORS_TO_USE = pimmslearn.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS))
pimmslearn.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE)
pimmslearn.plotting.defaults - INFO     Reused some colors!
BPCAVAEDAETRKNNRFCFKNN5KNNKNN_IMPUTEIRMROWMEDIANMedianLLSQRILCCOLMEDIANSVDMETHODPIMINDETMINPROBMINIMUMZEROIMPSEQROBMLEIMPSEQ

Hide code cell source

TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n]
TOP_N_COLOR_PALETTE = {model: color for model,
                       color in zip(TOP_N_ORDER, COLORS_TO_USE)}
TOP_N_ORDER
['BPCA', 'VAE', 'DAE', 'TRKNN', 'RF']

Correlation per sample#

Hide code cell source

corr_per_sample_val = (pred_val
                       .groupby(sample_index_name)
                       .apply(
                           lambda df: df.corr().loc[TARGET_COL]
                       )[ORDER_MODELS])

min_corr = int(corr_per_sample_val.min().min() * 10) / 10
kwargs = dict(ylim=(min_corr, 1), rot=90,
              #     boxprops=dict(linewidth=1.5),
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model pred. per sample on validation data',
              ylabel='correlation per sample')
ax = corr_per_sample_val[TOP_N_ORDER].plot.box(**kwargs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.xlsx'
dumps[fname.stem] = fname
with pd.ExcelWriter(fname) as w:
    corr_per_sample_val.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_val.to_excel(w, sheet_name='correlations')
    corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf
_images/21a0df5f6c20693fa09760977a57be039cba4d661f8cf003e25a6021d814d066.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_val[TOP_N_ORDER]).min()
mask = (corr_per_sample_val[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_val.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID                                                
Sample_018 0.968582 0.943402 0.925223 0.953573 0.914824 0.956620 0.925531 0.938822 0.939383 0.952858 0.908717 0.908717 0.936909 0.897323 nan 0.161907 0.073926 nan -0.183010 nan nan 0.375658 nan nan
Sample_054 0.932254 0.933119 0.927597 0.910271 0.917592 0.902884 0.936915 0.925876 0.905679 0.913765 0.915748 0.915748 0.929264 0.845077 nan 0.190649 -0.092560 nan 0.140313 nan nan 0.836204 nan nan
Sample_071 0.887866 0.904119 0.900719 0.888162 0.894288 0.900513 0.901240 0.895286 0.880453 0.865003 0.885806 0.885806 0.899799 0.871241 nan 0.178303 -0.063628 nan 0.090107 nan nan 0.364226 nan nan
Sample_073 0.930349 0.917367 0.925892 0.919876 0.907405 0.915267 0.933555 0.950641 0.916774 0.901773 0.900178 0.900178 0.909057 0.877483 nan -0.017963 0.070423 nan -0.065587 nan nan 0.356937 nan nan
Sample_095 0.940942 0.920757 0.915858 0.927289 0.923759 0.935673 0.924950 0.930902 0.909714 0.913905 0.878167 0.878167 0.917350 0.752341 nan -0.120269 0.006888 nan 0.036943 nan nan 0.419195 nan nan
Sample_133 0.919483 0.931032 0.943091 0.928251 0.922222 0.909428 0.903483 0.903370 0.885348 0.878925 0.899233 0.899233 0.881238 0.827577 nan 0.219841 0.143079 nan -0.148442 nan nan 0.409126 nan nan
Sample_139 0.927681 0.932534 0.932437 0.957367 0.916318 0.904036 0.912868 0.901552 0.878475 0.891290 0.907333 0.907333 0.928867 0.853311 nan 0.156894 0.184833 nan -0.045617 nan nan 0.554137 nan nan
Sample_150 0.950334 0.904851 0.892668 0.945063 0.920318 0.948159 0.885565 0.868275 0.930981 0.907849 0.892997 0.892997 0.940619 0.796418 nan 0.166841 0.053029 nan -0.201771 nan nan 0.335988 nan nan
Sample_171 0.924707 0.902190 0.923772 0.916959 0.900950 0.917056 0.902581 0.906699 0.884571 0.881019 0.875433 0.875433 -0.090619 0.894430 nan -0.004823 -0.030088 nan 0.138185 nan nan 0.302006 nan nan
Sample_173 0.916627 0.946196 0.958204 0.932711 0.947399 0.959869 0.939783 0.940274 0.918589 0.916299 0.925428 0.925428 0.926916 0.972932 nan 0.059663 0.047900 nan -0.039212 nan nan 0.334436 nan nan
Sample_174 0.970316 0.878662 0.872150 0.967356 0.883676 0.956711 0.854645 0.846532 0.920737 0.920759 0.887409 0.887409 0.972096 0.810778 nan 0.306279 0.142238 nan 0.107394 nan nan 0.357612 nan nan
Sample_181 0.964199 0.949908 0.948941 0.945214 0.897419 0.964297 0.940761 0.936048 0.909140 0.899569 0.861266 0.861266 0.893600 0.636031 nan -0.013702 -0.199989 nan -0.004579 nan nan 0.530054 -0.103146 nan
Sample_198 0.914339 0.952824 0.953688 0.932612 0.952520 0.943925 0.955742 0.947627 0.936142 0.946119 0.956493 0.956493 0.924497 0.935035 nan 0.097862 -0.111432 nan -0.043340 nan nan 0.481999 nan nan

Error plot#

Hide code cell source

c_error_min = 4.5
mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1)
errors_val.loc[mask].sort_index(level=1).head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_012 A0A024QZX5;A0A087X1N8;P35237 -0.324 -0.125 -0.276 -0.317 -0.246 -0.416 -0.140 0.856 NaN -53.608 ... -8.881 -4.247 0.632 -3.020 -1.199 -0.302 -0.324 1.241 -0.318 -15.949
Sample_017 A0A024QZX5;A0A087X1N8;P35237 0.347 0.516 0.352 0.255 -0.093 -0.022 0.251 1.658 NaN 26.724 ... -8.211 -2.676 1.228 -2.276 -0.580 0.245 0.347 2.214 0.305 -15.279
Sample_050 A0A024QZX5;A0A087X1N8;P35237 0.544 -0.058 0.150 0.190 0.024 -0.102 0.178 2.207 NaN 0.348 ... -8.013 -2.686 3.294 -2.168 -0.114 0.129 0.544 2.691 0.238 -15.081
Sample_102 A0A024QZX5;A0A087X1N8;P35237 -0.029 -0.134 -0.120 -0.084 0.030 0.067 -0.107 0.942 NaN 19.277 ... -8.586 -5.087 1.609 -2.899 -0.701 -0.099 -0.029 1.168 -0.065 -15.654
Sample_109 A0A024QZX5;A0A087X1N8;P35237 0.343 -0.177 -0.050 0.069 -0.179 -0.004 -0.263 1.518 NaN -28.795 ... -8.215 -2.765 -2.077 -2.560 -0.538 -0.128 0.343 1.968 -0.012 -15.283

5 rows × 24 columns

Hide code cell source

errors_val = errors_val.abs().groupby(
    freq_feat.index.name).mean()  # absolute error
errors_val = errors_val.join(freq_feat)
errors_val = errors_val.sort_values(by=freq_feat.name, ascending=True)
errors_val.head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
Q9Y281;Q9Y281-3 0.415 0.216 0.260 0.344 0.325 0.285 0.266 4.078 NaN 10.519 ... 0.525 2,473.194 0.660 1.602 0.352 0.415 4.472 0.307 12.573 52
K7EPJ5;O60291;O60291-2;O60291-3;O60291-4 0.331 0.405 0.461 0.378 0.281 0.385 0.387 3.029 NaN 13.344 ... 1.364 2,525.930 0.842 2.462 0.356 0.331 3.452 0.257 13.857 52
B1AJQ6;Q86Y82 1.082 0.832 0.530 0.296 0.482 1.226 0.674 3.367 NaN 5.026 ... 1.064 2,441.128 0.219 5.397 0.699 1.082 3.926 0.900 13.380 52
P69892 0.872 1.457 1.905 1.001 1.734 1.286 1.360 1.980 NaN 18.023 ... 2.873 2,532.051 1.534 7.127 1.286 0.872 2.320 0.966 14.768 53
A2RU67 0.689 0.418 0.442 0.502 0.503 0.462 0.539 4.495 NaN 15.116 ... 1.079 1,998.072 0.669 1.608 0.507 0.689 4.870 0.462 12.437 53

5 rows × 25 columns

Hide code cell source

errors_val.describe()[ORDER_MODELS].T  # mean of means
count mean std min 25% 50% 75% max
BPCA 1,419.000 0.408 0.306 0.017 0.222 0.320 0.494 4.195
VAE 1,419.000 0.419 0.316 0.043 0.228 0.331 0.487 3.651
DAE 1,419.000 0.423 0.328 0.057 0.227 0.330 0.500 3.561
TRKNN 1,419.000 0.437 0.309 0.000 0.241 0.349 0.526 3.647
RF 1,419.000 0.450 0.324 0.029 0.250 0.359 0.534 3.878
CF 1,419.000 0.455 0.301 0.041 0.267 0.376 0.548 3.221
KNN5 1,419.000 0.455 0.322 0.039 0.256 0.369 0.540 3.634
KNN 1,419.000 0.468 0.333 0.012 0.267 0.375 0.549 3.693
KNN_IMPUTE 1,419.000 0.531 0.378 0.063 0.296 0.424 0.636 3.430
IRM 1,419.000 0.555 0.372 0.030 0.311 0.449 0.674 3.476
ROWMEDIAN 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
Median 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
LLS 1,419.000 1.088 19.029 0.023 0.279 0.408 0.596 706.018
QRILC 1,419.000 1.607 0.894 0.141 1.022 1.359 1.903 7.838
COLMEDIAN 1,419.000 2.071 1.509 0.038 0.916 1.738 2.812 12.631
SVDMETHOD 1,419.000 2.136 1.467 0.149 0.976 1.893 2.905 12.211
PI 1,419.000 4.107 2.445 0.219 2.264 3.654 5.524 16.085
MINDET 1,419.000 4.438 2.493 0.374 2.622 4.032 5.828 17.100
MINPROB 1,419.000 4.455 2.493 0.424 2.623 4.001 5.874 16.956
MINIMUM 1,419.000 9.620 2.542 3.842 7.854 9.231 11.051 22.371
ZERO 1,419.000 16.688 2.542 10.910 14.922 16.299 18.119 29.439
IMPSEQROB 1,419.000 443.657 892.834 0.830 23.477 43.842 100.325 2,633.136
MLE 1,419.000 2,171.007 331.079 1.453 1,992.846 2,214.845 2,487.619 2,683.431
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Hide code cell source

c_avg_error = 2
mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1)
errors_val.loc[mask]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
O60512 2.233 1.954 2.265 1.771 2.293 2.209 1.989 5.769 NaN 8.471 ... 1.555 2,128.612 2.264 4.578 1.949 2.233 6.098 2.559 10.910 58
Q99538 2.502 2.607 2.535 2.363 2.711 2.517 2.464 2.615 NaN 8.517 ... 2.401 1,769.534 2.164 2.583 2.624 2.502 2.767 2.399 14.984 107
P02100 2.192 1.729 2.844 2.390 2.283 2.509 1.033 1.996 NaN 14.829 ... 4.558 2,512.438 4.311 5.130 2.656 2.192 2.106 1.856 16.373 127
A0A0G2JRN3 3.053 3.221 3.561 3.651 3.693 3.634 4.195 3.998 NaN 71.992 ... 7.086 1,986.331 7.159 7.838 3.878 3.053 3.976 3.647 19.496 128
P01817 2.254 2.327 2.037 1.958 1.963 2.117 2.385 2.736 NaN 10.059 ... 2.680 2,369.250 2.165 3.385 2.041 2.254 3.104 2.039 14.053 133
Q15375;Q15375-4 4.171 1.591 1.651 1.576 1.608 1.331 1.981 3.754 NaN 16.223 ... 6.872 2,285.221 6.470 4.640 1.986 4.171 3.566 2.065 19.101 163
P68871 2.331 1.250 2.445 2.361 1.616 1.638 0.571 1.720 NaN 23.608 ... 4.208 2,237.073 3.759 4.146 2.252 2.331 2.014 0.854 16.378 168
P69905 2.793 1.621 2.771 2.862 2.936 2.820 1.032 2.807 NaN 94.049 ... 6.130 1,992.771 5.671 6.119 2.448 2.793 2.626 1.016 18.200 190
P35527 2.216 1.201 2.310 2.235 2.064 2.156 1.295 2.273 NaN 96.343 ... 5.128 2,335.097 4.405 5.129 1.896 2.216 2.403 1.169 17.045 195
P15509;P15509-2;P15509-3;P15509-5;P15509-7;P15509-8 2.252 1.480 1.772 1.497 1.218 1.374 1.336 3.397 NaN 48.350 ... 6.331 1,276.662 6.404 4.076 1.879 2.252 3.146 2.437 18.354 201

10 rows × 25 columns

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 3))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred_val[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,)
ax.set_ylabel(f"Average error ({METRIC})")
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf
_images/c415844e8ef9432a8c9b61238633a80c6c0c38f78a6f31f0a9c54ed206993a82.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=94) BPCA 0.715 0.601 0.849
1 11\n(N=94) DAE 0.703 0.581 0.838
2 11\n(N=94) RF 0.670 0.549 0.798
3 11\n(N=94) TRKNN 0.655 0.545 0.784
4 11\n(N=94) VAE 0.579 0.463 0.713
... ... ... ... ... ...
85 29\n(N=5) BPCA 0.175 0.062 0.288
86 29\n(N=5) DAE 0.111 0.025 0.222
87 29\n(N=5) RF 0.109 0.046 0.175
88 29\n(N=5) TRKNN 0.193 0.128 0.257
89 29\n(N=5) VAE 0.169 0.070 0.308

90 rows × 5 columns

Hide code cell source

errors_binned.head()
dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_158 Q9UN70;Q9UN70-2 BPCA 0.839 15 2,398 15\n(N=2,398)
1 Sample_158 Q9UN70;Q9UN70-2 VAE 1.097 15 2,398 15\n(N=2,398)
2 Sample_158 Q9UN70;Q9UN70-2 DAE 0.969 15 2,398 15\n(N=2,398)
3 Sample_158 Q9UN70;Q9UN70-2 TRKNN 1.070 15 2,398 15\n(N=2,398)
4 Sample_158 Q9UN70;Q9UN70-2 RF 0.988 15 2,398 15\n(N=2,398)

test data#

Hide code cell source

pred_test = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='test',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
pred_test = pred_test[[TARGET_COL] + ORDER_MODELS]
pred_test = pred_test.join(freq_feat, on=freq_feat.index.name)
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_test.csv'
    dumps[fname.stem] = fname
    pred_test.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_test
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_000 A0A075B6P5;P01615 17.016 17.483 17.206 17.028 17.438 17.457 17.464 17.207 17.190 18.269 ... 17.496 13.937 12.970 13.409 7.068 0 229.376 2,505.226 NaN 210
A0A087X089;Q16627;Q16627-2 18.280 17.769 17.919 17.984 17.930 17.675 17.653 18.146 18.293 17.797 ... 17.695 12.522 12.970 13.196 7.068 0 -20.319 2,505.226 NaN 210
A0A0B4J2B5;S4R460 21.735 22.459 22.218 22.286 22.397 22.161 22.756 21.959 21.835 22.205 ... 17.493 13.706 12.970 12.821 7.068 0 -10.898 2,505.226 NaN 210
A0A140T971;O95865;Q5SRR8;Q5SSV3 14.603 15.285 15.279 15.247 15.399 15.207 15.196 15.143 15.172 15.557 ... 17.087 12.825 12.970 12.875 7.068 0 -2.819 2,505.226 NaN 145
A0A140TA33;A0A140TA41;A0A140TA52;P22105;P22105-3;P22105-4 16.143 16.583 16.684 16.779 16.775 16.509 16.338 16.743 16.625 16.646 ... 17.508 13.540 12.970 12.788 7.068 0 -42.837 2,505.226 NaN 210
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_209 Q96ID5 16.074 15.866 16.093 16.021 16.122 16.007 16.065 15.981 15.909 15.925 ... 17.133 13.663 12.435 12.472 7.068 0 20.373 17.260 NaN 194
Q9H492;Q9H492-2 13.173 13.249 13.051 13.360 13.273 13.388 13.830 13.432 13.669 13.594 ... 17.109 14.755 12.435 12.746 7.068 0 14.713 19.076 NaN 111
Q9HC57 14.207 13.756 14.030 13.733 14.589 14.657 14.331 14.131 13.962 14.391 ... 17.157 12.503 12.435 14.108 7.068 0 21.445 19.649 NaN 128
Q9NPH3;Q9NPH3-2;Q9NPH3-5 14.962 15.096 15.202 15.218 15.099 15.162 15.276 15.123 15.094 15.117 ... 17.257 13.025 12.435 12.676 7.068 0 35.578 16.125 NaN 199
Q9UGM5;Q9UGM5-2 16.871 16.395 16.396 16.372 16.429 16.422 16.355 16.378 16.255 17.054 ... 17.133 12.163 12.435 13.010 7.068 0 82.601 13.608 NaN 209

12600 rows × 26 columns

Write averages for all models to excel (from before?)

Hide code cell source

errors_test_mae = pimmslearn.pandas.calc_errors.get_absolute_error(
    pred_test
)
mae_stats_ordered_test = errors_test_mae.describe()[ORDER_MODELS]
mae_stats_ordered_test
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
count 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 ... 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 0.000
mean 0.432 0.438 0.437 0.458 0.465 0.470 0.469 0.482 0.558 0.587 ... 2.223 2.330 3.815 4.109 4.113 9.271 16.339 334.546 2,186.302 NaN
std 0.518 0.539 0.534 0.539 0.544 0.519 0.546 0.562 0.679 0.647 ... 1.662 1.653 2.677 2.667 2.711 2.741 2.741 793.494 853.899 NaN
min 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.003 0.003 0.141 7.209 0.001 0.001 NaN
25% 0.121 0.122 0.120 0.132 0.136 0.146 0.138 0.140 0.163 0.175 ... 0.961 1.044 1.739 2.132 2.072 7.344 14.412 12.192 2,436.455 NaN
50% 0.280 0.277 0.278 0.299 0.304 0.323 0.307 0.316 0.364 0.394 ... 1.954 2.098 3.332 3.635 3.656 8.867 15.935 34.192 2,496.971 NaN
75% 0.546 0.548 0.544 0.584 0.592 0.609 0.596 0.612 0.703 0.762 ... 3.119 3.286 5.361 5.610 5.637 10.842 17.910 91.928 2,555.017 NaN
max 7.635 9.213 9.370 9.111 8.284 6.445 8.577 8.171 9.005 7.829 ... 13.272 13.022 18.314 18.317 19.472 23.072 30.140 2,869.824 2,873.005 NaN

8 rows × 24 columns

Hide code cell source

mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f')

Hide code cell source

cp_mean_perf = pd.concat([
    mae_stats_ordered_val.loc['mean'],
    mae_stats_ordered_test.loc['mean'],
],
    axis=1,
    keys=['val', 'test']
).sort_values(by='val')
cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f')
cp_mean_perf
val test
BPCA 0.422 0.432
VAE 0.430 0.438
DAE 0.435 0.437
TRKNN 0.450 0.458
RF 0.462 0.465
CF 0.466 0.470
KNN5 0.467 0.469
KNN 0.481 0.482
KNN_IMPUTE 0.554 0.558
IRM 0.588 0.587
ROWMEDIAN 0.598 0.602
Median 0.598 0.602
LLS 1.329 0.874
QRILC 1.640 1.628
COLMEDIAN 2.210 2.223
SVDMETHOD 2.309 2.330
PI 3.812 3.815
MINDET 4.108 4.109
MINPROB 4.133 4.113
MINIMUM 9.272 9.271
ZERO 16.340 16.339
IMPSEQROB 333.478 334.546
MLE 2,172.384 2,186.302
IMPSEQ NaN NaN

Hide code cell source

writer.close()

Intensity distribution as histogram#

Plot top 4 models predictions for intensities in test data

Hide code cell source

min_max = pimmslearn.plotting.data.min_max(pred_test[TARGET_COL])
top_n = 4
fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True)

for model, color, ax in zip(
        ORDER_MODELS[:top_n],
        COLORS_TO_USE[:top_n],
        axes):

    ax, bins = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[TARGET_COL],
        color='grey',
        min_max=min_max,
        ax=ax
    )
    ax, _ = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[model],
        color=color,
        min_max=min_max,
        ax=ax,
        alpha=0.5,
    )
    _ = [(l_.set_rotation(90))
         for l_ in ax.get_xticklabels()]
    ax.legend()

axes[0].set_ylabel('Number of observations')

fname = args.out_figures / f'2_{group}_intensity_binned_top_{top_n}_models_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf
_images/db20270e54d85354ef454dac11b76ba4596a338e9167ae24b1aecbe993c5affe.png

Hide code cell source

counts_per_bin = pimmslearn.pandas.get_counts_per_bin(df=pred_test,
                                                bins=bins,
                                                columns=[TARGET_COL, *ORDER_MODELS[:top_n]])

counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = pd.cut(df[col], bins=bins).to_frame().groupby(col).size()
observed BPCA VAE DAE TRKNN
bin
(7, 8] 2 0 0 0 0
(8, 9] 7 0 0 1 0
(9, 10] 18 2 0 1 1
(10, 11] 69 29 22 29 13
(11, 12] 217 165 119 148 113
(12, 13] 634 531 502 500 479
(13, 14] 1,394 1,248 1,247 1,227 1,224
(14, 15] 2,042 2,033 2,114 2,127 2,118
(15, 16] 2,054 2,359 2,397 2,323 2,429
(16, 17] 1,787 1,867 1,863 1,893 1,842
(17, 18] 1,333 1,363 1,359 1,380 1,401
(18, 19] 965 956 923 922 923
(19, 20] 792 789 835 809 800
(20, 21] 536 528 497 517 533
(21, 22] 320 322 317 307 323
(22, 23] 182 176 181 191 171
(23, 24] 102 92 87 85 92
(24, 25] 45 38 38 40 37
(25, 26] 50 57 58 57 59
(26, 27] 25 20 17 20 17
(27, 28] 3 2 1 0 2
(28, 29] 8 11 12 11 11
(29, 30] 13 11 11 11 12

Correlation per sample#

Hide code cell source

corr_per_sample_test = (pred_test
                        .groupby(sample_index_name)
                        .apply(lambda df: df.corr().loc[TARGET_COL])
                        [ORDER_MODELS])
corr_per_sample_test = corr_per_sample_test.join(
    pred_test
    .groupby(sample_index_name)[TARGET_COL]
    .count()
    .rename('n_obs')
)
too_few_obs = corr_per_sample_test['n_obs'] < 3
corr_per_sample_test.loc[~too_few_obs].describe()
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 ... 210.000 210.000 0.000 210.000 0.000 0.000 210.000 30.000 0.000 210.000
mean 0.969 0.967 0.968 0.966 0.966 0.967 0.965 0.963 0.948 0.952 ... 0.076 -0.001 NaN -0.003 NaN NaN 0.371 -0.018 NaN 60.000
std 0.017 0.019 0.017 0.019 0.018 0.016 0.018 0.019 0.035 0.022 ... 0.192 0.143 NaN 0.137 NaN NaN 0.139 0.151 NaN 9.810
min 0.878 0.859 0.908 0.858 0.874 0.903 0.870 0.888 0.722 0.865 ... -0.402 -0.335 NaN -0.374 NaN NaN 0.021 -0.287 NaN 31.000
25% 0.962 0.963 0.961 0.960 0.960 0.961 0.956 0.953 0.938 0.943 ... -0.059 -0.097 NaN -0.088 NaN NaN 0.288 -0.118 NaN 53.000
50% 0.973 0.971 0.972 0.970 0.970 0.971 0.970 0.968 0.958 0.956 ... 0.067 -0.006 NaN -0.004 NaN NaN 0.368 -0.042 NaN 60.000
75% 0.981 0.980 0.980 0.979 0.978 0.978 0.979 0.978 0.969 0.966 ... 0.200 0.086 NaN 0.083 NaN NaN 0.448 0.062 NaN 67.000
max 0.994 0.993 0.993 0.992 0.991 0.992 0.992 0.990 0.987 0.988 ... 0.546 0.397 NaN 0.411 NaN NaN 0.889 0.393 NaN 86.000

8 rows × 25 columns

Hide code cell source

# ! add minimum
kwargs = dict(ylim=(0.7, 1), rot=90,
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model predictions per sample on test data',
              ylabel='correlation per sample')
ax = (corr_per_sample_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_sample_test.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_test.to_excel(w, sheet_name='correlations')
    corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf
_images/7922b9e4259e137988d6a0f442d6a2c8199e31c4103f4403f2596802f2f14faa.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_test[TOP_N_ORDER]).min()
mask = (corr_per_sample_test[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_test.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
  BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
Sample ID                                                  
Sample_015 0.958593 0.924426 0.933751 0.967788 0.947196 0.954625 0.954757 0.964680 0.932223 0.908295 0.882874 0.882874 0.923304 0.917249 nan 0.082354 0.283189 nan -0.166397 nan nan 0.127719 -0.097389 nan 38
Sample_035 0.946885 0.940404 0.929235 0.954952 0.943541 0.950288 0.958440 0.936034 0.951218 0.940977 0.924673 0.924673 0.959100 0.848982 nan 0.257382 -0.162555 nan -0.058128 nan nan 0.699718 nan nan 57
Sample_043 0.949633 0.858586 0.946119 0.858409 0.874193 0.916034 0.870351 0.888093 0.847971 0.882989 0.814366 0.814366 0.828364 0.743820 nan -0.060303 -0.094694 nan 0.027931 nan nan 0.444295 nan nan 57
Sample_047 0.939299 0.935523 0.939772 0.916738 0.937730 0.934825 0.950780 0.945377 0.874472 0.900719 0.896683 0.896683 0.009710 0.917471 nan -0.077002 0.288712 nan -0.198367 nan nan 0.524493 nan nan 46
Sample_069 0.936850 0.939782 0.941523 0.944619 0.922562 0.948004 0.944406 0.937711 0.904430 0.950822 0.914668 0.914668 0.918364 0.702691 nan 0.044260 -0.046101 nan -0.013715 nan nan 0.271641 nan nan 68
Sample_080 0.922142 0.903468 0.924316 0.911596 0.907961 0.921630 0.912309 0.921951 0.902040 0.883475 0.893836 0.893836 0.880736 0.858145 nan 0.093012 0.179780 nan -0.168011 nan nan 0.383774 nan nan 64
Sample_091 0.878328 0.908531 0.929634 0.931055 0.892895 0.903029 0.918275 0.903784 0.920915 0.864672 0.903019 0.903019 0.910180 0.864228 nan -0.095511 -0.139221 nan -0.271150 nan nan 0.341572 nan nan 60
Sample_108 0.929388 0.952325 0.935099 0.940255 0.950187 0.945715 0.951487 0.946047 0.866107 0.915455 0.939810 0.939810 0.929216 0.872660 nan -0.044493 -0.205408 nan 0.024208 nan nan 0.407365 nan nan 68
Sample_109 0.937615 0.926942 0.918476 0.924847 0.932272 0.934963 0.898780 0.893626 0.841761 0.879726 0.890426 0.890426 0.931145 0.766556 nan -0.059637 0.064319 nan -0.063192 nan nan 0.337532 -0.040683 nan 59
Sample_111 0.978525 0.924839 0.955242 0.974002 0.922145 0.969441 0.958219 0.933775 0.923850 0.935239 0.857016 0.857016 0.962568 0.465618 nan -0.130888 0.116533 nan 0.069062 nan nan 0.451452 nan nan 54
Sample_115 0.891712 0.908184 0.915636 0.915296 0.921725 0.923359 0.928234 0.918842 0.853242 0.874847 0.881285 0.881285 0.901459 0.840022 nan 0.094778 -0.114045 nan 0.212302 nan nan 0.320851 nan nan 63
Sample_134 0.933622 0.928976 0.938864 0.907465 0.916421 0.945027 0.952746 0.935936 0.905001 0.915387 0.865397 0.865397 0.881808 0.877223 nan 0.389792 0.030858 nan 0.203116 nan nan 0.344741 nan nan 66
Sample_138 0.957581 0.917653 0.939472 0.953928 0.938101 0.948569 0.927573 0.936390 0.943933 0.936371 0.921359 0.921359 0.963983 0.872310 nan 0.001445 0.034147 nan 0.126516 nan nan 0.523470 nan nan 46
Sample_148 0.975203 0.938897 0.908491 0.979465 0.950146 0.962219 0.946668 0.926864 0.929094 0.955283 0.935395 0.935395 0.984939 0.812267 nan 0.037085 -0.033325 nan -0.020796 nan nan 0.362124 nan nan 62
Sample_151 0.947829 0.922078 0.915358 0.919188 0.935494 0.948035 0.937720 0.937262 0.934510 0.915733 0.904552 0.904552 0.917004 0.812885 nan -0.189751 -0.017551 nan -0.015127 nan nan 0.302307 nan nan 70
Sample_152 0.922635 0.926545 0.922803 0.926056 0.922356 0.922182 0.932482 0.931084 0.918127 0.917052 0.909410 0.909410 0.877491 0.928775 nan 0.098949 -0.026448 nan 0.083479 nan nan 0.336118 nan nan 64
Sample_162 0.929186 0.946370 0.939790 0.933190 0.942582 0.935863 0.937839 0.949772 0.956867 0.940055 0.937255 0.937255 0.933909 0.921417 nan 0.516397 -0.102285 nan 0.085090 nan nan 0.294184 0.087903 nan 51
Sample_167 0.952090 0.931831 0.937914 0.931476 0.934976 0.947787 0.939793 0.936802 0.922116 0.930438 0.905413 0.905413 0.923164 0.903636 nan 0.221299 0.009705 nan -0.009801 nan nan 0.235179 nan nan 65
Sample_171 0.948100 0.926793 0.919661 0.901446 0.913652 0.926500 0.919215 0.909432 0.845442 0.899387 0.863135 0.863135 0.898770 0.824624 nan -0.061550 0.066253 nan 0.188184 nan nan 0.344922 nan nan 40
Sample_181 0.912274 0.931778 0.927525 0.920976 0.924586 0.909406 0.929397 0.913043 0.869468 0.929033 0.896030 0.896030 0.899227 0.849220 nan -0.243627 -0.097130 nan -0.077920 nan nan 0.419029 0.117814 nan 60
Sample_185 0.949315 0.944668 0.941018 0.929238 0.939542 0.947021 0.936803 0.930556 0.924211 0.929391 0.922411 0.922411 0.899905 0.897834 nan -0.264227 0.076503 nan -0.094571 nan nan 0.576069 nan nan 69
Sample_199 0.928280 0.930841 0.930659 0.930234 0.929986 0.929567 0.917037 0.925542 0.912243 0.918083 0.910943 0.910943 0.937794 0.783540 nan -0.086879 0.156780 nan 0.130244 nan nan 0.289504 nan nan 45
Sample_200 0.934067 0.938717 0.929792 0.933368 0.934338 0.931138 0.934918 0.918361 0.722446 0.926169 0.891269 0.891269 0.916117 0.769894 nan 0.034387 -0.246020 nan -0.175349 nan nan 0.535540 -0.109406 nan 40

Hide code cell source

feature_names = pred_test.index.levels[-1]
N_SAMPLES = pred_test.index
M = len(feature_names)
pred_test.loc[pd.IndexSlice[:, feature_names[random.randint(0, M - 1)]], :]
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_006 Q5VSG8 14.625 15.774 15.817 15.701 16.155 15.711 15.555 15.838 15.710 15.797 ... 17.264 12.767 12.622 12.285 7.068 0 -44.660 2,508.999 NaN 153
Sample_027 Q5VSG8 16.405 15.714 15.955 15.985 15.827 15.996 15.817 15.578 15.833 16.124 ... 17.278 13.173 12.785 11.820 7.068 0 67.515 2,555.064 NaN 153
Sample_042 Q5VSG8 15.418 15.537 15.894 16.019 15.994 15.719 15.298 15.668 15.639 15.401 ... 17.424 12.212 12.863 13.274 7.068 0 -58.300 2,556.320 NaN 153
Sample_097 Q5VSG8 15.193 15.393 15.291 15.033 14.807 15.542 14.844 15.186 14.743 15.394 ... 16.975 11.997 12.186 11.989 7.068 0 -11.039 2,528.647 NaN 153
Sample_128 Q5VSG8 16.527 15.437 15.454 16.404 15.506 15.665 16.129 15.453 15.553 15.654 ... 17.162 11.772 12.211 13.346 7.068 0 68.326 2,492.974 NaN 153
Sample_152 Q5VSG8 18.415 16.205 15.195 16.141 15.541 15.619 16.215 15.714 16.209 15.337 ... 17.328 12.667 12.470 12.005 7.068 0 65.801 2,512.772 NaN 153
Sample_195 Q5VSG8 14.977 14.754 15.341 15.293 15.115 15.192 14.941 15.357 15.119 15.401 ... 17.092 11.579 12.219 12.833 7.068 0 -44.354 2,568.559 NaN 153

7 rows × 26 columns

Hide code cell source

options = random.sample(sorted(set(feature_names)), 1)
pred_test.loc[pd.IndexSlice[:, options[0]], :]
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_022 O00462 17.502 17.377 17.530 17.527 17.417 17.512 17.321 17.681 17.579 17.434 ... 17.284 13.664 12.758 13.219 7.068 0 -35.529 2,542.359 NaN 210
Sample_071 O00462 17.006 17.490 17.062 17.132 16.784 17.032 17.285 17.053 17.042 17.098 ... 17.067 13.060 12.212 12.083 7.068 0 -159.102 2,443.160 NaN 210
Sample_084 O00462 16.994 17.305 16.951 16.983 16.970 17.025 17.081 16.878 16.880 17.095 ... 17.196 12.531 11.629 12.116 7.068 0 21.877 2,616.241 NaN 210
Sample_113 O00462 17.479 17.293 17.475 17.281 17.189 17.226 17.445 17.361 17.533 17.288 ... 17.166 11.741 11.637 12.326 7.068 0 73.522 2,668.638 NaN 210
Sample_128 O00462 16.766 17.025 16.967 16.976 16.974 16.874 17.022 17.037 17.079 16.857 ... 17.162 12.189 12.211 12.080 7.068 0 -40.448 2,492.974 NaN 210
Sample_144 O00462 16.674 16.863 16.855 16.836 16.964 16.869 17.198 16.987 16.958 16.980 ... 17.157 12.218 12.831 12.449 7.068 0 24.163 2,508.753 NaN 210
Sample_152 O00462 17.139 17.127 17.100 17.143 17.109 16.860 17.011 16.937 16.830 17.214 ... 17.333 12.657 12.470 13.031 7.068 0 30.638 2,512.772 NaN 210
Sample_176 O00462 17.052 17.000 17.107 17.182 17.118 16.987 17.146 17.069 17.047 17.093 ... 17.151 14.126 11.913 11.415 7.068 0 45.816 2,532.476 NaN 210

8 rows × 26 columns

Correlation per feature#

Hide code cell source

corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply(
    lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS]
corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[
    TARGET_COL].count().rename('n_obs'))

too_few_obs = corr_per_feat_test['n_obs'] < 3
corr_per_feat_test.loc[~too_few_obs].describe()
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 ... 1,396.000 1,396.000 1,396.000 1,396.000 0.000 0.000 1,396.000 1,396.000 0.000 1,396.000
mean 0.636 0.631 0.637 0.607 0.586 0.603 0.564 0.538 0.445 0.470 ... -0.008 0.013 0.086 0.036 NaN NaN -0.009 -0.016 NaN 8.999
std 0.337 0.335 0.344 0.342 0.355 0.331 0.359 0.369 0.422 0.388 ... 0.447 0.401 0.441 0.415 NaN NaN 0.411 0.433 NaN 3.913
min -0.998 -0.984 -0.998 -0.977 -0.993 -0.999 -0.983 -0.991 -1.000 -0.999 ... -0.999 -0.988 -0.999 -0.993 NaN NaN -1.000 -0.999 NaN 3.000
25% 0.506 0.517 0.507 0.455 0.417 0.448 0.392 0.359 0.221 0.266 ... -0.341 -0.259 -0.242 -0.246 NaN NaN -0.285 -0.313 NaN 6.000
50% 0.746 0.737 0.745 0.706 0.696 0.702 0.677 0.634 0.554 0.554 ... -0.005 0.011 0.118 0.060 NaN NaN 0.004 -0.016 NaN 8.000
75% 0.880 0.870 0.882 0.859 0.847 0.849 0.833 0.809 0.770 0.768 ... 0.331 0.279 0.425 0.338 NaN NaN 0.268 0.294 NaN 11.000
max 0.999 1.000 0.999 1.000 1.000 1.000 0.999 1.000 1.000 0.998 ... 0.999 0.999 0.999 0.998 NaN NaN 0.992 0.998 NaN 32.000

8 rows × 25 columns

Hide code cell source

corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0)
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
protein groups
A0A0A0MS09;P01880;P01880-2 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000 1.000 1.000 -1.000 1.000 -1.000 -1.000 1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
A0A0C4DH29 -1.000 -1.000 -1.000 -1.000 1.000 1.000 1.000 -1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000 1.000 -1.000 1.000 -1.000 1.000 1.000 1.000 -1.000 -1.000 ... 1.000 -1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000 1.000 1.000 -1.000 1.000 -1.000 -1.000 1.000 -1.000 1.000 ... 1.000 1.000 -1.000 1.000 NaN NaN -1.000 -1.000 NaN 2
D6RF35 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
E7EQ64;P07477 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 -1.000 NaN 2
F8WDW9;Q96AP7 -1.000 1.000 1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
J3KRP0 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 1.000 1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
O43581-2;O43581-3;O43581-5 -1.000 -1.000 -1.000 -1.000 1.000 1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 1.000 -1.000 1.000 NaN NaN -1.000 1.000 NaN 2
P04075 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 -1.000 1.000 -1.000 NaN NaN 1.000 -1.000 NaN 2
P04080 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 1.000 -1.000 ... -1.000 1.000 -1.000 1.000 NaN NaN -1.000 -1.000 NaN 2
P33151 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P62258 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 1.000 1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
Q9NYQ8 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 -1.000 -1.000 1.000 ... -1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
Q9Y281;Q9Y281-3 1.000 1.000 1.000 1.000 -1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2

16 rows × 25 columns

Hide code cell source

kwargs = dict(rot=90,
              flierprops=dict(markersize=1),
              ylabel=f'correlation per {FEAT_NAME_DISPLAY}')
ax = (corr_per_feat_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs)
      )
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                       horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_feat.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_feat_test.loc[~too_few_obs].describe().to_excel(
        w, sheet_name='summary')
    corr_per_feat_test.to_excel(w, sheet_name='correlations')
    corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf
_images/46a0fe20dc00740fb03e5e6d22f778d93472525a899b962fddd2785af89e6866.png

Hide code cell source

feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count()
feat_count_test.name = 'count'
feat_count_test.head()
protein groups
A0A024QZX5;A0A087X1N8;P35237                                                     10
A0A024R0T9;K7ER74;P02655                                                          8
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8    6
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                           8
A0A075B6H7                                                                        4
Name: count, dtype: int64

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_feat_test[TOP_N_ORDER]).min()
mask = (corr_per_feat_test[TOP_N_ORDER] < treshold).any(axis=1)


def highlight_min(s, color, tolerence=0.00001):
    return np.where((s - s.min()).abs() < tolerence, f"background-color: {color};", None)


view = (corr_per_feat_test
        .join(feat_count_test)
        .loc[mask]
        .sort_values('count'))

if not view.empty:
    display(view
            .style.
            apply(highlight_min, color='yellow', axis=1,
                  subset=corr_per_feat_test.columns)
            )
else:
    print("None found")
  BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs count
protein groups                                                    
A0A0C4DH29 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000000 1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 1.000000 1.000000 1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000000 1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 nan 2 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000000 1.000000 -1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan 1.000000 -1.000000 nan 2 2
F8WDW9;Q96AP7 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
O43581-2;O43581-3;O43581-5 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 1.000000 nan 2 2
Q9Y281;Q9Y281-3 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 nan nan 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;P01892;P10316 -0.405690 -0.429011 0.009868 -0.426425 -0.021073 -0.707564 0.225780 0.141568 -0.230425 0.240703 nan nan 0.029026 0.284018 0.969574 0.997890 -0.785999 -0.651641 0.330650 nan nan 0.599349 0.988049 nan 3 3
O95497 0.491301 0.945814 -0.669746 -0.187796 0.395362 0.986729 -0.278320 -0.038249 -0.972465 0.974255 nan nan -0.924475 0.990793 -0.728224 -0.107445 -0.185769 0.908126 0.756230 nan nan 0.163889 -0.914582 nan 3 3
P04040 0.955425 0.996135 0.141471 0.364706 -0.769346 0.801973 0.860313 0.995399 0.584629 0.685429 nan nan 0.736559 0.580919 0.910187 0.961838 0.885077 0.889522 0.993779 nan nan -0.755848 0.778883 nan 3 3
Q9UI40;Q9UI40-2 0.154643 -0.543671 0.454324 0.705987 0.532638 -0.593498 0.043055 -0.368219 0.609727 -0.080993 nan nan 0.216445 -0.779695 0.418780 0.168840 0.874164 0.994516 0.543796 nan nan 0.588145 0.963302 nan 3 3
Q9BRA2 -0.997854 -0.905541 -0.868644 -0.976681 -0.992520 -0.893310 0.355335 -0.969137 -0.999909 0.328330 nan nan -0.376394 -0.714867 -0.888126 -0.946135 -0.184643 -0.251266 -0.993260 nan nan 0.231242 0.893058 nan 3 3
P67936 -0.781265 -0.984325 -0.998131 -0.280233 -0.831201 -0.758616 -0.930367 0.668141 -0.999486 -0.951572 nan nan -0.952164 -0.520411 -0.989773 -0.899019 -0.466071 0.990700 -0.848113 nan nan -0.364239 -0.919047 nan 3 3
Q5FWE3;Q5FWE3-3 -0.595898 0.509470 -0.964173 0.940836 0.957155 0.928147 -0.515612 -0.450286 0.997780 -0.317399 nan nan 0.852964 -0.419630 0.879697 0.674983 0.926040 0.966111 0.997252 nan nan -0.999926 0.995547 nan 3 3
P14138 -0.783274 0.463007 -0.707466 0.277987 0.500272 -0.390157 0.474087 -0.039227 0.184293 -0.975804 nan nan -0.284116 -0.844495 -0.680464 -0.988691 -0.903851 -0.256388 -0.204135 nan nan 0.330904 0.330904 nan 3 3
A0A087X117;A0A0G2JN29;J3KN36;P69849;Q15155;Q5JPE7;Q5JPE7-2 0.678810 -0.221945 -0.350382 0.676860 -0.339560 -0.240795 0.312055 0.660508 -0.865326 0.808609 nan nan -0.988011 -0.345563 -0.484680 -0.563403 -0.603396 -0.965384 -0.814245 nan nan -0.178456 0.909074 nan 3 3
Q0P6D2 0.488693 -0.791180 -0.680823 -0.977189 -0.991322 0.559590 -0.914209 -0.991429 0.990772 -0.817525 nan nan -0.910486 -0.239399 -0.815144 -0.873162 -0.955235 -0.856054 -0.923324 nan nan 0.198784 -0.816625 nan 3 3
Q96KR4;Q96KR4-3 -0.811499 0.974236 -0.516714 0.188634 0.913578 -0.998885 0.962865 0.998946 -0.996456 -0.998544 nan nan -0.499851 0.666499 -0.969111 -0.794681 -0.944704 0.464970 0.126469 nan nan 0.770952 0.620557 nan 3 3
P01912;Q5Y7A7 0.312242 0.824886 -0.961164 -0.737679 -0.382274 0.153907 -0.983163 -0.923553 0.367817 0.598949 nan nan -0.036948 -0.607883 0.393426 0.622172 0.045562 0.128832 -0.300240 nan nan -0.520030 -0.464856 nan 4 4
P55058 0.331515 -0.202663 -0.199568 -0.453671 -0.199872 -0.165490 -0.730945 -0.687075 0.561214 0.511729 nan nan -0.850960 0.576320 -0.860013 0.937309 0.858934 0.109603 -0.678461 nan nan -0.314778 -0.708824 nan 4 4
Q99538 0.593821 -0.229839 -0.248473 0.377925 -0.422897 -0.273797 -0.288458 -0.152372 0.223491 -0.280701 nan nan 0.914120 0.192772 0.708292 -0.029421 0.492204 0.753316 0.211428 nan nan -0.421840 -0.113954 nan 4 4
Q9ULP0-3;Q9ULP0-6 -0.433605 0.159449 -0.257946 0.280988 0.670847 -0.069102 -0.127518 -0.217068 -0.824736 0.569381 nan nan 0.047124 0.269339 -0.147969 0.088032 -0.745951 0.597287 0.214852 nan nan 0.651242 0.678026 nan 4 4
P78310;P78310-2;P78310-5;P78310-6;P78310-7 -0.287802 0.616232 0.881076 0.187632 0.736129 -0.276865 0.236820 0.177240 0.323788 -0.925309 nan nan -0.002267 0.316710 0.370874 -0.992637 -0.686869 -0.740181 -0.296970 nan nan -0.197104 0.065686 nan 4 4
P48745 -0.215191 -0.023907 -0.093750 0.281906 -0.759900 -0.254192 -0.708241 -0.711413 0.592691 0.072595 nan nan -0.966228 0.945186 -0.502678 -0.007247 0.277121 -0.811427 -0.468625 nan nan -0.816749 -0.563041 nan 4 4
P01704 0.467663 -0.252897 0.975575 -0.028085 -0.142199 0.805287 -0.769039 0.534897 0.229086 -0.463726 nan nan 0.151957 -0.150628 0.353124 -0.603360 0.583247 -0.375894 0.110619 nan nan 0.964419 -0.791674 nan 4 4
A0A075B7B8 0.995309 0.791104 0.955650 -0.584344 0.923487 0.947905 0.645944 0.628331 0.493563 0.290645 nan nan -0.198260 0.837792 0.596184 0.426337 -0.234803 -0.818663 0.286601 nan nan -0.419682 0.603615 nan 4 4
A0A0G2JRN3 -0.879258 -0.541208 -0.205140 -0.190835 -0.764746 -0.025441 0.676931 0.355859 -0.978184 -0.922357 nan nan -0.966410 -0.230728 -0.890772 -0.522710 -0.741120 -0.786540 -0.927348 nan nan 0.786883 0.273736 nan 4 4
H0Y4H1 0.812529 0.349285 0.908593 0.493598 -0.949148 0.990536 -0.065041 -0.210497 0.198249 0.844542 nan nan 0.965678 0.378429 0.871470 0.919630 0.554656 0.705379 0.295721 nan nan -0.335699 0.311475 nan 4 4
B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 -0.586205 -0.289257 -0.381057 -0.468526 -0.883514 -0.367728 -0.950792 -0.978604 -0.206013 0.640276 nan nan -0.926715 -0.822871 -0.611749 -0.688762 0.749394 -0.359254 -0.116363 nan nan -0.630160 0.512953 nan 4 4
E9PN95;P11684 -0.044137 0.608411 0.585073 -0.416898 0.255763 0.082417 -0.241660 0.598536 0.553236 0.989064 nan nan 0.572649 0.714753 -0.972121 -0.214485 -0.433037 -0.729323 -0.988176 nan nan 0.624754 0.488347 nan 4 4
A8MXB9;J3KQJ1;Q8NBJ7 -0.526368 -0.394205 -0.563214 -0.288690 0.555064 0.343212 0.719047 0.882512 0.575047 0.781018 nan nan 0.983037 0.526537 -0.315140 -0.447225 -0.973521 0.092247 0.596012 nan nan -0.625728 -0.380936 nan 4 4
A0A087WSY4 0.341097 -0.148785 -0.467895 -0.455325 -0.817958 -0.323573 0.666119 0.804721 -0.612366 -0.162759 nan nan 0.149034 -0.672419 0.994065 0.280256 0.408032 -0.553907 -0.096528 nan nan -0.767064 0.652348 nan 4 4
P21810 0.488982 -0.760950 -0.316445 0.558018 -0.766228 0.074589 0.725338 0.497175 0.976315 0.606417 nan nan 0.507237 -0.319133 0.086433 0.181221 0.017309 -0.013604 -0.306770 nan nan -0.032190 -0.515024 nan 4 4
P31150 -0.695110 -0.265330 -0.579256 0.079765 -0.040502 0.610818 -0.302639 -0.258552 0.871434 0.374253 nan nan -0.383378 -0.375825 -0.744603 -0.826531 0.394727 0.679134 0.232574 nan nan 0.018162 0.997740 nan 4 4
P69905 0.965583 -0.524527 -0.702585 0.990263 -0.144855 0.834823 -0.734862 -0.884057 0.787808 0.652423 nan nan 0.995940 0.586054 -0.089591 0.819375 -0.973068 -0.613292 -0.305953 nan nan 0.599413 0.563706 nan 4 4
Q8TEA8 0.312460 0.257921 0.530713 0.078415 -0.411483 0.534498 -0.199719 -0.061944 0.642442 0.704937 nan nan 0.711175 -0.090526 0.925803 0.920805 -0.696605 0.711193 0.577306 nan nan -0.971775 0.075778 nan 4 4
Q8WXD2 -0.194738 -0.430617 -0.483562 0.030615 -0.208978 -0.033587 -0.644254 -0.896598 -0.044385 -0.527932 nan nan -0.031366 -0.476616 -0.879923 -0.696602 -0.020692 0.367920 -0.886265 nan nan -0.304569 0.801129 nan 4 4
Q9NS85 0.209240 0.000918 -0.244667 -0.748054 -0.625789 0.082955 0.174286 0.827546 -0.293415 -0.490455 nan nan -0.811087 -0.794271 0.688432 0.684565 -0.061817 -0.573150 0.551908 nan nan 0.028761 0.700517 nan 4 4
B1AJZ9;B1AJZ9-4;H0YE38;Q5JYW6 -0.213396 -0.298024 -0.246042 -0.655514 -0.761989 -0.182860 -0.355725 -0.358839 -0.320693 0.330414 nan nan -0.420475 0.681571 -0.515651 -0.659479 -0.340889 0.428130 -0.230083 nan nan 0.163593 0.301970 nan 5 5
G3V2U7;P07311 0.478113 -0.432644 -0.224121 0.277132 -0.608351 -0.376993 0.126790 -0.075889 -0.045458 0.298977 nan nan -0.201011 0.428160 -0.434263 -0.349618 -0.593266 0.286177 0.455633 nan nan -0.412639 -0.436945 nan 5 5
Q8N428 -0.711604 -0.487544 -0.690390 -0.372718 -0.853613 0.263717 -0.815146 -0.973013 0.485147 -0.775526 nan nan 0.896715 -0.582837 -0.145359 0.398169 -0.627400 0.571660 0.667949 nan nan -0.592174 0.633450 nan 5 5
Q9NZP8 0.485369 0.069047 0.405518 -0.596706 -0.357075 0.584042 0.391275 0.135336 -0.921105 0.672486 nan nan -0.849012 0.692534 0.108741 -0.011939 -0.139423 0.306441 0.537510 nan nan 0.307325 0.307325 nan 5 5
Q9NZC2;Q9NZC2-2;Q9NZC2-3 -0.532191 -0.554614 -0.664409 -0.549588 -0.485273 -0.726817 0.118561 -0.020609 -0.710573 -0.652656 nan nan -0.534201 0.698304 0.889235 -0.779762 -0.567587 -0.454867 -0.093283 nan nan 0.820294 0.820294 nan 5 5
Q9UHI8 0.073471 -0.166905 -0.142836 -0.445122 0.195510 -0.288544 0.316329 -0.061541 0.216956 -0.119815 nan nan -0.208262 -0.221055 0.051369 0.334440 0.361738 0.118070 -0.273584 nan nan -0.151806 -0.746015 nan 5 5
Q10469 -0.098233 -0.535297 0.016150 -0.291141 -0.839787 -0.512776 -0.671391 -0.600598 0.649427 0.606434 nan nan -0.185669 0.683510 -0.517750 -0.162715 -0.053241 -0.550749 -0.468569 nan nan -0.084422 0.700076 nan 5 5
Q5BIV9 -0.499694 -0.239637 -0.453539 -0.073578 -0.039045 -0.306471 -0.595082 -0.855339 -0.481080 -0.438393 nan nan -0.128645 0.059139 -0.778869 -0.513219 -0.241407 -0.261649 -0.600959 nan nan -0.127227 -0.239878 nan 5 5
Q6ZVL6 -0.405009 0.439637 0.438265 0.541507 0.683371 0.270387 -0.626545 -0.421210 0.745658 0.380994 nan nan 0.809662 -0.107972 0.343913 -0.237720 0.180088 0.617457 0.706048 nan nan -0.305862 -0.650631 nan 5 5
Q13508;Q13508-2;Q13508-3 -0.588905 0.855331 0.474934 0.317771 0.879714 -0.667387 0.367007 -0.137197 -0.196508 0.072987 nan nan 0.595601 0.970782 -0.921969 -0.902558 -0.662699 -0.647509 -0.738314 nan nan -0.816043 0.878974 nan 5 5
O15031 -0.384689 0.702700 0.520531 0.545446 0.592297 0.737380 0.392387 0.288900 0.278546 0.565505 nan nan 0.144879 0.464377 -0.264862 -0.348062 0.203328 -0.121316 -0.744921 nan nan -0.296688 -0.296688 nan 5 5
O75339 0.036295 -0.423461 -0.248377 0.082569 -0.491145 -0.163106 -0.383440 -0.134371 -0.662474 0.284820 nan nan -0.707550 0.315849 -0.475700 -0.809598 0.817582 -0.206109 -0.342625 nan nan 0.211736 0.029954 nan 5 5
P00441 -0.466488 -0.438271 -0.197884 -0.418119 -0.376758 -0.451248 0.047755 -0.045985 -0.047843 -0.228959 nan nan -0.221865 -0.418922 -0.478446 -0.697790 0.404520 -0.145801 -0.300732 nan nan 0.030159 -0.379242 nan 5 5
A0A0G2JRQ6 -0.226956 -0.654177 -0.615743 -0.249237 -0.236313 -0.201171 -0.267241 -0.104896 0.030670 -0.906418 nan nan -0.206273 0.026396 -0.360860 0.551245 0.403397 0.307142 0.517793 nan nan -0.589769 -0.589769 nan 5 5
A0A0C4DGV8;Q13214;Q13214-2 0.215023 -0.127094 0.183221 -0.952860 0.408146 0.179961 -0.317076 -0.315165 -0.648660 -0.694452 nan nan -0.483616 0.126421 0.951239 0.908215 0.351071 -0.051800 0.278987 nan nan -0.698798 -0.910792 nan 5 5
D6R956;P09936 0.296679 0.132572 0.115514 -0.551658 0.110367 0.602972 -0.259386 -0.664319 0.151812 0.024035 nan nan 0.313797 0.743695 -0.119808 -0.402161 -0.706868 -0.212790 0.124786 nan nan -0.514141 0.033000 nan 5 5
P02533 0.968900 -0.938376 -0.491814 0.859288 0.469956 0.348511 0.106114 0.085868 0.760846 0.490145 nan nan 0.864174 0.839709 0.303053 0.460298 0.540528 0.446403 0.984374 nan nan 0.207036 -0.545616 nan 5 5
P01834 0.145967 -0.482318 -0.291923 -0.196562 0.016119 0.260299 0.157569 0.333278 0.249827 0.136816 nan nan -0.132585 0.043390 -0.169066 -0.492225 -0.507740 0.283761 0.350251 nan nan -0.157407 -0.157407 nan 5 5
P10124 -0.597027 -0.294143 -0.268502 -0.186782 -0.059547 -0.519127 0.571509 -0.252024 0.436283 -0.628340 nan nan -0.573988 0.089761 -0.483333 -0.479793 0.775322 -0.261585 -0.575851 nan nan 0.414290 0.578378 nan 5 5
P05451 0.097378 -0.509742 -0.446000 -0.057327 -0.475283 -0.161314 -0.360316 -0.684154 -0.327039 0.306881 nan nan -0.336078 -0.266095 -0.538331 -0.470228 0.438245 -0.412800 0.191825 nan nan -0.749296 -0.302038 nan 6 6
H3BRQ4;K4DIB9;P50238 -0.093569 -0.239921 -0.452521 -0.057295 -0.946437 -0.479498 -0.083371 -0.277897 0.428824 0.210272 nan nan 0.321101 -0.295369 -0.052992 0.460109 0.003475 0.387204 0.546740 nan nan 0.012052 -0.720358 nan 6 6
M0R009 -0.339372 -0.209087 -0.422875 -0.584432 -0.071531 -0.330579 -0.356792 -0.247140 -0.037546 -0.001198 nan nan -0.732570 -0.720117 -0.580618 0.320436 -0.122446 -0.750481 -0.748465 nan nan 0.635403 -0.182952 nan 6 6
P29401;P29401-2 0.085483 0.062777 -0.186081 -0.368660 -0.056067 -0.255608 0.223946 0.365861 0.001412 -0.392136 nan nan 0.108221 0.982623 0.187563 0.224033 0.212184 -0.281768 0.292417 nan nan 0.300495 0.397517 nan 6 6
P01036;P01037 0.014913 -0.312309 -0.131511 -0.482398 -0.115642 0.024068 -0.220449 -0.046435 -0.115861 -0.146825 nan nan 0.512862 -0.256948 -0.389652 -0.354472 -0.213116 -0.362058 -0.710594 nan nan 0.673582 0.579024 nan 6 6
P10644;P10644-2 -0.252574 0.031079 0.076065 -0.097608 -0.140461 0.049724 -0.336136 -0.255617 0.448682 -0.000997 nan nan 0.107935 0.221059 0.383896 -0.528502 -0.191882 0.577309 0.257838 nan nan -0.154795 -0.502540 nan 6 6
E9PKE3;P11142 -0.225336 -0.131346 -0.559804 0.346178 0.244284 -0.532078 -0.515610 -0.674428 0.038561 -0.291303 nan nan -0.729284 0.037040 -0.288577 0.003161 -0.315863 -0.507155 -0.364106 nan nan 0.281878 0.415848 nan 6 6
Q9BT88 -0.231560 -0.337563 0.195083 -0.505983 -0.500323 0.219319 0.248942 0.470496 -0.357877 -0.223776 nan nan 0.096659 0.026127 -0.078098 0.471098 0.792265 0.626185 0.554085 nan nan 0.342112 0.354892 nan 7 7
Q9BX67 0.701146 -0.300622 0.180066 0.063862 -0.199904 0.912907 -0.048843 -0.246240 0.445479 0.340506 nan nan 0.033937 0.442579 -0.269549 -0.431929 -0.532661 0.232872 -0.130604 nan nan -0.194531 -0.561094 nan 7 7
Q8IWU5;Q8IWU5-2 -0.218403 -0.335499 0.264065 -0.483524 -0.508145 0.686806 -0.071915 -0.451814 -0.684735 0.392763 nan nan -0.070980 0.595402 0.334826 0.479673 0.022466 -0.096058 -0.180395 nan nan 0.632888 -0.013472 nan 7 7
Q96S96 -0.324600 0.364627 0.295605 -0.462902 -0.422620 0.534054 -0.359991 -0.296281 -0.305383 0.059876 nan nan -0.582492 0.387452 -0.157328 -0.153988 0.803221 -0.086735 0.297549 nan nan -0.682257 0.760044 nan 7 7
P80108 -0.376648 -0.481125 -0.244665 -0.172435 -0.570973 -0.299678 -0.184693 -0.137669 0.293716 -0.492364 nan nan 0.087362 -0.839785 -0.600824 -0.578633 0.159178 -0.443645 -0.034583 nan nan 0.119112 0.119112 nan 7 7
P19835;X6R868 -0.578277 -0.544594 -0.720762 -0.428965 -0.416798 -0.550914 -0.359418 -0.458749 -0.930907 0.408941 nan nan -0.762851 -0.599931 -0.321335 -0.213048 0.370871 -0.001008 -0.136041 nan nan -0.780480 0.407170 nan 7 7
Q5VSG8 0.643434 -0.357802 0.621396 0.006904 0.264873 0.786558 0.216016 0.672377 -0.081632 0.554619 nan nan -0.342809 0.517234 0.068333 0.338473 0.299636 0.019817 -0.222090 nan nan 0.814893 -0.335813 nan 7 7
P35443 -0.174408 0.213802 0.600113 -0.252097 -0.087769 0.733066 0.404203 0.582749 0.068469 -0.211928 nan nan 0.314822 -0.322885 -0.017793 -0.130271 -0.797452 0.039112 0.101271 nan nan 0.022830 0.275796 nan 7 7
C9JKT8;Q9UEW3;Q9UEW3-2 -0.293733 -0.119412 0.148771 -0.671856 -0.228615 0.046175 0.252685 0.407657 0.516418 -0.293680 nan nan -0.461590 0.025811 -0.356302 -0.545248 -0.194787 -0.645563 -0.567836 nan nan -0.058753 -0.279069 nan 7 7
K7EPJ5;O60291;O60291-2;O60291-3;O60291-4 0.329534 0.036946 0.120074 0.338197 -0.338033 0.160313 0.445305 0.333772 0.254331 0.510493 nan nan -0.168375 -0.063677 -0.417249 -0.282897 -0.237020 -0.236863 -0.657632 nan nan -0.087903 -0.225342 nan 7 7
Q5SRI9 -0.274015 0.236269 0.120605 -0.087155 0.033891 0.212136 -0.584189 -0.348800 0.733544 -0.259835 nan nan 0.162788 0.049301 0.592005 0.136408 -0.101261 -0.088515 0.296959 nan nan 0.192704 -0.380808 nan 8 8
Q13790 -0.521996 0.374844 0.338857 -0.198715 -0.153678 0.166330 -0.462789 0.222006 -0.215508 -0.483551 nan nan 0.029153 -0.316209 0.174426 -0.331592 -0.551342 0.061342 0.082957 nan nan 0.078235 -0.389478 nan 8 8
Q9HBT6 -0.262534 0.113213 0.126150 0.204430 0.422328 -0.054979 0.099041 -0.130810 0.445839 -0.463107 nan nan 0.711355 -0.293042 -0.045300 -0.091698 -0.060012 -0.213494 -0.233973 nan nan 0.106877 -0.188251 nan 8 8
Q8IUX7 0.488978 0.781380 0.938364 0.266292 -0.396147 0.409147 0.026749 0.245281 -0.109344 0.333081 nan nan 0.470310 -0.285050 0.553343 0.501729 -0.064960 0.022986 -0.012999 nan nan 0.577999 -0.601119 nan 8 8
Q9Y653;Q9Y653-2;Q9Y653-3 -0.130134 -0.471959 -0.548024 -0.417296 -0.452784 -0.179090 -0.604021 -0.672811 -0.191744 0.165960 nan nan -0.729686 0.492473 0.286967 0.024990 -0.292701 -0.282547 0.160846 nan nan 0.406484 -0.099690 nan 8 8
Q9P232 -0.178684 -0.020185 -0.309093 -0.387374 -0.279732 -0.051121 -0.587127 -0.640844 -0.695382 -0.457551 nan nan -0.400998 0.220458 0.256287 0.419846 -0.657994 0.477733 0.595364 nan nan -0.766501 -0.447343 nan 8 8
P12273 0.309929 -0.007882 -0.253345 0.125788 0.223111 0.330019 -0.170046 -0.219819 -0.188677 0.272716 nan nan -0.741473 -0.439237 -0.153439 -0.054931 -0.141287 -0.291898 -0.217592 nan nan -0.347341 -0.122712 nan 8 8
O95841 0.500995 0.901884 0.506152 -0.046983 -0.240101 0.393514 0.726898 0.515862 0.107641 -0.307109 nan nan 0.898836 -0.060935 -0.099352 0.066238 -0.243407 0.491465 0.593293 nan nan 0.138001 0.018682 nan 8 8
Q5JRA6;Q5JRA6-2 -0.221290 0.672508 -0.113676 -0.172243 -0.620867 -0.118956 0.450879 0.021417 -0.649397 -0.669281 nan nan -0.367023 0.094328 -0.544434 -0.362956 0.312089 -0.396093 -0.361723 nan nan 0.085440 0.745223 nan 8 8
K7ES70;P55083;P55083-2 0.053310 0.036132 0.050995 -0.070135 -0.338392 0.196651 0.203235 0.059892 -0.126223 0.098983 nan nan 0.114059 0.476088 -0.093409 0.299230 0.113570 0.019436 0.351699 nan nan 0.371669 0.274381 nan 8 8
P05556;P05556-2;P05556-3;P05556-4;P05556-5 -0.472444 -0.169280 0.005685 -0.424667 -0.130859 -0.086953 -0.339333 -0.164482 -0.016348 0.328409 nan nan -0.091371 0.115200 0.247949 0.184702 -0.201535 -0.061795 -0.427230 nan nan 0.003355 0.029215 nan 8 8
A0A0G2JQD2;A0A0G2JQM0;A0A0G2JRN4;P30711 -0.278053 0.009955 -0.400460 -0.254147 -0.145893 -0.229834 -0.176005 -0.274112 0.078554 -0.628983 nan nan -0.142576 -0.079816 -0.484875 -0.287631 0.258942 -0.051869 0.020356 nan nan 0.739107 -0.015886 nan 8 8
B7Z5R6;Q14596;Q14596-2 0.170383 -0.326110 0.145769 -0.034715 -0.106378 -0.016393 0.158769 0.376342 0.835850 -0.173276 nan nan 0.070890 0.046177 -0.097908 0.041231 -0.095141 0.037118 -0.022477 nan nan 0.285089 -0.165235 nan 8 8
O15204;O15204-2 0.584330 0.849663 0.955571 0.230570 -0.242515 0.889410 -0.376917 -0.381210 0.401527 0.683591 nan nan 0.073044 -0.096982 -0.632177 -0.384078 0.328267 0.554693 -0.134213 nan nan -0.369992 0.163446 nan 8 8
P18206;P18206-2 0.275209 0.265898 0.432344 -0.189763 -0.266701 0.646511 -0.393250 -0.430189 0.404893 0.478030 nan nan -0.101734 -0.653787 0.168785 0.035300 -0.741670 -0.121760 -0.079742 nan nan -0.668040 0.105408 nan 8 8
B1AJR6;B1AJR9;B1AJS0;O14522 0.530251 -0.122237 -0.693445 0.093327 0.013068 -0.107320 -0.445185 -0.353217 0.644977 0.465073 nan nan -0.049114 -0.543198 -0.304727 -0.305222 -0.295832 0.183477 -0.553465 nan nan 0.348627 -0.076654 nan 9 9
A0A0C4DH24 -0.413803 -0.322359 -0.585423 -0.451837 -0.438503 -0.372484 -0.168096 -0.181333 -0.444924 0.152891 nan nan -0.499855 0.360202 0.293136 0.105732 0.324225 0.189404 0.480190 nan nan -0.601267 0.147397 nan 9 9
B1ALD9;Q15063;Q15063-3;Q15063-5 -0.101006 0.245695 -0.285092 -0.287475 0.151918 -0.133438 -0.240490 -0.113355 -0.369493 0.319295 nan nan 0.425439 -0.385897 0.080040 0.086550 0.589715 0.619472 -0.001508 nan nan 0.053837 -0.648727 nan 9 9
O43529 0.020461 0.289049 -0.343930 0.174158 0.030217 -0.091672 -0.013005 -0.117824 0.195129 0.128859 nan nan -0.187278 -0.355928 0.069547 0.283319 -0.091890 0.566172 -0.045165 nan nan -0.238252 -0.423734 nan 9 9
P10745 -0.236641 0.087747 0.229596 0.023283 -0.341177 0.350413 0.468128 0.489925 -0.073139 0.328449 nan nan 0.136523 -0.153853 0.597717 0.544391 -0.085925 0.161125 -0.066866 nan nan 0.688639 -0.246050 nan 9 9
A0A087X1V2 -0.070690 0.470731 0.574659 -0.243238 0.341055 0.035347 0.057184 0.003097 0.088347 -0.016905 nan nan 0.227076 0.448399 -0.181107 -0.345283 0.404375 0.241761 0.281900 nan nan 0.039602 -0.005971 nan 9 9
P08493;P08493-2 0.038081 -0.252940 -0.298695 -0.232150 -0.305122 0.167804 -0.635377 -0.703795 -0.351766 0.150211 nan nan 0.024427 -0.477192 -0.406176 -0.246588 -0.629097 0.360308 0.230852 nan nan -0.397982 -0.397982 nan 9 9
P50395 0.013608 -0.105694 0.303679 0.174813 -0.281000 -0.586062 0.327166 0.193138 0.152363 0.368766 nan nan 0.016074 -0.289078 0.147720 0.026398 0.192630 0.045322 0.119366 nan nan -0.756427 -0.524876 nan 9 9
Q9Y6C2 0.340254 0.305470 0.230686 -0.272410 0.246479 -0.132753 0.348677 0.614777 -0.561969 0.551991 nan nan 0.042545 0.009107 0.079803 0.126374 0.138402 -0.186103 0.595286 nan nan 0.017971 -0.127935 nan 9 9
Q14019 -0.344741 -0.003441 -0.122263 0.165256 0.366310 0.084577 -0.725812 -0.693759 0.525948 -0.022084 nan nan 0.367858 0.387061 0.064769 0.401079 -0.262616 0.256334 0.717644 nan nan -0.513253 -0.468898 nan 9 9
B1AJQ6;Q86Y82 0.036729 -0.269560 0.025041 0.003516 0.202369 -0.677791 0.200578 0.321117 0.511875 0.436190 nan nan 0.363410 -0.500231 0.081571 0.042638 0.194860 0.006824 -0.554849 nan nan -0.332403 -0.058363 nan 9 9
A0A1W2PQB1;H0Y755;M9MML0;P08637 -0.090323 0.654956 0.243391 -0.253373 0.606684 0.593555 0.460288 0.690784 0.595184 -0.139070 nan nan 0.331327 -0.730755 -0.081580 -0.266497 0.620620 0.540145 0.048839 nan nan -0.617375 0.448386 nan 10 10
O76070 -0.432052 -0.021578 0.053427 -0.215355 0.740689 -0.528293 -0.204646 0.162530 0.469000 0.054400 nan nan -0.278608 0.194958 -0.212317 -0.086282 -0.316203 -0.128095 -0.316757 nan nan -0.474129 0.337800 nan 10 10
P08670 -0.314275 -0.493394 -0.644232 -0.179390 0.305070 -0.555841 -0.501234 -0.180491 -0.411473 0.025687 nan nan 0.206552 0.312147 0.551632 0.469208 -0.209100 0.402038 0.299838 nan nan 0.028298 0.291405 nan 10 10
P00915 0.933351 0.171496 -0.423666 0.883700 -0.039080 0.946159 0.215571 0.399271 0.618757 0.705641 nan nan 0.933484 0.070995 0.081495 0.012603 0.135959 0.414364 0.496996 nan nan 0.359490 0.118343 nan 10 10
Q6UWH4;Q6UWH4-2 -0.588718 -0.387878 -0.480295 0.098882 -0.708704 -0.419446 -0.397136 0.151063 -0.079928 -0.371245 nan nan -0.426708 0.512708 -0.578150 -0.418792 0.059608 0.143652 -0.359636 nan nan -0.126561 -0.059018 nan 10 10
Q9NQS3;Q9NQS3-2;Q9NQS3-3 -0.424592 -0.012592 -0.162010 -0.139485 0.027785 -0.012216 -0.111027 -0.140219 0.069189 -0.172248 nan nan 0.202371 0.062999 0.035925 0.120460 0.150434 0.092650 0.300169 nan nan 0.156133 -0.174368 nan 11 11
P01742 0.375201 -0.115479 0.390960 -0.267428 -0.290895 0.157755 0.026400 0.271311 -0.587962 -0.094306 nan nan -0.393033 0.283397 0.676899 0.724786 -0.028253 0.594179 0.604933 nan nan -0.194256 0.578374 nan 11 11
P55774 0.103249 -0.114137 -0.269257 -0.308916 -0.042578 0.211110 -0.215141 -0.218578 -0.211760 -0.158074 nan nan -0.181675 0.090941 -0.183622 -0.001697 -0.564199 -0.506454 -0.141548 nan nan -0.584551 -0.551401 nan 11 11
P09172 0.179702 0.244764 -0.254351 0.271932 0.326035 0.116834 -0.255731 -0.201813 0.101990 0.246039 nan nan 0.011346 0.371446 -0.145854 0.238185 -0.083932 -0.148892 -0.287918 nan nan -0.112661 -0.563368 nan 12 12
A1L4H1 0.186316 -0.291064 -0.197674 0.309188 -0.228749 -0.201133 0.072585 -0.116746 -0.221594 0.008497 nan nan -0.326665 -0.099248 0.300334 -0.324173 0.233784 0.289843 0.763518 nan nan -0.031547 -0.042797 nan 12 12
Q9UFP1 0.121246 -0.409320 0.003399 -0.094306 -0.137641 -0.059138 0.028588 0.208374 0.056005 -0.173611 nan nan -0.123046 -0.065698 -0.266922 -0.405138 0.131999 0.076316 0.235877 nan nan 0.159000 0.376368 nan 12 12
Q96RW7;Q96RW7-2 -0.274538 -0.253965 -0.160067 -0.301343 -0.293977 0.086292 -0.398478 -0.352505 0.219858 -0.008263 nan nan 0.625088 0.195838 0.502826 0.258426 -0.086622 -0.017822 -0.347225 nan nan 0.378646 0.234101 nan 12 12
P53634 0.321988 0.017886 0.097579 -0.130860 -0.302778 0.268760 -0.101786 -0.152078 0.263806 0.352836 nan nan 0.180917 0.000572 0.333731 0.177133 0.047049 -0.016131 0.197959 nan nan -0.353595 0.259023 nan 12 12
Q9BUJ0 -0.037313 -0.142301 0.048086 -0.314651 -0.021587 -0.121874 -0.098921 0.027051 -0.265561 -0.119432 nan nan -0.111523 0.089643 -0.202398 -0.237412 0.552726 -0.107921 -0.352559 nan nan 0.049800 -0.265784 nan 12 12
Q9NZ08;Q9NZ08-2 0.032883 0.073821 -0.412193 0.433371 0.328127 -0.174515 0.299292 0.489257 0.572066 0.260532 nan nan 0.348860 0.025822 -0.357765 -0.168136 0.138144 0.521799 0.196087 nan nan 0.202766 0.753444 nan 12 12
P29966 0.767339 -0.635417 0.610070 0.078527 0.398689 0.409266 0.720862 0.172931 0.306123 0.322426 nan nan 0.323159 -0.454287 0.159322 0.397631 -0.046339 0.358982 0.220865 nan nan 0.350638 0.249010 nan 13 13
Q6PCB0 -0.010818 -0.181178 -0.065827 -0.187322 -0.236035 0.040856 -0.287276 -0.323757 -0.316441 0.132796 nan nan -0.011942 0.028386 -0.530040 -0.329471 -0.123151 -0.188112 0.278353 nan nan -0.023465 -0.115061 nan 14 14
B4DYV8;Q8WZ75;Q8WZ75-2;Q8WZ75-3 0.270149 0.185096 0.295313 -0.317889 -0.066418 0.269665 0.130729 0.093670 -0.515780 0.120208 nan nan -0.440024 0.199183 0.078349 0.291298 0.203131 0.133283 -0.068728 nan nan 0.085688 -0.033982 nan 14 14
O43852;O43852-3;O43852-5 0.193891 -0.265474 -0.111542 -0.218009 -0.263394 0.102351 0.295117 0.141397 0.293714 -0.062552 nan nan -0.155637 -0.293284 0.614990 0.394764 -0.144037 0.574898 0.754347 nan nan -0.110063 -0.377160 nan 15 15
P11597;P11597-2 0.091263 -0.346277 -0.256525 -0.244961 -0.276513 -0.072548 -0.071420 -0.029940 0.225605 -0.022021 nan nan -0.443310 0.016309 -0.211026 0.052557 0.409642 0.330721 0.111146 nan nan 0.400242 -0.067354 nan 16 16
Q96AQ6;Q96AQ6-2 0.074929 -0.102326 0.162998 -0.286627 -0.194049 0.027999 0.266461 0.377825 -0.047226 -0.184508 nan nan -0.159338 -0.314913 -0.158511 0.136220 -0.239093 -0.189966 -0.047163 nan nan 0.554486 -0.006715 nan 16 16
Q6ZMP0;Q6ZMP0-2 -0.257877 0.250811 -0.109976 0.132715 0.064003 0.338560 0.066672 -0.028928 -0.151054 -0.094501 nan nan -0.193571 0.130420 0.414953 -0.432438 -0.396096 -0.152902 -0.041783 nan nan 0.481989 0.031743 nan 16 16
A6XMH3;P01236;Q5I0G2 0.143374 -0.202718 -0.491132 -0.263686 -0.254333 0.089571 0.249922 0.236575 -0.184831 0.126855 nan nan -0.456084 -0.130336 0.183538 0.124415 -0.395066 -0.746360 -0.276474 nan nan 0.415223 0.098237 nan 17 17
Q13231;Q13231-3 0.028238 0.012765 0.139751 -0.248124 -0.009779 0.056620 -0.144117 -0.108428 0.011950 0.128016 nan nan 0.134116 -0.164687 -0.284982 -0.276036 -0.115379 0.135271 -0.118556 nan nan -0.001439 0.440431 nan 19 19

Error plot#

Hide code cell source

metrics = pimmslearn.models.Metrics()
test_metrics = metrics.add_metrics(
    pred_test[['observed', *TOP_N_ORDER]], key='test data')
test_metrics = pd.DataFrame(test_metrics)[TOP_N_ORDER]
test_metrics
Selected as truth to compare to: observed
BPCA VAE DAE TRKNN RF
MSE 0.455 0.482 0.476 0.500 0.511
MAE 0.432 0.438 0.437 0.458 0.465
N 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000
prop 1.000 1.000 1.000 1.000 1.000

Hide code cell source

n_in_comparison = int(test_metrics.loc['N'].unique()[0])
n_in_comparison
12600

Hide code cell source

_to_plot = test_metrics.loc[METRIC].to_frame().T
_to_plot.index = [feature_names.name]
_to_plot
BPCA VAE DAE TRKNN RF
protein groups 0.432 0.438 0.437 0.458 0.465

Hide code cell source

try:
    text = model_configs[["latent_dim", "hidden_layers"]].apply(
        build_text,
        axis=1)
except KeyError:
    logger.warning("No PIMMS models in comparsion. Using empty text")
    text = pd.Series('', index=model_configs.columns)

_to_plot.loc["text"] = text
_to_plot = _to_plot.fillna('')
_to_plot
BPCA VAE DAE TRKNN RF
protein groups 0.432 0.438 0.437 0.458 0.465
text LD: 10 HL: 64 LD: 10 HL: 64

Hide code cell source

fig, ax = plt.subplots(figsize=(4, 2))  # size of the plot can be adjusted
ax = _to_plot.loc[[feature_names.name]].plot.bar(
    rot=0,
    ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY}\n({n_in_comparison:,} intensities)",
    # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
    color=COLORS_TO_USE,
    ax=ax,
    width=.7)
ax = pimmslearn.plotting.add_height_to_barplot(ax, size=7)
ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=7)
ax.set_xticklabels([])
fname = args.out_figures / f'2_{group}_performance_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_performance_test.pdf
_images/d69c88f63ca2f130c544ad464fb3df70826da24acc40c9479ae320185f6bfb66.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
_to_plot_long = _to_plot.T
_to_plot_long = _to_plot_long.rename(
    {feature_names.name: 'metric_value'}, axis=1)
_to_plot_long['data level'] = feature_names.name
_to_plot_long = _to_plot_long.set_index('data level', append=True)
_to_plot_long.to_csv(fname.with_suffix('.csv'))

Plot error by median feature intensity#

Hide code cell source

pimmslearn.plotting.make_large_descriptors(7)
fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred=pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC,
    palette=COLORS_TO_USE
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
pimmslearn.plotting.make_large_descriptors(6)
fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: UserWarning: The palette list has more values (24) than needed (5), which may not be intended.
  sns.barplot(
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:105: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_000 A0A075B6P5;P01615 BPCA 0.467 19 912 19\n(N=912)
1 Sample_000 A0A075B6P5;P01615 VAE 0.190 19 912 19\n(N=912)
2 Sample_000 A0A075B6P5;P01615 DAE 0.012 19 912 19\n(N=912)
3 Sample_000 A0A075B6P5;P01615 TRKNN 0.422 19 912 19\n(N=912)
4 Sample_000 A0A075B6P5;P01615 RF 0.441 19 912 19\n(N=912)
... ... ... ... ... ... ... ...
62,995 Sample_209 Q9UGM5;Q9UGM5-2 BPCA 0.476 16 1,913 16\n(N=1,913)
62,996 Sample_209 Q9UGM5;Q9UGM5-2 VAE 0.474 16 1,913 16\n(N=1,913)
62,997 Sample_209 Q9UGM5;Q9UGM5-2 DAE 0.499 16 1,913 16\n(N=1,913)
62,998 Sample_209 Q9UGM5;Q9UGM5-2 TRKNN 0.442 16 1,913 16\n(N=1,913)
62,999 Sample_209 Q9UGM5;Q9UGM5-2 RF 0.449 16 1,913 16\n(N=1,913)

63000 rows × 7 columns

_images/b5dee89dde439e457be35ed1d737fb1534bbd221cd7e41f90694763743dc493c.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=63) BPCA 0.619 0.504 0.744
1 11\n(N=63) DAE 0.667 0.536 0.810
2 11\n(N=63) RF 0.598 0.470 0.737
3 11\n(N=63) TRKNN 0.589 0.482 0.698
4 11\n(N=63) VAE 0.579 0.464 0.698
... ... ... ... ... ...
85 29\n(N=12) BPCA 0.133 0.065 0.212
86 29\n(N=12) DAE 0.132 0.083 0.185
87 29\n(N=12) RF 0.282 0.190 0.387
88 29\n(N=12) TRKNN 0.230 0.158 0.317
89 29\n(N=12) VAE 0.212 0.122 0.316

90 rows × 5 columns

Hide code cell source

(errors_binned
 .set_index(
     ['model', errors_binned.columns[-1]]
 )
 .loc[ORDER_MODELS[0]]
 .sort_values(by=METRIC))
Sample ID protein groups MAE bin n_obs
intensity binned by median of protein groups
18\n(N=846) Sample_142 P09972 0.000 18 846
15\n(N=2,557) Sample_021 A0A0A0MT66 0.000 15 2,557
14\n(N=2,074) Sample_058 Q16853;Q16853-2 0.000 14 2,074
16\n(N=1,913) Sample_015 B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 0.000 16 1,913
15\n(N=2,557) Sample_079 A6NCT7;Q07092;Q07092-2 0.000 15 2,557
... ... ... ... ... ...
14\n(N=2,074) Sample_011 P11597;P11597-2 5.771 14 2,074
14\n(N=2,074) Sample_184 F8WD41;Q15166 6.195 14 2,074
17\n(N=1,393) Sample_108 P27824;P27824-2 6.482 17 1,393
14\n(N=2,074) Sample_091 F8WD41;Q15166 6.823 14 2,074
14\n(N=2,074) Sample_115 P17050 7.635 14 2,074

12600 rows × 5 columns

Custom model selection#

Hide code cell source

if SEL_MODELS:
    metrics = pimmslearn.models.Metrics()
    test_metrics = metrics.add_metrics(
        pred_test[['observed', *SEL_MODELS]], key='test data')
    test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS]
    test_metrics

    n_in_comparison = int(test_metrics.loc['N'].unique()[0])
    n_in_comparison

    _to_plot = test_metrics.loc[METRIC].to_frame().T
    _to_plot.index = [feature_names.name]
    _to_plot

    try:
        text = model_configs[["latent_dim", "hidden_layers"]].apply(
            build_text,
            axis=1)
    except KeyError:
        logger.warning("No PIMMS models in comparsion. Using empty text")
        text = pd.Series('', index=model_configs.columns)

    _to_plot.loc["text"] = text
    _to_plot = _to_plot.fillna('')
    _to_plot

    fig, ax = plt.subplots(figsize=(4, 2))
    ax = _to_plot.loc[[feature_names.name]].plot.bar(
        rot=0,
        ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)",
        # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
        color=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS)),
        ax=ax,
        width=.7)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    ax = pimmslearn.plotting.add_height_to_barplot(ax, size=5)
    ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5)
    ax.set_xticklabels([])

    fname = args.out_figures / f'2_{group}_performance_test_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, name=fname)

    dumps[fname.stem] = fname.with_suffix('.csv')
    _to_plot_long = _to_plot.T
    _to_plot_long = _to_plot_long.rename(
        {feature_names.name: 'metric_value'}, axis=1)
    _to_plot_long['data level'] = feature_names.name
    _to_plot_long = _to_plot_long.set_index('data level', append=True)
    _to_plot_long.to_csv(fname.with_suffix('.csv'))

Hide code cell source

# custom selection
if SEL_MODELS:
    pimmslearn.plotting.make_large_descriptors(7)
    fig, ax = plt.subplots(figsize=(8, 2))

    ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
        pred=pred_test[
            [TARGET_COL] + SEL_MODELS
        ],
        feat_medians=data.train_X.median(),
        ax=ax,
        metric_name=METRIC,
        feat_name=FEAT_NAME_DISPLAY,
        palette=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS))
    )
    # ax.set_ylim(0, 1.5)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    # for text in ax.legend().get_texts():
    #     text.set_fontsize(6)
    fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(ax.get_figure(), name=fname)
    plt.show(fig)

    dumps[fname.stem] = fname.with_suffix('.csv')
    errors_binned.to_csv(fname.with_suffix('.csv'))
    pimmslearn.plotting.make_large_descriptors(6)
    # ax.xaxis.set_tick_params(rotation=0) # horizontal

    # ! only used for reporting
    plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
        errors=errors_binned,
        feat_name=FEAT_NAME_DISPLAY,
        metric_name=METRIC
    )
    plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
    display(plotted)

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_binned(
    pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    ax=ax,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_test_errors_binned_by_int.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:50: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  ax = sns.barplot(
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf
_images/ce892a23d31761bc6a6c167cfc61a0c8421ad2fb70ebf6dcbb0807c3ac7ecdee.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE intensity bin
0 Sample_143 P02768 BPCA 0.065 30\n(N=2)
1 Sample_143 P02768 VAE 0.303 30\n(N=2)
2 Sample_143 P02768 DAE 0.205 30\n(N=2)
3 Sample_143 P02768 TRKNN 0.574 30\n(N=2)
4 Sample_143 P02768 RF 0.667 30\n(N=2)

Figures dumped to disk#

Hide code cell source

figures
{'2_1_fake_na_val_test_splits': Path('runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png'),
 '2_1_pred_corr_val_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf'),
 '2_1_errors_binned_by_feat_median_val': Path('runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf'),
 '2_1_intensity_binned_top_4_models_test': Path('runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf'),
 '2_1_pred_corr_test_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf'),
 '2_1_pred_corr_test_per_feat': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf'),
 '2_1_performance_test': Path('runs/alzheimer_study/figures/2_1_performance_test.pdf'),
 '2_1_test_errors_binned_by_feat_medians': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf'),
 '2_1_test_errors_binned_by_int': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf')}

Hide code cell source

dumps
print("done")
done