Compare models#

  1. Load available configurations

  2. Load validation predictions

    • calculate absolute error

    • select top N for plotting by MAE from smallest (best) to largest (worst) (top N as specified, default 5)

    • correlation per sample, correlation per feat, correlation overall

    • MAE plots

  3. Load test data predictions

    • as for validation data

    • top N based on validation data

Hide code cell source

import logging
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from IPython.display import display

import pimmslearn
import pimmslearn.imputation
import pimmslearn.models
import pimmslearn.nb
from pimmslearn.analyzers import compare_predictions
from pimmslearn.io import datasplits
from pimmslearn.models.collect_dumps import collect, select_content

pd.options.display.max_rows = 30
pd.options.display.min_rows = 10
pd.options.display.max_colwidth = 100

plt.rcParams.update({'figure.figsize': (4, 2)})
pimmslearn.plotting.make_large_descriptors(7)

logger = pimmslearn.logging.setup_nb_logger()
logging.getLogger('fontTools').setLevel(logging.WARNING)


def load_config_file(fname: Path, first_split='config_') -> dict:
    with open(fname) as f:
        loaded = yaml.safe_load(f)
    key = f"{select_content(fname.stem, first_split=first_split)}"
    return key, loaded


def build_text(s):
    ret = ''
    if not np.isnan(s["latent_dim"]):
        ret += f'LD: {int(s["latent_dim"])} '
    try:
        if len(s["hidden_layers"]):
            t = ",".join(str(x) for x in s["hidden_layers"])
            ret += f"HL: {t}"
    except TypeError:
        # nan
        pass
    return ret

Hide code cell source

# catch passed parameters
args = None
args = dict(globals()).keys()

Papermill script parameters:

# files and folders
# Datasplit folder with data for experiment
folder_experiment: str = 'runs/example'
folder_data: str = ''  # specify data directory if needed
file_format: str = 'csv'  # change default to pickled files
# Machine parsed metadata from rawfile workflow
fn_rawfile_metadata: str = 'data/dev_datasets/HeLa_6070/files_selected_metadata_N50.csv'
models: str = 'Median,CF,DAE,VAE'  # picked models to compare (comma separated)
sel_models: str = ''  # user defined comparison (comma separated)
# Restrict plotting to top N methods for imputation based on error of validation data, maximum 10
plot_to_n: int = 5
feat_name_display: str = None  # display name for feature name in plural (e.g. 'protein groups')
save_agg_pred: bool = False  # save aggregated predictions of validation and test data
# Parameters
fn_rawfile_metadata = "https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv"
folder_experiment = "runs/alzheimer_study"
models = "Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO"

Some argument transformations

Hide code cell source

args = pimmslearn.nb.get_params(args, globals=globals())
args
root - INFO     Removed from global namespace: folder_experiment
root - INFO     Removed from global namespace: folder_data
root - INFO     Removed from global namespace: file_format
root - INFO     Removed from global namespace: fn_rawfile_metadata
root - INFO     Removed from global namespace: models
root - INFO     Removed from global namespace: sel_models
root - INFO     Removed from global namespace: plot_to_n
root - INFO     Removed from global namespace: feat_name_display
root - INFO     Removed from global namespace: save_agg_pred
{'folder_experiment': 'runs/alzheimer_study',
 'folder_data': '',
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'sel_models': '',
 'plot_to_n': 5,
 'feat_name_display': None,
 'save_agg_pred': False}

Hide code cell source

args = pimmslearn.nb.args_from_dict(args)
args
{'data': Path('runs/alzheimer_study/data'),
 'feat_name_display': None,
 'file_format': 'csv',
 'fn_rawfile_metadata': 'https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv',
 'folder_data': '',
 'folder_experiment': Path('runs/alzheimer_study'),
 'models': 'Median,CF,DAE,VAE,KNN,KNN5,BPCA,COLMEDIAN,IMPSEQ,IMPSEQROB,IRM,KNN_IMPUTE,LLS,MINDET,MINIMUM,MINPROB,MLE,PI,QRILC,RF,ROWMEDIAN,SVDMETHOD,TRKNN,ZERO',
 'out_figures': Path('runs/alzheimer_study/figures'),
 'out_folder': Path('runs/alzheimer_study'),
 'out_metrics': Path('runs/alzheimer_study'),
 'out_models': Path('runs/alzheimer_study'),
 'out_preds': Path('runs/alzheimer_study/preds'),
 'plot_to_n': 5,
 'save_agg_pred': False,
 'sel_models': ''}

Hide code cell source

figures = {}
dumps = {}

Hide code cell source

TARGET_COL = 'observed'
METRIC = 'MAE'
MIN_FREQ = None
MODELS_PASSED = args.models.split(',')
MODELS = MODELS_PASSED.copy()
FEAT_NAME_DISPLAY = args.feat_name_display
SEL_MODELS = None
if args.sel_models:
    SEL_MODELS = args.sel_models.split(',')

Hide code cell source

# list(sns.color_palette().as_hex()) # string representation of colors
if args.plot_to_n > 10:
    logger.warning("Set maximum of models to 10 (maximum)")
    args.overwrite_entry('plot_to_n', 10)

Hide code cell source

data = datasplits.DataSplits.from_folder(
    args.data, file_format=args.file_format)
pimmslearn.io.datasplits - INFO     Loaded 'train_X' from file: runs/alzheimer_study/data/train_X.csv
pimmslearn.io.datasplits - INFO     Loaded 'val_y' from file: runs/alzheimer_study/data/val_y.csv
pimmslearn.io.datasplits - INFO     Loaded 'test_y' from file: runs/alzheimer_study/data/test_y.csv

Hide code cell source

fig, axes = plt.subplots(1, 2, sharey=True, sharex=True)

pimmslearn.plotting.data.plot_observations(data.val_y.unstack(), ax=axes[0],
                                     title='Validation split', size=1, xlabel='')
pimmslearn.plotting.data.plot_observations(data.test_y.unstack(), ax=axes[1],
                                     title='Test split', size=1, xlabel='')
fig.suptitle("Simulated missing values per sample", size=8)
# hide axis and use only for common x label
fig.add_subplot(111, frameon=False)
plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
plt.xlabel(f'Samples ordered by identified {data.val_y.index.names[-1]}')
group = 1
fname = args.out_figures / f'2_{group}_fake_na_val_test_splits.png'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png
_images/93e0d29a12b03e8af2899dbb522653b6ae8fbb651be546a2927a1813a23e72b3.png

data completeness across entire data#

Hide code cell source

# load frequency of training features...
# needs to be pickle -> index.name needed
freq_feat = pimmslearn.io.datasplits.load_freq(args.data, file='freq_features.json')
freq_feat.head()  # training data
A0A024QZX5;A0A087X1N8;P35237                                                     197
A0A024R0T9;K7ER74;P02655                                                         208
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8   185
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                          208
A0A075B6H7                                                                        97
Name: freq, dtype: int64

Hide code cell source

prop = freq_feat / len(data.train_X.index.levels[0])
prop.sort_values().to_frame().plot(
    xlabel=f'{data.val_y.index.names[-1]}',
    ylabel='Proportion of identification in samples')
<Axes: xlabel='protein groups', ylabel='Proportion of identification in samples'>
_images/e0f465e3c6d5d36aa8b3e60537ee3f18584687e65343e7e1ce4e7aeb6c0e5f01.png

View training data in wide format

Hide code cell source

data.to_wide_format()
data.train_X
protein groups A0A024QZX5;A0A087X1N8;P35237 A0A024R0T9;K7ER74;P02655 A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 A0A075B6H7 A0A075B6H9 A0A075B6I0 A0A075B6I1 A0A075B6I6 A0A075B6I9 ... Q9Y653;Q9Y653-2;Q9Y653-3 Q9Y696 Q9Y6C2 Q9Y6N6 Q9Y6N7;Q9Y6N7-2;Q9Y6N7-4 Q9Y6R7 Q9Y6X5 Q9Y6Y8;Q9Y6Y8-2 Q9Y6Y9 S4R3U6
Sample ID
Sample_000 15.912 16.852 15.570 16.481 17.301 20.246 16.764 17.584 16.988 20.054 ... 16.012 15.178 NaN 15.050 16.842 NaN NaN 19.563 NaN 12.805
Sample_001 NaN 16.874 15.519 16.387 NaN 19.941 18.786 17.144 NaN 19.067 ... 15.528 15.576 NaN 14.833 16.597 20.299 15.556 19.386 13.970 12.442
Sample_002 16.111 NaN 15.935 16.416 18.175 19.251 16.832 15.671 17.012 18.569 ... 15.229 14.728 13.757 15.118 17.440 19.598 15.735 20.447 12.636 12.505
Sample_003 16.107 17.032 15.802 16.979 15.963 19.628 17.852 18.877 14.182 18.985 ... 15.495 14.590 14.682 15.140 17.356 19.429 NaN 20.216 NaN 12.445
Sample_004 15.603 15.331 15.375 16.679 NaN 20.450 18.682 17.081 14.140 19.686 ... 14.757 NaN NaN 15.256 17.075 19.582 15.328 NaN 13.145 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_205 15.682 16.886 14.910 16.482 NaN 17.705 17.039 NaN 16.413 19.102 ... NaN 15.684 14.236 15.415 17.551 17.922 16.340 19.928 12.929 NaN
Sample_206 15.798 17.554 15.600 15.938 NaN 18.154 18.152 16.503 16.860 18.538 ... 15.422 16.106 NaN 15.345 17.084 18.708 NaN 19.433 NaN NaN
Sample_207 15.739 NaN 15.469 16.898 NaN 18.636 17.950 16.321 16.401 18.849 ... 15.808 16.098 14.403 15.715 NaN 18.725 16.138 19.599 13.637 11.174
Sample_208 15.477 16.779 14.995 16.132 NaN 14.908 NaN NaN 16.119 18.368 ... 15.157 16.712 NaN 14.640 16.533 19.411 15.807 19.545 NaN NaN
Sample_209 NaN 17.261 15.175 16.235 NaN 17.893 17.744 16.371 15.780 18.806 ... 15.237 15.652 15.211 14.205 16.749 19.275 15.732 19.577 11.042 11.791

210 rows × 1421 columns

Number of samples and features:

Hide code cell source

N_SAMPLES, M_FEAT = data.train_X.shape
print(f"N samples: {N_SAMPLES:,d}, M features: {M_FEAT}")
N samples: 210, M features: 1421

Collect outputs in excel file:

Hide code cell source

fname = args.folder_experiment / '01_2_performance_summary.xlsx'
dumps[fname.stem] = fname
writer = pd.ExcelWriter(fname)
print(f"Saving to: {fname}")
Saving to: runs/alzheimer_study/01_2_performance_summary.xlsx

Model specifications#

  • used for bar plot annotations

Hide code cell source

# model_key could be used as key from config file
# ? load only specified configs?
# ? case: no config file available?
all_configs = collect(
    paths=(fname for fname in args.out_models.iterdir()
           if fname.suffix == '.yaml'
           and 'model_config' in fname.name),
    load_fn=load_config_file
)
model_configs = pd.DataFrame(all_configs).set_index('id')
model_configs.T.to_excel(writer, sheet_name='model_params')
model_configs.T
id VAE KNN5 DAE KNN Median CF
M 1421 1421 1421 1421 1421 1421
batch_size 64.000 64.000 64.000 64.000 NaN 1,024.000
cuda False True False True NaN False
data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data runs/alzheimer_study/data
epoch_trained 107.000 NaN 179.000 NaN NaN 13.000
epochs_max 300.000 50.000 300.000 50.000 NaN 100.000
file_format csv csv csv csv csv csv
fn_rawfile_metadata https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv https://raw.githubusercontent.com/RasmussenLab/njab/HEAD/docs/tutorial/data/alzheimer/meta.csv
folder_data NaN
folder_experiment runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
hidden_layers [64] NaN [64] NaN NaN NaN
latent_dim 10.000 NaN 10.000 NaN NaN 50.000
meta_cat_col NaN NaN NaN NaN NaN NaN
meta_date_col NaN NaN NaN NaN NaN NaN
model VAE KNN DAE KNN Median CF
model_key VAE KNN5 DAE KNN Median CF
n_params 277998 1 184983 1 1421 83283
out_figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures runs/alzheimer_study/figures
out_folder runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_metrics runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_models runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study runs/alzheimer_study
out_preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds runs/alzheimer_study/preds
patience 50.000 NaN 25.000 NaN NaN 1.000
sample_idx_position 0 0 0 0 0 0
save_pred_real_na True True True True True True
force_train NaN True NaN True NaN NaN
neighbors NaN 5.000 NaN 3.000 NaN NaN
pred_test_Median NaN NaN NaN NaN runs/alzheimer_study/preds/pred_test_Median.csv NaN
pred_val_Median NaN NaN NaN NaN runs/alzheimer_study/preds/pred_val_Median.csv NaN

Set Feature name (columns are features, rows are samples)

Hide code cell source

# index name
freq_feat.index.name = data.train_X.columns.name
# sample index name
sample_index_name = data.train_X.index.name

Load predictions on validation and test data split#

Validation data#

  • set top N models to plot based on validation data split

Hide code cell source

pred_val = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='val',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
SAMPLE_ID, FEAT_NAME = pred_val.index.names
if not FEAT_NAME_DISPLAY:
    FEAT_NAME_DISPLAY = FEAT_NAME
pred_val[MODELS]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 15.752 15.383 15.590 15.716 15.427 15.449 15.469 16.800 NaN 58.276 ... 7.068 11.759 2,513.638 13.686 14.684 15.652 15.752 17.206 15.700 0
Sample_050 Q9Y287 17.221 16.713 16.721 16.778 17.776 17.314 16.453 17.288 NaN 16.993 ... 7.068 11.661 19.829 12.756 15.985 16.844 17.221 17.807 16.738 0
Sample_107 Q8N475;Q8N475-2 14.846 14.252 14.021 14.275 14.150 14.355 13.110 17.187 NaN -78.084 ... 7.068 11.758 2,582.130 12.748 13.426 14.501 14.846 17.434 13.776 0
Sample_199 P06307 18.973 19.205 18.697 19.127 19.247 19.385 19.639 16.711 NaN 102.283 ... 7.068 12.699 2,483.120 12.973 17.354 19.224 18.973 17.111 19.015 0
Sample_067 Q5VUB5 14.726 14.855 15.179 14.968 15.232 15.040 15.465 16.743 NaN -36.470 ... 7.068 11.790 2,569.564 13.254 12.685 14.943 14.726 17.031 14.699 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.918 23.224 23.040 22.869 22.884 22.899 22.994 17.042 NaN 104.484 ... 7.068 12.253 2,634.108 12.200 22.290 22.864 22.918 17.330 22.872 0
Sample_002 A0A0A0MT36 15.877 15.392 15.755 16.131 16.857 16.142 15.882 16.792 NaN -18.408 ... 7.068 12.826 2,448.503 11.867 13.208 15.572 15.877 16.879 15.671 0
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 16.278 15.616 15.822 15.699 15.840 15.574 15.406 17.032 NaN -27.128 ... 7.068 11.813 2,487.550 14.168 14.056 15.662 16.278 17.215 15.574 0
Sample_182 Q8NFT8 13.995 13.650 13.557 13.283 13.685 13.480 14.322 16.764 NaN -12.434 ... 7.068 11.421 2,426.191 11.618 10.753 13.788 13.995 17.125 14.518 0
Sample_123 Q16853;Q16853-2 14.849 14.488 14.539 14.566 14.612 14.627 14.582 16.686 NaN 78.799 ... 7.068 11.676 2,461.806 11.338 14.021 14.730 14.849 16.981 14.485 0

12600 rows × 24 columns

Describe absolute error

Hide code cell source

errors_val = (pred_val
              .drop(TARGET_COL, axis=1)
              .sub(pred_val[TARGET_COL], axis=0)
              [MODELS])
errors_val  # over all samples and all features
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 1.122 0.753 0.959 1.085 0.797 0.819 0.839 2.169 NaN 43.645 ... -7.562 -2.871 2,499.008 -0.945 0.054 1.021 1.122 2.575 1.070 -14.630
Sample_050 Q9Y287 1.466 0.958 0.966 1.023 2.021 1.559 0.698 1.533 NaN 1.238 ... -8.687 -4.094 4.074 -2.999 0.230 1.089 1.466 2.052 0.983 -15.755
Sample_107 Q8N475;Q8N475-2 -0.183 -0.777 -1.009 -0.754 -0.880 -0.674 -1.919 2.157 NaN -93.113 ... -7.961 -3.272 2,567.100 -2.281 -1.603 -0.529 -0.183 2.405 -1.253 -15.029
Sample_199 P06307 -0.403 -0.171 -0.679 -0.249 -0.129 0.009 0.263 -2.665 NaN 82.907 ... -12.308 -6.676 2,463.744 -6.403 -2.021 -0.151 -0.403 -2.265 -0.360 -19.376
Sample_067 Q5VUB5 -0.583 -0.454 -0.130 -0.341 -0.077 -0.269 0.156 1.434 NaN -51.779 ... -8.241 -3.519 2,554.255 -2.055 -2.624 -0.365 -0.583 1.723 -0.610 -15.309
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 0.096 0.402 0.218 0.047 0.062 0.077 0.171 -5.781 NaN 81.662 ... -15.754 -10.569 2,611.285 -10.622 -0.532 0.042 0.096 -5.493 0.050 -22.822
Sample_002 A0A0A0MT36 -2.288 -2.773 -2.410 -2.034 -1.308 -2.023 -2.283 -1.373 NaN -36.573 ... -11.097 -5.339 2,430.338 -6.298 -4.957 -2.593 -2.288 -1.286 -2.494 -18.165
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 0.753 0.091 0.297 0.173 0.314 0.049 -0.120 1.507 NaN -42.653 ... -8.457 -3.712 2,472.025 -1.357 -1.470 0.137 0.753 1.690 0.049 -15.525
Sample_182 Q8NFT8 -0.383 -0.728 -0.822 -1.096 -0.694 -0.899 -0.057 2.385 NaN -26.813 ... -7.311 -2.958 2,411.812 -2.761 -3.625 -0.590 -0.383 2.746 0.139 -14.379
Sample_123 Q16853;Q16853-2 0.345 -0.017 0.035 0.062 0.108 0.123 0.077 2.181 NaN 64.295 ... -7.436 -2.828 2,447.302 -3.166 -0.483 0.226 0.345 2.477 -0.019 -14.504

12600 rows × 24 columns

Select top N for plotting and set colors#

Hide code cell source

ORDER_MODELS = (errors_val
                .abs()
                .mean()
                .sort_values()
                .index
                .to_list())
ORDER_MODELS
['BPCA',
 'VAE',
 'DAE',
 'TRKNN',
 'RF',
 'CF',
 'KNN5',
 'KNN',
 'KNN_IMPUTE',
 'IRM',
 'ROWMEDIAN',
 'Median',
 'LLS',
 'QRILC',
 'COLMEDIAN',
 'SVDMETHOD',
 'PI',
 'MINDET',
 'MINPROB',
 'MINIMUM',
 'ZERO',
 'IMPSEQROB',
 'MLE',
 'IMPSEQ']

Hide code cell source

pred_val = pred_val[[TARGET_COL] + ORDER_MODELS]
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_val.csv'
    dumps[fname.stem] = fname
    pred_val.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_val
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID protein groups
Sample_158 Q9UN70;Q9UN70-2 14.630 15.469 15.716 15.590 15.700 15.652 15.383 15.449 15.427 15.937 ... 16.800 17.206 13.686 11.916 11.759 7.068 0 58.276 2,513.638 NaN
Sample_050 Q9Y287 15.755 16.453 16.778 16.721 16.738 16.844 16.713 17.314 17.776 16.961 ... 17.288 17.807 12.756 12.900 11.661 7.068 0 16.993 19.829 NaN
Sample_107 Q8N475;Q8N475-2 15.029 13.110 14.275 14.021 13.776 14.501 14.252 14.355 14.150 15.437 ... 17.187 17.434 12.748 12.313 11.758 7.068 0 -78.084 2,582.130 NaN
Sample_199 P06307 19.376 19.639 19.127 18.697 19.015 19.224 19.205 19.385 19.247 18.861 ... 16.711 17.111 12.973 12.285 12.699 7.068 0 102.283 2,483.120 NaN
Sample_067 Q5VUB5 15.309 15.465 14.968 15.179 14.699 14.943 14.855 15.040 15.232 15.079 ... 16.743 17.031 13.254 11.827 11.790 7.068 0 -36.470 2,569.564 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_111 F6SYF8;Q9UBP4 22.822 22.994 22.869 23.040 22.872 22.864 23.224 22.899 22.884 22.837 ... 17.042 17.330 12.200 12.161 12.253 7.068 0 104.484 2,634.108 NaN
Sample_002 A0A0A0MT36 18.165 15.882 16.131 15.755 15.671 15.572 15.392 16.142 16.857 15.446 ... 16.792 16.879 11.867 12.586 12.826 7.068 0 -18.408 2,448.503 NaN
Sample_049 Q8WY21;Q8WY21-2;Q8WY21-3;Q8WY21-4 15.525 15.406 15.699 15.822 15.574 15.662 15.616 15.574 15.840 15.995 ... 17.032 17.215 14.168 12.352 11.813 7.068 0 -27.128 2,487.550 NaN
Sample_182 Q8NFT8 14.379 14.322 13.283 13.557 14.518 13.788 13.650 13.480 13.685 14.675 ... 16.764 17.125 11.618 12.504 11.421 7.068 0 -12.434 2,426.191 NaN
Sample_123 Q16853;Q16853-2 14.504 14.582 14.566 14.539 14.485 14.730 14.488 14.627 14.612 14.824 ... 16.686 16.981 11.338 12.689 11.676 7.068 0 78.799 2,461.806 NaN

12600 rows × 25 columns

Hide code cell source

mae_stats_ordered_val = errors_val.abs().describe()[ORDER_MODELS]
mae_stats_ordered_val.to_excel(writer, sheet_name='mae_stats_ordered_val', float_format='%.5f')
mae_stats_ordered_val.T
count mean std min 25% 50% 75% max
BPCA 12,600.000 0.422 0.501 0.000 0.119 0.269 0.534 9.370
VAE 12,600.000 0.430 0.522 0.000 0.120 0.273 0.545 9.931
DAE 12,600.000 0.431 0.521 0.000 0.120 0.270 0.541 9.387
TRKNN 12,600.000 0.450 0.516 0.000 0.132 0.295 0.569 7.975
RF 12,600.000 0.461 0.531 0.000 0.134 0.304 0.587 8.970
CF 12,600.000 0.463 0.511 0.000 0.141 0.313 0.596 6.038
KNN5 12,600.000 0.467 0.546 0.000 0.135 0.305 0.594 10.231
KNN 12,600.000 0.481 0.565 0.000 0.138 0.310 0.618 10.502
KNN_IMPUTE 12,600.000 0.554 0.668 0.000 0.164 0.359 0.692 7.550
IRM 12,600.000 0.588 0.637 0.000 0.176 0.396 0.767 7.953
ROWMEDIAN 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
Median 12,600.000 0.598 0.639 0.000 0.189 0.419 0.778 9.014
LLS 12,600.000 1.329 54.974 0.000 0.151 0.343 0.662 4,842.571
QRILC 12,600.000 1.639 1.262 0.000 0.831 1.354 2.065 15.188
COLMEDIAN 12,600.000 2.210 1.634 0.000 0.947 1.972 3.094 12.944
SVDMETHOD 12,600.000 2.309 1.635 0.000 1.027 2.091 3.251 12.624
PI 12,600.000 3.801 2.643 0.000 1.763 3.365 5.342 18.083
MINDET 12,600.000 4.108 2.650 0.001 2.089 3.678 5.665 17.920
MINPROB 12,600.000 4.126 2.687 0.000 2.081 3.715 5.712 18.426
MINIMUM 12,600.000 9.272 2.717 0.373 7.327 8.890 10.863 22.773
ZERO 12,600.000 16.340 2.717 6.695 14.395 15.958 17.931 29.841
IMPSEQROB 12,600.000 333.478 793.700 0.002 12.282 33.864 87.298 2,869.299
MLE 12,600.000 2,172.384 865.925 0.009 2,435.415 2,495.362 2,552.718 2,873.681
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Some model have fixed colors, others are assigned randomly

Note

  1. The order of “new” models is important for the color assignment.

  2. User defined model keys for the same model with two configuration will yield different colors.

Hide code cell source

COLORS_TO_USE = pimmslearn.plotting.defaults.assign_colors(list(k.upper() for k in ORDER_MODELS))
pimmslearn.plotting.defaults.ModelColorVisualizer(ORDER_MODELS, COLORS_TO_USE)
pimmslearn.plotting.defaults - INFO     Reused some colors!
BPCAVAEDAETRKNNRFCFKNN5KNNKNN_IMPUTEIRMROWMEDIANMedianLLSQRILCCOLMEDIANSVDMETHODPIMINDETMINPROBMINIMUMZEROIMPSEQROBMLEIMPSEQ

Hide code cell source

TOP_N_ORDER = ORDER_MODELS[:args.plot_to_n]
TOP_N_COLOR_PALETTE = {model: color for model,
                       color in zip(TOP_N_ORDER, COLORS_TO_USE)}
TOP_N_ORDER
['BPCA', 'VAE', 'DAE', 'TRKNN', 'RF']

Correlation per sample#

Hide code cell source

corr_per_sample_val = (pred_val
                       .groupby(sample_index_name)
                       .apply(
                           lambda df: df.corr().loc[TARGET_COL]
                       )[ORDER_MODELS])

min_corr = int(corr_per_sample_val.min().min() * 10) / 10
kwargs = dict(ylim=(min_corr, 1), rot=90,
              #     boxprops=dict(linewidth=1.5),
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model pred. per sample on validation data',
              ylabel='correlation per sample')
ax = corr_per_sample_val[TOP_N_ORDER].plot.box(**kwargs)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

fname = args.out_figures / f'2_{group}_pred_corr_val_per_sample.xlsx'
dumps[fname.stem] = fname
with pd.ExcelWriter(fname) as w:
    corr_per_sample_val.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_val.to_excel(w, sheet_name='correlations')
    corr_per_sample_val[TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf
_images/1c647728907dba96dbf958f8118b79118290b59630d83f71d767f4fb3d4ba8a6.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_val[TOP_N_ORDER]).min()
mask = (corr_per_sample_val[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_val.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
Sample ID                                                
Sample_010 0.948707 0.932085 0.950477 0.944645 0.911789 0.942505 0.940594 0.946115 0.932235 0.932518 0.869102 0.869102 0.913668 0.833183 nan 0.065802 0.190694 nan 0.099473 nan nan 0.589151 nan nan
Sample_018 0.968582 0.940162 0.914499 0.953573 0.926834 0.958784 0.925531 0.938822 0.939383 0.952858 0.908717 0.908717 0.936909 0.870506 nan 0.161907 -0.040653 nan -0.212436 nan nan 0.375658 nan nan
Sample_033 0.960627 0.953893 0.963185 0.954568 0.923301 0.940668 0.936031 0.928838 0.744119 0.927209 0.814959 0.814959 0.949288 0.719028 nan 0.227900 0.280716 nan 0.035866 nan nan 0.049356 nan nan
Sample_054 0.932254 0.931470 0.929426 0.910271 0.932313 0.923676 0.936915 0.925876 0.905679 0.913765 0.915748 0.915748 0.929264 0.848070 nan 0.190649 -0.042870 nan 0.100141 nan nan 0.836204 nan nan
Sample_071 0.887866 0.910603 0.903808 0.888162 0.896781 0.896309 0.901240 0.895286 0.880453 0.865003 0.885806 0.885806 0.899799 0.875124 nan 0.178303 -0.163426 nan -0.068125 nan nan 0.364226 nan nan
Sample_073 0.930349 0.921061 0.920234 0.919876 0.910779 0.923969 0.933555 0.950641 0.916774 0.901773 0.900178 0.900178 0.909057 0.896334 nan -0.017963 -0.149313 nan 0.164556 nan nan 0.356937 nan nan
Sample_095 0.940942 0.924516 0.923287 0.927289 0.927013 0.930142 0.924950 0.930902 0.909714 0.913905 0.878167 0.878167 0.917350 0.819837 nan -0.120269 0.084754 nan -0.175657 nan nan 0.419195 nan nan
Sample_110 0.955556 0.930827 0.927977 0.959306 0.952825 0.958437 0.949869 0.945613 0.857312 0.924310 0.933976 0.933976 0.940449 0.758540 nan 0.159307 -0.029723 nan -0.005979 nan nan 0.522425 nan nan
Sample_122 0.951246 0.950207 0.921719 0.963923 0.955631 0.960414 0.963846 0.964760 0.920077 0.937975 0.943311 0.943311 0.956290 0.876090 nan -0.062304 -0.097474 nan 0.164724 nan nan 0.439603 nan nan
Sample_133 0.919483 0.927948 0.929465 0.928251 0.930633 0.911731 0.903483 0.903370 0.885348 0.878925 0.899233 0.899233 0.881238 0.847028 nan 0.219841 0.029113 nan 0.196451 nan nan 0.409126 nan nan
Sample_139 0.927681 0.939145 0.926105 0.957367 0.933952 0.931575 0.912868 0.901552 0.878475 0.891290 0.907333 0.907333 0.928867 0.805487 nan 0.156894 -0.044552 nan 0.053371 nan nan 0.554137 nan nan
Sample_150 0.950334 0.889533 0.900840 0.945063 0.927958 0.936687 0.885565 0.868275 0.930981 0.907849 0.892997 0.892997 0.940619 0.779208 nan 0.166841 0.066741 nan -0.121078 nan nan 0.335988 nan nan
Sample_171 0.924707 0.916230 0.913235 0.916959 0.870878 0.935293 0.902581 0.906699 0.884571 0.881019 0.875433 0.875433 -0.090619 0.888167 nan -0.004823 -0.164133 nan 0.002303 nan nan 0.302006 nan nan
Sample_173 0.916627 0.941246 0.951035 0.932711 0.939035 0.959491 0.939783 0.940274 0.918589 0.916299 0.925428 0.925428 0.926916 0.959278 nan 0.059663 0.287064 nan -0.177585 nan nan 0.334436 nan nan
Sample_174 0.970316 0.866285 0.872082 0.967356 0.883479 0.941546 0.854645 0.846532 0.920737 0.920759 0.887409 0.887409 0.972096 0.795628 nan 0.306279 0.005442 nan -0.012891 nan nan 0.357612 nan nan
Sample_181 0.964199 0.927904 0.956800 0.945214 0.922909 0.942731 0.940761 0.936048 0.909140 0.899569 0.861266 0.861266 0.893600 0.606519 nan -0.013702 0.143908 nan -0.066681 nan nan 0.530054 -0.103146 nan
Sample_198 0.914339 0.944860 0.939021 0.932612 0.943595 0.936386 0.955742 0.947627 0.936142 0.946119 0.956493 0.956493 0.924497 0.931405 nan 0.097862 -0.131256 nan -0.113805 nan nan 0.481999 nan nan

Error plot#

Hide code cell source

c_error_min = 4.5
mask = (errors_val[MODELS].abs() > c_error_min).any(axis=1)
errors_val.loc[mask].sort_index(level=1).head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINIMUM MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO
Sample ID protein groups
Sample_012 A0A024QZX5;A0A087X1N8;P35237 -0.324 -0.161 -0.263 -0.279 -0.246 -0.416 -0.140 0.856 NaN -53.608 ... -8.881 -3.987 0.632 -2.230 -1.588 -0.293 -0.324 1.241 -0.318 -15.949
Sample_017 A0A024QZX5;A0A087X1N8;P35237 0.347 0.287 0.409 0.332 -0.093 -0.022 0.251 1.658 NaN 26.724 ... -8.211 -3.184 1.228 -2.086 -1.311 0.255 0.347 2.214 0.305 -15.279
Sample_050 A0A024QZX5;A0A087X1N8;P35237 0.544 0.146 0.086 0.092 0.024 -0.102 0.178 2.207 NaN 0.348 ... -8.013 -2.146 3.294 -2.553 -0.212 -0.085 0.544 2.691 0.238 -15.081
Sample_102 A0A024QZX5;A0A087X1N8;P35237 -0.029 -0.084 -0.152 -0.096 0.030 0.067 -0.107 0.942 NaN 19.277 ... -8.586 -3.708 1.609 -2.349 -0.897 -0.138 -0.029 1.168 -0.065 -15.654
Sample_109 A0A024QZX5;A0A087X1N8;P35237 0.343 -0.348 -0.096 -0.109 -0.179 -0.004 -0.263 1.518 NaN -28.795 ... -8.215 -3.515 -2.077 -2.497 -0.340 -0.021 0.343 1.968 -0.012 -15.283

5 rows × 24 columns

Hide code cell source

errors_val = errors_val.abs().groupby(
    freq_feat.index.name).mean()  # absolute error
errors_val = errors_val.join(freq_feat)
errors_val = errors_val.sort_values(by=freq_feat.name, ascending=True)
errors_val.head()
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
Q9Y281;Q9Y281-3 0.415 0.283 0.258 0.363 0.325 0.285 0.266 4.078 NaN 10.519 ... 0.546 2,473.194 0.570 1.274 0.376 0.415 4.472 0.307 12.573 52
K7EPJ5;O60291;O60291-2;O60291-3;O60291-4 0.331 0.568 0.375 0.391 0.281 0.385 0.387 3.029 NaN 13.344 ... 1.243 2,525.930 0.866 2.143 0.287 0.331 3.452 0.257 13.857 52
B1AJQ6;Q86Y82 1.082 0.902 0.887 0.825 0.482 1.226 0.674 3.367 NaN 5.026 ... 1.830 2,441.128 0.048 0.599 1.169 1.082 3.926 0.900 13.380 52
P69892 0.872 1.471 1.728 1.318 1.734 1.286 1.360 1.980 NaN 18.023 ... 2.506 2,532.051 2.568 7.668 1.411 0.872 2.320 0.966 14.768 53
A2RU67 0.689 0.545 0.480 0.496 0.503 0.462 0.539 4.495 NaN 15.116 ... 1.108 1,998.072 1.220 1.398 0.571 0.689 4.870 0.462 12.437 53

5 rows × 25 columns

Hide code cell source

errors_val.describe()[ORDER_MODELS].T  # mean of means
count mean std min 25% 50% 75% max
BPCA 1,419.000 0.408 0.306 0.017 0.222 0.320 0.494 4.195
VAE 1,419.000 0.419 0.321 0.014 0.229 0.333 0.490 3.708
DAE 1,419.000 0.420 0.325 0.028 0.227 0.325 0.496 3.152
TRKNN 1,419.000 0.437 0.309 0.000 0.241 0.349 0.526 3.647
RF 1,419.000 0.448 0.318 0.051 0.253 0.361 0.538 3.415
CF 1,419.000 0.450 0.303 0.028 0.260 0.368 0.532 3.126
KNN5 1,419.000 0.455 0.322 0.039 0.256 0.369 0.540 3.634
KNN 1,419.000 0.468 0.333 0.012 0.267 0.375 0.549 3.693
KNN_IMPUTE 1,419.000 0.531 0.378 0.063 0.296 0.424 0.636 3.430
IRM 1,419.000 0.555 0.372 0.030 0.311 0.449 0.674 3.476
ROWMEDIAN 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
Median 1,419.000 0.580 0.359 0.094 0.351 0.487 0.691 4.171
LLS 1,419.000 1.088 19.029 0.023 0.279 0.408 0.596 706.018
QRILC 1,419.000 1.611 0.868 0.319 1.041 1.369 1.885 7.668
COLMEDIAN 1,419.000 2.071 1.509 0.038 0.916 1.738 2.812 12.631
SVDMETHOD 1,419.000 2.136 1.467 0.149 0.976 1.893 2.905 12.211
PI 1,419.000 4.109 2.451 0.048 2.268 3.644 5.458 16.798
MINDET 1,419.000 4.438 2.493 0.374 2.622 4.032 5.828 17.100
MINPROB 1,419.000 4.449 2.486 0.546 2.606 3.992 5.875 17.656
MINIMUM 1,419.000 9.620 2.542 3.842 7.854 9.231 11.051 22.371
ZERO 1,419.000 16.688 2.542 10.910 14.922 16.299 18.119 29.439
IMPSEQROB 1,419.000 443.657 892.834 0.830 23.477 43.842 100.325 2,633.136
MLE 1,419.000 2,171.007 331.079 1.453 1,992.846 2,214.845 2,487.619 2,683.431
IMPSEQ 0.000 NaN NaN NaN NaN NaN NaN NaN

Hide code cell source

c_avg_error = 2
mask = (errors_val[TOP_N_ORDER] >= c_avg_error).any(axis=1)
errors_val.loc[mask]
Median CF DAE VAE KNN KNN5 BPCA COLMEDIAN IMPSEQ IMPSEQROB ... MINPROB MLE PI QRILC RF ROWMEDIAN SVDMETHOD TRKNN ZERO freq
protein groups
O60512 2.233 1.986 1.768 2.074 2.293 2.209 1.989 5.769 NaN 8.471 ... 1.702 2,128.612 2.319 2.207 1.973 2.233 6.098 2.559 10.910 58
P18206;P18206-2 2.427 1.766 1.253 2.012 1.744 1.637 1.297 3.821 NaN 7.976 ... 1.222 2,518.839 1.669 2.172 1.897 2.427 4.085 1.581 12.898 97
Q99538 2.502 2.483 2.333 2.263 2.711 2.517 2.464 2.615 NaN 8.517 ... 2.567 1,769.534 2.483 4.063 2.446 2.502 2.767 2.399 14.984 107
P02100 2.192 1.990 2.150 2.630 2.283 2.509 1.033 1.996 NaN 14.829 ... 4.380 2,512.438 4.020 4.354 2.592 2.192 2.106 1.856 16.373 127
A0A0G2JRN3 3.053 3.126 3.152 3.708 3.693 3.634 4.195 3.998 NaN 71.992 ... 7.041 1,986.331 6.761 5.694 3.415 3.053 3.976 3.647 19.496 128
P01817 2.254 2.064 2.079 1.994 1.963 2.117 2.385 2.736 NaN 10.059 ... 2.499 2,369.250 2.240 3.265 1.992 2.254 3.104 2.039 14.053 133
Q15375;Q15375-4 4.171 1.588 1.429 1.503 1.608 1.331 1.981 3.754 NaN 16.223 ... 7.039 2,285.221 6.277 4.848 1.555 4.171 3.566 2.065 19.101 163
P68871 2.331 1.564 2.585 2.265 1.616 1.638 0.571 1.720 NaN 23.608 ... 3.741 2,237.073 3.728 3.706 2.141 2.331 2.014 0.854 16.378 168
Q7Z794 1.867 0.691 2.014 1.571 1.461 1.326 0.294 1.853 NaN 99.033 ... 3.885 1,890.640 3.995 4.456 1.619 1.867 1.961 0.495 16.337 174
P69905 2.793 2.023 2.858 2.935 2.936 2.820 1.032 2.807 NaN 94.049 ... 5.686 1,992.771 5.286 6.889 2.658 2.793 2.626 1.016 18.200 190
P35527 2.216 1.402 2.413 2.421 2.064 2.156 1.295 2.273 NaN 96.343 ... 4.936 2,335.097 4.475 5.020 1.993 2.216 2.403 1.169 17.045 195
P15509;P15509-2;P15509-3;P15509-5;P15509-7;P15509-8 2.252 1.073 1.864 1.683 1.218 1.374 1.336 3.397 NaN 48.350 ... 5.795 1,276.662 5.784 4.272 1.701 2.252 3.146 2.437 18.354 201

12 rows × 25 columns

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 3))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred_val[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,)
ax.set_ylabel(f"Average error ({METRIC})")
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_errors_binned_by_feat_median_val.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:99: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(data=errors,
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf
_images/b0df35b343c48e31d5ee10b248c19566947b4e6da7cc87eb52033806735b75b0.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=94) BPCA 0.715 0.601 0.849
1 11\n(N=94) DAE 0.601 0.489 0.731
2 11\n(N=94) RF 0.634 0.528 0.759
3 11\n(N=94) TRKNN 0.655 0.545 0.784
4 11\n(N=94) VAE 0.627 0.503 0.772
... ... ... ... ... ...
85 29\n(N=5) BPCA 0.175 0.062 0.288
86 29\n(N=5) DAE 0.100 0.021 0.190
87 29\n(N=5) RF 0.127 0.047 0.224
88 29\n(N=5) TRKNN 0.193 0.128 0.257
89 29\n(N=5) VAE 0.141 0.034 0.256

90 rows × 5 columns

Hide code cell source

errors_binned.head()
dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_158 Q9UN70;Q9UN70-2 BPCA 0.839 15 2,398 15\n(N=2,398)
1 Sample_158 Q9UN70;Q9UN70-2 VAE 1.085 15 2,398 15\n(N=2,398)
2 Sample_158 Q9UN70;Q9UN70-2 DAE 0.959 15 2,398 15\n(N=2,398)
3 Sample_158 Q9UN70;Q9UN70-2 TRKNN 1.070 15 2,398 15\n(N=2,398)
4 Sample_158 Q9UN70;Q9UN70-2 RF 1.021 15 2,398 15\n(N=2,398)

test data#

Hide code cell source

pred_test = compare_predictions.load_split_prediction_by_modelkey(
    experiment_folder=args.folder_experiment,
    split='test',
    model_keys=MODELS_PASSED,
    shared_columns=[TARGET_COL])
pred_test = pred_test[[TARGET_COL] + ORDER_MODELS]
pred_test = pred_test.join(freq_feat, on=freq_feat.index.name)
if args.save_agg_pred:
    fname = args.folder_experiment / '01_2_agg_pred_test.csv'
    dumps[fname.stem] = fname
    pred_test.to_csv(fname)
    logger.info(f"Saved aggregated predictions to: {fname}")
pred_test
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_000 A0A075B6P5;P01615 17.016 17.483 17.433 17.333 17.438 17.642 17.441 17.207 17.190 18.269 ... 17.496 13.554 12.970 13.367 7.068 0 229.376 2,505.226 NaN 210
A0A087X089;Q16627;Q16627-2 18.280 17.769 17.838 18.181 17.930 17.895 18.203 18.146 18.293 17.797 ... 17.695 12.376 12.970 12.442 7.068 0 -20.319 2,505.226 NaN 210
A0A0B4J2B5;S4R460 21.735 22.459 22.173 22.503 22.397 22.232 22.858 21.959 21.835 22.205 ... 17.493 13.902 12.970 12.377 7.068 0 -10.898 2,505.226 NaN 210
A0A140T971;O95865;Q5SRR8;Q5SSV3 14.603 15.285 15.235 14.975 15.399 15.293 15.228 15.143 15.172 15.557 ... 17.087 14.251 12.970 12.552 7.068 0 -2.819 2,505.226 NaN 145
A0A140TA33;A0A140TA41;A0A140TA52;P22105;P22105-3;P22105-4 16.143 16.583 16.620 16.502 16.775 16.614 16.780 16.743 16.625 16.646 ... 17.508 12.690 12.970 13.175 7.068 0 -42.837 2,505.226 NaN 210
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Sample_209 Q96ID5 16.074 15.866 16.063 15.762 16.122 15.982 15.866 15.981 15.909 15.925 ... 17.133 14.038 12.435 11.614 7.068 0 20.373 17.260 NaN 194
Q9H492;Q9H492-2 13.173 13.249 13.373 13.337 13.273 13.231 12.962 13.432 13.669 13.594 ... 17.109 12.775 12.435 11.412 7.068 0 14.713 19.076 NaN 111
Q9HC57 14.207 13.756 14.448 13.647 14.589 14.283 13.856 14.131 13.962 14.391 ... 17.157 12.205 12.435 11.609 7.068 0 21.445 19.649 NaN 128
Q9NPH3;Q9NPH3-2;Q9NPH3-5 14.962 15.096 15.174 15.302 15.099 14.991 14.927 15.123 15.094 15.117 ... 17.257 13.761 12.435 12.863 7.068 0 35.578 16.125 NaN 199
Q9UGM5;Q9UGM5-2 16.871 16.395 16.583 16.437 16.429 16.578 16.636 16.378 16.255 17.054 ... 17.133 13.377 12.435 11.713 7.068 0 82.601 13.608 NaN 209

12600 rows × 26 columns

Write averages for all models to excel (from before?)

Hide code cell source

errors_test_mae = pimmslearn.pandas.calc_errors.get_absolute_error(
    pred_test
)
mae_stats_ordered_test = errors_test_mae.describe()[ORDER_MODELS]
mae_stats_ordered_test
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ
count 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 ... 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000 0.000
mean 0.432 0.435 0.435 0.458 0.465 0.468 0.469 0.482 0.558 0.587 ... 2.223 2.330 3.818 4.109 4.131 9.271 16.339 334.546 2,186.302 NaN
std 0.518 0.536 0.534 0.539 0.545 0.528 0.546 0.562 0.679 0.647 ... 1.662 1.653 2.669 2.667 2.714 2.741 2.741 793.494 853.899 NaN
min 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 ... 0.000 0.000 0.000 0.003 0.002 0.141 7.209 0.001 0.001 NaN
25% 0.121 0.121 0.122 0.132 0.136 0.141 0.138 0.140 0.163 0.175 ... 0.961 1.044 1.794 2.132 2.118 7.344 14.412 12.192 2,436.455 NaN
50% 0.280 0.274 0.275 0.299 0.305 0.313 0.307 0.316 0.364 0.394 ... 1.954 2.098 3.323 3.635 3.646 8.867 15.935 34.192 2,496.971 NaN
75% 0.546 0.545 0.538 0.584 0.588 0.602 0.596 0.612 0.703 0.762 ... 3.119 3.286 5.329 5.610 5.653 10.842 17.910 91.928 2,555.017 NaN
max 7.635 9.000 8.389 9.111 7.977 7.018 8.577 8.171 9.005 7.829 ... 13.272 13.022 18.545 18.317 18.031 23.072 30.140 2,869.824 2,873.005 NaN

8 rows × 24 columns

Hide code cell source

mae_stats_ordered_test.to_excel(writer, sheet_name='mae_stats_ordered_test', float_format='%.5f')

Hide code cell source

cp_mean_perf = pd.concat([
    mae_stats_ordered_val.loc['mean'],
    mae_stats_ordered_test.loc['mean'],
],
    axis=1,
    keys=['val', 'test']
).sort_values(by='val')
cp_mean_perf.to_excel(writer, sheet_name='cp_mean_perf', float_format='%.5f')
cp_mean_perf
val test
BPCA 0.422 0.432
VAE 0.430 0.435
DAE 0.431 0.435
TRKNN 0.450 0.458
RF 0.461 0.465
CF 0.463 0.468
KNN5 0.467 0.469
KNN 0.481 0.482
KNN_IMPUTE 0.554 0.558
IRM 0.588 0.587
ROWMEDIAN 0.598 0.602
Median 0.598 0.602
LLS 1.329 0.874
QRILC 1.639 1.629
COLMEDIAN 2.210 2.223
SVDMETHOD 2.309 2.330
PI 3.801 3.818
MINDET 4.108 4.109
MINPROB 4.126 4.131
MINIMUM 9.272 9.271
ZERO 16.340 16.339
IMPSEQROB 333.478 334.546
MLE 2,172.384 2,186.302
IMPSEQ NaN NaN

Hide code cell source

writer.close()

Intensity distribution as histogram#

Plot top 4 models predictions for intensities in test data

Hide code cell source

min_max = pimmslearn.plotting.data.min_max(pred_test[TARGET_COL])
top_n = 4
fig, axes = plt.subplots(ncols=top_n, figsize=(8, 2), sharey=True)

for model, color, ax in zip(
        ORDER_MODELS[:top_n],
        COLORS_TO_USE[:top_n],
        axes):

    ax, bins = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[TARGET_COL],
        color='grey',
        min_max=min_max,
        ax=ax
    )
    ax, _ = pimmslearn.plotting.data.plot_histogram_intensities(
        pred_test[model],
        color=color,
        min_max=min_max,
        ax=ax,
        alpha=0.5,
    )
    _ = [(l_.set_rotation(90))
         for l_ in ax.get_xticklabels()]
    ax.legend()

axes[0].set_ylabel('Number of observations')

fname = args.out_figures / f'2_{group}_intensity_binned_top_{top_n}_models_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf
_images/cd02f5871f95060303f0de92d09823d702df3c9e783b8a1f2156b09eab72fb58.png

Hide code cell source

counts_per_bin = pimmslearn.pandas.get_counts_per_bin(df=pred_test,
                                                bins=bins,
                                                columns=[TARGET_COL, *ORDER_MODELS[:top_n]])

counts_per_bin.to_excel(fname.with_suffix('.xlsx'))
counts_per_bin
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/pandas/__init__.py:320: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  _series = (pd.cut(df[col], bins=bins).to_frame().groupby(col).size())
observed BPCA VAE DAE TRKNN
bin
(7, 8] 2 0 0 1 0
(8, 9] 7 0 0 0 0
(9, 10] 18 2 0 2 1
(10, 11] 69 29 20 44 13
(11, 12] 217 165 143 157 113
(12, 13] 634 531 520 527 479
(13, 14] 1,394 1,248 1,233 1,246 1,224
(14, 15] 2,042 2,033 2,112 2,088 2,118
(15, 16] 2,054 2,359 2,371 2,321 2,429
(16, 17] 1,787 1,867 1,866 1,856 1,842
(17, 18] 1,333 1,363 1,362 1,370 1,401
(18, 19] 965 956 926 928 923
(19, 20] 792 789 808 805 800
(20, 21] 536 528 513 524 533
(21, 22] 320 322 317 320 323
(22, 23] 182 176 183 187 171
(23, 24] 102 92 87 86 92
(24, 25] 45 38 40 37 37
(25, 26] 50 57 57 59 59
(26, 27] 25 20 17 17 17
(27, 28] 3 2 2 2 2
(28, 29] 8 11 12 10 11
(29, 30] 13 11 10 12 12

Correlation per sample#

Hide code cell source

corr_per_sample_test = (pred_test
                        .groupby(sample_index_name)
                        .apply(lambda df: df.corr().loc[TARGET_COL])
                        [ORDER_MODELS])
corr_per_sample_test = corr_per_sample_test.join(
    pred_test
    .groupby(sample_index_name)[TARGET_COL]
    .count()
    .rename('n_obs')
)
too_few_obs = corr_per_sample_test['n_obs'] < 3
corr_per_sample_test.loc[~too_few_obs].describe()
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 210.000 ... 210.000 210.000 0.000 210.000 0.000 0.000 210.000 30.000 0.000 210.000
mean 0.969 0.968 0.968 0.966 0.966 0.967 0.965 0.963 0.948 0.952 ... 0.076 0.018 NaN -0.012 NaN NaN 0.371 -0.018 NaN 60.000
std 0.017 0.019 0.018 0.019 0.018 0.016 0.018 0.019 0.035 0.022 ... 0.192 0.129 NaN 0.130 NaN NaN 0.139 0.151 NaN 9.810
min 0.878 0.867 0.895 0.858 0.883 0.906 0.870 0.888 0.722 0.865 ... -0.402 -0.343 NaN -0.411 NaN NaN 0.021 -0.287 NaN 31.000
25% 0.962 0.963 0.962 0.960 0.960 0.959 0.956 0.953 0.938 0.943 ... -0.059 -0.062 NaN -0.090 NaN NaN 0.288 -0.118 NaN 53.000
50% 0.973 0.972 0.972 0.970 0.970 0.970 0.970 0.968 0.958 0.956 ... 0.067 0.017 NaN -0.037 NaN NaN 0.368 -0.042 NaN 60.000
75% 0.981 0.980 0.980 0.979 0.978 0.978 0.979 0.978 0.969 0.966 ... 0.200 0.098 NaN 0.084 NaN NaN 0.448 0.062 NaN 67.000
max 0.994 0.993 0.993 0.992 0.991 0.992 0.992 0.990 0.987 0.988 ... 0.546 0.367 NaN 0.350 NaN NaN 0.889 0.393 NaN 86.000

8 rows × 25 columns

Hide code cell source

# ! add minimum
kwargs = dict(ylim=(0.7, 1), rot=90,
              flierprops=dict(markersize=3),
              # title='Corr. betw. fake NA and model predictions per sample on test data',
              ylabel='correlation per sample')
ax = (corr_per_sample_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs))
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                   horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_sample.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_sample_test.describe().to_excel(w, sheet_name='summary')
    corr_per_sample_test.to_excel(w, sheet_name='correlations')
    corr_per_sample_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf
_images/7f097952823e0a80e9437a8971f900f134649a30c3d7269f9593baf161499e6c.png

identify samples which are below lower whisker for models

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_sample_test[TOP_N_ORDER]).min()
mask = (corr_per_sample_test[TOP_N_ORDER] < treshold).any(axis=1)
corr_per_sample_test.loc[mask].style.highlight_min(
    axis=1) if mask.sum() else 'Nothing to display'
  BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
Sample ID                                                  
Sample_015 0.958593 0.937618 0.921389 0.967788 0.953806 0.917739 0.954757 0.964680 0.932223 0.908295 0.882874 0.882874 0.923304 0.910070 nan 0.082354 -0.151438 nan -0.093017 nan nan 0.127719 -0.097389 nan 38
Sample_043 0.949633 0.867230 0.953250 0.858409 0.882720 0.905663 0.870351 0.888093 0.847971 0.882989 0.814366 0.814366 0.828364 0.808134 nan -0.060303 0.003601 nan -0.049556 nan nan 0.444295 nan nan 57
Sample_047 0.939299 0.932544 0.930098 0.916738 0.940995 0.943990 0.950780 0.945377 0.874472 0.900719 0.896683 0.896683 0.009710 0.906192 nan -0.077002 0.126782 nan -0.083391 nan nan 0.524493 nan nan 46
Sample_069 0.936850 0.929466 0.932177 0.944619 0.931202 0.954038 0.944406 0.937711 0.904430 0.950822 0.914668 0.914668 0.918364 0.653776 nan 0.044260 -0.032979 nan -0.040101 nan nan 0.271641 nan nan 68
Sample_080 0.922142 0.906427 0.922185 0.911596 0.911876 0.916779 0.912309 0.921951 0.902040 0.883475 0.893836 0.893836 0.880736 0.862179 nan 0.093012 0.072802 nan -0.040950 nan nan 0.383774 nan nan 64
Sample_091 0.878328 0.923110 0.918669 0.931055 0.913977 0.926303 0.918275 0.903784 0.920915 0.864672 0.903019 0.903019 0.910180 0.874000 nan -0.095511 0.268358 nan 0.029990 nan nan 0.341572 nan nan 60
Sample_108 0.929388 0.940292 0.938784 0.940255 0.947972 0.939157 0.951487 0.946047 0.866107 0.915455 0.939810 0.939810 0.929216 0.840381 nan -0.044493 0.038497 nan 0.204969 nan nan 0.407365 nan nan 68
Sample_109 0.937615 0.912988 0.924793 0.924847 0.926993 0.936394 0.898780 0.893626 0.841761 0.879726 0.890426 0.890426 0.931145 0.764772 nan -0.059637 0.003214 nan 0.136202 nan nan 0.337532 -0.040683 nan 59
Sample_111 0.978525 0.927049 0.966298 0.974002 0.917384 0.954914 0.958219 0.933775 0.923850 0.935239 0.857016 0.857016 0.962568 0.654922 nan -0.130888 -0.147590 nan -0.051757 nan nan 0.451452 nan nan 54
Sample_112 0.942505 0.949500 0.921262 0.947678 0.954022 0.948841 0.962984 0.962546 0.956576 0.944488 0.948732 0.948732 0.947997 0.864077 nan 0.166292 -0.181761 nan 0.037965 nan nan 0.504405 nan nan 57
Sample_115 0.891712 0.903910 0.905763 0.915296 0.925591 0.916985 0.928234 0.918842 0.853242 0.874847 0.881285 0.881285 0.901459 0.826139 nan 0.094778 0.216459 nan -0.030165 nan nan 0.320851 nan nan 63
Sample_134 0.933622 0.929198 0.942344 0.907465 0.928847 0.919268 0.952746 0.935936 0.905001 0.915387 0.865397 0.865397 0.881808 0.854481 nan 0.389792 -0.074113 nan 0.065290 nan nan 0.344741 nan nan 66
Sample_138 0.957581 0.914848 0.918003 0.953928 0.926452 0.941341 0.927573 0.936390 0.943933 0.936371 0.921359 0.921359 0.963983 0.845361 nan 0.001445 -0.015754 nan -0.057036 nan nan 0.523470 nan nan 46
Sample_148 0.975203 0.947171 0.916733 0.979465 0.941726 0.959538 0.946668 0.926864 0.929094 0.955283 0.935395 0.935395 0.984939 0.822427 nan 0.037085 -0.247200 nan -0.044230 nan nan 0.362124 nan nan 62
Sample_151 0.947829 0.920808 0.895495 0.919188 0.940457 0.936458 0.937720 0.937262 0.934510 0.915733 0.904552 0.904552 0.917004 0.821635 nan -0.189751 0.098263 nan -0.029724 nan nan 0.302307 nan nan 70
Sample_152 0.922635 0.921174 0.935681 0.926056 0.922512 0.935343 0.932482 0.931084 0.918127 0.917053 0.909410 0.909410 0.877491 0.918902 nan 0.098949 -0.018854 nan -0.076763 nan nan 0.336118 nan nan 64
Sample_162 0.929186 0.935185 0.942772 0.933190 0.943105 0.930114 0.937839 0.949772 0.956867 0.940055 0.937255 0.937255 0.933909 0.941678 nan 0.516397 -0.128314 nan 0.202032 nan nan 0.294184 0.087903 nan 51
Sample_167 0.952090 0.925487 0.946902 0.931476 0.927315 0.942503 0.939793 0.936802 0.922116 0.930438 0.905413 0.905413 0.923164 0.919505 nan 0.221299 0.056884 nan 0.288283 nan nan 0.235179 nan nan 65
Sample_171 0.948100 0.932670 0.916246 0.901446 0.898880 0.916956 0.919215 0.909432 0.845442 0.899387 0.863135 0.863135 0.898770 0.782911 nan -0.061550 0.271708 nan -0.047514 nan nan 0.344922 nan nan 40
Sample_181 0.912274 0.926415 0.925700 0.920976 0.926227 0.919975 0.929397 0.913043 0.869468 0.929033 0.896030 0.896030 0.899227 0.859657 nan -0.243627 -0.092701 nan -0.028086 nan nan 0.419029 0.117814 nan 60
Sample_184 0.944725 0.942549 0.925183 0.946711 0.938028 0.936981 0.930853 0.927579 0.934207 0.906590 0.924270 0.924270 0.921680 0.937838 nan 0.133736 0.063509 nan 0.247131 nan nan 0.517095 nan nan 60
Sample_185 0.949315 0.946521 0.944631 0.929238 0.936943 0.943437 0.936803 0.930556 0.924211 0.929391 0.922411 0.922411 0.899905 0.914843 nan -0.264227 -0.174395 nan 0.038387 nan nan 0.576069 nan nan 69
Sample_199 0.928280 0.929706 0.931374 0.930234 0.934139 0.944196 0.917037 0.925542 0.912243 0.918083 0.910943 0.910943 0.937794 0.795992 nan -0.086879 -0.062476 nan 0.190242 nan nan 0.289504 nan nan 45
Sample_200 0.934067 0.933034 0.907110 0.933368 0.936354 0.940015 0.934918 0.918361 0.722446 0.926169 0.891269 0.891269 0.916117 0.777468 nan 0.034387 0.353894 nan -0.285509 nan nan 0.535540 -0.109406 nan 40

Hide code cell source

feature_names = pred_test.index.levels[-1]
N_SAMPLES = pred_test.index
M = len(feature_names)
pred_test.loc[pd.IndexSlice[:, feature_names[random.randint(0, M - 1)]], :]
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_039 Q9NUQ9 12.609 12.654 12.752 12.464 12.826 12.686 12.427 12.972 12.812 12.772 ... 18.443 12.829 12.379 12.690 7.068 0 34.505 2,591.044 NaN 123
Sample_057 Q9NUQ9 12.401 12.470 12.862 12.955 13.121 12.849 12.582 12.982 13.086 12.381 ... 17.657 13.197 12.440 11.670 7.068 0 12.121 2,580.387 NaN 123
Sample_063 Q9NUQ9 12.974 13.398 13.290 13.459 13.512 12.946 13.236 13.345 13.211 13.520 ... 17.001 13.122 11.903 11.579 7.068 0 59.205 2,580.996 NaN 123
Sample_067 Q9NUQ9 13.447 12.777 12.653 12.695 12.811 12.749 12.697 12.921 12.818 13.041 ... 17.023 10.723 11.827 11.557 7.068 0 16.445 2,569.564 NaN 123
Sample_095 Q9NUQ9 12.595 12.385 12.701 12.472 13.047 12.655 12.900 12.878 12.746 12.494 ... 17.183 12.428 11.709 11.703 7.068 0 32.012 2,629.327 NaN 123
Sample_096 Q9NUQ9 13.211 13.112 13.148 13.078 13.232 12.988 13.376 13.155 13.221 13.030 ... 17.227 12.048 12.245 12.645 7.068 0 23.989 2,549.026 NaN 123
Sample_114 Q9NUQ9 12.759 12.772 13.067 13.077 13.433 12.852 12.741 12.935 12.800 12.411 ... 16.992 12.143 11.786 11.818 7.068 0 34.968 16.613 NaN 123
Sample_115 Q9NUQ9 13.562 12.745 12.936 12.905 13.340 12.830 12.575 12.816 12.540 12.329 ... 17.365 11.411 12.275 12.518 7.068 0 21.211 2,634.625 NaN 123
Sample_120 Q9NUQ9 11.726 12.612 12.429 12.574 12.740 12.775 12.985 12.826 12.916 13.606 ... 17.073 12.940 11.384 11.682 7.068 0 -17.482 19.725 NaN 123
Sample_123 Q9NUQ9 12.985 12.538 12.538 12.546 12.533 12.545 12.612 12.796 12.698 13.538 ... 16.971 12.780 12.689 12.162 7.068 0 -18.930 2,461.806 NaN 123
Sample_156 Q9NUQ9 12.980 12.514 12.412 11.888 12.427 12.569 12.594 12.364 11.966 12.950 ... 17.747 11.767 12.487 12.971 7.068 0 -12.483 18.196 NaN 123
Sample_158 Q9NUQ9 12.879 12.551 12.176 12.101 12.358 12.560 12.033 12.163 11.944 13.040 ... 17.290 12.038 11.916 11.224 7.068 0 -24.677 2,513.638 NaN 123
Sample_172 Q9NUQ9 12.201 12.746 12.995 12.813 12.878 12.905 12.887 12.801 12.725 13.071 ... 17.137 12.762 12.508 12.466 7.068 0 7.124 2,420.415 NaN 123

13 rows × 26 columns

Hide code cell source

options = random.sample(sorted(set(feature_names)), 1)
pred_test.loc[pd.IndexSlice[:, options[0]], :]
observed BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ freq
Sample ID protein groups
Sample_008 Q99574 18.325 18.478 18.481 18.429 18.471 18.361 18.326 18.591 18.402 18.568 ... 17.233 13.776 12.288 12.500 7.068 0 40.625 2,555.081 NaN 210
Sample_043 Q99574 18.938 18.746 19.015 18.691 18.845 18.500 18.665 18.825 18.798 18.807 ... 17.481 12.901 12.441 12.802 7.068 0 -110.773 2,455.484 NaN 210
Sample_069 Q99574 18.337 18.762 18.597 18.522 18.684 18.777 18.681 18.692 18.839 18.667 ... 17.010 13.375 11.631 11.801 7.068 0 159.875 2,567.211 NaN 210
Sample_081 Q99574 18.259 18.357 18.377 18.612 18.274 18.451 18.595 18.337 18.414 18.233 ... 17.001 11.802 12.067 12.814 7.068 0 60.279 2,631.820 NaN 210
Sample_085 Q99574 18.614 18.557 18.492 18.527 18.530 18.454 18.548 18.485 18.402 18.371 ... 17.221 12.622 12.550 11.729 7.068 0 82.411 2,494.406 NaN 210
Sample_177 Q99574 18.080 17.796 17.830 17.855 17.706 17.956 17.954 18.264 18.108 17.775 ... 17.324 11.648 12.140 12.119 7.068 0 64.433 16.051 NaN 210
Sample_187 Q99574 18.751 18.530 18.382 18.438 18.577 18.599 18.646 18.607 18.487 18.355 ... 16.953 11.724 12.471 13.513 7.068 0 68.128 2,508.616 NaN 210
Sample_195 Q99574 18.390 18.308 18.213 18.302 18.162 18.378 18.495 18.431 18.533 18.052 ... 17.117 10.845 12.219 12.055 7.068 0 -39.306 2,568.559 NaN 210

8 rows × 26 columns

Correlation per feature#

Hide code cell source

corr_per_feat_test = pred_test.groupby(FEAT_NAME).apply(
    lambda df: df.corr().loc[TARGET_COL])[ORDER_MODELS]
corr_per_feat_test = corr_per_feat_test.join(pred_test.groupby(FEAT_NAME)[
    TARGET_COL].count().rename('n_obs'))

too_few_obs = corr_per_feat_test['n_obs'] < 3
corr_per_feat_test.loc[~too_few_obs].describe()
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
count 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 1,396.000 ... 1,396.000 1,396.000 1,396.000 1,396.000 0.000 0.000 1,396.000 1,396.000 0.000 1,396.000
mean 0.636 0.636 0.639 0.607 0.591 0.601 0.564 0.538 0.445 0.470 ... -0.008 0.013 0.086 0.030 NaN NaN -0.009 -0.016 NaN 8.999
std 0.337 0.331 0.346 0.342 0.344 0.346 0.359 0.369 0.422 0.388 ... 0.447 0.395 0.441 0.420 NaN NaN 0.411 0.433 NaN 3.913
min -0.998 -1.000 -0.988 -0.977 -0.996 -1.000 -0.983 -0.991 -1.000 -0.999 ... -0.999 -0.999 -0.999 -0.999 NaN NaN -1.000 -0.999 NaN 3.000
25% 0.506 0.493 0.504 0.455 0.435 0.450 0.392 0.359 0.221 0.266 ... -0.341 -0.265 -0.242 -0.273 NaN NaN -0.285 -0.313 NaN 6.000
50% 0.746 0.746 0.757 0.706 0.690 0.702 0.677 0.634 0.554 0.554 ... -0.005 0.027 0.118 0.050 NaN NaN 0.004 -0.016 NaN 8.000
75% 0.880 0.876 0.885 0.859 0.848 0.851 0.833 0.809 0.770 0.768 ... 0.331 0.291 0.425 0.341 NaN NaN 0.268 0.294 NaN 11.000
max 0.999 0.999 0.999 1.000 0.999 1.000 0.999 1.000 1.000 0.998 ... 0.999 1.000 0.999 0.997 NaN NaN 0.992 0.998 NaN 32.000

8 rows × 25 columns

Hide code cell source

corr_per_feat_test.loc[too_few_obs].dropna(thresh=3, axis=0)
BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ... SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs
protein groups
A0A0A0MS09;P01880;P01880-2 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 -1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000 1.000 1.000 -1.000 1.000 1.000 -1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
A0A0C4DH29 -1.000 -1.000 1.000 -1.000 -1.000 -1.000 1.000 -1.000 1.000 1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 ... 1.000 1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000 1.000 -1.000 -1.000 -1.000 1.000 -1.000 1.000 -1.000 1.000 ... 1.000 1.000 -1.000 1.000 NaN NaN -1.000 -1.000 NaN 2
D6RF35 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
E7EQ64;P07477 1.000 1.000 1.000 1.000 1.000 -1.000 1.000 1.000 1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 -1.000 NaN 2
F8WDW9;Q96AP7 -1.000 1.000 1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2
J3KRP0 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 ... 1.000 -1.000 1.000 1.000 NaN NaN 1.000 1.000 NaN 2
O43581-2;O43581-3;O43581-5 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 ... 1.000 1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P04075 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 -1.000 1.000 1.000 NaN NaN 1.000 -1.000 NaN 2
P04080 1.000 1.000 1.000 1.000 1.000 1.000 1.000 -1.000 1.000 -1.000 ... -1.000 1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
P33151 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
P62258 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 ... 1.000 -1.000 1.000 -1.000 NaN NaN -1.000 1.000 NaN 2
Q9NYQ8 1.000 1.000 1.000 1.000 1.000 -1.000 -1.000 -1.000 -1.000 1.000 ... -1.000 -1.000 -1.000 -1.000 NaN NaN -1.000 -1.000 NaN 2
Q9Y281;Q9Y281-3 1.000 1.000 1.000 1.000 -1.000 1.000 1.000 1.000 1.000 1.000 ... -1.000 -1.000 -1.000 -1.000 NaN NaN 1.000 1.000 NaN 2

16 rows × 25 columns

Hide code cell source

kwargs = dict(rot=90,
              flierprops=dict(markersize=1),
              ylabel=f'correlation per {FEAT_NAME_DISPLAY}')
ax = (corr_per_feat_test
      .loc[~too_few_obs, TOP_N_ORDER]
      .plot
      .box(**kwargs)
      )
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45,
                       horizontalalignment='right')
fname = args.out_figures / f'2_{group}_pred_corr_test_per_feat.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
dumps[fname.stem] = fname.with_suffix('.xlsx')
with pd.ExcelWriter(fname.with_suffix('.xlsx')) as w:
    corr_per_feat_test.loc[~too_few_obs].describe().to_excel(
        w, sheet_name='summary')
    corr_per_feat_test.to_excel(w, sheet_name='correlations')
    corr_per_feat_test.loc[~too_few_obs, TOP_N_ORDER].to_excel(w, sheet_name='correlations_plotted')
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf
_images/df812a5bfd138d0b23c22a412194249c208ef0a9f39503be104db033929cda7f.png

Hide code cell source

feat_count_test = data.test_y.stack().groupby(FEAT_NAME).count()
feat_count_test.name = 'count'
feat_count_test.head()
protein groups
A0A024QZX5;A0A087X1N8;P35237                                                     10
A0A024R0T9;K7ER74;P02655                                                          8
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8    6
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503                                           8
A0A075B6H7                                                                        4
Name: count, dtype: int64

Hide code cell source

treshold = pimmslearn.pandas.get_lower_whiskers(
    corr_per_feat_test[TOP_N_ORDER]).min()
mask = (corr_per_feat_test[TOP_N_ORDER] < treshold).any(axis=1)


def highlight_min(s, color, tolerence=0.00001):
    return np.where((s - s.min()).abs() < tolerence, f"background-color: {color};", None)


view = (corr_per_feat_test
        .join(feat_count_test)
        .loc[mask]
        .sort_values('count'))

if not view.empty:
    display(view
            .style.
            apply(highlight_min, color='yellow', axis=1,
                  subset=corr_per_feat_test.columns)
            )
else:
    print("None found")
  BPCA VAE DAE TRKNN RF CF KNN5 KNN KNN_IMPUTE IRM ROWMEDIAN Median LLS QRILC COLMEDIAN SVDMETHOD PI MINDET MINPROB MINIMUM ZERO IMPSEQROB MLE IMPSEQ n_obs count
protein groups                                                    
A0A0C4DGV4;E9PLX3;O43504;R4GMU8 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 nan nan -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
A0A0C4DH29 -1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 1.000000 nan nan -1.000000 -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan -1.000000 -1.000000 nan 2 2
A0A0G2JLL6;A0A1B0GTE9;A0A1B0GTP1;Q7Z6L0;Q7Z6L0-2;Q7Z6L0-3 -1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 nan nan 1.000000 -1.000000 nan 2 2
A6H8L4;E7EUI5;P78536;P78536-2 1.000000 1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 1.000000 -1.000000 1.000000 1.000000 -1.000000 1.000000 nan nan -1.000000 -1.000000 nan 2 2
O43581-2;O43581-3;O43581-5 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 nan nan -1.000000 1.000000 nan 2 2
F8WDW9;Q96AP7 -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan -1.000000 1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
Q9Y281;Q9Y281-3 1.000000 1.000000 1.000000 1.000000 -1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 nan nan 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 nan nan 1.000000 1.000000 nan 2 2
A0A140T913;A0A140T933;A0A140T955;A0A140T9I0;A0A140T9X5;A0A1W2PPQ2;A0A1W2PRT9;P01892;P10316 -0.405690 -0.062043 -0.490330 -0.426425 0.092603 -0.476703 0.225780 0.141568 -0.230425 0.240663 nan nan 0.029026 0.086114 0.969574 0.997890 0.968003 -0.651641 -0.777583 nan nan 0.599349 0.988049 nan 3 3
P04040 0.955425 -1.000000 0.443953 0.364706 -0.971166 0.990722 0.860313 0.995399 0.584629 0.685429 nan nan 0.736559 0.981542 0.910187 0.961838 0.572406 0.889522 0.815375 nan nan -0.755848 0.778883 nan 3 3
P14138 -0.783274 0.376760 0.065222 0.277987 0.302618 -0.446927 0.474087 -0.039227 0.184293 -0.975804 nan nan -0.284116 -0.554726 -0.680464 -0.988691 -0.593193 -0.256388 -0.999295 nan nan 0.330904 0.330904 nan 3 3
Q9UI40;Q9UI40-2 0.154643 0.404282 -0.294619 0.705987 0.830019 -0.825973 0.043055 -0.368219 0.609727 -0.080993 nan nan 0.216445 0.532646 0.418780 0.168840 -0.481757 0.994516 -0.900111 nan nan 0.588145 0.963302 nan 3 3
Q96KR4;Q96KR4-3 -0.811499 0.919851 0.793383 0.188634 0.878230 -0.723372 0.962865 0.998946 -0.996456 -0.998544 nan nan -0.499851 -0.178609 -0.969111 -0.794681 0.840774 0.464970 0.087257 nan nan 0.770952 0.620557 nan 3 3
Q0P6D2 0.488693 0.352175 -0.263507 -0.977189 0.996261 -0.054426 -0.914209 -0.991429 0.990772 -0.817525 nan nan -0.910486 -0.674420 -0.815144 -0.873162 0.999677 -0.856054 -0.592952 nan nan 0.198784 -0.816625 nan 3 3
Q5FWE3;Q5FWE3-3 -0.595898 0.999243 -0.682544 0.940836 0.911864 -0.482131 -0.515612 -0.450286 0.997780 -0.317398 nan nan 0.852964 0.964312 0.879697 0.674983 0.443092 0.966111 0.525274 nan nan -0.999926 0.995547 nan 3 3
P67936 -0.781265 -0.792906 -0.821615 -0.280233 -0.890067 -0.999984 -0.930367 0.668141 -0.999486 -0.951571 nan nan -0.952164 0.835390 -0.989773 -0.899019 -0.361000 0.990700 0.664054 nan nan -0.364239 -0.919047 nan 3 3
O95497 0.491301 0.744993 -0.514379 -0.187796 0.327413 0.998902 -0.278320 -0.038249 -0.972465 0.974251 nan nan -0.924475 -0.574656 -0.728224 -0.107445 0.745515 0.908126 0.202523 nan nan 0.163889 -0.914582 nan 3 3
Q9BRA2 -0.997854 -0.993325 -0.987554 -0.976681 -0.996000 -0.999988 0.355335 -0.969137 -0.999909 0.328330 nan nan -0.376394 -0.725629 -0.888126 -0.946135 0.492942 -0.251266 -0.465952 nan nan 0.231242 0.893058 nan 3 3
P01912;Q5Y7A7 0.312242 0.004953 -0.248472 -0.737679 -0.779228 -0.178065 -0.983163 -0.923553 0.367817 0.598949 nan nan -0.036948 -0.731234 0.393426 0.622172 -0.106994 0.128832 0.786953 nan nan -0.520030 -0.464856 nan 4 4
P55058 0.331515 -0.259197 0.142781 -0.453671 -0.465064 -0.106582 -0.730945 -0.687075 0.561214 0.511726 nan nan -0.850960 -0.120859 -0.860013 0.937309 0.480305 0.109603 -0.021131 nan nan -0.314778 -0.708824 nan 4 4
P62805 0.288891 0.569826 0.165904 0.216616 -0.334162 0.506509 0.997199 0.999754 0.762963 0.304920 nan nan 0.058312 0.632499 0.970248 -0.577842 0.281787 0.967646 -0.239094 nan nan -0.353942 -0.929071 nan 4 4
Q99538 0.593821 -0.377183 -0.259214 0.377925 -0.121101 0.005697 -0.288458 -0.152372 0.223491 -0.280701 nan nan 0.914120 0.791648 0.708292 -0.029421 0.763081 0.753316 0.977585 nan nan -0.421840 -0.113954 nan 4 4
Q9NS85 0.209240 0.044743 -0.609833 -0.748054 0.104189 0.032552 0.174286 0.827546 -0.293415 -0.490454 nan nan -0.811087 0.239718 0.688432 0.684565 -0.997283 -0.573150 -0.689927 nan nan 0.028761 0.700517 nan 4 4
P78310;P78310-2;P78310-5;P78310-6;P78310-7 -0.287802 0.778894 0.255770 0.187632 0.662668 0.240633 0.236820 0.177240 0.323788 -0.925310 nan nan -0.002267 -0.358065 0.370874 -0.992637 0.118440 -0.740181 -0.868731 nan nan -0.197104 0.065686 nan 4 4
P69905 0.965583 -0.282550 -0.809649 0.990263 -0.330305 0.728356 -0.734862 -0.884057 0.787808 0.652423 nan nan 0.995940 -0.325685 -0.089591 0.819375 -0.978391 -0.613292 -0.715531 nan nan 0.599413 0.563706 nan 4 4
P31150 -0.695110 -0.422605 -0.714789 0.079765 0.162752 -0.454927 -0.302639 -0.258552 0.871434 0.374252 nan nan -0.383378 0.215939 -0.744603 -0.826531 0.450994 0.679134 0.655331 nan nan 0.018162 0.997740 nan 4 4
P48745 -0.215191 -0.194973 -0.667737 0.281906 0.115038 0.185120 -0.708241 -0.711413 0.592691 0.072594 nan nan -0.966228 0.930269 -0.502678 -0.007247 -0.527648 -0.811427 0.715371 nan nan -0.816749 -0.563041 nan 4 4
A8MXB9;J3KQJ1;Q8NBJ7 -0.526368 -0.208997 -0.282198 -0.288690 -0.216620 0.874813 0.719047 0.882512 0.575047 0.781018 nan nan 0.983037 -0.223269 -0.315140 -0.447225 0.214012 0.092247 -0.175089 nan nan -0.625728 -0.380936 nan 4 4
E9PN95;P11684 -0.044137 0.449996 -0.100617 -0.416898 -0.424486 -0.077324 -0.241660 0.598536 0.553236 0.989063 nan nan 0.572649 -0.823704 -0.972121 -0.214485 -0.355864 -0.729323 -0.818502 nan nan 0.624754 0.488347 nan 4 4
B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 -0.586205 -0.463431 -0.345807 -0.468526 -0.814033 -0.963640 -0.950792 -0.978604 -0.206013 0.640276 nan nan -0.926715 -0.777904 -0.611749 -0.688762 -0.770874 -0.359254 0.645595 nan nan -0.630160 0.512953 nan 4 4
A0A0G2JRN3 -0.879258 -0.732660 -0.796395 -0.190835 -0.696223 -0.828121 0.676931 0.355859 -0.978184 -0.922357 nan nan -0.966410 -0.701873 -0.890772 -0.522710 -0.171052 -0.786540 -0.846050 nan nan 0.786883 0.273736 nan 4 4
A0A087WSY4 0.341097 -0.495793 -0.432664 -0.455325 -0.695080 0.965204 0.666119 0.804721 -0.612366 -0.162758 nan nan 0.149034 0.728889 0.994065 0.280256 0.500163 -0.553907 0.634408 nan nan -0.767064 0.652348 nan 4 4
A0A075B7B8 0.995309 0.839044 0.950888 -0.584344 0.860207 0.929987 0.645944 0.628331 0.493563 0.290645 nan nan -0.198260 0.033966 0.596184 0.426337 0.967717 -0.818663 0.022315 nan nan -0.419682 0.603615 nan 4 4
P21810 0.488982 -0.672805 -0.853613 0.558018 0.495740 0.828170 0.725338 0.497175 0.976315 0.606417 nan nan 0.507237 -0.739167 0.086433 0.181221 0.816590 -0.013604 -0.408979 nan nan -0.032190 -0.515024 nan 4 4
Q9ULP0-3;Q9ULP0-6 -0.433605 0.334129 -0.045651 0.280988 0.573984 -0.498291 -0.127518 -0.217068 -0.824736 0.569382 nan nan 0.047124 -0.263898 -0.147969 0.088032 0.399408 0.597287 0.685924 nan nan 0.651242 0.678026 nan 4 4
Q8WXD2 -0.194738 -0.440013 -0.574341 0.030615 -0.545260 0.074052 -0.644254 -0.896598 -0.044385 -0.527932 nan nan -0.031366 -0.153494 -0.879923 -0.696602 0.272664 0.367920 -0.244138 nan nan -0.304569 0.801129 nan 4 4
Q8TEA8 0.312460 0.437158 0.697841 0.078415 -0.441708 -0.022802 -0.199719 -0.061944 0.642442 0.704937 nan nan 0.711175 0.801664 0.925803 0.920805 -0.901736 0.711193 -0.168416 nan nan -0.971775 0.075778 nan 4 4
Q13508;Q13508-2;Q13508-3 -0.588905 0.763474 -0.070096 0.317771 0.098808 -0.226479 0.367007 -0.137197 -0.196508 0.072987 nan nan 0.595601 -0.595333 -0.921969 -0.902558 0.180312 -0.647509 -0.574788 nan nan -0.816043 0.878974 nan 5 5
O15031 -0.384689 0.856968 0.370022 0.545446 0.106883 0.237051 0.392387 0.288900 0.278546 0.565510 nan nan 0.144879 -0.297758 -0.264862 -0.348062 -0.325409 -0.121316 0.106873 nan nan -0.296688 -0.296688 nan 5 5
B1AJZ9;B1AJZ9-4;H0YE38;Q5JYW6 -0.213396 -0.438445 -0.244748 -0.655514 -0.519094 -0.422950 -0.355725 -0.358839 -0.320693 0.330414 nan nan -0.420475 -0.058731 -0.515651 -0.659479 -0.119928 0.428130 0.153945 nan nan 0.163593 0.301970 nan 5 5
P01282;P01282-2 -0.216410 0.475808 0.461065 0.209445 0.022609 0.366572 0.888676 -0.090521 -0.394861 -0.216388 nan nan -0.782966 0.438149 0.261410 0.709596 0.058571 0.425854 -0.217154 nan nan -0.467714 0.087763 nan 5 5
Q6ZVL6 -0.405009 0.445645 0.862284 0.541507 0.552490 0.504941 -0.626545 -0.421210 0.745658 0.380995 nan nan 0.809662 -0.411550 0.343913 -0.237720 0.051514 0.617457 -0.824449 nan nan -0.305862 -0.650631 nan 5 5
Q9NZP8 0.485369 0.322863 0.693691 -0.596706 -0.367970 -0.084800 0.391275 0.135336 -0.921105 0.672486 nan nan -0.849012 -0.750451 0.108741 -0.011939 -0.772455 0.306441 0.696724 nan nan 0.307325 0.307325 nan 5 5
Q9NZC2;Q9NZC2-2;Q9NZC2-3 -0.532191 -0.513242 -0.604946 -0.549588 -0.401198 -0.662847 0.118561 -0.020609 -0.710573 -0.652656 nan nan -0.534201 -0.807445 0.889235 -0.779762 -0.253580 -0.454867 0.167148 nan nan 0.820294 0.820294 nan 5 5
Q9UHI8 0.073471 0.073801 -0.321787 -0.445122 -0.164245 0.373219 0.316329 -0.061541 0.216956 -0.119815 nan nan -0.208262 -0.381837 0.051369 0.334440 -0.003566 0.118070 0.309127 nan nan -0.151806 -0.746015 nan 5 5
Q8NFZ4 -0.225202 -0.032998 -0.190638 0.001304 -0.035023 -0.108452 -0.011690 0.057673 -0.422722 -0.787582 nan nan 0.200058 0.390180 0.685266 0.396679 0.781159 -0.363897 0.710319 nan nan -0.741786 0.576960 nan 5 5
Q10469 -0.098233 -0.512506 -0.026397 -0.291141 -0.778464 -0.893706 -0.671391 -0.600598 0.649427 0.606425 nan nan -0.185669 0.835898 -0.517750 -0.162715 0.454173 -0.550749 -0.355687 nan nan -0.084422 0.700076 nan 5 5
Q8N428 -0.711604 -0.497552 -0.571224 -0.372718 -0.308817 -0.358377 -0.815146 -0.973013 0.485147 -0.775527 nan nan 0.896715 0.648522 -0.145359 0.398169 -0.384566 0.571660 0.472995 nan nan -0.592174 0.633450 nan 5 5
Q5BIV9 -0.499694 -0.415760 -0.479889 -0.073578 -0.359034 -0.218733 -0.595082 -0.855339 -0.481080 -0.438393 nan nan -0.128645 0.066645 -0.778869 -0.513219 -0.622280 -0.261649 0.722304 nan nan -0.127227 -0.239878 nan 5 5
G3V2U7;P07311 0.478113 0.003535 -0.343718 0.277132 0.649848 0.120493 0.126790 -0.075889 -0.045458 0.298977 nan nan -0.201011 -0.011195 -0.434263 -0.349618 0.356663 0.286177 0.396439 nan nan -0.412639 -0.436945 nan 5 5
D6R956;P09936 0.296679 0.090444 0.045792 -0.551658 0.200743 0.302998 -0.259386 -0.664319 0.151812 0.024034 nan nan 0.313797 -0.043274 -0.119808 -0.402161 0.081771 -0.212790 0.185069 nan nan -0.514141 0.033000 nan 5 5
O75882;O75882-2;O75882-3 0.808756 0.712469 0.886667 -0.195404 0.497908 0.637487 0.647639 0.664223 0.069134 -0.175883 nan nan 0.085212 0.547646 -0.812939 0.161950 -0.404929 -0.782643 0.409907 nan nan -0.899663 0.811337 nan 5 5
A0A0C4DGV8;Q13214;Q13214-2 0.215023 -0.139196 0.182417 -0.952860 -0.241747 -0.453714 -0.317076 -0.315165 -0.648660 -0.694450 nan nan -0.483616 0.459069 0.951239 0.908215 0.698724 -0.051800 -0.149314 nan nan -0.698798 -0.910792 nan 5 5
P02533 0.968900 -0.818348 -0.801116 0.859288 -0.643727 0.483212 0.106114 0.085868 0.760846 0.490144 nan nan 0.864174 -0.510344 0.303053 0.460298 -0.163285 0.446403 0.700237 nan nan 0.207036 -0.545616 nan 5 5
P01834 0.145967 -0.416404 0.094685 -0.196562 -0.381406 0.049624 0.157569 0.333278 0.249827 0.136816 nan nan -0.132585 -0.234699 -0.169066 -0.492225 -0.059553 0.283761 0.545630 nan nan -0.157407 -0.157407 nan 5 5
P10124 -0.597027 -0.134200 -0.551194 -0.186782 -0.081019 -0.488675 0.571509 -0.252024 0.436283 -0.628340 nan nan -0.573988 0.436908 -0.483333 -0.479793 -0.917595 -0.261585 -0.302343 nan nan 0.414290 0.578378 nan 5 5
P00441 -0.466488 -0.522756 -0.614594 -0.418119 -0.740794 0.140093 0.047755 -0.045985 -0.047843 -0.228959 nan nan -0.221865 -0.184480 -0.478446 -0.697790 0.883768 -0.145801 0.671860 nan nan 0.030159 -0.379242 nan 5 5
F6RFD5;P60981;P60981-2 0.287552 0.872019 -0.329986 0.937927 0.104630 0.718946 0.762560 0.707064 0.944373 0.956543 nan nan -0.675579 0.393213 -0.092708 0.038986 0.778668 -0.548061 -0.093397 nan nan 0.299492 -0.089459 nan 5 5
O75339 0.036295 -0.503534 0.036136 0.082569 0.167913 -0.170608 -0.383440 -0.134371 -0.662474 0.284821 nan nan -0.707550 0.867696 -0.475700 -0.809598 0.398020 -0.206109 -0.417054 nan nan 0.211736 0.029954 nan 5 5
A0A0U1RR20;Q92954;Q92954-2;Q92954-3;Q92954-4;Q92954-5;Q92954-6 0.704180 -0.237857 0.312086 -0.095983 0.183850 -0.192332 -0.313959 -0.361898 0.788368 -0.256391 nan nan -0.060601 -0.389703 -0.579355 -0.021789 -0.612540 0.515599 0.527792 nan nan 0.721920 0.696770 nan 5 5
A0A0A0MRJ7;P12259 0.464082 0.726200 0.864747 0.746693 -0.473077 0.372992 0.242437 -0.199638 0.496836 0.591475 nan nan 0.663739 0.092654 0.283453 0.620421 -0.227013 -0.485466 0.613707 nan nan 0.461867 0.467662 nan 5 5
A0A0G2JRQ6 -0.226956 -0.370165 -0.666387 -0.249237 -0.451688 -0.323145 -0.267241 -0.104896 0.030670 -0.906416 nan nan -0.206273 -0.173262 -0.360860 0.551245 0.294703 0.307142 0.749650 nan nan -0.589769 -0.589769 nan 5 5
A0A0C4DH73;P01611 0.243542 0.326469 -0.756045 0.773609 -0.182104 0.920013 0.796573 0.344724 0.055662 0.108815 nan nan -0.232335 -0.071668 -0.561802 0.517434 -0.751469 -0.303031 -0.468274 nan nan -0.232278 -0.232290 nan 5 5
P10644;P10644-2 -0.252574 -0.092126 -0.510747 -0.097608 0.024151 0.127005 -0.336136 -0.255617 0.448682 -0.000994 nan nan 0.107935 0.200325 0.383896 -0.528502 -0.424133 0.577309 0.352704 nan nan -0.154795 -0.502540 nan 6 6
P05451 0.097378 -0.420879 -0.031407 -0.057327 -0.470193 -0.175585 -0.360316 -0.684154 -0.327039 0.306881 nan nan -0.336078 0.541580 -0.538331 -0.470228 -0.818292 -0.412800 -0.367715 nan nan -0.749296 -0.302038 nan 6 6
P01036;P01037 0.014913 -0.220650 -0.358501 -0.482398 -0.416728 -0.281198 -0.220449 -0.046435 -0.115861 -0.146824 nan nan 0.512862 -0.703563 -0.389652 -0.354472 -0.194348 -0.362058 0.396727 nan nan 0.673582 0.579024 nan 6 6
M0R009 -0.339372 -0.504553 -0.664157 -0.584432 -0.134024 -0.223670 -0.356792 -0.247140 -0.037546 -0.001198 nan nan -0.732570 0.161838 -0.580618 0.320436 0.268253 -0.750481 -0.468628 nan nan 0.635403 -0.182952 nan 6 6
E9PKE3;P11142 -0.225336 -0.437683 -0.630598 0.346178 0.593565 -0.086838 -0.515610 -0.674428 0.038561 -0.291303 nan nan -0.729284 0.226290 -0.288577 0.003161 -0.886771 -0.507155 0.892377 nan nan 0.281878 0.415848 nan 6 6
H3BRQ4;K4DIB9;P50238 -0.093569 -0.529152 -0.319716 -0.057295 -0.565397 -0.375668 -0.083371 -0.277897 0.428824 0.210272 nan nan 0.321101 0.200467 -0.052992 0.460109 -0.480837 0.387204 0.086397 nan nan 0.012052 -0.720358 nan 6 6
P29401;P29401-2 0.085483 -0.200973 0.212002 -0.368660 0.660436 -0.141815 0.223946 0.365861 0.001412 -0.392136 nan nan 0.108221 0.245511 0.187563 0.224033 0.248882 -0.281768 -0.482136 nan nan 0.300495 0.397517 nan 6 6
Q14257;Q14257-2 0.000015 0.075092 -0.175280 -0.049120 -0.421265 -0.380051 0.603578 0.512839 -0.438692 0.185626 nan nan -0.256550 0.564002 -0.167794 -0.168088 0.212974 -0.024909 0.039951 nan nan 0.319924 0.402311 nan 6 6
K7EPJ5;O60291;O60291-2;O60291-3;O60291-4 0.329534 0.101826 0.290041 0.338197 -0.387161 0.426722 0.445305 0.333772 0.254331 0.510494 nan nan -0.168375 0.607847 -0.417249 -0.282897 0.177570 -0.236863 0.038720 nan nan -0.087903 -0.225342 nan 7 7
C9JKT8;Q9UEW3;Q9UEW3-2 -0.293733 0.069145 -0.063052 -0.671856 -0.378833 0.019333 0.252685 0.407657 0.516418 -0.293680 nan nan -0.461590 -0.638187 -0.356302 -0.545248 -0.145144 -0.645563 -0.429328 nan nan -0.058753 -0.279069 nan 7 7
Q96S96 -0.324600 -0.132875 -0.210561 -0.462902 -0.516341 -0.527365 -0.359991 -0.296281 -0.305383 0.059875 nan nan -0.582492 -0.310602 -0.157328 -0.153988 -0.482796 -0.086735 0.206471 nan nan -0.682257 0.760044 nan 7 7
Q8IWU5;Q8IWU5-2 -0.218403 -0.178363 0.085210 -0.483524 -0.132098 0.433160 -0.071915 -0.451814 -0.684735 0.392763 nan nan -0.070980 -0.627498 0.334826 0.479673 0.539537 -0.096058 -0.078636 nan nan 0.632888 -0.013472 nan 7 7
Q9BT88 -0.231560 0.048813 -0.051092 -0.505983 -0.080088 0.026886 0.248942 0.470496 -0.357877 -0.223776 nan nan 0.096659 0.595168 -0.078098 0.471098 -0.517910 0.626185 0.096215 nan nan 0.342112 0.354892 nan 7 7
Q9BX67 0.701146 0.127659 -0.311447 0.063862 0.010888 0.175240 -0.048843 -0.246240 0.445479 0.340506 nan nan 0.033937 0.025333 -0.269549 -0.431929 -0.510652 0.232872 -0.374229 nan nan -0.194531 -0.561094 nan 7 7
P35443 -0.174408 0.309094 0.386450 -0.252097 -0.221136 0.345428 0.404203 0.582749 0.068469 -0.211928 nan nan 0.314822 0.487216 -0.017793 -0.130271 -0.502363 0.039112 -0.003224 nan nan 0.022830 0.275796 nan 7 7
P19835;X6R868 -0.578277 -0.579345 -0.588968 -0.428965 -0.550078 -0.163356 -0.359418 -0.458749 -0.930907 0.408941 nan nan -0.762851 0.232094 -0.321335 -0.213048 -0.430952 -0.001008 -0.609234 nan nan -0.780480 0.407170 nan 7 7
P80108 -0.376648 -0.360619 -0.262691 -0.172435 -0.130943 -0.408797 -0.184693 -0.137669 0.293716 -0.492363 nan nan 0.087362 0.673120 -0.600824 -0.578633 -0.452592 -0.443645 -0.528759 nan nan 0.119112 0.119112 nan 7 7
Q9P232 -0.178684 -0.113816 -0.288711 -0.387374 -0.303743 -0.105540 -0.587127 -0.640844 -0.695382 -0.457551 nan nan -0.400998 0.357736 0.256287 0.419846 -0.449833 0.477733 0.861359 nan nan -0.766501 -0.447343 nan 8 8
Q5JRA6;Q5JRA6-2 -0.221290 0.266692 -0.460617 -0.172243 0.083797 -0.339874 0.450879 0.021417 -0.649397 -0.669281 nan nan -0.367023 -0.237276 -0.544434 -0.362956 0.413593 -0.396093 -0.556269 nan nan 0.085440 0.745223 nan 8 8
P17677;P17677-2 -0.225142 0.178726 -0.158270 -0.046967 -0.132861 0.177455 -0.060209 -0.272913 -0.312226 0.255005 nan nan -0.234374 0.195361 0.302693 0.676453 -0.489936 -0.380524 -0.537706 nan nan 0.169275 -0.057105 nan 8 8
P18206;P18206-2 0.275209 0.276279 0.279062 -0.189763 0.423534 0.218512 -0.393250 -0.430189 0.404893 0.478030 nan nan -0.101734 -0.075565 0.168785 0.035300 -0.354340 -0.121760 0.187624 nan nan -0.668040 0.105408 nan 8 8
P20742 0.272179 -0.279888 0.233306 -0.050237 -0.212627 0.595828 0.042236 -0.224989 0.307233 0.709951 nan nan -0.253728 0.059307 -0.013379 0.034007 0.151578 0.302084 0.175770 nan nan -0.589304 -0.738412 nan 8 8
A0A0G2JQD2;A0A0G2JQM0;A0A0G2JRN4;P30711 -0.278053 0.026230 -0.090845 -0.254147 -0.276933 -0.067652 -0.176005 -0.274112 0.078554 -0.628983 nan nan -0.142576 0.069472 -0.484875 -0.287631 0.457282 -0.051869 -0.384560 nan nan 0.739107 -0.015886 nan 8 8
P05556;P05556-2;P05556-3;P05556-4;P05556-5 -0.472444 -0.167499 -0.168632 -0.424667 -0.407688 -0.081046 -0.339333 -0.164482 -0.016348 0.328409 nan nan -0.091371 -0.022663 0.247949 0.184702 0.014803 -0.061795 0.211570 nan nan 0.003355 0.029215 nan 8 8
Q9Y653;Q9Y653-2;Q9Y653-3 -0.130134 -0.612567 -0.502569 -0.417296 -0.709442 -0.432946 -0.604021 -0.672811 -0.191744 0.165963 nan nan -0.729686 -0.209520 0.286967 0.024990 -0.180826 -0.282547 0.037286 nan nan 0.406484 -0.099690 nan 8 8
Q9HBT6 -0.262534 0.221576 0.103837 0.204430 0.622433 0.187341 0.099041 -0.130810 0.445839 -0.463106 nan nan 0.711355 -0.386375 -0.045300 -0.091698 0.134155 -0.213494 0.300313 nan nan 0.106877 -0.188251 nan 8 8
Q13790 -0.521996 0.464373 0.466962 -0.198715 -0.158093 -0.335648 -0.462789 0.222006 -0.215508 -0.483553 nan nan 0.029153 0.326662 0.174426 -0.331592 -0.520082 0.061342 -0.245186 nan nan 0.078235 -0.389478 nan 8 8
Q5SRI9 -0.274015 0.360157 0.349130 -0.087155 0.056862 -0.128198 -0.584189 -0.348800 0.733544 -0.259834 nan nan 0.162788 -0.222384 0.592005 0.136408 -0.154616 -0.088515 -0.437021 nan nan 0.192704 -0.380808 nan 8 8
A0A0C4DH24 -0.413803 -0.342470 -0.215487 -0.451837 -0.356647 -0.345938 -0.168096 -0.181333 -0.444924 0.152891 nan nan -0.499855 0.520926 0.293136 0.105732 -0.017588 0.189404 0.076900 nan nan -0.601267 0.147397 nan 9 9
A0A087X1V2 -0.070690 0.278264 0.311792 -0.243238 -0.186011 0.611378 0.057184 0.003097 0.088347 -0.016904 nan nan 0.227076 -0.258078 -0.181107 -0.345283 0.180509 0.241761 0.177299 nan nan 0.039602 -0.005971 nan 9 9
B1AJR6;B1AJR9;B1AJS0;O14522 0.530251 -0.006428 -0.202694 0.093327 0.208754 -0.245885 -0.445185 -0.353217 0.644977 0.465073 nan nan -0.049114 0.431654 -0.304727 -0.305222 -0.367348 0.183477 0.413495 nan nan 0.348627 -0.076654 nan 9 9
A0A0A0MRJ6;F6S8N6;H7BY58;P22061;P22061-2 0.302851 -0.209110 0.001491 -0.170927 -0.080518 0.058591 0.109176 0.179939 -0.309833 0.501859 nan nan -0.040253 -0.338414 -0.578080 -0.848097 0.122965 0.184935 -0.137151 nan nan -0.241246 -0.541386 nan 9 9
E7EQB2;E7ER44;P02788;P02788-2 0.255680 0.166358 0.097317 0.195892 -0.257214 0.384095 0.005787 0.359566 -0.323275 0.079027 nan nan -0.156194 0.687927 0.368657 0.365413 -0.219374 -0.214494 -0.077001 nan nan 0.572123 0.107378 nan 9 9
P08493;P08493-2 0.038081 -0.365314 -0.149088 -0.232150 -0.149889 0.410243 -0.635377 -0.703795 -0.351766 0.150211 nan nan 0.024427 -0.732845 -0.406176 -0.246588 0.248326 0.360308 0.036249 nan nan -0.397982 -0.397982 nan 9 9
Q9Y6C2 0.340254 -0.063698 0.476543 -0.272410 -0.282619 0.490419 0.348677 0.614777 -0.561969 0.551986 nan nan 0.042545 0.503046 0.079803 0.126374 -0.421373 -0.186103 -0.551184 nan nan 0.017971 -0.127935 nan 9 9
P40121;P40121-2 -0.202385 0.366429 0.147497 0.354102 0.267371 0.271809 0.424692 0.529660 0.206394 0.067427 nan nan -0.529026 -0.073815 0.396670 0.185199 0.452133 -0.359970 -0.137408 nan nan -0.176933 -0.108032 nan 9 9
P50395 0.013608 -0.113989 -0.127114 0.174813 -0.522429 -0.668212 0.327166 0.193138 0.152363 0.368766 nan nan 0.016074 0.149917 0.147720 0.026398 0.314399 0.045322 0.088129 nan nan -0.756427 -0.524876 nan 9 9
B1ALD9;Q15063;Q15063-3;Q15063-5 -0.101006 -0.206638 -0.057735 -0.287475 -0.318185 -0.055343 -0.240490 -0.113355 -0.369493 0.319285 nan nan 0.425439 -0.021086 0.080040 0.086550 0.290365 0.619472 0.401747 nan nan 0.053837 -0.648727 nan 9 9
P10745 -0.236641 -0.003766 0.263363 0.023283 -0.213456 -0.335154 0.468128 0.489925 -0.073139 0.328449 nan nan 0.136523 0.118299 0.597717 0.544391 0.443008 0.161125 0.211604 nan nan 0.688639 -0.246050 nan 9 9
E9PGA6;Q9BXJ4;Q9BXJ4-2;Q9BXJ4-3 0.124919 0.220024 0.438780 -0.190596 0.022614 -0.131321 0.189015 0.273937 -0.120507 0.106539 nan nan 0.095878 -0.121064 0.608922 0.429551 -0.464017 0.323193 0.253418 nan nan 0.096747 0.648033 nan 9 9
Q14019 -0.344741 0.103428 0.081974 0.165256 -0.116056 0.201472 -0.725812 -0.693759 0.525948 -0.022084 nan nan 0.367858 0.113498 0.064769 0.401079 0.109434 0.256334 -0.129078 nan nan -0.513253 -0.468898 nan 9 9
Q6UWH4;Q6UWH4-2 -0.588718 -0.226077 -0.734227 0.098882 -0.426622 -0.499675 -0.397136 0.151063 -0.079928 -0.371245 nan nan -0.426708 -0.140997 -0.578150 -0.418792 0.080968 0.143652 0.733599 nan nan -0.126561 -0.059018 nan 10 10
A0A1W2PQB1;H0Y755;M9MML0;P08637 -0.090323 0.560602 0.576538 -0.253373 0.297178 0.372386 0.460288 0.690784 0.595184 -0.139070 nan nan 0.331327 0.395590 -0.081580 -0.266497 -0.031056 0.540145 0.473262 nan nan -0.617375 0.448386 nan 10 10
O76070 -0.432052 -0.185319 -0.263040 -0.215355 -0.529241 -0.437830 -0.204646 0.162530 0.469000 0.054400 nan nan -0.278608 -0.244682 -0.212317 -0.086282 0.426709 -0.128095 0.192339 nan nan -0.474129 0.337800 nan 10 10
P08670 -0.314275 -0.344201 -0.405174 -0.179390 -0.211443 -0.453995 -0.501234 -0.180491 -0.411473 0.025687 nan nan 0.206552 0.176763 0.551632 0.469208 -0.316052 0.402038 -0.136206 nan nan 0.028298 0.291405 nan 10 10
P00915 0.933351 -0.107314 -0.200301 0.883700 0.471143 0.858997 0.215571 0.399271 0.618757 0.705641 nan nan 0.933484 0.185897 0.081495 0.012603 -0.463694 0.414364 0.256349 nan nan 0.359490 0.118343 nan 10 10
Q9NQS3;Q9NQS3-2;Q9NQS3-3 -0.424592 0.014125 -0.200662 -0.139485 0.026032 -0.324225 -0.111027 -0.140219 0.069189 -0.172248 nan nan 0.202371 -0.073466 0.035925 0.120460 -0.279258 0.092650 0.098420 nan nan 0.156133 -0.174368 nan 11 11
F8WD41;Q15166 -0.202856 0.248253 -0.033970 0.391304 0.296698 -0.115738 -0.359926 -0.291802 0.180810 -0.065244 nan nan 0.029724 -0.324188 -0.066574 0.265087 0.543120 0.363056 0.385556 nan nan 0.207901 0.110472 nan 11 11
P01742 0.375201 0.102261 0.231270 -0.267428 -0.411875 0.180121 0.026400 0.271311 -0.587962 -0.094307 nan nan -0.393033 -0.416621 0.676899 0.724786 0.169061 0.594179 0.034490 nan nan -0.194256 0.578374 nan 11 11
P55774 0.103249 -0.025625 0.019666 -0.308916 -0.040293 0.027328 -0.215141 -0.218578 -0.211760 -0.158074 nan nan -0.181675 -0.106143 -0.183622 -0.001697 0.200956 -0.506454 -0.554162 nan nan -0.584551 -0.551401 nan 11 11
Q9UFP1 0.121246 -0.270576 0.119135 -0.094306 -0.109641 -0.212699 0.028588 0.208374 0.056005 -0.173611 nan nan -0.123046 0.620369 -0.266922 -0.405138 -0.562924 0.076316 -0.058400 nan nan 0.159000 0.376368 nan 12 12
Q96RW7;Q96RW7-2 -0.274538 0.099474 -0.297262 -0.301343 0.060619 -0.089934 -0.398478 -0.352505 0.219858 -0.008263 nan nan 0.625088 -0.562674 0.502826 0.258426 0.174427 -0.017822 0.348426 nan nan 0.378646 0.234101 nan 12 12
Q9BUJ0 -0.037313 -0.415798 0.162340 -0.314651 -0.097394 0.064061 -0.098921 0.027051 -0.265561 -0.119433 nan nan -0.111523 -0.500348 -0.202398 -0.237412 -0.262258 -0.107921 -0.456513 nan nan 0.049800 -0.265784 nan 12 12
Q9NPH3;Q9NPH3-2;Q9NPH3-5 0.151630 0.295972 0.192087 -0.026036 -0.192285 0.386922 -0.230539 -0.455875 -0.432298 -0.248365 nan nan -0.211883 -0.337277 -0.108300 -0.175836 0.289718 0.015789 0.185993 nan nan 0.415593 0.041149 nan 12 12
A1L4H1 0.186316 -0.347841 0.120191 0.309188 -0.067660 -0.156773 0.072585 -0.116746 -0.221594 0.008498 nan nan -0.326665 0.577579 0.300334 -0.324173 -0.368786 0.289843 0.494000 nan nan -0.031547 -0.042797 nan 12 12
Q9NZ08;Q9NZ08-2 0.032883 0.041364 -0.323727 0.433371 0.017643 -0.185123 0.299292 0.489257 0.572066 0.260532 nan nan 0.348860 -0.181623 -0.357765 -0.168136 0.461927 0.521799 0.211498 nan nan 0.202766 0.753444 nan 12 12
P29966 0.767339 -0.387350 0.447993 0.078527 -0.032150 0.543698 0.720862 0.172931 0.306123 0.322425 nan nan 0.323159 0.343889 0.159322 0.397631 -0.109396 0.358982 0.295261 nan nan 0.350638 0.249010 nan 13 13
B4DYV8;Q8WZ75;Q8WZ75-2;Q8WZ75-3 0.270149 0.459992 0.519129 -0.317889 0.046917 0.379618 0.130729 0.093670 -0.515780 0.120207 nan nan -0.440024 0.029137 0.078349 0.291298 -0.002071 0.133283 0.299691 nan nan 0.085688 -0.033982 nan 14 14
Q6PCB0 -0.010818 -0.199445 -0.067678 -0.187322 -0.196364 -0.255827 -0.287276 -0.323757 -0.316441 0.132796 nan nan -0.011942 -0.086716 -0.530040 -0.329471 0.563574 -0.188112 0.172064 nan nan -0.023465 -0.115061 nan 14 14
O43852;O43852-3;O43852-5 0.193891 -0.167247 -0.064301 -0.218009 -0.011591 -0.029262 0.295117 0.141397 0.293714 -0.062552 nan nan -0.155637 0.338758 0.614990 0.394764 0.162358 0.574898 0.225381 nan nan -0.110063 -0.377160 nan 15 15
Q6ZMP0;Q6ZMP0-2 -0.257877 0.176879 -0.154587 0.132715 0.358425 -0.264652 0.066672 -0.028928 -0.151054 -0.094502 nan nan -0.193571 0.160711 0.414953 -0.432438 -0.047291 -0.152902 -0.032008 nan nan 0.481989 0.031743 nan 16 16
P11597;P11597-2 0.091263 -0.310808 -0.345883 -0.244961 -0.386252 -0.106661 -0.071420 -0.029940 0.225605 -0.022021 nan nan -0.443310 0.033269 -0.211026 0.052557 0.587557 0.330721 0.075937 nan nan 0.400242 -0.067354 nan 16 16
Q96AQ6;Q96AQ6-2 0.074929 -0.019283 0.173339 -0.286627 -0.091433 0.082604 0.266461 0.377825 -0.047226 -0.184507 nan nan -0.159338 0.183841 -0.158511 0.136220 -0.514370 -0.189966 0.091875 nan nan 0.554486 -0.006715 nan 16 16
A6XMH3;P01236;Q5I0G2 0.143374 -0.262062 -0.535140 -0.263686 -0.401925 -0.352063 0.249922 0.236575 -0.184831 0.126855 nan nan -0.456084 0.078152 0.183538 0.124415 -0.253925 -0.746360 -0.591013 nan nan 0.415223 0.098237 nan 17 17
Q13231;Q13231-3 0.028238 0.126926 0.281138 -0.248124 0.165819 0.012068 -0.144117 -0.108428 0.011950 0.128016 nan nan 0.134116 0.022742 -0.284982 -0.276036 -0.099596 0.135271 -0.133702 nan nan -0.001439 0.440431 nan 19 19

Error plot#

Hide code cell source

metrics = pimmslearn.models.Metrics()
test_metrics = metrics.add_metrics(
    pred_test[['observed', *TOP_N_ORDER]], key='test data')
test_metrics = pd.DataFrame(test_metrics)[TOP_N_ORDER]
test_metrics
Selected as truth to compare to: observed
BPCA VAE DAE TRKNN RF
MSE 0.455 0.477 0.474 0.500 0.512
MAE 0.432 0.435 0.435 0.458 0.465
N 12,600.000 12,600.000 12,600.000 12,600.000 12,600.000
prop 1.000 1.000 1.000 1.000 1.000

Hide code cell source

n_in_comparison = int(test_metrics.loc['N'].unique()[0])
n_in_comparison
12600

Hide code cell source

_to_plot = test_metrics.loc[METRIC].to_frame().T
_to_plot.index = [feature_names.name]
_to_plot
BPCA VAE DAE TRKNN RF
protein groups 0.432 0.435 0.435 0.458 0.465

Hide code cell source

try:
    text = model_configs[["latent_dim", "hidden_layers"]].apply(
        build_text,
        axis=1)
except KeyError:
    logger.warning("No PIMMS models in comparsion. Using empty text")
    text = pd.Series('', index=model_configs.columns)

_to_plot.loc["text"] = text
_to_plot = _to_plot.fillna('')
_to_plot
BPCA VAE DAE TRKNN RF
protein groups 0.432 0.435 0.435 0.458 0.465
text LD: 10 HL: 64 LD: 10 HL: 64

Hide code cell source

fig, ax = plt.subplots(figsize=(4, 2))  # size of the plot can be adjusted
ax = _to_plot.loc[[feature_names.name]].plot.bar(
    rot=0,
    ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY}\n({n_in_comparison:,} intensities)",
    # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
    color=COLORS_TO_USE,
    ax=ax,
    width=.7)
ax = pimmslearn.plotting.add_height_to_barplot(ax, size=7)
ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=7)
ax.set_xticklabels([])
fname = args.out_figures / f'2_{group}_performance_test.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_performance_test.pdf
_images/c223fdf1685adac3480d39a7601c737e2f8eba6b89d0df894a6c26f1af1963b9.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
_to_plot_long = _to_plot.T
_to_plot_long = _to_plot_long.rename(
    {feature_names.name: 'metric_value'}, axis=1)
_to_plot_long['data level'] = feature_names.name
_to_plot_long = _to_plot_long.set_index('data level', append=True)
_to_plot_long.to_csv(fname.with_suffix('.csv'))

Plot error by median feature intensity#

Hide code cell source

pimmslearn.plotting.make_large_descriptors(7)
fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
    pred=pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    feat_medians=data.train_X.median(),
    ax=ax,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC,
    palette=COLORS_TO_USE
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
pimmslearn.plotting.make_large_descriptors(6)
fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:99: UserWarning: The palette list has more values (24) than needed (5), which may not be intended.
  sns.barplot(data=errors,
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:99: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  sns.barplot(data=errors,
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf
Sample ID protein groups model MAE bin n_obs intensity binned by median of protein groups
0 Sample_000 A0A075B6P5;P01615 BPCA 0.467 19 912 19\n(N=912)
1 Sample_000 A0A075B6P5;P01615 VAE 0.417 19 912 19\n(N=912)
2 Sample_000 A0A075B6P5;P01615 DAE 0.317 19 912 19\n(N=912)
3 Sample_000 A0A075B6P5;P01615 TRKNN 0.422 19 912 19\n(N=912)
4 Sample_000 A0A075B6P5;P01615 RF 0.626 19 912 19\n(N=912)
... ... ... ... ... ... ... ...
62,995 Sample_209 Q9UGM5;Q9UGM5-2 BPCA 0.476 16 1,913 16\n(N=1,913)
62,996 Sample_209 Q9UGM5;Q9UGM5-2 VAE 0.288 16 1,913 16\n(N=1,913)
62,997 Sample_209 Q9UGM5;Q9UGM5-2 DAE 0.434 16 1,913 16\n(N=1,913)
62,998 Sample_209 Q9UGM5;Q9UGM5-2 TRKNN 0.442 16 1,913 16\n(N=1,913)
62,999 Sample_209 Q9UGM5;Q9UGM5-2 RF 0.293 16 1,913 16\n(N=1,913)

63000 rows × 7 columns

_images/28f6b2147f42e8fe5aa31801715aad322a035b72732ef65e0b030b74c3a06c94.png

Hide code cell source

# ! only used for reporting
plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
    errors=errors_binned,
    feat_name=FEAT_NAME_DISPLAY,
    metric_name=METRIC
)
plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
plotted
bin model mean ci_low ci_high
0 11\n(N=63) BPCA 0.619 0.504 0.744
1 11\n(N=63) DAE 0.656 0.514 0.822
2 11\n(N=63) RF 0.595 0.471 0.731
3 11\n(N=63) TRKNN 0.589 0.482 0.698
4 11\n(N=63) VAE 0.561 0.454 0.677
... ... ... ... ... ...
85 29\n(N=12) BPCA 0.133 0.065 0.212
86 29\n(N=12) DAE 0.175 0.115 0.251
87 29\n(N=12) RF 0.241 0.156 0.331
88 29\n(N=12) TRKNN 0.230 0.158 0.317
89 29\n(N=12) VAE 0.184 0.120 0.252

90 rows × 5 columns

Hide code cell source

(errors_binned
 .set_index(
     ['model', errors_binned.columns[-1]]
 )
 .loc[ORDER_MODELS[0]]
 .sort_values(by=METRIC))
Sample ID protein groups MAE bin n_obs
intensity binned by median of protein groups
18\n(N=846) Sample_142 P09972 0.000 18 846
15\n(N=2,557) Sample_021 A0A0A0MT66 0.000 15 2,557
14\n(N=2,074) Sample_058 Q16853;Q16853-2 0.000 14 2,074
16\n(N=1,913) Sample_015 B7Z2R4;C9JR67;O43556;O43556-3;O43556-4 0.000 16 1,913
15\n(N=2,557) Sample_079 A6NCT7;Q07092;Q07092-2 0.000 15 2,557
... ... ... ... ... ...
14\n(N=2,074) Sample_011 P11597;P11597-2 5.771 14 2,074
14\n(N=2,074) Sample_184 F8WD41;Q15166 6.195 14 2,074
17\n(N=1,393) Sample_108 P27824;P27824-2 6.482 17 1,393
14\n(N=2,074) Sample_091 F8WD41;Q15166 6.823 14 2,074
14\n(N=2,074) Sample_115 P17050 7.635 14 2,074

12600 rows × 5 columns

Custom model selection#

Hide code cell source

if SEL_MODELS:
    metrics = pimmslearn.models.Metrics()
    test_metrics = metrics.add_metrics(
        pred_test[['observed', *SEL_MODELS]], key='test data')
    test_metrics = pd.DataFrame(test_metrics)[SEL_MODELS]
    test_metrics

    n_in_comparison = int(test_metrics.loc['N'].unique()[0])
    n_in_comparison

    _to_plot = test_metrics.loc[METRIC].to_frame().T
    _to_plot.index = [feature_names.name]
    _to_plot

    try:
        text = model_configs[["latent_dim", "hidden_layers"]].apply(
            build_text,
            axis=1)
    except KeyError:
        logger.warning("No PIMMS models in comparsion. Using empty text")
        text = pd.Series('', index=model_configs.columns)

    _to_plot.loc["text"] = text
    _to_plot = _to_plot.fillna('')
    _to_plot

    fig, ax = plt.subplots(figsize=(4, 2))
    ax = _to_plot.loc[[feature_names.name]].plot.bar(
        rot=0,
        ylabel=f"{METRIC} for {FEAT_NAME_DISPLAY} ({n_in_comparison:,} intensities)",
        # title=f'performance on test data (based on {n_in_comparison:,} measurements)',
        color=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS)),
        ax=ax,
        width=.7)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    ax = pimmslearn.plotting.add_height_to_barplot(ax, size=5)
    ax = pimmslearn.plotting.add_text_to_barplot(ax, _to_plot.loc["text"], size=5)
    ax.set_xticklabels([])

    fname = args.out_figures / f'2_{group}_performance_test_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(fig, name=fname)

    dumps[fname.stem] = fname.with_suffix('.csv')
    _to_plot_long = _to_plot.T
    _to_plot_long = _to_plot_long.rename(
        {feature_names.name: 'metric_value'}, axis=1)
    _to_plot_long['data level'] = feature_names.name
    _to_plot_long = _to_plot_long.set_index('data level', append=True)
    _to_plot_long.to_csv(fname.with_suffix('.csv'))

Hide code cell source

# custom selection
if SEL_MODELS:
    pimmslearn.plotting.make_large_descriptors(7)
    fig, ax = plt.subplots(figsize=(8, 2))

    ax, errors_binned = pimmslearn.plotting.errors.plot_errors_by_median(
        pred=pred_test[
            [TARGET_COL] + SEL_MODELS
        ],
        feat_medians=data.train_X.median(),
        ax=ax,
        metric_name=METRIC,
        feat_name=FEAT_NAME_DISPLAY,
        palette=pimmslearn.plotting.defaults.assign_colors(
            list(k.upper() for k in SEL_MODELS))
    )
    # ax.set_ylim(0, 1.5)
    ax.legend(loc='best', ncols=len(SEL_MODELS))
    # for text in ax.legend().get_texts():
    #     text.set_fontsize(6)
    fname = args.out_figures / f'2_{group}_test_errors_binned_by_feat_medians_sel.pdf'
    figures[fname.stem] = fname
    pimmslearn.savefig(ax.get_figure(), name=fname)
    plt.show(fig)

    dumps[fname.stem] = fname.with_suffix('.csv')
    errors_binned.to_csv(fname.with_suffix('.csv'))
    pimmslearn.plotting.make_large_descriptors(6)
    # ax.xaxis.set_tick_params(rotation=0) # horizontal

    # ! only used for reporting
    plotted = pimmslearn.plotting.errors.get_data_for_errors_by_median(
        errors=errors_binned,
        feat_name=FEAT_NAME_DISPLAY,
        metric_name=METRIC
    )
    plotted.to_excel(fname.with_suffix('.xlsx'), index=False)
    display(plotted)

Error by non-decimal number of intensity#

  • number of observations in parentheses.

Hide code cell source

fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = pimmslearn.plotting.errors.plot_errors_binned(
    pred_test[
        [TARGET_COL] + TOP_N_ORDER
    ],
    ax=ax,
    palette=TOP_N_COLOR_PALETTE,
    metric_name=METRIC,
)
ax.legend(loc='best', ncols=len(TOP_N_ORDER))
fname = args.out_figures / f'2_{group}_test_errors_binned_by_int.pdf'
figures[fname.stem] = fname
pimmslearn.savefig(ax.get_figure(), name=fname)
/home/runner/work/pimms/pimms/project/.snakemake/conda/924ec7e362d761ecf0807b9074d79999_/lib/python3.12/site-packages/pimmslearn/plotting/errors.py:45: FutureWarning: 

The `errwidth` parameter is deprecated. And will be removed in v0.15.0. Pass `err_kws={'linewidth': 1.2}` instead.

  ax = sns.barplot(data=errors_binned, ax=ax,
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf
_images/9cab34bacd5d34f5e6d728fc4e65bbbe0bd259a8ba9fb4780d2b66f3a84da3f6.png

Hide code cell source

dumps[fname.stem] = fname.with_suffix('.csv')
errors_binned.to_csv(fname.with_suffix('.csv'))
errors_binned.head()
Sample ID protein groups model MAE intensity bin
0 Sample_143 P02768 BPCA 0.065 30\n(N=2)
1 Sample_143 P02768 VAE 0.250 30\n(N=2)
2 Sample_143 P02768 DAE 0.443 30\n(N=2)
3 Sample_143 P02768 TRKNN 0.574 30\n(N=2)
4 Sample_143 P02768 RF 0.457 30\n(N=2)

Figures dumped to disk#

Hide code cell source

figures
{'2_1_fake_na_val_test_splits': Path('runs/alzheimer_study/figures/2_1_fake_na_val_test_splits.png'),
 '2_1_pred_corr_val_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_val_per_sample.pdf'),
 '2_1_errors_binned_by_feat_median_val': Path('runs/alzheimer_study/figures/2_1_errors_binned_by_feat_median_val.pdf'),
 '2_1_intensity_binned_top_4_models_test': Path('runs/alzheimer_study/figures/2_1_intensity_binned_top_4_models_test.pdf'),
 '2_1_pred_corr_test_per_sample': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_sample.pdf'),
 '2_1_pred_corr_test_per_feat': Path('runs/alzheimer_study/figures/2_1_pred_corr_test_per_feat.pdf'),
 '2_1_performance_test': Path('runs/alzheimer_study/figures/2_1_performance_test.pdf'),
 '2_1_test_errors_binned_by_feat_medians': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_feat_medians.pdf'),
 '2_1_test_errors_binned_by_int': Path('runs/alzheimer_study/figures/2_1_test_errors_binned_by_int.pdf')}

Hide code cell source

dumps
print("done")
done