Compare outcomes from differential analysis based on different imputation methods#

  • load scores based on 10_1_ald_diff_analysis

Hide code cell source

import logging
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from IPython.display import display

import pimmslearn
import pimmslearn.databases.diseases

logger = pimmslearn.logging.setup_nb_logger()

plt.rcParams['figure.figsize'] = (2, 2)
fontsize = 5
pimmslearn.plotting.make_large_descriptors(fontsize)
logging.getLogger('fontTools').setLevel(logging.ERROR)

# catch passed parameters
args = None
args = dict(globals()).keys()

Parameters#

Default and set parameters for the notebook.

folder_experiment = 'runs/appl_ald_data/plasma/proteinGroups'

target = 'kleiner'
model_key = 'VAE'
baseline = 'RSN'
out_folder = 'diff_analysis'
selected_statistics = ['p-unc', '-Log10 pvalue', 'qvalue', 'rejected']

disease_ontology = 5082  # code from https://disease-ontology.org/
# split diseases notebook? Query gene names for proteins in file from uniprot?
annotaitons_gene_col = 'PG.Genes'
# Parameters
disease_ontology = 10652
folder_experiment = "runs/alzheimer_study"
target = "AD"
baseline = "PI"
model_key = "QRILC"
out_folder = "diff_analysis"
annotaitons_gene_col = "None"

Add set parameters to configuration

Hide code cell source

params = pimmslearn.nb.get_params(args, globals=globals())
args = pimmslearn.nb.Config()
args.folder_experiment = Path(params["folder_experiment"])
args = pimmslearn.nb.add_default_paths(args,
                                 out_root=(
                                     args.folder_experiment
                                     / params["out_folder"]
                                     / params["target"]
                                     / f"{params['baseline']}_vs_{params['model_key']}"))
args.update_from_dict(params)
args.scores_folder = scores_folder = (args.folder_experiment
                                      / params["out_folder"]
                                      / params["target"]
                                      / 'scores')
args.freq_features_observed = args.folder_experiment / 'freq_features_observed.csv'
args
root - INFO     Removed from global namespace: folder_experiment
root - INFO     Removed from global namespace: target
root - INFO     Removed from global namespace: model_key
root - INFO     Removed from global namespace: baseline
root - INFO     Removed from global namespace: out_folder
root - INFO     Removed from global namespace: selected_statistics
root - INFO     Removed from global namespace: disease_ontology
root - INFO     Removed from global namespace: annotaitons_gene_col
root - INFO     Already set attribute: folder_experiment has value runs/alzheimer_study
root - INFO     Already set attribute: out_folder has value diff_analysis
{'annotaitons_gene_col': 'None',
 'baseline': 'PI',
 'data': PosixPath('runs/alzheimer_study/data'),
 'disease_ontology': 10652,
 'folder_experiment': PosixPath('runs/alzheimer_study'),
 'freq_features_observed': PosixPath('runs/alzheimer_study/freq_features_observed.csv'),
 'model_key': 'QRILC',
 'out_figures': PosixPath('runs/alzheimer_study/figures'),
 'out_folder': PosixPath('runs/alzheimer_study/diff_analysis/AD/PI_vs_QRILC'),
 'out_metrics': PosixPath('runs/alzheimer_study'),
 'out_models': PosixPath('runs/alzheimer_study'),
 'out_preds': PosixPath('runs/alzheimer_study/preds'),
 'scores_folder': PosixPath('runs/alzheimer_study/diff_analysis/AD/scores'),
 'selected_statistics': ['p-unc', '-Log10 pvalue', 'qvalue', 'rejected'],
 'target': 'AD'}

Excel file for exports#

files_out = dict()
writer_args = dict(float_format='%.3f')

fname = args.out_folder / 'diff_analysis_compare_methods.xlsx'
files_out[fname.name] = fname
writer = pd.ExcelWriter(fname)
logger.info("Writing to excel file: %s", fname)
root - INFO     Writing to excel file: runs/alzheimer_study/diff_analysis/AD/PI_vs_QRILC/diff_analysis_compare_methods.xlsx

Load scores#

Load baseline model scores#

Show all statistics, later use selected statistics

Hide code cell source

fname = args.scores_folder / f'diff_analysis_scores_{args.baseline}.pkl'
scores_baseline = pd.read_pickle(fname)
scores_baseline
model PI
var SS DF F p-unc np2 -Log10 pvalue qvalue rejected
protein groups Source
A0A024QZX5;A0A087X1N8;P35237 AD 0.261 1 0.456 0.501 0.002 0.301 0.653 False
age 0.036 1 0.063 0.802 0.000 0.096 0.878 False
Kiel 1.631 1 2.848 0.093 0.015 1.031 0.198 False
Magdeburg 4.653 1 8.127 0.005 0.041 2.315 0.018 True
Sweden 7.129 1 12.451 0.001 0.061 3.281 0.003 True
... ... ... ... ... ... ... ... ... ...
S4R3U6 AD 0.832 1 0.895 0.345 0.005 0.462 0.508 False
age 0.017 1 0.019 0.891 0.000 0.050 0.934 False
Kiel 0.472 1 0.508 0.477 0.003 0.322 0.630 False
Magdeburg 1.695 1 1.824 0.178 0.009 0.749 0.320 False
Sweden 13.108 1 14.110 0.000 0.069 3.640 0.001 True

7105 rows × 8 columns

Load selected comparison model scores#

Hide code cell source

fname = args.scores_folder / f'diff_analysis_scores_{args.model_key}.pkl'
scores_model = pd.read_pickle(fname)
scores_model
model QRILC
var SS DF F p-unc np2 -Log10 pvalue qvalue rejected
protein groups Source
A0A024QZX5;A0A087X1N8;P35237 AD 0.640 1 4.091 0.044 0.021 1.352 0.103 False
age 0.019 1 0.124 0.725 0.001 0.140 0.815 False
Kiel 0.428 1 2.732 0.100 0.014 1.000 0.197 False
Magdeburg 0.910 1 5.816 0.017 0.030 1.774 0.047 True
Sweden 2.368 1 15.136 0.000 0.073 3.860 0.001 True
... ... ... ... ... ... ... ... ... ...
S4R3U6 AD 1.002 1 0.543 0.462 0.003 0.335 0.603 False
age 1.906 1 1.033 0.311 0.005 0.507 0.462 False
Kiel 10.737 1 5.816 0.017 0.030 1.774 0.047 True
Magdeburg 20.107 1 10.891 0.001 0.054 2.938 0.005 True
Sweden 0.008 1 0.004 0.947 0.000 0.024 0.968 False

7105 rows × 8 columns

Combined scores#

show only selected statistics for comparsion

Hide code cell source

scores = scores_model.join(scores_baseline, how='outer')[[args.baseline, args.model_key]]
scores = scores.loc[:, pd.IndexSlice[scores.columns.levels[0].to_list(),
                                     args.selected_statistics]]
scores
model PI QRILC
var p-unc -Log10 pvalue qvalue rejected p-unc -Log10 pvalue qvalue rejected
protein groups Source
A0A024QZX5;A0A087X1N8;P35237 AD 0.501 0.301 0.653 False 0.044 1.352 0.103 False
Kiel 0.093 1.031 0.198 False 0.100 1.000 0.197 False
Magdeburg 0.005 2.315 0.018 True 0.017 1.774 0.047 True
Sweden 0.001 3.281 0.003 True 0.000 3.860 0.001 True
age 0.802 0.096 0.878 False 0.725 0.140 0.815 False
... ... ... ... ... ... ... ... ... ...
S4R3U6 AD 0.345 0.462 0.508 False 0.462 0.335 0.603 False
Kiel 0.477 0.322 0.630 False 0.017 1.774 0.047 True
Magdeburg 0.178 0.749 0.320 False 0.001 2.938 0.005 True
Sweden 0.000 3.640 0.001 True 0.947 0.024 0.968 False
age 0.891 0.050 0.934 False 0.311 0.507 0.462 False

7105 rows × 8 columns

Models in comparison (name mapping)

Hide code cell source

models = pimmslearn.nb.Config.from_dict(
    pimmslearn.pandas.index_to_dict(scores.columns.get_level_values(0)))
vars(models)
{'PI': 'PI', 'QRILC': 'QRILC'}

Describe scores#

Hide code cell source

scores.describe()
model PI QRILC
var p-unc -Log10 pvalue qvalue p-unc -Log10 pvalue qvalue
count 7,105.000 7,105.000 7,105.000 7,105.000 7,105.000 7,105.000
mean 0.260 2.484 0.337 0.245 2.741 0.311
std 0.301 5.376 0.329 0.296 5.177 0.324
min 0.000 0.000 0.000 0.000 0.000 0.000
25% 0.004 0.336 0.015 0.002 0.361 0.008
50% 0.123 0.909 0.247 0.093 1.030 0.187
75% 0.462 2.431 0.615 0.436 2.691 0.581
max 1.000 145.647 1.000 0.999 86.130 0.999

One to one comparison of by feature:#

Hide code cell source

scores = scores.loc[pd.IndexSlice[:, args.target], :]
scores.to_excel(writer, 'scores', **writer_args)
scores
/tmp/ipykernel_111598/3761369923.py:2: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
  scores.to_excel(writer, 'scores', **writer_args)
model PI QRILC
var p-unc -Log10 pvalue qvalue rejected p-unc -Log10 pvalue qvalue rejected
protein groups Source
A0A024QZX5;A0A087X1N8;P35237 AD 0.501 0.301 0.653 False 0.044 1.352 0.103 False
A0A024R0T9;K7ER74;P02655 AD 0.035 1.458 0.092 False 0.032 1.489 0.080 False
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 AD 0.102 0.990 0.214 False 0.366 0.437 0.517 False
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 AD 0.620 0.208 0.747 False 0.304 0.517 0.455 False
A0A075B6H7 AD 0.063 1.202 0.146 False 0.160 0.797 0.283 False
... ... ... ... ... ... ... ... ... ...
Q9Y6R7 AD 0.175 0.756 0.317 False 0.175 0.756 0.304 False
Q9Y6X5 AD 0.057 1.241 0.136 False 0.031 1.510 0.078 False
Q9Y6Y8;Q9Y6Y8-2 AD 0.083 1.079 0.183 False 0.083 1.079 0.171 False
Q9Y6Y9 AD 0.561 0.251 0.697 False 0.829 0.082 0.891 False
S4R3U6 AD 0.345 0.462 0.508 False 0.462 0.335 0.603 False

1421 rows × 8 columns

And the descriptive statistics of the numeric values:

Hide code cell source

scores.describe()
model PI QRILC
var p-unc -Log10 pvalue qvalue p-unc -Log10 pvalue qvalue
count 1,421.000 1,421.000 1,421.000 1,421.000 1,421.000 1,421.000
mean 0.253 1.409 0.335 0.249 1.484 0.323
std 0.291 1.643 0.316 0.288 1.752 0.313
min 0.000 0.001 0.000 0.000 0.001 0.000
25% 0.012 0.363 0.040 0.010 0.354 0.030
50% 0.121 0.916 0.243 0.110 0.958 0.213
75% 0.434 1.909 0.591 0.443 2.010 0.587
max 0.997 23.415 0.998 0.998 22.323 0.998

and the boolean decision values

Hide code cell source

scores.describe(include=['bool', 'O'])
model PI QRILC
var rejected rejected
count 1421 1421
unique 2 2
top False False
freq 1025 995

Load frequencies of observed features#

Hide code cell source

freq_feat = pd.read_csv(args.freq_features_observed, index_col=0)
freq_feat.columns = pd.MultiIndex.from_tuples([('data', 'frequency'),])
freq_feat
data
frequency
protein groups
A0A024QZX5;A0A087X1N8;P35237 186
A0A024R0T9;K7ER74;P02655 195
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 174
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 196
A0A075B6H7 91
... ...
Q9Y6R7 197
Q9Y6X5 173
Q9Y6Y8;Q9Y6Y8-2 197
Q9Y6Y9 119
S4R3U6 126

1421 rows × 1 columns

Compare shared features#

Hide code cell source

scores_common = (scores
                 .dropna()
                 .reset_index(-1, drop=True)
                 ).join(
    freq_feat, how='left'
)
scores_common
PI QRILC data
p-unc -Log10 pvalue qvalue rejected p-unc -Log10 pvalue qvalue rejected frequency
protein groups
A0A024QZX5;A0A087X1N8;P35237 0.501 0.301 0.653 False 0.044 1.352 0.103 False 186
A0A024R0T9;K7ER74;P02655 0.035 1.458 0.092 False 0.032 1.489 0.080 False 195
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 0.102 0.990 0.214 False 0.366 0.437 0.517 False 174
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 0.620 0.208 0.747 False 0.304 0.517 0.455 False 196
A0A075B6H7 0.063 1.202 0.146 False 0.160 0.797 0.283 False 91
... ... ... ... ... ... ... ... ... ...
Q9Y6R7 0.175 0.756 0.317 False 0.175 0.756 0.304 False 197
Q9Y6X5 0.057 1.241 0.136 False 0.031 1.510 0.078 False 173
Q9Y6Y8;Q9Y6Y8-2 0.083 1.079 0.183 False 0.083 1.079 0.171 False 197
Q9Y6Y9 0.561 0.251 0.697 False 0.829 0.082 0.891 False 119
S4R3U6 0.345 0.462 0.508 False 0.462 0.335 0.603 False 126

1421 rows × 9 columns

Annotate decisions in Confusion Table style:#

Hide code cell source

def annotate_decision(scores, model, model_column):
    return scores[(model_column, 'rejected')].replace({False: f'{model} (no) ', True: f'{model} (yes)'})


annotations = None
for model, model_column in models.items():
    if annotations is not None:
        annotations += ' - '
        annotations += annotate_decision(scores_common,
                                         model=model, model_column=model_column)
    else:
        annotations = annotate_decision(
            scores_common, model=model, model_column=model_column)
annotations.name = 'Differential Analysis Comparison'
annotations.value_counts()
Differential Analysis Comparison
PI (no)  - QRILC (no)    964
PI (yes) - QRILC (yes)   365
PI (no)  - QRILC (yes)    61
PI (yes) - QRILC (no)     31
Name: count, dtype: int64

List different decisions between models#

Hide code cell source

mask_different = (
    (scores_common.loc[:, pd.IndexSlice[:, 'rejected']].any(axis=1))
    & ~(scores_common.loc[:, pd.IndexSlice[:, 'rejected']].all(axis=1))
)
_to_write = scores_common.loc[mask_different]
_to_write.to_excel(writer, 'differences', **writer_args)
logger.info("Writen to Excel file under sheet 'differences'.")
_to_write
/tmp/ipykernel_111598/1417621106.py:6: FutureWarning: Starting with pandas version 3.0 all arguments of to_excel except for the argument 'excel_writer' will be keyword-only.
  _to_write.to_excel(writer, 'differences', **writer_args)
root - INFO     Writen to Excel file under sheet 'differences'.
PI QRILC data
p-unc -Log10 pvalue qvalue rejected p-unc -Log10 pvalue qvalue rejected frequency
protein groups
A0A075B6I0 0.032 1.488 0.088 False 0.002 2.653 0.009 True 194
A0A087WWT2;Q9NPD7 0.033 1.478 0.089 False 0.004 2.403 0.014 True 193
A0A087X1G7;A0A0B4J1S4;O60613 0.014 1.850 0.045 True 0.023 1.635 0.062 False 184
A0A0A0MQS9;A0A0A0MTC7;Q16363;Q16363-2 0.012 1.904 0.040 True 0.249 0.603 0.394 False 92
A0A0A0MS20;A0A0A0MSZ8;A0A0G2JM38;A0A0G2JM43;A0A0G2JM57;A0A0G2JM84;A0A0G2JMH7;A0A0G2JML1;A0A0G2JNE9;A0A0G2JNL1;A0A0G2JP25;A0A0G2JP84;A0A0G2JPA9;A0A0G2JPC7;A0A0G2JPU4;A0A0G2JPX5;A0A0G2JQ10;A0A0G2JQ20;A8MUE1;C9JST2;Q8NHJ6;Q8NHJ6-2;Q8NHJ6-3 0.104 0.981 0.217 False 0.010 2.001 0.031 True 166
... ... ... ... ... ... ... ... ... ...
Q9NYX4 0.054 1.265 0.130 False 0.006 2.200 0.021 True 195
Q9P0K9 0.051 1.291 0.124 False 0.010 1.997 0.031 True 192
Q9UJ14 0.006 2.252 0.021 True 0.038 1.424 0.090 False 169
Q9ULP0-3;Q9ULP0-6 0.025 1.596 0.072 False 0.002 2.647 0.009 True 136
Q9UQ52 0.061 1.212 0.144 False 0.004 2.437 0.013 True 188

92 rows × 9 columns

Plot qvalues of both models with annotated decisions#

Prepare data for plotting (qvalues)

Hide code cell source

var = 'qvalue'
to_plot = [scores_common[v][var] for v in models.values()]
for s, k in zip(to_plot, models.keys()):
    s.name = k.replace('_', ' ')
to_plot.append(scores_common['data'])
to_plot.append(annotations)
to_plot = pd.concat(to_plot, axis=1)
to_plot
PI QRILC frequency Differential Analysis Comparison
protein groups
A0A024QZX5;A0A087X1N8;P35237 0.653 0.103 186 PI (no) - QRILC (no)
A0A024R0T9;K7ER74;P02655 0.092 0.080 195 PI (no) - QRILC (no)
A0A024R3W6;A0A024R412;O60462;O60462-2;O60462-3;O60462-4;O60462-5;Q7LBX6;X5D2Q8 0.214 0.517 174 PI (no) - QRILC (no)
A0A024R644;A0A0A0MRU5;A0A1B0GWI2;O75503 0.747 0.455 196 PI (no) - QRILC (no)
A0A075B6H7 0.146 0.283 91 PI (no) - QRILC (no)
... ... ... ... ...
Q9Y6R7 0.317 0.304 197 PI (no) - QRILC (no)
Q9Y6X5 0.136 0.078 173 PI (no) - QRILC (no)
Q9Y6Y8;Q9Y6Y8-2 0.183 0.171 197 PI (no) - QRILC (no)
Q9Y6Y9 0.697 0.891 119 PI (no) - QRILC (no)
S4R3U6 0.508 0.603 126 PI (no) - QRILC (no)

1421 rows × 4 columns

List of features with the highest difference in qvalues

Hide code cell source

# should it be possible to run not only RSN?
to_plot['diff_qvalue'] = (to_plot[str(args.baseline)] - to_plot[str(args.model_key)]).abs()
to_plot.loc[mask_different].sort_values('diff_qvalue', ascending=False)
PI QRILC frequency Differential Analysis Comparison diff_qvalue
protein groups
P36269;P36269-2;P36269-3 0.049 0.634 54 PI (yes) - QRILC (no) 0.585
Q13093 0.582 0.049 85 PI (no) - QRILC (yes) 0.534
G3V295;G3V3I1;G3V5Z7;P60900 0.544 0.013 95 PI (no) - QRILC (yes) 0.531
P37802;P37802-2;X6RJP6 0.024 0.507 110 PI (yes) - QRILC (no) 0.483
Q08174-2 0.483 0.016 194 PI (no) - QRILC (yes) 0.467
... ... ... ... ... ...
Q7Z7H5;Q7Z7H5-3 0.047 0.058 176 PI (yes) - QRILC (no) 0.011
Q6NUJ2 0.044 0.054 165 PI (yes) - QRILC (no) 0.010
K7ERG9;P00746 0.052 0.047 197 PI (no) - QRILC (yes) 0.004
P00740;P00740-2 0.053 0.048 197 PI (no) - QRILC (yes) 0.004
C9J0J0;Q96EE4 0.049 0.053 162 PI (yes) - QRILC (no) 0.004

92 rows × 5 columns

Differences plotted with created annotations#

Hide code cell source

figsize = (4, 4)
size = 5
fig, ax = plt.subplots(figsize=figsize)
x_col = to_plot.columns[0]
y_col = to_plot.columns[1]
ax = sns.scatterplot(data=to_plot,
                     x=x_col,
                     y=y_col,
                     s=size,
                     hue='Differential Analysis Comparison',
                     ax=ax)
_ = ax.legend(fontsize=fontsize,
              title_fontsize=fontsize,
              markerscale=0.4,
              title='',
              )
ax.set_xlabel(f"qvalue for {x_col}")
ax.set_ylabel(f"qvalue for {y_col}")
ax.hlines(0.05, 0, 1, color='grey', linestyles='dotted')
ax.vlines(0.05, 0, 1, color='grey', linestyles='dotted')
sns.move_legend(ax, "upper right")
files_out[f'diff_analysis_comparision_1_{args.model_key}'] = (
    args.out_folder /
    f'diff_analysis_comparision_1_{args.model_key}')
fname = files_out[f'diff_analysis_comparision_1_{args.model_key}']
pimmslearn.savefig(fig, name=fname)
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/diff_analysis/AD/PI_vs_QRILC/diff_analysis_comparision_1_QRILC
../../../_images/d8175e60b383032d3fe8b9e50803f6094de4193aabd886d309b8f2d7023a6133.png
  • also showing how many features were measured (“observed”) by size of circle

Hide code cell source

fig, ax = plt.subplots(figsize=figsize)
ax = sns.scatterplot(data=to_plot,
                     x=to_plot.columns[0],
                     y=to_plot.columns[1],
                     size='frequency',
                     s=size,
                     sizes=(5, 20),
                     hue='Differential Analysis Comparison')
_ = ax.legend(fontsize=fontsize,
              title_fontsize=fontsize,
              markerscale=0.6,
              title='',
              )
ax.set_xlabel(f"qvalue for {x_col}")
ax.set_ylabel(f"qvalue for {y_col}")
ax.hlines(0.05, 0, 1, color='grey', linestyles='dotted')
ax.vlines(0.05, 0, 1, color='grey', linestyles='dotted')
sns.move_legend(ax, "upper right")
files_out[f'diff_analysis_comparision_2_{args.model_key}'] = (
    args.out_folder / f'diff_analysis_comparision_2_{args.model_key}')
pimmslearn.savefig(
    fig, name=files_out[f'diff_analysis_comparision_2_{args.model_key}'])
pimmslearn.plotting - INFO     Saved Figures to runs/alzheimer_study/diff_analysis/AD/PI_vs_QRILC/diff_analysis_comparision_2_QRILC
../../../_images/456b491b061d3bc2746c5f4fb1031a93343dd99b0cfab682d107f226a85534a6.png

Only features contained in model#

  • this block exist due to a specific part in the ALD analysis of the paper

Hide code cell source

scores_model_only = scores.reset_index(level=-1, drop=True)
_diff = scores_model_only.index.difference(scores_common.index)
if not _diff.empty:
    scores_model_only = (scores_model_only
                         .loc[
                             _diff,
                             args.model_key]
                         .sort_values(by='qvalue', ascending=True)
                         .join(freq_feat.squeeze().rename(freq_feat.columns.droplevel()[0])
                               )
                         )
    display(scores_model_only)
else:
    scores_model_only = None
    logger.info("No features only in new comparision model.")

if not _diff.empty:
    scores_model_only.to_excel(writer, 'only_model', **writer_args)
    display(scores_model_only.rejected.value_counts())
    scores_model_only_rejected = scores_model_only.loc[scores_model_only.rejected]
    scores_model_only_rejected.to_excel(
        writer, 'only_model_rejected', **writer_args)
root - INFO     No features only in new comparision model.

DISEASES DB lookup#

Query diseases database for gene associations with specified disease ontology id.

Hide code cell source

data = pimmslearn.databases.diseases.get_disease_association(
    doid=args.disease_ontology, limit=10000)
data = pd.DataFrame.from_dict(data, orient='index').rename_axis('ENSP', axis=0)
data = data.rename(columns={'name': args.annotaitons_gene_col}).reset_index(
).set_index(args.annotaitons_gene_col)
data
pimmslearn.databases.diseases - WARNING  There are more associations available
ENSP score
None
PSEN1 ENSP00000326366 5.000
APP ENSP00000284981 5.000
PSEN2 ENSP00000355747 5.000
APOE ENSP00000252486 5.000
TREM2 ENSP00000362205 4.825
... ... ...
CEP170B ENSP00000404151 0.683
SMPDL3A ENSP00000357425 0.683
ADAMTS10 ENSP00000471851 0.683
PPP3R2 ENSP00000498330 0.683
VAT1 ENSP00000347872 0.683

10000 rows × 2 columns

Shared features#

ToDo: new script -> DISEASES DB lookup

Hide code cell source

feat_name = scores.index.names[0]  # first index level is feature name
if args.annotaitons_gene_col in scores.index.names:
    logger.info(f"Found gene annotation in scores index:  {scores.index.names}")
else:
    logger.info(f"No gene annotation in scores index:  {scores.index.names}"
                " Exiting.")
    import sys
    sys.exit(0)
root - INFO     No gene annotation in scores index:  ['protein groups', 'Source'] Exiting.
/home/runner/work/pimms/pimms/project/.snakemake/conda/43fbe714d68d8fe6f9b0c93f5652adb3_/lib/python3.12/site-packages/IPython/core/interactiveshell.py:3755: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.
  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
An exception has occurred, use %tb to see the full traceback.

SystemExit: 0

Hide code cell source

gene_to_PG = (scores.droplevel(
    list(set(scores.index.names) - {feat_name, args.annotaitons_gene_col})
)
    .index
    .to_frame()
    .reset_index(drop=True)
    .set_index(args.annotaitons_gene_col)
)
gene_to_PG.head()

Hide code cell source

disease_associations_all = data.join(
    gene_to_PG).dropna().reset_index().set_index(feat_name).join(annotations)
disease_associations_all

only by model#

Hide code cell source

idx = disease_associations_all.index.intersection(scores_model_only.index)
disease_assocications_new = disease_associations_all.loc[idx].sort_values(
    'score', ascending=False)
disease_assocications_new.head(20)

Hide code cell source

mask = disease_assocications_new.loc[idx, 'score'] >= 2.0
disease_assocications_new.loc[idx].loc[mask]

Only by model which were significant#

Hide code cell source

idx = disease_associations_all.index.intersection(
    scores_model_only_rejected.index)
disease_assocications_new_rejected = disease_associations_all.loc[idx].sort_values(
    'score', ascending=False)
disease_assocications_new_rejected.head(20)

Hide code cell source

mask = disease_assocications_new_rejected.loc[idx, 'score'] >= 2.0
disease_assocications_new_rejected.loc[idx].loc[mask]

Shared which are only significant for by model#

mask = (scores_common[(str(args.model_key), 'rejected')] & mask_different)
mask.sum()

Hide code cell source

idx = disease_associations_all.index.intersection(mask.index[mask])
disease_assocications_shared_rejected_by_model = (disease_associations_all.loc[idx].sort_values(
    'score', ascending=False))
disease_assocications_shared_rejected_by_model.head(20)

Hide code cell source

mask = disease_assocications_shared_rejected_by_model.loc[idx, 'score'] >= 2.0
disease_assocications_shared_rejected_by_model.loc[idx].loc[mask]

Only significant by RSN#

mask = (scores_common[(str(args.baseline), 'rejected')] & mask_different)
mask.sum()

Hide code cell source

idx = disease_associations_all.index.intersection(mask.index[mask])
disease_assocications_shared_rejected_by_RSN = (
    disease_associations_all
    .loc[idx]
    .sort_values('score', ascending=False))
disease_assocications_shared_rejected_by_RSN.head(20)

Hide code cell source

mask = disease_assocications_shared_rejected_by_RSN.loc[idx, 'score'] >= 2.0
disease_assocications_shared_rejected_by_RSN.loc[idx].loc[mask]

Write to excel#

Hide code cell source

disease_associations_all.to_excel(
    writer, sheet_name='disease_assoc_all', **writer_args)
disease_assocications_new.to_excel(
    writer, sheet_name='disease_assoc_new', **writer_args)
disease_assocications_new_rejected.to_excel(
    writer, sheet_name='disease_assoc_new_rejected', **writer_args)

Outputs#

Hide code cell source

writer.close()
files_out