Back to Article
EDA
Download Notebook

EDA

In [2]:
# Import data loading functions
from mhcpred.data import get_train_data, get_test_data

# Load training and test data
df_train = get_train_data()
df_test = get_test_data()
In [3]:
# View first few rows of training data
df_train.head()
peptide allele hit fold
0 YFPLAPFNQL HLA-C*14:02 True 0
1 KESKINQVF HLA-B*44:02 True 0
2 QPHDPLVPLSA HLA-B*54:01 True 0
3 RTIADSLINSF HLA-B*57:03 True 0
4 EEKTIIKKL HLA-B*44:03 True 0
In [4]:
# Get allele counts in training data
df_train[["allele"]].value_counts()
allele     
HLA-A*02:01    265252
HLA-B*07:02    201038
HLA-B*57:01    184773
HLA-A*29:02    181136
HLA-B*40:02    145817
                ...  
HLA-A*69:01        12
HLA-A*02:06         6
HLA-A*26:02         6
HLA-A*26:03         6
HLA-A*25:01         6
Name: count, Length: 130, dtype: int64
In [5]:
df_test["allele"].value_counts()
allele
HLA-A*02:02    77053
HLA-A*02:06    54510
HLA-A*02:11    48445
HLA-B*53:01    46991
HLA-B*15:17    45917
HLA-A*02:05    45136
HLA-B*15:03    44968
HLA-A*33:01    43333
HLA-A*66:01    41538
HLA-C*12:03    36448
HLA-C*03:03    35568
HLA-A*11:01    33424
HLA-A*30:02    33180
HLA-C*08:02    32416
HLA-A*23:01    30467
HLA-A*32:01    28036
HLA-B*40:02    23768
HLA-B*14:02    21601
HLA-B*37:01    20048
HLA-B*40:01    18908
HLA-B*45:01    18750
HLA-B*18:01    18284
HLA-B*58:01    17946
HLA-B*15:02    16702
HLA-B*15:01    16624
HLA-A*30:01    15837
HLA-C*07:02    15293
HLA-B*46:01    14015
HLA-B*38:01     9509
HLA-B*35:03     8275
HLA-A*26:01     7730
HLA-C*05:01     7033
HLA-A*25:01     6906
HLA-A*68:01     5648
HLA-B*08:01     3365
HLA-B*07:02     2469
Name: count, dtype: int64
In [6]:
# Get positive samples per allele in training
df_train.groupby("allele").hit.sum()
allele
HLA-A*01:01     7156
HLA-A*01:03        7
HLA-A*02:01    13025
HLA-A*02:03     1873
HLA-A*02:04     3155
               ...  
HLA-C*12:04        3
HLA-C*14:02     2441
HLA-C*15:02     1873
HLA-C*16:01     2970
HLA-C*17:01      602
Name: hit, Length: 130, dtype: int64
In [7]:
df_train.hit.sum()
197547
In [8]:
len(df_train)
3679405
In [9]:
# ~5.37% positive rate
df_train.hit.sum() / len(df_train)
0.05368993084479692
In [10]:
df_test.groupby("allele").hit.sum()
allele
HLA-A*02:02    3063
HLA-A*02:05    2016
HLA-A*02:06    1975
HLA-A*02:11    2035
HLA-A*11:01    2309
HLA-A*23:01    1697
HLA-A*25:01     396
HLA-A*26:01     555
HLA-A*30:01     892
HLA-A*30:02    2415
HLA-A*32:01    1436
HLA-A*33:01    2138
HLA-A*66:01    1988
HLA-A*68:01     433
HLA-B*07:02     159
HLA-B*08:01     180
HLA-B*14:02    1056
HLA-B*15:01     769
HLA-B*15:02     637
HLA-B*15:03    1953
HLA-B*15:17    1712
HLA-B*18:01     784
HLA-B*35:03     330
HLA-B*37:01    1253
HLA-B*38:01     619
HLA-B*40:01    1268
HLA-B*40:02    1333
HLA-B*45:01     760
HLA-B*46:01     575
HLA-B*53:01    2016
HLA-B*58:01     866
HLA-C*03:03    2003
HLA-C*05:01     383
HLA-C*07:02     593
HLA-C*08:02    1546
HLA-C*12:03    1273
Name: hit, dtype: int64
In [11]:
df_test.hit.sum() / len(df_test)
0.04800130213150049
In [11]:
# Find alleles only in test set
set(df_test.allele.unique()) - set(df_train.allele.unique())
{'HLA-A*02:02', 'HLA-A*02:11', 'HLA-A*33:01', 'HLA-B*53:01'}
  • Dataset Class Imbalance
    • Training Set:
      • Total samples: 3,679,405
      • Positive rate: 5.37%
    • Test Set:
      • Total samples: 453,934
      • Positive rate: 4.8%
  • Allele Distribution
    • Most frequent: HLA-A*02:01 (265,252 samples)
    • Least frequent: Multiple alleles with only 6 samples
    • Distribution: Highly imbalanced across alleles
  • Test-Only Alleles
    • HLA-A*02:02
    • HLA-A*02:11
    • HLA-A*33:01
    • HLA-B*53:01