# Import data loading functions
from mhcpred.data import get_train_data, get_test_data
# Load training and test data
= get_train_data()
df_train = get_test_data() df_test
In [2]:
In [3]:
# View first few rows of training data
df_train.head()
peptide | allele | hit | fold | |
---|---|---|---|---|
0 | YFPLAPFNQL | HLA-C*14:02 | True | 0 |
1 | KESKINQVF | HLA-B*44:02 | True | 0 |
2 | QPHDPLVPLSA | HLA-B*54:01 | True | 0 |
3 | RTIADSLINSF | HLA-B*57:03 | True | 0 |
4 | EEKTIIKKL | HLA-B*44:03 | True | 0 |
In [4]:
# Get allele counts in training data
"allele"]].value_counts() df_train[[
allele
HLA-A*02:01 265252
HLA-B*07:02 201038
HLA-B*57:01 184773
HLA-A*29:02 181136
HLA-B*40:02 145817
...
HLA-A*69:01 12
HLA-A*02:06 6
HLA-A*26:02 6
HLA-A*26:03 6
HLA-A*25:01 6
Name: count, Length: 130, dtype: int64
In [5]:
"allele"].value_counts() df_test[
allele
HLA-A*02:02 77053
HLA-A*02:06 54510
HLA-A*02:11 48445
HLA-B*53:01 46991
HLA-B*15:17 45917
HLA-A*02:05 45136
HLA-B*15:03 44968
HLA-A*33:01 43333
HLA-A*66:01 41538
HLA-C*12:03 36448
HLA-C*03:03 35568
HLA-A*11:01 33424
HLA-A*30:02 33180
HLA-C*08:02 32416
HLA-A*23:01 30467
HLA-A*32:01 28036
HLA-B*40:02 23768
HLA-B*14:02 21601
HLA-B*37:01 20048
HLA-B*40:01 18908
HLA-B*45:01 18750
HLA-B*18:01 18284
HLA-B*58:01 17946
HLA-B*15:02 16702
HLA-B*15:01 16624
HLA-A*30:01 15837
HLA-C*07:02 15293
HLA-B*46:01 14015
HLA-B*38:01 9509
HLA-B*35:03 8275
HLA-A*26:01 7730
HLA-C*05:01 7033
HLA-A*25:01 6906
HLA-A*68:01 5648
HLA-B*08:01 3365
HLA-B*07:02 2469
Name: count, dtype: int64
In [6]:
# Get positive samples per allele in training
"allele").hit.sum() df_train.groupby(
allele
HLA-A*01:01 7156
HLA-A*01:03 7
HLA-A*02:01 13025
HLA-A*02:03 1873
HLA-A*02:04 3155
...
HLA-C*12:04 3
HLA-C*14:02 2441
HLA-C*15:02 1873
HLA-C*16:01 2970
HLA-C*17:01 602
Name: hit, Length: 130, dtype: int64
In [7]:
sum() df_train.hit.
197547
In [8]:
len(df_train)
3679405
In [9]:
# ~5.37% positive rate
sum() / len(df_train) df_train.hit.
0.05368993084479692
In [10]:
"allele").hit.sum() df_test.groupby(
allele
HLA-A*02:02 3063
HLA-A*02:05 2016
HLA-A*02:06 1975
HLA-A*02:11 2035
HLA-A*11:01 2309
HLA-A*23:01 1697
HLA-A*25:01 396
HLA-A*26:01 555
HLA-A*30:01 892
HLA-A*30:02 2415
HLA-A*32:01 1436
HLA-A*33:01 2138
HLA-A*66:01 1988
HLA-A*68:01 433
HLA-B*07:02 159
HLA-B*08:01 180
HLA-B*14:02 1056
HLA-B*15:01 769
HLA-B*15:02 637
HLA-B*15:03 1953
HLA-B*15:17 1712
HLA-B*18:01 784
HLA-B*35:03 330
HLA-B*37:01 1253
HLA-B*38:01 619
HLA-B*40:01 1268
HLA-B*40:02 1333
HLA-B*45:01 760
HLA-B*46:01 575
HLA-B*53:01 2016
HLA-B*58:01 866
HLA-C*03:03 2003
HLA-C*05:01 383
HLA-C*07:02 593
HLA-C*08:02 1546
HLA-C*12:03 1273
Name: hit, dtype: int64
In [11]:
sum() / len(df_test) df_test.hit.
0.04800130213150049
In [11]:
# Find alleles only in test set
set(df_test.allele.unique()) - set(df_train.allele.unique())
{'HLA-A*02:02', 'HLA-A*02:11', 'HLA-A*33:01', 'HLA-B*53:01'}
- Dataset Class Imbalance
- Training Set:
- Total samples: 3,679,405
- Positive rate: 5.37%
- Test Set:
- Total samples: 453,934
- Positive rate: 4.8%
- Training Set:
- Allele Distribution
- Most frequent: HLA-A*02:01 (265,252 samples)
- Least frequent: Multiple alleles with only 6 samples
- Distribution: Highly imbalanced across alleles
- Test-Only Alleles
- HLA-A*02:02
- HLA-A*02:11
- HLA-A*33:01
- HLA-B*53:01