I'm trying to understand why these two queries are treated differently with regards to use of the primary keys in joins.
This query with a join on icd_codes
(the SELECT
query, without the EXPLAIN
, of course) completes in 56 ms:
EXPLAIN
SELECT var.Var_ID,
var.Gene,
var.HGVSc,
pVCF_145K.PT_ID,
pVCF_145K.AD_ALT,
pVCF_145K.AD_REF,
icd_codes.ICD_NM,
icd_codes.PT_AGE
FROM public.variants_145K var
INNER JOIN public.pVCF_145K USING (Var_ID)
INNER JOIN public.icd_codes using (PT_ID)
# INNER JOIN public.demographics USING (PT_ID)
WHERE Gene IN ('SLC9A6', 'SLC9A7')
AND Canonical
AND impact = 'high'
+------+-------------+-----------+-------+------------------------------------------------------------------+---------------------------------+---------+------------------------+------+------------------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+------+-------------+-----------+-------+------------------------------------------------------------------+---------------------------------+---------+------------------------+------+------------------------------------+
| 1 | SIMPLE | var | range | PRIMARY,variants_145K_Gene_index,variants_145K_Impact_Gene_index | variants_145K_Impact_Gene_index | 125 | NULL | 280 | Using index condition; Using where |
| 1 | SIMPLE | pVCF_145K | ref | PRIMARY,pVCF_145K_PT_ID_index | PRIMARY | 326 | public.var.Var_ID | 268 | |
| 1 | SIMPLE | icd_codes | ref | PRIMARY | PRIMARY | 38 | public.pVCF_145K.PT_ID | 29 | |
+------+-------------+-----------+-------+------------------------------------------------------------------+---------------------------------+---------+------------------------+------+------------------------------------+
This query with a join on demographics
takes over 11 minutes, and I'm not sure how to interpret the difference in the explain results. Why is it resorting to using the join buffer? How can I optimize this further?
EXPLAIN
SELECT variants_145K.Var_ID,
variants_145K.Gene,
variants_145K.HGVSc,
pVCF_145K.PT_ID,
pVCF_145K.AD_ALT,
pVCF_145K.AD_REF,
demographics.Sex,
demographics.Age
FROM public.variants_145K
INNER JOIN public.pVCF_145K USING (Var_ID)
# inner join public.icd_codes using (PT_ID)
INNER JOIN public.demographics USING (PT_ID)
WHERE Gene IN ('SLC9A6', 'SLC9A7')
AND Canonical
AND impact = 'high'
+------+-------------+---------------+--------+------------------------------------------------------------------+---------------------------------+---------+-------------------------------------------------------+---------+------------------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+------+-------------+---------------+--------+------------------------------------------------------------------+---------------------------------+---------+-------------------------------------------------------+---------+------------------------------------+
| 1 | SIMPLE | variants_145K | range | PRIMARY,variants_145K_Gene_index,variants_145K_Impact_Gene_index | variants_145K_Impact_Gene_index | 125 | NULL | 280 | Using index condition; Using where |
| 1 | SIMPLE | demographics | ALL | PRIMARY | NULL | NULL | NULL | 1916393 | Using join buffer (flat, BNL join) |
| 1 | SIMPLE | pVCF_145K | eq_ref | PRIMARY,pVCF_145K_PT_ID_index | PRIMARY | 364 | public.variants_145K.Var_ID,public.demographics.PT_ID | 1 | |
+------+-------------+---------------+--------+------------------------------------------------------------------+---------------------------------+---------+-------------------------------------------------------+---------+------------------------------------+
Adding a further filter in demographics
(WHERE demographics.Platform IS NOT NULL
) as shown below reduces to 38 seconds. However, there are queries where we do not use such filters so it would be ideal if it could use the primary PT_ID
key in the joins.
+------+-------------+---------------+--------+------------------------------------------------------------------+---------------------------------+---------+-------------------------------------------------------+--------+------------------------------------------------------------------------+
| id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra |
+------+-------------+---------------+--------+------------------------------------------------------------------+---------------------------------+---------+-------------------------------------------------------+--------+------------------------------------------------------------------------+
| 1 | SIMPLE | variants_145K | range | PRIMARY,variants_145K_Gene_index,variants_145K_Impact_Gene_index | variants_145K_Impact_Gene_index | 125 | NULL | 280 | Using index condition; Using where |
| 1 | SIMPLE | demographics | range | PRIMARY,Demographics_PLATFORM_index | Demographics_PLATFORM_index | 17 | NULL | 258544 | Using index condition; Using where; Using join buffer (flat, BNL join) |
| 1 | SIMPLE | pVCF_145K | eq_ref | PRIMARY,pVCF_145K_PT_ID_index | PRIMARY | 364 | public.variants_145K.Var_ID,public.demographics.PT_ID | 1 | |
+------+-------------+---------------+--------+------------------------------------------------------------------+---------------------------------+---------+-------------------------------------------------------+--------+------------------------------------------------------------------------+
The tables:
create table public.demographics # 1,916,393 rows
(
PT_ID varchar(9) not null
primary key,
Age float(3,1) null,
Status varchar(8) not null,
Sex varchar(7) not null,
Race_1 varchar(41) not null,
Race_2 varchar(41) not null,
Ethnicity varchar(22) not null,
Smoker_flag tinyint(1) not null,
Platform char(4) null,
MyCode_Consent tinyint(1) not null,
MR_ENC_DT date null,
Birthday date null,
Deathday date null,
max_unrelated_145K tinyint unsigned null
);
create index Demographics_PLATFORM_index
on public.demographics (Platform);
create table public.icd_codes # 116,220,141 rows
(
PT_ID varchar(9) not null,
ICD_CD varchar(8) not null,
ICD_NM varchar(217) not null,
DX_DT date not null,
PT_AGE float(3,1) unsigned not null,
CODE_SYSTEM char(7) not null,
primary key (PT_ID, ICD_CD, DX_DT)
);
create table public.pVCF_145K # 10,113,244,082 rows
(
Var_ID varchar(81) not null,
PT_ID varchar(9) not null,
GT tinyint unsigned not null,
GQ smallint unsigned not null,
AD_REF smallint unsigned not null,
AD_ALT smallint unsigned not null,
DP smallint unsigned not null,
FT varchar(30) null,
primary key (Var_ID, PT_ID)
);
create index pVCF_145K_PT_ID_index
on public.pVCF_145K (PT_ID);
create table public.variants_145K # 151,314,917 rows
(
Var_ID varchar(81) not null,
Gene varchar(22) null,
Feature varchar(18) not null,
Feature_type varchar(10) null,
HIGH_INF_POS tinyint(1) null,
Consequence varchar(26) not null,
rsid varchar(34) null,
Impact varchar(8) not null,
Canonical tinyint(1) not null,
Exon smallint unsigned null,
Intron smallint unsigned null,
HGVSc varchar(323) null,
HGVSp varchar(196) null,
AA_position smallint unsigned null,
gnomAD_NFE_MAF float null,
SIFT varchar(14) null,
PolyPhen varchar(17) null,
GHS_Hom mediumint(5) unsigned null,
GHS_Het mediumint(5) unsigned null,
GHS_WT mediumint(5) unsigned null,
IDT_MAF float null,
VCR_MAF float null,
UKB_MAF float null,
Chr tinyint unsigned not null,
Pos int(9) unsigned not null,
Ref varchar(298) not null,
Alt varchar(306) not null,
primary key (Var_ID, Feature)
);
create index variants_145K_Chr_Pos_Ref_Alt_index
on public.variants_145K (Chr, Pos, Ref, Alt);
create index variants_145K_Gene_index
on public.variants_145K (Gene);
create index variants_145K_Impact_Gene_index
on public.variants_145K (Impact, Gene);
create index variants_145K_rsid_index
on public.variants_145K (rsid);
This is on MariaDB 10.5.8 (innodb)
Thank you!