This R notebook uses the full downloaded chem database (version 28) to extract interesting drug information.
This script requires the following dataset files:
chembl_28_sqlite.tar.gz
file.chembl
variable.src1src22.txt.gz
file.unichem
variable.efo.obo
file.efo
variable.These data sources are agglomerated and stored in the sqlite3 format saved as the file pointed to by the file.output
variable. A cached (temporary) sqlite3 database will also be used at the path pointed to by the file.local
variable.
path.data <- file.path('data.raw','drugs') # Path of the base data folder. Update to reflect your own path.
file.local <- 'cache/chembl-drugs.sqlite3'
file.output <- 'data/chembl-drugs.sqlite3'
tbl.selected <- 'selected'
tbl.unichem <- 'unichem'
file.unichem <- file.path(path.data, 'unichem','src1src22.txt.gz')
file.chembl <- file.path(path.data, 'chembl','chembl_28.sqlite3')
#file.mesh <- file.path(path.data, 'mesh','desc2021.xml')
file.efo <- file.path(path.data, 'efo','efo.obo')
library(ggplot2)
library(dplyr)
library(reshape2)
Attaching package: ‘dplyr’ The following objects are masked from ‘package:stats’: filter, lag The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union
mdcast <- function(df, id.vars, expand, prefixes, binarise=F) {
l_cast <- list()
get.df <- function(i) {
colname <- expand[i]
prefix <- prefixes[i]
frm <- formula(paste(paste(sep='+',id.vars),'~',colname))
dcast(frm, data=df %>% mutate(!!colname:=paste0(prefix,.[[colname]])),fun.aggregate=length,value.var=id.vars[1])
}
df.out <- get.df(1)
for(i in seq_len(length(expand)-1)+1) {
df.next <- get.df(i)
df.out <- df.out %>% merge(df.next, by=id.vars)
}
if(binarise) {
df.out <- df.out %>% mutate_at(vars(-all_of(id.vars)), . %>% as.logical %>% as.integer)
}
df.out
}
count.heads <- function(df) { list(type=gsub('_.*','',df %>% colnames) %>% as.factor) %>% as.data.frame %>% group_by(type) %>% summarize(num=n()) }
print.head.count <- function(df) { df.hc <- count.heads(df); paste(df.hc$type,df.hc$num,sep=': ', collapse=', ') }
#df.drug.atc.raw %>% mdcast(id.vars='molregno',expand=c('level1_description'), prefixes=c('l1_'), binarise=T) %>% head(9)
read.unichem <- function(file) {
con.file <- gzfile(file, 'rt')
dt.map <- read.table(con.file, header=T, sep='\t') %>% rename(chembl='From.src.1',pubchem='To.src.22') %>% (data.table::data.table)(key='chembl')
close(con.file)
dt.map
}
# Some helping functions for interfacing with the R DB API
dbFetchQuery <- function(con, ...) {
res <- DBI::dbSendQuery(con, ...)
df <- DBI::dbFetch(res)
DBI::dbClearResult(res)
df
}
dbListDatabases <- function(con) {
dbFetchQuery(con, 'PRAGMA database_list;') %>% `[[`('name')
}
dbExistsTable <- function(con, table, db) {
tbl.name <- ifelse(missing(db),'SQLITE_MASTER', paste0('`',db,'`.SQLITE_MASTER'))
sql <- paste0("select count(name) as value from ", tbl.name, " where type='table' and name=?")
dbFetchQuery(con, sql, params=table)$value != 0
}
dbCreateTable <- function(con, tbl.name, vars) {
sql <- paste0('create table ',tbl.name,' (',paste(names(vars),vars, collapse=', '),');')
DBI::dbExecute(con, sql)
}
dbWriteOutputTable <- function(con, table, df, index, drop=T,...) {
qtable <- DBI::dbQuoteIdentifier(con, table)
tbl.name <- paste('output',qtable,sep='.')
if(drop) {
DBI::dbExecute(con,paste0('drop table if exists ',tbl.name))
}
DBI::dbWriteTable(con, DBI::SQL(tbl.name), df,...)
if(!missing(index)) {
index.name = paste(table,index,'index',sep='_')
qindex <- DBI::dbQuoteIdentifier(con, index)
DBI::dbExecute(con, paste0('create index output.',index.name,' on ',qtable,'(',qindex,')'))
}
}
# This file connects a local sqlite
dbInitCache <- function(con) {
if(!'local' %in% dbListDatabases(con)) {
message('Attaching ', file.local,' as `local`')
DBI::dbExecute(con, 'attach ? as ?;', params=c(file.local, 'local'))
}
if(!'output' %in% dbListDatabases(con)) {
message('Attaching ', file.output,' as `output`')
DBI::dbExecute(con, 'attach ? as ?;', params=c(file.output, 'output'))
}
if(!dbExistsTable(con,tbl.selected, db='local')) {
tbl.name <- paste0('`local`.`',tbl.selected,'`')
message('Creating selection ')
dbCreateTable(con, tbl.name,c(molregno='INTEGER PRIMARY KEY NOT NULL'))
sql <- paste0("
insert into ",tbl.name,"
select distinct molregno from
molecule_dictionary as md
where molregno in (
select molregno from drug_indication union select molregno from drug_mechanism
) and
molecule_type='Small molecule'
order by molregno;")
DBI::dbExecute(con, sql);
}
if(!dbExistsTable(con,tbl.unichem, db='local')) {
tbl.name <- paste0('local.', DBI::dbQuoteIdentifier(con,tbl.unichem))
message('Reading data for table: ',tbl.name)
df.unichem <- read.unichem(file=file.unichem)
message('Writing to DB')
DBI::dbWriteTable(con,DBI::SQL(tbl.name), df.unichem)
message('Creating Index')
sql <- paste0('create index `local`.UniChemIndex on `',tbl.unichem,'`(chembl)')
DBI::dbExecute(con, sql)
}
}
con <- DBI::dbConnect(RSQLite::SQLite(), file.chembl,
synchronous='normal')
DBI::dbListTables(con)
dbInitCache(con)
Attaching cache/chembl-drugs.sqlite3 as `local` Attaching data/chembl-drugs.sqlite3 as `output` Creating selection Reading data for table: local.`unichem` Writing to DB Creating Index
In this work we use the pre-computed 2D-structure kernel available from the PubChem interface. For this purpose, we need to associate each drug in the Chembl database with those in the PubChem database.
This happens via the unichem
crossreferencing library.
For this you will need the downloaded unichem
df.link.raw <- dbFetchQuery(con, '
select md.molregno,md.chembl_id,md.pref_name,uc.pubchem from
local.unichem as uc join
molecule_dictionary as md on md.chembl_id = uc.chembl
where md.molregno in (select molregno from local.selected)
group by molregno
order by molregno
limit 10000
;')
message('Cross-referenced chemicals (with PubChem): ',df.link.raw %>% nrow)
df.link.raw %>% head(10)
dbWriteOutputTable(con, table='drug_xref', df.link.raw,index='molregno', drop=T)
Cross-referenced chemicals (with PubChem): 5546
molregno | chembl_id | pref_name | pubchem | |
---|---|---|---|---|
<int> | <chr> | <chr> | <int> | |
1 | 97 | CHEMBL2 | PRAZOSIN | 4893 |
2 | 115 | CHEMBL3 | NICOTINE | 89594 |
3 | 146 | CHEMBL4 | OFLOXACIN | 4583 |
4 | 147 | CHEMBL5 | NALIDIXIC ACID | 4421 |
5 | 148 | CHEMBL6246 | ELLAGIC ACID | 5281855 |
6 | 154 | CHEMBL204021 | DARAPLADIB | 9939609 |
7 | 173 | CHEMBL6 | INDOMETHACIN | 3715 |
8 | 194 | CHEMBL403 | SULBACTAM | 130313 |
9 | 207 | CHEMBL404 | TAZOBACTAM | 123630 |
10 | 210 | CHEMBL6273 | FLEROXACIN | 40466912 |
This is part of the manually curated information that is available in the Chembl database and consists of assumed or verified mechanism of action information for each drug.
The result is stored on the drug_moa
table of the output sqlite3 database file.
process.moa <- function(df.moa) {
df.moa.sel <- df.moa %>% select(
-record_id,-tid,-tax_id,-oc_id,-target_desc,-contains('comment'),-variant_id, # unimportant
-molecular_mechanism,-site_id,-target_name,-mechanism_of_action, # too fine
-disease_efficacy,-direct_interaction # only refer to 1 case
) %>%
mutate(
target=ifelse(parent_type=='PROTEIN',parent_type, target_type),
organism_class=ifelse(l2=='Mammalia',organism, l1)
) %>%
select(-l1,-l2,-l3,-organism,-parent_type,-target_type) %>%
mutate_if(is.character,as.factor)
df.mec_ids <- df.moa.sel %>% group_by(molregno) %>% summarise(mec_ids=paste(mec_id, collapse=','))
#print(df.moa.sel %>% filter() %>% as.list %>% lapply(function(x) length(unique(x))) %>% as.vector)
df.moa.act <- df.moa.sel %>% mutate(action_type=paste0('act_',gsub(' ','_',tolower(action_type)))) %>% dcast(molregno~action_type, length, value.var='target')
df.moa.tar <- df.moa.sel %>% mutate(target=paste0('tar_',gsub(' ','_',tolower(target)))) %>% dcast(molregno~target, length, value.var = 'target')
df.moa.org <- df.moa.sel %>% mutate(organism=paste0('org_',gsub(' ','_',tolower(organism_class)))) %>% dcast(molregno~organism, length, value.var = 'target')
df.moa.wide <- merge(df.mec_ids, df.moa.act, by='molregno') %>% merge(df.moa.tar, by='molregno') %>% merge(df.moa.org, by='molregno')
df.moa.wide %>% head
df.moa.wide
}
df.moa.raw <- dbFetchQuery(con, "SELECT
dm.*,tt.*,td.pref_name as target_name, td.organism,oc.* FROM
drug_mechanism as dm left join
target_dictionary as td on dm.tid=td.tid join
target_type as tt on td.target_type = tt.target_type join
organism_class as oc on oc.tax_id=td.tax_id join
molecule_dictionary as md on md.molregno = dm.molregno
where md.molecule_type = 'Small molecule'
limit 10000")
print(df.moa.raw %>% colnames)
df.moa <- df.moa.raw %>% process.moa
message('Count of header types discovered: ', print.head.count(df.moa))
df.moa %>% nrow
df.moa %>% head
dbWriteOutputTable(con, table='drug_moa', df.moa,index='molregno', drop=T)
[1] "mec_id" "record_id" "molregno" [4] "mechanism_of_action" "tid" "site_id" [7] "action_type" "direct_interaction" "molecular_mechanism" [10] "disease_efficacy" "mechanism_comment" "selectivity_comment" [13] "binding_site_comment" "variant_id" "target_type" [16] "target_desc" "parent_type" "target_name" [19] "organism" "oc_id" "tax_id" [22] "l1" "l2" "l3"
Count of header types discovered: act: 24, mec: 1, molregno: 1, org: 6, tar: 8
molregno | mec_ids | act_activator | act_agonist | act_allosteric_antagonist | act_antagonist | act_binding_agent | act_blocker | act_cross-linking_agent | act_degrader | ⋯ | tar_protein_nucleic-acid_complex | tar_small_molecule | tar_subcellular | tar_unknown | org_bacteria | org_eukaryotes | org_fungi | org_homo_sapiens | org_rattus_norvegicus | org_viruses | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <chr> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | ⋯ | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | |
1 | 115 | 1044 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 146 | 1899 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
3 | 147 | 1905 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 154 | 5729 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 | 173 | 1192 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
6 | 241 | 1901 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
The Anatomical Therapeutic Chemical is a classification system for drugs based on the body part they target as well as a classification into families of the active ingredients.
It consists of a hierarchy of 5 levels, out of which we only use the first 4, which we also simplify and compact (for better presentation).
The result is stored on the drug_atc
table of the output sqlite3 database file.
df.drug.atc.raw <- dbFetchQuery(con, "SELECT
mac.molregno,ac.* FROM
molecule_dictionary as md join
molecule_atc_classification as mac on md.molregno = mac.molregno join
atc_classification as ac on mac.level5 = ac.level5
where md.molecule_type='Small molecule' and
md.molregno in (select molregno from local.selected)
order by md.molregno
limit 10000")
message('Loaded ',df.drug.atc.raw %>% nrow,' ATC entries')
df.drug.atc.raw %>% unique %>% head(5)
Loaded 3477 ATC entries
molregno | who_name | level1 | level2 | level3 | level4 | level5 | level1_description | level2_description | level3_description | level4_description | |
---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | |
1 | 97 | prazosin | C | C02 | C02C | C02CA | C02CA01 | CARDIOVASCULAR SYSTEM | ANTIHYPERTENSIVES | ANTIADRENERGIC AGENTS, PERIPHERALLY ACTING | Alpha-adrenoreceptor antagonists |
2 | 115 | nicotine | N | N07 | N07B | N07BA | N07BA01 | NERVOUS SYSTEM | OTHER NERVOUS SYSTEM DRUGS | DRUGS USED IN ADDICTIVE DISORDERS | Drugs used in nicotine dependence |
3 | 146 | ofloxacin | J | J01 | J01M | J01MA | J01MA01 | ANTIINFECTIVES FOR SYSTEMIC USE | ANTIBACTERIALS FOR SYSTEMIC USE | QUINOLONE ANTIBACTERIALS | Fluoroquinolones |
4 | 146 | ofloxacin | S | S01 | S01A | S01AE | S01AE01 | SENSORY ORGANS | OPHTHALMOLOGICALS | ANTIINFECTIVES | Fluoroquinolones |
5 | 146 | ofloxacin | S | S02 | S02A | S02AA | S02AA16 | SENSORY ORGANS | OTOLOGICALS | ANTIINFECTIVES | Antiinfectives |
# Cleans and shortens the names of the ATC categories, for succinctness and simplicity.
clean_atc <- function(x) {
x %>%
gsub('(DRUGS|TOPICAL PRODUCTS|TREATMENT|PREPARATIONS) (OF|FOR) |DRUGS USED IN ','', ., ignore.case=T) %>%
gsub('^(.*) FOR ([^ ]+) USE','\\2 \\1',., ignore.case=T) %>%
gsub('FOR TREATMENT OF ','',., ignore.case=T) %>%
gsub('\\([^)]+\\)','',.) %>%
gsub('MODULATORS OF THE ([^ ]+)','\\1 MODULATORS',., ignore.case=T) %>%
gsub('INCL.',',',.) %>%
gsub('EXCL.','excl',.) %>%
gsubfn::gsubfn('[^, ]+',list('DISORDERS'='','THERAPEUTIC'='','DRUGS'='','PREPARATIONS'='','AGENTS'='','AND'=',','PRODUCTS'='', 'THERAPY'='','ETC'='','SYSTEM'='','SUPPLEMENTS'=''),.) %>%
gsub('[- ,./]+','_',.) %>%
gsub('(^_)|(_$)','',.) %>%
gsub('_+','_',.)
}
df.atc.cln <- df.drug.atc.raw %>% select(level4, contains('description')) %>% unique %>% mutate_at(vars(contains('description')), clean_atc)
df.atc.cln %>% head
Warning message: “no DISPLAY variable so Tk is not available”
level4 | level1_description | level2_description | level3_description | level4_description | |
---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | |
1 | C02CA | CARDIOVASCULAR | ANTIHYPERTENSIVES | ANTIADRENERGIC_PERIPHERALLY_ACTING | Alpha_adrenoreceptor_antagonists |
2 | N07BA | NERVOUS | OTHER_NERVOUS | ADDICTIVE | nicotine_dependence |
3 | J01MA | SYSTEMIC_ANTIINFECTIVES | SYSTEMIC_ANTIBACTERIALS | QUINOLONE_ANTIBACTERIALS | Fluoroquinolones |
4 | S01AE | SENSORY_ORGANS | OPHTHALMOLOGICALS | ANTIINFECTIVES | Fluoroquinolones |
5 | S02AA | SENSORY_ORGANS | OTOLOGICALS | ANTIINFECTIVES | Antiinfectives |
6 | J01MB | SYSTEMIC_ANTIINFECTIVES | SYSTEMIC_ANTIBACTERIALS | QUINOLONE_ANTIBACTERIALS | Other_quinolones |
df.drug.atc <- df.drug.atc.raw %>% select(molregno, level4) %>% merge(df.atc.cln, by='level4') %>% mdcast(id.vars='molregno', expand=c('level1_description','level2_description','level3_description'), prefixes=c('L1_','L2_','L3_'),binarise=F)
message('Count of ATC descriptions per level: ', df.drug.atc %>% select(-molregno) %>% print.head.count)
message('Count of described drugs: ',df.drug.atc %>% nrow)
df.drug.atc %>% head
dbWriteOutputTable(con, table='drug_atc', df.drug.atc,index='molregno', drop=T)
Count of ATC descriptions per level: L1: 14, L2: 85, L3: 198 Count of described drugs: 2525
molregno | L1_ALIMENTARY_TRACT_METABOLISM | L1_ANTINEOPLASTIC_IMMUNOMODULATING | L1_ANTIPARASITIC_INSECTICIDES_REPELLENTS | L1_BLOOD_BLOOD_FORMING_ORGANS | L1_CARDIOVASCULAR | L1_DERMATOLOGICALS | L1_GENITO_URINARY_SEX_HORMONES | L1_MUSCULO_SKELETAL | L1_NERVOUS | ⋯ | L3_TUMOUR_DETECTION | L3_ULTRASOUND_CONTRAST_MEDIA | L3_UROLOGICALS | L3_UTEROTONICS | L3_VASODILATORS_USED_IN_CARDIAC_DISEASES | L3_VITAMIN_A_D_COMBINATIONS_OF_THE_TWO | L3_VITAMIN_B1_PLAIN_IN_COMBINATION_WITH_VITAMIN_B6_B12 | L3_VITAMIN_B12_FOLIC_ACID | L3_VITAMIN_K_OTHER_HEMOSTATICS | L3_X_RAY_CONTRAST_MEDIA_IODINATED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | ⋯ | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | |
1 | 97 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 115 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 146 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 147 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 173 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 3 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 194 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
The Experimental Factor Ontology (EFO) is an ontological categorisation of molecules of biological interest, and therefore assigns varying number of tags in varying depth levels of the selected drugs.
Similar to the ATC, we simplify and compactify a selection of the data.
To parse this information the EFO ontology must be downloaded and available in the file.efo
variable, and the ontologyIndex
R package must be installed. You will also need the popular tidyr
package.
The result is stored on the drug_efo
table of the output sqlite3 database file.
df.di.raw <- dbFetchQuery(con, "SELECT
di.* FROM
drug_indication as di join
molecule_dictionary as md on md.molregno = di.molregno
where md.molregno in (select molregno from local.selected)
order by md.molregno
limit 50000")
mesh_ids <- df.di.raw$mesh_id %>% unique
df.di.raw %>% nrow
df.di.raw %>% unique %>% head(5)
drugind_id | record_id | molregno | max_phase_for_ind | mesh_id | mesh_heading | efo_id | efo_term | |
---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <chr> | |
1 | 69285 | 2791601 | 97 | 4 | D006973 | Hypertension | EFO:0000537 | hypertension |
2 | 69286 | 2791601 | 97 | 3 | D013313 | Stress Disorders, Post-Traumatic | EFO:0001358 | post-traumatic stress disorder |
3 | 69287 | 2791601 | 97 | 2 | D014029 | Tobacco Use Disorder | EFO:0003768 | nicotine dependence |
4 | 69288 | 2791601 | 97 | 3 | D000437 | Alcoholism | EFO:0003829 | alcohol dependence |
5 | 69289 | 2791601 | 97 | 3 | D007669 | Kidney Calculi | EFO:0003845 | kidney stone |
library(purrr)
get_oi_name <- function(names, oi) {
names %>% map(~tryCatch(ontologyIndex::get_term_property(oi, 'name', .), error=function(e)NA)) %>% as.character
}
oi.efo <- ontologyIndex::get_ontology(file.efo)
oi.efo
efo.ids <- df.di.raw$efo_id %>% unique
efo.ids <- efo.ids[-which(is.na(efo.ids))]
efo.ancest.all <- ontologyIndex::get_ancestors(oi.efo, efo.ids)
dt.name2id <- data.table::data.table(id=efo.ancest.all) %>% mutate(name=id %>% get_oi_name(oi.efo)) %>% (data.table::setkey)('name')
efo.allowed.roots <- dt.name2id[c('disease','Phenotypic abnormality','behavior','pathologic process')]$id
Ontology with 26989 terms format-version: 1.2 data-version: http://www.ebi.ac.uk/efo/releases/v3.29.1/efo.owl ontology: http://www.ebi.ac.uk/efo/efo.owl Properties: id: character name: character parents: list children: list ancestors: list obsolete: logical Roots: EFO:0000001 - experimental factor EFO:0000824 - relationship RO:0000053 - bearer_of RO:0000057 - has_participant has_part - has part RO:0000056 - participates_in RO:0002502 - depends on disease_has_feature - disease has feature located_in - located_in location_of - location_of ... 42 more
efo.subset <- function(efo_ids, mesh_headings) {
map2(efo_ids, mesh_headings, function(efo_id, mesh_heading){
if(is.na(efo_id)) {
mesh_heading
} else {
efo.ancest <- ontologyIndex::get_ancestors(oi.efo, efo_id)
efo.ancest.allowed <- ontologyIndex::intersection_with_descendants(oi.efo, roots=efo.allowed.roots, terms=efo.ancest)
if(efo.ancest.allowed[1] %in% efo.allowed.roots) {
efo_id <- rev(efo.ancest.allowed[-1])
is_efo <- grepl('^EFO:',efo_id)
if(any(is_efo)) {
efo_id <- efo_id[is_efo]
}
efo_names <- get_oi_name(efo_id, oi.efo)
if('pain' %in% efo_names) {
efo_names <- mesh_heading
}
if(length(efo_names)>1) {
efo_names <- efo_names[-1]
}
efo_terms <- rev(efo_names)[1:min(length(efo_names),4)]
} else {
efo_terms <- 'other'
}
efo_terms
}
})
}
df.di.efo <- df.di.raw %>% mutate(ok = efo.subset(efo_id,mesh_heading))
df.di.efo %>% head(10)
drugind_id | record_id | molregno | max_phase_for_ind | mesh_id | mesh_heading | efo_id | efo_term | ok | |
---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <chr> | <chr> | <chr> | <chr> | <list> | |
1 | 69285 | 2791601 | 97 | 4 | D006973 | Hypertension | EFO:0000537 | hypertension | cardiovascular disease, vascular disease |
2 | 69286 | 2791601 | 97 | 3 | D013313 | Stress Disorders, Post-Traumatic | EFO:0001358 | post-traumatic stress disorder | nervous system disease , central nervous system disease, brain disease , mental or behavioural disorder |
3 | 69287 | 2791601 | 97 | 2 | D014029 | Tobacco Use Disorder | EFO:0003768 | nicotine dependence | nervous system disease , central nervous system disease, brain disease , mental or behavioural disorder |
4 | 69288 | 2791601 | 97 | 3 | D000437 | Alcoholism | EFO:0003829 | alcohol dependence | nervous system disease , central nervous system disease, brain disease , mental or behavioural disorder |
5 | 69289 | 2791601 | 97 | 3 | D007669 | Kidney Calculi | EFO:0003845 | kidney stone | other |
6 | 69290 | 2791601 | 97 | 2 | D019969 | Amphetamine-Related Disorders | EFO:0004701 | methamphetamine dependence | nervous system disease , central nervous system disease, brain disease , mental or behavioural disorder |
7 | 69292 | 2791601 | 97 | 1 | D001750 | Urinary Bladder, Neurogenic | HP:0000011 | Neurogenic bladder | urinary system disease |
8 | 112027 | 2791601 | 97 | 1 | D001007 | Anxiety | EFO:0005230 | anxiety | anxiety |
9 | 23567 | 1343518 | 115 | 4 | D014029 | Tobacco Use Disorder | EFO:0003768 | nicotine dependence | nervous system disease , central nervous system disease, brain disease , mental or behavioural disorder |
10 | 23570 | 1343518 | 115 | 3 | D016540 | Smoking Cessation | EFO:0004319 | smoking cessation | smoking cessation |
message('Unique EFO tags: ', do.call(c,df.di.efo$ok) %>% unique %>% length)
message('Simplified/compacted/agglomerated (surviving) EFO tags: ', df.di.efo$ok %>% unique %>% length)
Unique EFO tags: 488 Simplified/compacted/agglomerated (surviving) EFO tags: 777
df.drug.efo <- df.di.efo %>% select(molregno, ok) %>% tidyr::unnest_longer(ok) %>% mutate(var=tolower(gsub(' ','_', ok))) %>%
mdcast(id.vars='molregno',expand='var', prefixes='efo_')
df.drug.efo %>% head
dbWriteOutputTable(con, table='drug_efo', df.drug.efo,index='molregno', drop=T)
molregno | efo_abdominal_symptom | efo_abnormal_abdomen_morphology | efo_abnormal_aortic_morphology | efo_abnormal_biliary_tract_morphology | efo_abnormal_bleeding | efo_abnormal_blood_glucose_concentration | efo_abnormal_blood_ion_concentration | efo_abnormal_blood_phosphate_concentration | efo_abnormal_blood_sodium_concentration | ⋯ | efo_ventral_hernia | efo_vertebral_joint_disease | efo_vesiculobullous_skin_disease | efo_viral_disease | efo_visual_impairment | efo_vitamin_deficiency | efo_weight_loss | efo_xerostomia | efo_yersinia_infectious_disease | efo_yersinia_pestis_infectious_disease | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | ⋯ | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | |
1 | 97 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 115 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 146 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 147 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 148 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 154 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ⋯ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
These are not used in this project, but provided for a future extension.
Feel free to skip this section.
The result is stored on the drug_chemprops
table of the output sqlite3 database file.
df.comp.raw <- dbFetchQuery(con, "SELECT
cp.* FROM
compound_properties as cp join
molecule_dictionary as md on md.molregno = cp.molregno
where md.molregno in (select molregno from local.selected)
order by md.molregno
limit 30000")
df.comp.raw %>% nrow
df.comp.raw %>% colnames
df.comp.raw %>% as.list %>% sapply(function(x) x %>% unique%>% length) %>% as.data.frame
df.comp.raw %>% unique %>% head(10)
dbWriteOutputTable(con, table='drug_chemprops', df.comp.raw,index='molregno', drop=T)
. | |
---|---|
<int> | |
molregno | 5932 |
mw_freebase | 3958 |
alogp | 1039 |
hba | 23 |
hbd | 17 |
psa | 2419 |
rtb | 40 |
ro3_pass | 3 |
num_ro5_violations | 6 |
cx_most_apka | 1087 |
cx_most_bpka | 988 |
cx_logp | 1135 |
cx_logd | 1225 |
molecular_species | 5 |
full_mwt | 4773 |
aromatic_rings | 10 |
heavy_atoms | 72 |
qed_weighted | 94 |
mw_monoisotopic | 4442 |
full_molformula | 5352 |
hba_lipinski | 28 |
hbd_lipinski | 21 |
num_lipinski_ro5_violations | 6 |
molregno | mw_freebase | alogp | hba | hbd | psa | rtb | ro3_pass | num_ro5_violations | cx_most_apka | ⋯ | molecular_species | full_mwt | aromatic_rings | heavy_atoms | qed_weighted | mw_monoisotopic | full_molformula | hba_lipinski | hbd_lipinski | num_lipinski_ro5_violations | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <dbl> | <dbl> | <int> | <int> | <dbl> | <int> | <chr> | <int> | <dbl> | ⋯ | <chr> | <dbl> | <int> | <int> | <dbl> | <dbl> | <chr> | <int> | <int> | <int> | |
1 | 97 | 383.41 | 1.78 | 8 | 1 | 106.95 | 4 | N | 0 | NA | ⋯ | NEUTRAL | 383.41 | 3 | 28 | 0.73 | 383.1594 | C19H21N5O4 | 9 | 2 | 0 |
2 | 115 | 162.24 | 1.85 | 2 | 0 | 16.13 | 1 | Y | 0 | NA | ⋯ | BASE | 162.24 | 1 | 12 | 0.63 | 162.1157 | C10H14N2 | 2 | 0 | 0 |
3 | 146 | 361.37 | 1.54 | 6 | 1 | 75.01 | 2 | N | 0 | 5.29 | ⋯ | ACID | 361.37 | 2 | 26 | 0.87 | 361.1438 | C18H20FN3O4 | 7 | 1 | 0 |
4 | 147 | 232.24 | 1.42 | 4 | 1 | 72.19 | 2 | N | 0 | 4.37 | ⋯ | ACID | 232.24 | 2 | 17 | 0.85 | 232.0848 | C12H12N2O3 | 5 | 1 | 0 |
5 | 148 | 302.19 | 1.31 | 8 | 4 | 141.34 | 0 | N | 0 | 5.54 | ⋯ | ACID | 302.19 | 4 | 22 | 0.22 | 302.0063 | C14H6O8 | 8 | 4 | 0 |
6 | 154 | 666.79 | 7.22 | 6 | 0 | 58.44 | 13 | N | 2 | NA | ⋯ | NEUTRAL | 666.79 | 4 | 47 | 0.09 | 666.2652 | C36H38F4N4O2S | 6 | 0 | 2 |
7 | 173 | 357.79 | 3.93 | 4 | 1 | 68.53 | 4 | N | 0 | 3.79 | ⋯ | ACID | 357.79 | 3 | 25 | 0.77 | 357.0768 | C19H16ClNO4 | 5 | 1 | 0 |
8 | 194 | 233.25 | -0.79 | 4 | 1 | 91.75 | 1 | N | 0 | 3.09 | ⋯ | ACID | 233.25 | 0 | 15 | 0.60 | 233.0358 | C8H11NO5S | 6 | 1 | 0 |
9 | 207 | 300.30 | -1.52 | 7 | 1 | 122.46 | 3 | N | 0 | 2.86 | ⋯ | ACID | 300.30 | 1 | 20 | 0.67 | 300.0528 | C10H12N4O5S | 9 | 1 | 0 |
10 | 210 | 369.34 | 1.70 | 5 | 1 | 65.78 | 4 | N | 0 | 5.32 | ⋯ | ACID | 369.34 | 2 | 26 | 0.89 | 369.1300 | C17H18F3N3O3 | 6 | 1 | 0 |
These can be used to manually compute the similarities of the drugs, for example in a SOAP kernel. In this work we use pre-computed similarities through the PubChem interface (see Cross-reference with PubChem )
Feel free to skip this section.
The result is stored on the drug_dtructs
table of the output sqlite3 database file.
df.mols.raw <- dbFetchQuery(con, "SELECT
cs.molregno,cs.canonical_smiles,cs.molfile FROM
compound_structures as cs join
molecule_dictionary as md on md.molregno = cs.molregno
where md.molregno in (select molregno from local.selected)
order by md.molregno
limit 30000")
df.mols.raw %>% nrow
df.mols.raw %>% head(1)
dbWriteOutputTable(con, table='drug_structs', df.mols.raw,index='molregno', drop=T)
molregno | canonical_smiles | molfile | |
---|---|---|---|
<int> | <chr> | <chr> | |
1 | 97 | COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC | RDKit 2D 28 31 0 0 0 0 0 0 0 0999 V2000 0.9375 -1.9792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.9292 -2.6917 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 -0.3208 -2.6750 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2875 -3.0417 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.3042 -1.6167 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 -0.3208 -1.9625 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 3.3917 -0.5917 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 1.5542 -1.6417 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 2.7792 -0.9542 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 -0.9500 -3.0167 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 4.0125 -0.9292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -0.9250 -1.5917 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.5833 -2.6542 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -1.5708 -1.9417 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 4.2417 -1.5917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 1.5417 -0.9500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2.1667 -2.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2.1667 -0.5792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2.7792 -1.6375 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 3.4042 0.0875 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 4.5792 -0.5125 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 4.9417 -1.5917 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 5.1542 -0.9292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 0.2667 -3.7542 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0 -2.2000 -2.9917 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 -2.1833 -1.5792 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0 -2.7958 -1.9125 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 -2.7958 -2.6542 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 2 1 2 0 3 6 1 0 4 2 1 0 5 1 1 0 6 5 2 0 7 9 1 0 8 1 1 0 9 19 1 0 10 3 1 0 11 7 1 0 12 6 1 0 13 14 1 0 14 12 2 0 15 11 1 0 16 8 1 0 17 8 1 0 18 16 1 0 19 17 1 0 20 7 2 0 21 11 2 0 22 15 1 0 23 21 1 0 24 4 1 0 25 13 1 0 26 14 1 0 27 26 1 0 28 25 1 0 3 4 2 0 9 18 1 0 10 13 2 0 22 23 2 0 M END |