Importing Required Libraries¶
First, we are importing the libraries and modules that are required for running this notebook.
In [1]:
Copied!
#import general packages
import os
import argparse
import logging
import sys
from typing import Dict, List
from dpu_utils.utils.richpath import RichPath
import pickle
import torch
from tqdm.notebook import tqdm
import pandas as pd
# Setting up local details:
# This should be the location of the checkout of the THEMAP repository:
repo_path = os.path.dirname(os.path.abspath(""))
CHECKOUT_PATH = repo_path
DATASET_PATH = os.path.join(repo_path, "datasets")
os.chdir(CHECKOUT_PATH)
sys.path.insert(0, CHECKOUT_PATH)
#import general packages
import os
import argparse
import logging
import sys
from typing import Dict, List
from dpu_utils.utils.richpath import RichPath
import pickle
import torch
from tqdm.notebook import tqdm
import pandas as pd
# Setting up local details:
# This should be the location of the checkout of the THEMAP repository:
repo_path = os.path.dirname(os.path.abspath(""))
CHECKOUT_PATH = repo_path
DATASET_PATH = os.path.join(repo_path, "datasets")
os.chdir(CHECKOUT_PATH)
sys.path.insert(0, CHECKOUT_PATH)
In [2]:
Copied!
from third_party.otdd.otdd.pytorch.datasets import MolDataset, load_molecule_data
from third_party.otdd.otdd.pytorch.distance import DatasetDistance
from third_party.otdd.otdd.pytorch.datasets import MolDataset, load_molecule_data
from third_party.otdd.otdd.pytorch.distance import DatasetDistance
ot.gpu not found - coupling computation will be in cpu
In [3]:
Copied!
# import visualization packages
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import ipywidgets as widgets
import seaborn as sns
from sklearn.manifold import TSNE
import pandas as pd
from themap.utils import compute_task_hardness_from_distance_matrix, normalize, internal_hardness, otdd_hardness
from themap.data import MoleculeDatapoint, MoleculeDataset, ProteinDataset
light_color = plt.get_cmap("plasma").colors[170]
dark_color = "black"
matplotlib.rcParams.update(
{
"pgf.texsystem": "pdflatex",
"font.family": "serif",
"font.serif": "Computer Modern Roman",
"font.size": 20,
"text.usetex": True,
"pgf.rcfonts": False,
}
)
# import visualization packages
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import ipywidgets as widgets
import seaborn as sns
from sklearn.manifold import TSNE
import pandas as pd
from themap.utils import compute_task_hardness_from_distance_matrix, normalize, internal_hardness, otdd_hardness
from themap.data import MoleculeDatapoint, MoleculeDataset, ProteinDataset
light_color = plt.get_cmap("plasma").colors[170]
dark_color = "black"
matplotlib.rcParams.update(
{
"pgf.texsystem": "pdflatex",
"font.family": "serif",
"font.serif": "Computer Modern Roman",
"font.size": 20,
"text.usetex": True,
"pgf.rcfonts": False,
}
)
Create source and target datasets (Data)¶
In [4]:
Copied!
source_dataset_path = RichPath.create(os.path.join(DATASET_PATH, "train", "CHEMBL1023359.jsonl.gz"))
target_dataset_path = RichPath.create(os.path.join(DATASET_PATH, "test", "CHEMBL2219358.jsonl.gz"))
source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
target_dataset = MoleculeDataset.load_from_file(target_dataset_path)
source_dataset_path = RichPath.create(os.path.join(DATASET_PATH, "train", "CHEMBL1023359.jsonl.gz"))
target_dataset_path = RichPath.create(os.path.join(DATASET_PATH, "test", "CHEMBL2219358.jsonl.gz"))
source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
target_dataset = MoleculeDataset.load_from_file(target_dataset_path)
In [5]:
Copied!
Molecule_Feaurizer = widgets.Dropdown(
options=['gin_supervised_infomax', 'gin_supervised_masking', 'gin_supervised_edgepred'],
value='gin_supervised_infomax',
description='Molecule Featurizer:',
disabled=False,
)
Molecule_Feaurizer = widgets.Dropdown(
options=['gin_supervised_infomax', 'gin_supervised_masking', 'gin_supervised_edgepred'],
value='gin_supervised_infomax',
description='Molecule Featurizer:',
disabled=False,
)
In [6]:
Copied!
Molecule_Feaurizer
Molecule_Feaurizer
Out[6]:
Dropdown(description='Molecule Featurizer:', options=('gin_supervised_infomax', 'gin_supervised_masking', 'gin…
In [7]:
Copied!
## compute and load the embeddings
molecule_feaurizer = Molecule_Feaurizer.value
source_features = source_dataset.get_dataset_embedding(molecule_feaurizer)
target_features = target_dataset.get_dataset_embedding(molecule_feaurizer)
assert source_features.shape[1] == target_features.shape[1]
## compute and load the embeddings
molecule_feaurizer = Molecule_Feaurizer.value
source_features = source_dataset.get_dataset_embedding(molecule_feaurizer)
target_features = target_dataset.get_dataset_embedding(molecule_feaurizer)
assert source_features.shape[1] == target_features.shape[1]
In [8]:
Copied!
source_dataset_otdd = MolDataset(source_dataset)
target_dataset_otdd = MolDataset(target_dataset)
source_dataset_otdd = MolDataset(source_dataset)
target_dataset_otdd = MolDataset(target_dataset)
In [9]:
Copied!
source_dataset_loader = load_molecule_data(source_dataset)
target_dataset_loader = load_molecule_data(target_dataset)
source_dataset_loader = load_molecule_data(source_dataset)
target_dataset_loader = load_molecule_data(target_dataset)
In [ ]:
Copied!
# Instantiate distance
dist = DatasetDistance(source_dataset_loader, target_dataset_loader,
inner_ot_method = 'exact',
debiased_loss = True,
p = 2, entreg = 1e-1,
device='cuda' if torch.cuda.is_available() else 'cpu')
d = dist.distance(maxsamples = 1000)
print(f'OTDD(src,tgt)={d}')
# Instantiate distance
dist = DatasetDistance(source_dataset_loader, target_dataset_loader,
inner_ot_method = 'exact',
debiased_loss = True,
p = 2, entreg = 1e-1,
device='cuda' if torch.cuda.is_available() else 'cpu')
d = dist.distance(maxsamples = 1000)
print(f'OTDD(src,tgt)={d}')
Calculate chemcial distance between target datasets with all the source datasets¶
In [ ]:
Copied!
import glob
source_datasets_path = glob.glob(os.path.join(DATASET_PATH, "train", "CHEMBL*"))
target_datasets_path = glob.glob(os.path.join(DATASET_PATH, "test", "CHEMBL*"))
chem_distances = {}
for target_path in tqdm(target_datasets_path):
chem_distance = {}
target_dataset_path = RichPath.create(target_path)
target_dataset = MoleculeDataset.load_from_file(target_dataset_path)
target_features = target_dataset.get_dataset_embedding(molecule_feaurizer)
target_dataset_otdd = MolDataset(target_dataset)
target_dataset_loader = load_molecule_data(target_dataset)
for source_path in source_datasets_path:
source_dataset_path = RichPath.create(source_path)
source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
source_features = source_dataset.get_dataset_embedding(molecule_feaurizer)
source_dataset_otdd = MolDataset(source_dataset)
source_dataset_loader = load_molecule_data(source_dataset)
dist = DatasetDistance(source_dataset_loader, target_dataset_loader,
inner_ot_method = 'exact',
debiased_loss = True,
p = 2, entreg = 1e-1,
device='cuda' if torch.cuda.is_available() else 'cpu')
d = dist.distance(maxsamples = 1000)
print(f'OTDD({source_dataset.task_id},{target_dataset.task_id})= {d}')
chem_distance[source_dataset.task_id] = d.cpu().item()
chem_distances[target_dataset.task_id] = chem_distance
import glob
source_datasets_path = glob.glob(os.path.join(DATASET_PATH, "train", "CHEMBL*"))
target_datasets_path = glob.glob(os.path.join(DATASET_PATH, "test", "CHEMBL*"))
chem_distances = {}
for target_path in tqdm(target_datasets_path):
chem_distance = {}
target_dataset_path = RichPath.create(target_path)
target_dataset = MoleculeDataset.load_from_file(target_dataset_path)
target_features = target_dataset.get_dataset_embedding(molecule_feaurizer)
target_dataset_otdd = MolDataset(target_dataset)
target_dataset_loader = load_molecule_data(target_dataset)
for source_path in source_datasets_path:
source_dataset_path = RichPath.create(source_path)
source_dataset = MoleculeDataset.load_from_file(source_dataset_path)
source_features = source_dataset.get_dataset_embedding(molecule_feaurizer)
source_dataset_otdd = MolDataset(source_dataset)
source_dataset_loader = load_molecule_data(source_dataset)
dist = DatasetDistance(source_dataset_loader, target_dataset_loader,
inner_ot_method = 'exact',
debiased_loss = True,
p = 2, entreg = 1e-1,
device='cuda' if torch.cuda.is_available() else 'cpu')
d = dist.distance(maxsamples = 1000)
print(f'OTDD({source_dataset.task_id},{target_dataset.task_id})= {d}')
chem_distance[source_dataset.task_id] = d.cpu().item()
chem_distances[target_dataset.task_id] = chem_distance
In [12]:
Copied!
## Choose your target from chem_distances.keys()
your_tasks = "CHEMBL2219236"
chem_dist = chem_distances[your_tasks]
fig = plt.figure(figsize=(12, 5))
plt.bar(chem_dist.keys(), chem_dist.values())
plt.xlabel('Source datasets')
plt.ylabel('OTDD')
plt.title(f'OTDD between source datasets and target {your_tasks}')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=1.0)
plt.show()
## Choose your target from chem_distances.keys()
your_tasks = "CHEMBL2219236"
chem_dist = chem_distances[your_tasks]
fig = plt.figure(figsize=(12, 5))
plt.bar(chem_dist.keys(), chem_dist.values())
plt.xlabel('Source datasets')
plt.ylabel('OTDD')
plt.title(f'OTDD between source datasets and target {your_tasks}')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=1.0)
plt.show()
In [15]:
Copied!
chem_distances.keys()
chem_distances.keys()
Out[15]:
dict_keys(['CHEMBL2219236', 'CHEMBL1963831', 'CHEMBL2219358'])
Calculate protein distance between target datasets with all the source datasets¶
In [13]:
Copied!
Protein_Feaurizer = widgets.Dropdown(
options=['esm2_t6_8M_UR50D', 'esm2_t12_35M_UR50D', 'esm2_t30_150M_UR50D', 'esm2_t33_650M_UR50D', 'esm2_t36_3B_UR50D'],
value='esm2_t33_650M_UR50D',
description='Protein Featurizer:',
disabled=False,
)
Protein_Feaurizer = widgets.Dropdown(
options=['esm2_t6_8M_UR50D', 'esm2_t12_35M_UR50D', 'esm2_t30_150M_UR50D', 'esm2_t33_650M_UR50D', 'esm2_t36_3B_UR50D'],
value='esm2_t33_650M_UR50D',
description='Protein Featurizer:',
disabled=False,
)
In [14]:
Copied!
Protein_Feaurizer
Protein_Feaurizer
Out[14]:
Dropdown(description='Protein Featurizer:', index=3, options=('esm2_t6_8M_UR50D', 'esm2_t12_35M_UR50D', 'esm2_…
In [15]:
Copied!
source_protein = ProteinDataset.load_from_file("datasets/train/train_proteins.fasta")
target_protein = ProteinDataset.load_from_file("datasets/test/test_proteins.fasta")
source_protein = ProteinDataset.load_from_file("datasets/train/train_proteins.fasta")
target_protein = ProteinDataset.load_from_file("datasets/test/test_proteins.fasta")
In [16]:
Copied!
protein_featurizer = Protein_Feaurizer.value
source_protein_features = source_protein.get_features(protein_featurizer)
target_protein_features = target_protein.get_features(protein_featurizer)
protein_featurizer = Protein_Feaurizer.value
source_protein_features = source_protein.get_features(protein_featurizer)
target_protein_features = target_protein.get_features(protein_featurizer)
In [17]:
Copied!
from scipy.spatial.distance import cdist
dist = cdist(source_protein.features, target_protein.features)
from scipy.spatial.distance import cdist
dist = cdist(source_protein.features, target_protein.features)
In [18]:
Copied!
prot_distances = {}
for i, target_prot in enumerate(target_protein.task_id):
prot_distance = {}
for j, source_prot in enumerate(source_protein.task_id):
prot_distance[source_prot] = dist[j, i]
prot_distances[target_prot] = prot_distance
prot_distances = {}
for i, target_prot in enumerate(target_protein.task_id):
prot_distance = {}
for j, source_prot in enumerate(source_protein.task_id):
prot_distance[source_prot] = dist[j, i]
prot_distances[target_prot] = prot_distance
In [24]:
Copied!
## Choose your target from chem_distances.keys()
your_tasks = "CHEMBL2219236"
prot_dist = prot_distances[your_tasks]
fig = plt.figure(figsize=(12, 5))
plt.bar(prot_dist.keys(), prot_dist.values())
plt.xlabel('Source datasets')
plt.ylabel('Protein Distance')
plt.title(f'Protein Distance between source datasets and target {your_tasks}')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=1.0)
plt.show()
## Choose your target from chem_distances.keys()
your_tasks = "CHEMBL2219236"
prot_dist = prot_distances[your_tasks]
fig = plt.figure(figsize=(12, 5))
plt.bar(prot_dist.keys(), prot_dist.values())
plt.xlabel('Source datasets')
plt.ylabel('Protein Distance')
plt.title(f'Protein Distance between source datasets and target {your_tasks}')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=1.0)
plt.show()
Combine Two Distances¶
Now, we can answer to the following questions:
- Given a target task, what is the closest source task in terms of chemical and protein distances?
- Given the source tasks, which target task is hardest to transfer to in terms of chemical and protein space?
- Given a target task and source tasks, how to pick the k nearset source tasks fo transfer learning?
So, let's answer to this questions in the following sections.
In [42]:
Copied!
chem_df = pd.DataFrame.from_dict(chem_distances)
prot_df = pd.DataFrame.from_dict(prot_distances)
chem_df = pd.DataFrame.from_dict(chem_distances)
prot_df = pd.DataFrame.from_dict(prot_distances)
In [69]:
Copied!
## Given a target task, what is the closest source task in terms of chemical and protein distances?
your_task = "CHEMBL2219236"
chem_distance = chem_df[your_task]
prot_distance = prot_df[your_task]
normalized_chem_distance=(chem_distance-chem_distance.min())/(chem_distance.max()-chem_distance.min())
normalized_prot_distance=(prot_distance-prot_distance.min())/(prot_distance.max()-prot_distance.min())
normalized_prot_distance = normalized_prot_distance.reindex(normalized_chem_distance.index)
normalized_comb_distance = (normalized_chem_distance + normalized_prot_distance)/2
print(f'Closest source task in terms of chemical distance: {chem_distance.idxmin()}')
print(f'Closest source task in terms of protein distance: {prot_distance.idxmin()}')
print(f'Closest source task in terms of combination of chemical and protein distance: {normalized_comb_distance.idxmin()}')
## Given a target task, what is the closest source task in terms of chemical and protein distances?
your_task = "CHEMBL2219236"
chem_distance = chem_df[your_task]
prot_distance = prot_df[your_task]
normalized_chem_distance=(chem_distance-chem_distance.min())/(chem_distance.max()-chem_distance.min())
normalized_prot_distance=(prot_distance-prot_distance.min())/(prot_distance.max()-prot_distance.min())
normalized_prot_distance = normalized_prot_distance.reindex(normalized_chem_distance.index)
normalized_comb_distance = (normalized_chem_distance + normalized_prot_distance)/2
print(f'Closest source task in terms of chemical distance: {chem_distance.idxmin()}')
print(f'Closest source task in terms of protein distance: {prot_distance.idxmin()}')
print(f'Closest source task in terms of combination of chemical and protein distance: {normalized_comb_distance.idxmin()}')
Closest source task in terms of chemical distance: CHEMBL2218944 Closest source task in terms of protein distance: CHEMBL2219012 Closest source task in terms of combination of chemical and protein distance: CHEMBL2219012
In [91]:
Copied!
## Given the source tasks, which target task is hardest to transfer to in terms of chemical and protein space?
## Answering this question require to define hardness, which we consider here as the average of k-nearest source tasks.
k = 3
target_tasks = chem_df.columns
source_tasks = chem_df.index
hardness_all={}
for target_task in target_tasks:
hardness={}
chem_distance = chem_df[target_task]
prot_distance = prot_df[target_task]
chem_distance=chem_distance.sort_values()
prot_distance = prot_distance.sort_values()
hardness['EXT_CHEM'] = chem_distance[:k].sum()/k
hardness['EXT_PROT'] = prot_distance[:k].sum()/k
hardness_all[target_task] = hardness
hardness_df = pd.DataFrame.from_dict(hardness_all).T
hardness_df['all'] = (hardness_df['EXT_CHEM'] + hardness_df['EXT_PROT'])/2
print(f'Easiest target task in terms of chemical distance: {hardness_df["EXT_CHEM"].idxmin()}')
print(f'Easiest target task in terms of protein distance: {hardness_df["EXT_PROT"].idxmin()}')
print(f'Easiest target task in terms of combination of chemical and protein distance: {hardness_df["all"].idxmin()}')
## Given the source tasks, which target task is hardest to transfer to in terms of chemical and protein space?
## Answering this question require to define hardness, which we consider here as the average of k-nearest source tasks.
k = 3
target_tasks = chem_df.columns
source_tasks = chem_df.index
hardness_all={}
for target_task in target_tasks:
hardness={}
chem_distance = chem_df[target_task]
prot_distance = prot_df[target_task]
chem_distance=chem_distance.sort_values()
prot_distance = prot_distance.sort_values()
hardness['EXT_CHEM'] = chem_distance[:k].sum()/k
hardness['EXT_PROT'] = prot_distance[:k].sum()/k
hardness_all[target_task] = hardness
hardness_df = pd.DataFrame.from_dict(hardness_all).T
hardness_df['all'] = (hardness_df['EXT_CHEM'] + hardness_df['EXT_PROT'])/2
print(f'Easiest target task in terms of chemical distance: {hardness_df["EXT_CHEM"].idxmin()}')
print(f'Easiest target task in terms of protein distance: {hardness_df["EXT_PROT"].idxmin()}')
print(f'Easiest target task in terms of combination of chemical and protein distance: {hardness_df["all"].idxmin()}')
Easiest target task in terms of chemical distance: CHEMBL2219236 Easiest target task in terms of protein distance: CHEMBL2219358 Easiest target task in terms of combination of chemical and protein distance: CHEMBL2219358
In [93]:
Copied!
plt.figure(figsize=(12, 5))
plt.bar(hardness_df.index, hardness_df['all'])
plt.xlabel('Target datasets')
plt.ylabel('Hardness')
plt.title(f'Hardness of target datasets')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=1.0)
plt.show()
plt.figure(figsize=(12, 5))
plt.bar(hardness_df.index, hardness_df['all'])
plt.xlabel('Target datasets')
plt.ylabel('Hardness')
plt.title(f'Hardness of target datasets')
plt.xticks(rotation=90)
plt.grid(axis='y', linestyle='--', alpha=1.0)
plt.show()
In [103]:
Copied!
## Given a target task and source tasks, how to pick the k nearset source tasks fo transfer learning?
## Given the source tasks, which target task is hardest to transfer to in terms of chemical and protein space?
## Answering this question require to define hardness, which we consider here as the average of k-nearest source tasks.
k = 3
target_tasks = chem_df.columns
source_tasks = chem_df.index
closest_tasks={}
for target_task in target_tasks:
closest={}
chem_distance = chem_df[target_task]
prot_distance = prot_df[target_task]
chem_distance=chem_distance.sort_values()
prot_distance = prot_distance.sort_values()
closest['EXT_CHEM'] = chem_distance[:k].index.to_list()
closest['EXT_PROT'] = prot_distance[:k].index.to_list()
closest_tasks[target_task] = closest
closest_df = pd.DataFrame.from_dict(closest_tasks).T
## Given a target task and source tasks, how to pick the k nearset source tasks fo transfer learning?
## Given the source tasks, which target task is hardest to transfer to in terms of chemical and protein space?
## Answering this question require to define hardness, which we consider here as the average of k-nearest source tasks.
k = 3
target_tasks = chem_df.columns
source_tasks = chem_df.index
closest_tasks={}
for target_task in target_tasks:
closest={}
chem_distance = chem_df[target_task]
prot_distance = prot_df[target_task]
chem_distance=chem_distance.sort_values()
prot_distance = prot_distance.sort_values()
closest['EXT_CHEM'] = chem_distance[:k].index.to_list()
closest['EXT_PROT'] = prot_distance[:k].index.to_list()
closest_tasks[target_task] = closest
closest_df = pd.DataFrame.from_dict(closest_tasks).T