Examples
This section provides practical, runnable examples for common THEMAP use cases. Each example includes complete code, sample data, and explanations.
Quick Examples
1. Basic Molecular Distance
"""
Compute distance between two molecular datasets using OTDD.
"""
from themap.data import MoleculeDataset
from themap.distance import MoleculeDatasetDistance
from dpu_utils.utils.richpath import RichPath
# Load datasets
source = MoleculeDataset.load_from_file(
RichPath.create("datasets/train/CHEMBL1023359.jsonl.gz")
)
target = MoleculeDataset.load_from_file(
RichPath.create("datasets/test/CHEMBL2219358.jsonl.gz")
)
# Compute OTDD distance
distance_calc = MoleculeDatasetDistance(
tasks=None,
molecule_method="otdd"
)
distance_calc.source_molecule_datasets = [source]
distance_calc.target_molecule_datasets = [target]
distance_calc.source_task_ids = [source.task_id]
distance_calc.target_task_ids = [target.task_id]
result = distance_calc.get_distance()
print(f"OTDD distance: {result}")
2. Protein Similarity Analysis
"""
Analyze protein similarity using sequence embeddings.
"""
from themap.data import ProteinDataset
from themap.distance import ProteinDatasetDistance
# Load protein sequences
proteins = ProteinDataset.load_from_file("datasets/train/train_proteins.fasta")
# Compute pairwise distances
prot_distance = ProteinDatasetDistance(
tasks=None,
protein_method="euclidean"
)
# Set up for self-comparison
prot_distance.source_protein_datasets = proteins
prot_distance.target_protein_datasets = proteins
distances = prot_distance.get_distance()
print("Protein distance matrix:")
for target_id, source_distances in distances.items():
print(f"{target_id}: {source_distances}")
3. Unified Task System
"""
Work with the unified task system for multi-modal analysis.
"""
from themap.data.tasks import Tasks
from themap.distance import TaskDistance
# Load integrated tasks
tasks = Tasks.from_directory(
directory="datasets/",
task_list_file="datasets/sample_tasks_list.json",
load_molecules=True,
load_proteins=True,
cache_dir="cache/"
)
# Compute combined distances
task_distance = TaskDistance(
tasks=tasks,
molecule_method="cosine",
protein_method="euclidean"
)
# Get all distance types
all_distances = task_distance.compute_all_distances(
combination_strategy="weighted_average",
molecule_weight=0.7,
protein_weight=0.3
)
print("Distance computation complete:")
print(f"Molecule distances: {len(all_distances['molecule'])} tasks")
print(f"Protein distances: {len(all_distances['protein'])} tasks")
print(f"Combined distances: {len(all_distances['combined'])} tasks")
Complete Workflow Examples
Transfer Learning Pipeline
"""
Complete transfer learning workflow with task hardness estimation.
"""
import numpy as np
import pandas as pd
from themap.data.tasks import Tasks
from themap.distance import TaskDistance
from themap.hardness import TaskHardness
def transfer_learning_pipeline(source_tasks_dir, target_tasks_dir):
"""
Complete pipeline for transfer learning task selection.
"""
# 1. Load source and target tasks
print("π Loading tasks...")
source_tasks = Tasks.from_directory(
directory=source_tasks_dir,
load_molecules=True,
load_proteins=True
)
target_tasks = Tasks.from_directory(
directory=target_tasks_dir,
load_molecules=True,
load_proteins=True
)
print(f"Source tasks: {len(source_tasks)}")
print(f"Target tasks: {len(target_tasks)}")
# 2. Compute task distances
print("π Computing task distances...")
task_distance = TaskDistance(
tasks=source_tasks + target_tasks, # Combined for comparison
molecule_method="cosine",
protein_method="euclidean"
)
distances = task_distance.compute_all_distances()
# 3. Estimate task hardness
print("πͺ Estimating task hardness...")
hardness_estimator = TaskHardness(
tasks=target_tasks,
method="combined_distance"
)
hardness_scores = hardness_estimator.estimate_hardness(distances['combined'])
# 4. Create transferability matrix
print("πΊοΈ Creating transferability map...")
transfer_matrix = pd.DataFrame(distances['combined'])
# 5. Recommend source tasks for each target
recommendations = {}
for target_task in target_tasks.get_task_ids():
if target_task in transfer_matrix.index:
# Get closest source tasks
source_distances = transfer_matrix.loc[target_task]
top_sources = source_distances.nsmallest(3)
recommendations[target_task] = {
'recommended_sources': top_sources.index.tolist(),
'distances': top_sources.values.tolist(),
'estimated_hardness': hardness_scores.get(target_task, 'N/A')
}
return {
'transfer_matrix': transfer_matrix,
'hardness_scores': hardness_scores,
'recommendations': recommendations,
'distances': distances
}
# Run pipeline
if __name__ == "__main__":
results = transfer_learning_pipeline(
source_tasks_dir="datasets/train/",
target_tasks_dir="datasets/test/"
)
# Print recommendations
print("\nπ― Transfer Learning Recommendations:")
for target, rec in results['recommendations'].items():
print(f"\nTarget: {target}")
print(f"Hardness: {rec['estimated_hardness']:.3f}")
print("Best source tasks:")
for source, dist in zip(rec['recommended_sources'], rec['distances']):
print(f" {source}: distance={dist:.3f}")
Batch Processing Pipeline
"""
Process large numbers of datasets efficiently.
"""
import os
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from themap.distance import MoleculeDatasetDistance
from themap.data import MoleculeDataset
def process_single_comparison(source_path, target_path, method="euclidean"):
"""Process a single dataset comparison."""
try:
# Load datasets
source = MoleculeDataset.load_from_file(source_path)
target = MoleculeDataset.load_from_file(target_path)
# Compute distance
distance_calc = MoleculeDatasetDistance(
tasks=None,
molecule_method=method
)
distance_calc.source_molecule_datasets = [source]
distance_calc.target_molecule_datasets = [target]
distance_calc.source_task_ids = [source.task_id]
distance_calc.target_task_ids = [target.task_id]
result = distance_calc.get_distance()
return {
'source': source.task_id,
'target': target.task_id,
'distance': list(result.values())[0][source.task_id],
'method': method,
'status': 'success'
}
except Exception as e:
return {
'source': Path(source_path).stem,
'target': Path(target_path).stem,
'distance': None,
'method': method,
'status': f'error: {str(e)}'
}
def batch_distance_computation(
source_dir,
target_dir,
method="euclidean",
max_workers=4,
output_file="distance_results.csv"
):
"""
Compute distances between all source-target dataset pairs.
"""
# Find all dataset files
source_files = list(Path(source_dir).glob("*.jsonl.gz"))
target_files = list(Path(target_dir).glob("*.jsonl.gz"))
print(f"Found {len(source_files)} source and {len(target_files)} target datasets")
# Create all combinations
comparisons = [
(str(source), str(target))
for source in source_files
for target in target_files
]
print(f"Total comparisons: {len(comparisons)}")
# Process in parallel
results = []
completed = 0
with ProcessPoolExecutor(max_workers=max_workers) as executor:
# Submit all jobs
future_to_comparison = {
executor.submit(process_single_comparison, source, target, method): (source, target)
for source, target in comparisons
}
# Collect results
for future in as_completed(future_to_comparison):
result = future.result()
results.append(result)
completed += 1
if completed % 10 == 0:
print(f"Completed: {completed}/{len(comparisons)}")
# Save results
df_results = pd.DataFrame(results)
df_results.to_csv(output_file, index=False)
print(f"\nβ
Results saved to {output_file}")
print(f"Successful computations: {sum(1 for r in results if r['status'] == 'success')}")
print(f"Failed computations: {sum(1 for r in results if r['status'] != 'success')}")
return df_results
# Run batch processing
if __name__ == "__main__":
results_df = batch_distance_computation(
source_dir="datasets/train/",
target_dir="datasets/test/",
method="euclidean",
max_workers=4
)
# Basic analysis
successful_results = results_df[results_df['status'] == 'success']
if len(successful_results) > 0:
print(f"\nDistance statistics:")
print(f"Mean distance: {successful_results['distance'].mean():.3f}")
print(f"Std distance: {successful_results['distance'].std():.3f}")
print(f"Min distance: {successful_results['distance'].min():.3f}")
print(f"Max distance: {successful_results['distance'].max():.3f}")
Performance Benchmarking
"""
Benchmark different distance methods for performance and accuracy.
"""
import time
import psutil
import os
from themap.distance import MoleculeDatasetDistance
from themap.data.tasks import Tasks
def benchmark_distance_methods(tasks, methods=["euclidean", "cosine", "otdd"]):
"""
Benchmark different distance computation methods.
"""
def get_memory_usage():
"""Get current memory usage in MB."""
process = psutil.Process(os.getpid())
return process.memory_info().rss / 1024 / 1024
results = {}
for method in methods:
print(f"\nπ Benchmarking {method} method...")
# Memory before
memory_before = get_memory_usage()
# Time the computation
start_time = time.time()
try:
# Create distance calculator
distance_calc = MoleculeDatasetDistance(
tasks=tasks,
molecule_method=method
)
# Compute distances
distances = distance_calc.get_distance()
end_time = time.time()
memory_after = get_memory_usage()
# Calculate metrics
computation_time = end_time - start_time
memory_usage = memory_after - memory_before
# Analyze results
if distances:
all_distances = [
dist for target_dists in distances.values()
for dist in target_dists.values()
]
mean_distance = np.mean(all_distances)
std_distance = np.std(all_distances)
else:
mean_distance = None
std_distance = None
results[method] = {
'computation_time': computation_time,
'memory_usage': memory_usage,
'mean_distance': mean_distance,
'std_distance': std_distance,
'num_comparisons': sum(len(d) for d in distances.values()) if distances else 0,
'status': 'success'
}
print(f" β
Time: {computation_time:.2f}s")
print(f" πΎ Memory: {memory_usage:.1f}MB")
if mean_distance is not None:
print(f" π Mean distance: {mean_distance:.3f}")
except Exception as e:
end_time = time.time()
results[method] = {
'computation_time': end_time - start_time,
'memory_usage': None,
'mean_distance': None,
'std_distance': None,
'num_comparisons': 0,
'status': f'failed: {str(e)}'
}
print(f" β Failed: {str(e)}")
return results
def compare_methods(benchmark_results):
"""
Compare benchmark results across methods.
"""
print("\nπ Benchmark Comparison:")
print("-" * 80)
print(f"{'Method':<10} {'Time (s)':<10} {'Memory (MB)':<12} {'Mean Dist':<10} {'Status':<15}")
print("-" * 80)
for method, results in benchmark_results.items():
time_str = f"{results['computation_time']:.2f}" if results['computation_time'] else "N/A"
memory_str = f"{results['memory_usage']:.1f}" if results['memory_usage'] else "N/A"
dist_str = f"{results['mean_distance']:.3f}" if results['mean_distance'] else "N/A"
print(f"{method:<10} {time_str:<10} {memory_str:<12} {dist_str:<10} {results['status']:<15}")
# Performance ranking
successful_methods = {
method: results for method, results in benchmark_results.items()
if results['status'] == 'success'
}
if successful_methods:
print(f"\nπ Rankings:")
# Speed ranking
speed_ranking = sorted(
successful_methods.items(),
key=lambda x: x[1]['computation_time']
)
print(f"β‘ Fastest: {' > '.join([method for method, _ in speed_ranking])}")
# Memory ranking
memory_ranking = sorted(
successful_methods.items(),
key=lambda x: x[1]['memory_usage']
)
print(f"πΎ Most memory efficient: {' > '.join([method for method, _ in memory_ranking])}")
# Run benchmark
if __name__ == "__main__":
# Load sample tasks
tasks = Tasks.from_directory(
directory="datasets/",
task_list_file="datasets/sample_tasks_list.json",
load_molecules=True
)
# Run benchmark
results = benchmark_distance_methods(
tasks=tasks,
methods=["euclidean", "cosine", "otdd"]
)
# Compare results
compare_methods(results)
Visualization Examples
Distance Matrix Heatmap
"""
Create publication-ready distance matrix visualizations.
"""
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from themap.distance import TaskDistance
def plot_distance_matrix(distance_dict, title="Task Distance Matrix", figsize=(10, 8)):
"""
Create a heatmap visualization of task distances.
"""
# Convert to DataFrame
df = pd.DataFrame(distance_dict)
# Create figure
plt.figure(figsize=figsize)
# Create heatmap
sns.heatmap(
df,
annot=True,
fmt='.3f',
cmap='viridis',
cbar_kws={'label': 'Distance'},
square=True
)
plt.title(title)
plt.xlabel('Source Tasks')
plt.ylabel('Target Tasks')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
return plt.gcf()
def plot_distance_comparison(molecule_distances, protein_distances, combined_distances):
"""
Compare different distance types in a multi-panel plot.
"""
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
distance_types = [
(molecule_distances, "Molecule Distances", axes[0]),
(protein_distances, "Protein Distances", axes[1]),
(combined_distances, "Combined Distances", axes[2])
]
for distances, title, ax in distance_types:
df = pd.DataFrame(distances)
sns.heatmap(
df,
annot=True,
fmt='.2f',
cmap='viridis',
ax=ax,
cbar_kws={'label': 'Distance'}
)
ax.set_title(title)
ax.set_xlabel('Source Tasks')
ax.set_ylabel('Target Tasks')
plt.tight_layout()
return fig
# Example usage
if __name__ == "__main__":
# Load tasks and compute distances
tasks = Tasks.from_directory("datasets/", load_molecules=True, load_proteins=True)
task_distance = TaskDistance(tasks=tasks)
all_distances = task_distance.compute_all_distances()
# Create visualizations
fig1 = plot_distance_matrix(
all_distances['molecule'],
title="Molecular Dataset Distances"
)
fig2 = plot_distance_comparison(
all_distances['molecule'],
all_distances['protein'],
all_distances['combined']
)
# Save figures
fig1.savefig('molecule_distances.png', dpi=300, bbox_inches='tight')
fig2.savefig('distance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
Script Templates
Command-Line Interface
#!/usr/bin/env python3
"""
Command-line interface for THEMAP distance computation.
Usage:
python distance_cli.py --source datasets/train/ --target datasets/test/ --method otdd
"""
import argparse
import json
from pathlib import Path
from themap.data.tasks import Tasks
from themap.distance import TaskDistance
def main():
parser = argparse.ArgumentParser(description="THEMAP Distance Computation CLI")
parser.add_argument("--source", required=True, help="Source tasks directory")
parser.add_argument("--target", required=True, help="Target tasks directory")
parser.add_argument("--method", default="euclidean",
choices=["euclidean", "cosine", "otdd"],
help="Distance computation method")
parser.add_argument("--output", default="distances.json",
help="Output file for distances")
parser.add_argument("--cache-dir", default="cache/",
help="Cache directory for features")
parser.add_argument("--molecule-featurizer", default="ecfp",
help="Molecular featurizer to use")
parser.add_argument("--protein-featurizer", default="esm2_t33_650M_UR50D",
help="Protein featurizer to use")
args = parser.parse_args()
print(f"π Computing {args.method} distances")
print(f"π Source: {args.source}")
print(f"π― Target: {args.target}")
# Load tasks
source_tasks = Tasks.from_directory(
directory=args.source,
load_molecules=True,
load_proteins=True,
cache_dir=args.cache_dir
)
target_tasks = Tasks.from_directory(
directory=args.target,
load_molecules=True,
load_proteins=True,
cache_dir=args.cache_dir
)
print(f"π Loaded {len(source_tasks)} source and {len(target_tasks)} target tasks")
# Compute distances
all_tasks = source_tasks + target_tasks
task_distance = TaskDistance(
tasks=all_tasks,
molecule_method=args.method,
protein_method=args.method
)
distances = task_distance.get_distance()
# Save results
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
json.dump(distances, f, indent=2)
print(f"πΎ Results saved to {output_path}")
print(f"β
Computed {sum(len(d) for d in distances.values())} pairwise distances")
if __name__ == "__main__":
main()
These examples provide a solid foundation for working with THEMAP in various scenarios. Each example is self-contained and can be adapted to your specific use case. For more detailed explanations, see our tutorials.