Benchmarking lung integration#

Here we walkthrough applying the integration benchmarking metrics on the lung atlas example from the scIB paper.

import numpy as np
import scanpy as sc

from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection

%matplotlib inline

Load and preprocess data#

adata = sc.read(
    "data/lung_atlas.h5ad",
    backup_url="https://figshare.com/ndownloader/files/24539942",
)
adata
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="cell_ranger", batch_key="batch")
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

We subset to the highly variable genes so that each method has the same input.

adata = adata[:, adata.var.highly_variable].copy()
adata.obsm["Unintegrated"] = adata.obsm["X_pca"]

Run methods#

Here we run a few embedding-based methods. By focusing on embedding-based methods, we can substantially reduce the runtime of the benchmarking metrics.

In principle, graph-based integration methods can also be benchmarked on some of the metrics that have graph inputs. Future work can explore using graph convolutional networks to embed the graph and then using the embedding-based metrics.

Scanorama#

import scanorama

# List of adata per batch
batch_cats = adata.obs.batch.cat.categories
adata_list = [adata[adata.obs.batch == b].copy() for b in batch_cats]
scanorama.integrate_scanpy(adata_list)

adata.obsm["Scanorama"] = np.zeros((adata.shape[0], adata_list[0].obsm["X_scanorama"].shape[1]))
for i, b in enumerate(batch_cats):
    adata.obsm["Scanorama"][adata.obs.batch == b] = adata_list[i].obsm["X_scanorama"]

Liger#

import pyliger

bdata = adata.copy()
# Pyliger normalizes by library size with a size factor of 1
# So here we give it the count data
bdata.X = bdata.layers["counts"]
# List of adata per batch
adata_list = [bdata[bdata.obs.batch == b].copy() for b in batch_cats]
for i, ad in enumerate(adata_list):
    ad.uns["sample_name"] = batch_cats[i]
    # Hack to make sure each method uses the same genes
    ad.uns["var_gene_idx"] = np.arange(bdata.n_vars)


liger_data = pyliger.create_liger(adata_list, remove_missing=False, make_sparse=False)
# Hack to make sure each method uses the same genes
liger_data.var_genes = bdata.var_names
pyliger.normalize(liger_data)
pyliger.scale_not_center(liger_data)
pyliger.optimize_ALS(liger_data, k=30)
pyliger.quantile_norm(liger_data)


adata.obsm["LIGER"] = np.zeros((adata.shape[0], liger_data.adata_list[0].obsm["H_norm"].shape[1]))
for i, b in enumerate(batch_cats):
    adata.obsm["LIGER"][adata.obs.batch == b] = liger_data.adata_list[i].obsm["H_norm"]

Harmony#

from harmony import harmonize

adata.obsm["Harmony"] = harmonize(adata.obsm["X_pca"], adata.obs, batch_key="batch")

scVI#

import scvi

scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")
vae = scvi.model.SCVI(adata, gene_likelihood="nb", n_layers=2, n_latent=30)
vae.train()
adata.obsm["scVI"] = vae.get_latent_representation()

scANVI#

lvae = scvi.model.SCANVI.from_scvi_model(
    vae,
    adata=adata,
    labels_key="cell_type",
    unlabeled_category="Unknown",
)
lvae.train(max_epochs=20, n_samples_per_label=100)
adata.obsm["scANVI"] = lvae.get_latent_representation()

Perform the benchmark#

bm = Benchmarker(
    adata,
    batch_key="batch",
    label_key="cell_type",
    bio_conservation_metrics=BioConservation(),
    batch_correction_metrics=BatchCorrection(),
    embedding_obsm_keys=["Unintegrated", "Scanorama", "LIGER", "Harmony", "scVI", "scANVI"],
    n_jobs=6,
)
bm.benchmark()

Visualize the results#

bm.plot_results_table()
../_images/72bdb33d77de77ce505e24fef1d700c2b91985b1d3fcce1c2b1dd7a35c8a647f.png
<plottable.table.Table at 0x7f477571fee0>
bm.plot_results_table(min_max_scale=False)
../_images/72fe472fdb279e95aba93bd88bf06ad186ff74cc30d6ec63c47b0025da83ccc0.png
<plottable.table.Table at 0x7f47749a5930>

We can also access the underlying dataframes to print the results ourselves.

from rich import print

df = bm.get_results(min_max_scale=False)
print(df)
               Isolated labels        KMeans NMI        KMeans ARI  \
Embedding                                                            
Unintegrated          0.661278          0.659168          0.427279   
Scanorama             0.626327          0.704883          0.490931   
LIGER                 0.621488          0.570878          0.329819   
Harmony               0.554801          0.606546          0.432499   
scVI                  0.617493          0.650634          0.509873   
scANVI                0.712235          0.747422          0.587875   
Metric Type   Bio conservation  Bio conservation  Bio conservation   

              Silhouette label             cLISI  Silhouette batch  \
Embedding                                                            
Unintegrated          0.585415               1.0          0.855258   
Scanorama             0.564498           0.99961          0.932364   
LIGER                 0.542106           0.98501          0.850703   
Harmony               0.564499          0.993888          0.879939   
scVI                  0.538018          0.994553          0.895502   
scANVI                0.613172          0.999632          0.884228   
Metric Type   Bio conservation  Bio conservation  Batch correction   

                         iLISI              KBET Graph connectivity  \
Embedding                                                             
Unintegrated          0.008071          0.223882           0.764953   
Scanorama              0.06491          0.313843           0.689019   
LIGER                  0.16332          0.463284           0.694717   
Harmony               0.130195          0.432907           0.756294   
scVI                  0.124561          0.347675           0.896865   
scANVI                0.095614          0.337137           0.932888   
Metric Type   Batch correction  Batch correction   Batch correction   

                PCR comparison Batch correction Bio conservation  \
Embedding                                                          
Unintegrated                 0         0.370433         0.666628   
Scanorama             0.204786         0.440984          0.67725   
LIGER                 0.810099         0.596425          0.60986   
Harmony               0.521998         0.544267         0.630447   
scVI                  0.871396           0.6272         0.662114   
scANVI                0.734378         0.596849         0.732067   
Metric Type   Batch correction  Aggregate score  Aggregate score   

                        Total  
Embedding                      
Unintegrated          0.54815  
Scanorama            0.582743  
LIGER                0.604486  
Harmony              0.595975  
scVI                 0.648148  
scANVI                0.67798  
Metric Type   Aggregate score  
df.transpose()
Embedding Unintegrated Scanorama LIGER Harmony scVI scANVI Metric Type
Isolated labels 0.661278 0.626327 0.621488 0.554801 0.617493 0.712235 Bio conservation
KMeans NMI 0.659168 0.704883 0.570878 0.606546 0.650634 0.747422 Bio conservation
KMeans ARI 0.427279 0.490931 0.329819 0.432499 0.509873 0.587875 Bio conservation
Silhouette label 0.585415 0.564498 0.542106 0.564499 0.538018 0.613172 Bio conservation
cLISI 1.0 0.99961 0.98501 0.993888 0.994553 0.999632 Bio conservation
Silhouette batch 0.855258 0.932364 0.850703 0.879939 0.895502 0.884228 Batch correction
iLISI 0.008071 0.06491 0.16332 0.130195 0.124561 0.095614 Batch correction
KBET 0.223882 0.313843 0.463284 0.432907 0.347675 0.337137 Batch correction
Graph connectivity 0.764953 0.689019 0.694717 0.756294 0.896865 0.932888 Batch correction
PCR comparison 0 0.204786 0.810099 0.521998 0.871396 0.734378 Batch correction
Batch correction 0.370433 0.440984 0.596425 0.544267 0.6272 0.596849 Aggregate score
Bio conservation 0.666628 0.67725 0.60986 0.630447 0.662114 0.732067 Aggregate score
Total 0.54815 0.582743 0.604486 0.595975 0.648148 0.67798 Aggregate score