3. Via 2.0 Cartography on Zebrahub (Visualization)

This tutorial focuses on how to generate the Via2.0 Zebrahub Atlas View. The TI analysis is shown in Tutorial #2

import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pyVIA.core as via
from datetime import datetime

Load the data

Data has been filtered and library normalized. Data is annotated with time/stage, cluster labels, and cell type labels (coarse and fine)

print(f'{datetime.now()}\tStart reading data')
adata=sc.read_h5ad( filename='/home/user/Trajectory/Datasets/Zebrafish_Lange2023/Rawdata/Zebrahub_data_via.h5ad')
#compute PCA
adata.uns['log1p']['base'] = None #fix for small bug in scanpy #see https://github.com/theislab/single-cell-tutorial/issues/97
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
sc.tl.pca(adata, n_comps=100)
print(adata)
2024-02-06 16:48:28.584723	Start reading data
AnnData object with n_obs × n_vars = 119457 × 28187
    obs: 'n_genes', 'n_counts', 'fish', 'timepoint_cluster', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_nc', 'pct_counts_nc', 'developmental_stage', 'timepoint', 'zebrafish_anatomy_ontology_class', 'zebrafish_anatomy_ontology_id', 'parc', 'kmeans50', 'celltype_extrainfo', 'zebrafish_anatomy_ontology_class_shobi', 'adjusted_timepoint', 'bool_list', 'global annotation', 'time_coarse_cell_type', 'time_fine_cell_type', 'parc_cluster', 'parc_cluster_and_cell_type'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'nc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'global_annotation_colors', 'hvg', 'log1p', 'pca', 'timepoint_colors', 'zebrafish_anatomy_ontology_class_colors', 'zebrafish_anatomy_ontology_id_colors'
    obsm: 'X_U', 'X_Via2_Atlas', 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'

Extract some annotations

n_cells = adata.shape[0]
print('cells is', n_cells)
parc_cluster = [i for i in adata.obs['parc_cluster']]
parc_cluster_and_cell_type = [i for i in adata.obs['parc_cluster_and_cell_type']]
coarse_cell_type = [i for i in adata.obs['zebrafish_anatomy_ontology_class']]
fine_cell_type = [i for i in adata.obs['zebrafish_anatomy_ontology_class_shobi']] #consolidated annotaions from individual time point files
coarse_fine_cell_type = [coarse_cell_type[i] + fine_cell_type[i] for i in range(n_cells)]
time_point_numeric = [i for i in adata.obs['adjusted_timepoint']] #assigned a numeric value to each stage (e.g. 5hpf is 5, 10hpf is 10,...)
early_cell_type = [i if str(i) != 'nan' else '' for i in adata.obs['global annotation']]

hybrid_cell_time = [str(fine_cell_type[i]) + '_'+early_cell_type[i]+'_'+str(time_point_numeric[i]) for i in range(n_cells)] #labels of early stage cells which was provided for cells <24hpf also included
parc_numeric_cluster_labels = [i for i in adata.obs['parc']]
parc_cluster = [i for i in adata.obs['parc_cluster']]
cells is 119457

Find a suitable root

root_str = 'paraxial mesoderm_Somites_0'
loc_root = np.where(np.asarray(hybrid_cell_time) == root_str)[0] #'paraxial mesoderm_PSM_0' 1 cell , 'paraxial mesoderm_Somites_0' has 51 cells, 'paraxial mesoderm_Muscle_0', 'tail bud_NMPs_0'
root = [loc_root[0]]
n_pcs = 100
print(f'{datetime.now()}\troot is cell number  {root[0]} of type {fine_cell_type[root[0]]}')
2024-02-06 16:49:16.395417	root is cell number  2604 of type paraxial mesoderm

Initialize parameters

knn=30
time_series = True  # True
memory = 10
jac_std_global = 1  # 1 is used in all the umap csv files
random_seed = 0#1  # 0
knn_sequential = 15  # 5 forviagraphs #15 for embedding
t_diff_step = 2
do_gaussian_kernel_edgeweights = False
edgebundle_pruning_twice = True
edgepruning_clustering_resolution = 0.15  # 0.15 viagraph and embedding
edgebundle_pruning = 0.05  # 0.15
neighboring_terminal_states_threshold = 4  # 3
viagraph_decay = 0.6  # 0.7, 0.9
max_vis_out = 2

Set terminal states (optional)

# The parameter "user_defined_terminal_group" is being set to a list of group level annotations corresponding to the provided true_label. These were terminal fates detected in previous runs and saved for reproducability.
# This parameter can also be left to its default user_defined_terminal_group = []. the results should be similar.
user_defined_cellfates= ['0_cartilage element', '2_dermis', '49_fin',
                                          '67_pharyngeal arch', '120_cranial cartilage',
                                          '1_forebrain', '71_neuron', '89_diencephalon',
                                          '136_radial glial cell',
                                          '10_epidermal cell', '83_neuromast', 
                                          '145_enteric musculature', '55_myotome', 
                                          '46_hematopoietic cell',                                          '102_vasculature', '127_Muller cell', '105_oligodendrocyte',                            '69_liver', '90_pharynx',
                                          '95_intestine']

Initialize and Run the Via2.0 Class

# Initialize and Run the Via2.0 Class
print(f'{datetime.now()}\tRun Via2.0')

v0 = via.VIA(adata.obsm['X_pca'][:, 0:n_pcs], true_label=parc_cluster_and_cell_type, 
             labels=parc_cluster, knn=knn, memory=memory,
             user_defined_terminal_group=user_defined_cellfates,
             edgepruning_clustering_resolution=edgepruning_clustering_resolution,
             neighboring_terminal_states_threshold=neighboring_terminal_states_threshold,
             too_big_factor=0.3, resolution_parameter=1,
             root_user=root, dataset='', random_seed=random_seed,
             is_coarse=True, preserve_disconnected=False, pseudotime_threshold_TS=40, x_lazy=0.99,
             do_gaussian_kernel_edgeweights=do_gaussian_kernel_edgeweights,
             alpha_teleport=0.99, edgebundle_pruning=edgebundle_pruning, edgebundle_pruning_twice=True,
             time_series=time_series, time_series_labels=time_point_numeric, knn_sequential=knn_sequential,
             knn_sequential_reverse=knn_sequential, t_diff_step=t_diff_step,
             do_compute_embedding=False)

v0.run_VIA()
2024-02-06 16:49:16.446582	Run Via2.0
2024-02-06 16:49:16.453681	Running VIA over input data of 119457 (samples) x 100 (features)
2024-02-06 16:49:16.453681	Knngraph has 30 neighbors
2024-02-06 16:49:49.943120	Using time series information to guide knn graph construction 
2024-02-06 16:49:49.954230	Time series ordered set [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
2024-02-06 16:50:36.547252	Shape neighbors (119457, 30) and sequential neighbors (119457, 30)
2024-02-06 16:50:36.600487	Shape augmented neighbors (119457, 60)
2024-02-06 16:50:36.651840	Actual average allowable time difference between nodes is 10.0
2024-02-06 16:50:55.131081	Finished global pruning of 30-knn graph used for clustering at level of 0.15. Kept 46.9 % of edges. 
2024-02-06 16:50:55.445815	Number of connected components used for clustergraph  is 1
2024-02-06 16:51:17.750064	Using predfined labels provided by user (this must be provided as an array)
2024-02-06 16:51:17.750064	Making cluster graph. Global cluster graph pruning level: 0.15
2024-02-06 16:51:19.093352	Graph has 1 connected components before pruning
2024-02-06 16:51:19.108996	Graph has 1 connected components after pruning
2024-02-06 16:51:19.108996	Graph has 1 connected components after reconnecting
2024-02-06 16:51:19.108996	48.0% links trimmed from local pruning relative to start
2024-02-06 16:51:19.108996	88.8% links trimmed from global pruning relative to start
2024-02-06 16:51:19.108996	Graph has 1 connected components before pruning
2024-02-06 16:51:19.108996	Graph has 3 connected components after pruning
2024-02-06 16:51:19.124624	Graph has 1 connected components after reconnecting
2024-02-06 16:51:19.124624	48.0% links trimmed from local pruning relative to start
2024-02-06 16:51:19.124624	92.1% links trimmed from global pruning relative to start
2024-02-06 16:51:19.124624	component number 0 out of  [0]
C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\scipy\sparse\_index.py:103: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  self._set_intXint(row, col, x.flat[0])
2024-02-06 16:51:20.255203	The root index, 2604 provided by the user belongs to cluster number 97 and corresponds to cell type 97_paraxial mesoderm
2024-02-06 16:51:20.404829	Computing lazy-teleporting expected hitting times
2024-02-06 16:51:31.983877	 ended all multiprocesses, will retrieve and reshape
try rw2 hitting times setup
start computing walks with rw2 method
g.indptr.size, 147
C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\pecanpy\graph.py:90: UserWarning: WARNING: Implicitly set node IDs to the canonical node ordering due to missing IDs field in the raw CSR npz file. This warning message can be suppressed by setting implicit_ids to True in the read_npz function call, or by setting the --implicit_ids flag in the CLI
  warnings.warn(
memory for rw2 hittings times  2. Using rw2 based pt
do scaling of pt
2024-02-06 16:51:41.963854	Terminal cluster list based on user defined cells/groups: [('0_cartilage element', 0), ('2_dermis', 2), ('49_fin', 49), ('67_pharyngeal arch', 67), ('120_cranial cartilage', 120), ('1_forebrain', 1), ('71_neuron', 71), ('89_diencephalon', 89), ('136_radial glial cell', 136), ('10_epidermal cell', 10), ('83_neuromast', 83), ('145_enteric musculature', 145), ('55_myotome', 55), ('46_hematopoietic cell', 46), ('102_vasculature', 102), ('127_Muller cell', 127), ('105_oligodendrocyte', 105), ('69_liver', 69), ('90_pharynx', 90), ('95_intestine', 95)]
2024-02-06 16:51:41.963854	Terminal clusters corresponding to unique lineages in this component are [0, 2, 49, 67, 120, 1, 71, 89, 136, 10, 83, 145, 55, 46, 102, 127, 105, 69, 90, 95] 
TESTING rw2_lineage probability at memory 10
testing rw2 lineage probability at memory 10
g.indptr.size, 147
C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\pecanpy\graph.py:90: UserWarning: WARNING: Implicitly set node IDs to the canonical node ordering due to missing IDs field in the raw CSR npz file. This warning message can be suppressed by setting implicit_ids to True in the read_npz function call, or by setting the --implicit_ids flag in the CLI
  warnings.warn(
2024-02-06 16:51:48.486988	 Cluster or terminal cell fate 0 is reached 265.0 times
2024-02-06 16:51:48.683785	 Cluster or terminal cell fate 2 is reached 321.0 times
2024-02-06 16:51:48.923914	 Cluster or terminal cell fate 49 is reached 38.0 times
2024-02-06 16:51:49.118724	 Cluster or terminal cell fate 67 is reached 326.0 times
2024-02-06 16:51:49.340747	 Cluster or terminal cell fate 120 is reached 169.0 times
2024-02-06 16:51:49.568767	 Cluster or terminal cell fate 1 is reached 65.0 times
2024-02-06 16:51:49.816750	 Cluster or terminal cell fate 71 is reached 45.0 times
2024-02-06 16:51:50.007561	 Cluster or terminal cell fate 89 is reached 417.0 times
2024-02-06 16:51:50.233503	 Cluster or terminal cell fate 136 is reached 116.0 times
2024-02-06 16:51:50.461331	 Cluster or terminal cell fate 10 is reached 71.0 times
2024-02-06 16:51:50.692439	 Cluster or terminal cell fate 83 is reached 30.0 times
2024-02-06 16:51:50.925563	 Cluster or terminal cell fate 145 is reached 116.0 times
2024-02-06 16:51:51.169573	 Cluster or terminal cell fate 55 is reached 45.0 times
2024-02-06 16:51:51.405499	 Cluster or terminal cell fate 46 is reached 25.0 times
2024-02-06 16:51:51.625457	 Cluster or terminal cell fate 102 is reached 124.0 times
2024-02-06 16:51:51.854602	 Cluster or terminal cell fate 127 is reached 109.0 times
2024-02-06 16:51:52.087725	 Cluster or terminal cell fate 105 is reached 59.0 times
2024-02-06 16:51:52.314733	 Cluster or terminal cell fate 69 is reached 47.0 times
2024-02-06 16:51:52.546824	 Cluster or terminal cell fate 90 is reached 66.0 times
2024-02-06 16:51:52.788913	 Cluster or terminal cell fate 95 is reached 40.0 times
2024-02-06 16:51:53.453693	There are (20) terminal clusters corresponding to unique lineages {0: '0_pectoral fin', 2: '2_dermis', 49: '49_fin', 67: '67_pharyngeal arch', 120: '120_cranial cartilage', 1: '1_forebrain', 71: '71_neuron', 89: '89_diencephalon', 136: '136_radial glial cell', 10: '10_epidermal cell', 83: '83_neuromast', 145: '145_enteric musculature', 55: '55_myotome', 46: '46_hematopoietic cell', 102: '102_vasculature', 127: '127_Muller cell', 105: '105_oligodendrocyte', 69: '69_liver', 90: '90_pharynx', 95: '95_intestine'}
2024-02-06 16:51:53.453693	Begin projection of pseudotime and lineage likelihood
2024-02-06 16:52:02.358324	Start reading data
2024-02-06 16:52:02.358324	Correlation of Via pseudotime with developmental stage 73.33 %
2024-02-06 16:52:02.400584	Cluster graph layout based on forward biasing
2024-02-06 16:52:02.426671	Graph has 1 connected components before pruning
2024-02-06 16:52:02.432658	Graph has 58 connected components after pruning
2024-02-06 16:52:02.478882	Graph has 1 connected components after reconnecting
2024-02-06 16:52:02.479882	67.2% links trimmed from local pruning relative to start
2024-02-06 16:52:02.479882	80.4% links trimmed from global pruning relative to start
2024-02-06 16:52:02.479882	Additional Visual cluster graph pruning for edge bundling at level: 0.15
2024-02-06 16:52:02.481883	Make via clustergraph edgebundle
C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\scipy\sparse\_index.py:103: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  self._set_intXint(row, col, x.flat[0])
2024-02-06 16:52:05.635522	Hammer dims: Nodes shape: (146, 2) Edges shape: (890, 3)
2024-02-06 16:52:05.853588	Time elapsed 143.6 seconds

Computing the Atlas View

Set parameter init_pos

  • ‘via’ : uses via graph to initialize embedding

  • ‘spectral’ : uses normalized Laplacian of the fuzzy 1-skeleton

  • n_cell x 2 dimensional array preset by the user

Once you have the single cell atlas embedding, you can compute the edge layout and generate the full Atlas View

compute_atlas_embedding = True
if compute_atlas_embedding:
    # STEP 3
    print(f'{datetime.now()}\tStart Atlas embedding computation')
    atlas_embedding = via.via_atlas_emb(via_object=v0, init_pos='via', min_dist=0.2)
else: atlas_embedding = embedding=adata.obsm['X_Via2_Atlas'] #use pre-saved embedding

v0.embedding = atlas_embedding #remember to pass this onto the via_object
2024-02-06 16:52:05.878707	Start Atlas embedding computation
X-input (119457, 100)
len membership and n_cells 119457 119457
n cell 119457
2024-02-06 16:52:05.878707	Computing embedding on sc-Viagraph
2024-02-06 16:52:10.671518	using via cluster graph to initialize embedding
(119457, 100)
	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs

Re-orient (optional)

If you want to apply a rotation transformation on the embedding

#re-orient/rotate. This can be skipped. Only use it if neceseary First plot out the atlas_embedding (code in next snipped) and see if you want to rotate it so the early cells are on the LHS. import math theta_deg = 180 theta_radians = math.sin(math.radians(theta_deg)) R = np.array([[math.cos(theta_radians), -math.sin(theta_radians)],[math.sin(theta_radians),math.cos(theta_radians)]]) atlas_embedding_ = np.matmul(R,atlas_embedding.T) atlas_embedding_=atlas_embedding.T v0.embedding = atlas_embedding_

Plot Atlas Embedding’

print(f'{datetime.now()}\tPlot Atlas Embedding')
f1, ax = via.plot_scatter(embedding=atlas_embedding, labels=time_point_numeric, cmap='plasma', s=5,
                          alpha=0.5, edgecolors='None',
                          title='atlas', text_labels=False)
f1.set_size_inches(10, 10)

f2, ax = via.plot_scatter(embedding=atlas_embedding, labels=coarse_cell_type, title='atlas celltype',
                          alpha=0.5, s=5)
f2.set_size_inches(10, 10)
plt.show()
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
2024-02-06 17:02:03.305009	Plot Atlas Embedding
../_images/85f8728b1469431a0d855922a789456b73d099dacc82a51d3b119b6825ce5c41.png ../_images/4244eef9926088cd51361edd0b63410a016c0f05b64edcf75bbeb193ec0ef034.png

Atlas View Edges

  1. Compute the edges and pass it onto the via_object to avoid recomputing each time you plot

print(f'{datetime.now()}\tStart Computing Atlas View')
dict_coarse = {'notochord':2.5, 'central_nervous_system':5.5,'neural_crest':4.5,'paraxial_mesoderm':0,'intermediate_mesoderm':1,'lateral_mesoderm':1.5,
                       'mesenchyme':2.0,'hematopoietic_system':0.5,'periderm':8,'endoderm':7}
decay = 0.55#0.7  # increasing decay increasing merging of edges
i_bw = 0.035#0.05  # increasing bw increases merging of edges
global_visual_pruning = 0.55
n_milestones = 300

hammerbundle_dict = via.make_edgebundle_milestone(via_object=v0, n_milestones=n_milestones, decay=decay,
                                                  initial_bandwidth=i_bw,
                                                  global_visual_pruning=global_visual_pruning,
                                                  sc_labels_numeric=time_point_numeric)
v0.hammerbundle_milestone_dict = hammerbundle_dict  # make the hmbd in one shot to be used for all plots
2024-02-06 17:02:15.718628	Start Computing Atlas View
2024-02-06 17:02:15.718628	Start finding milestones
2024-02-06 17:02:50.911185	End milestones with 300
2024-02-06 17:02:52.349099	Recompute weights
2024-02-06 17:02:52.414034	pruning milestone graph based on recomputed weights
2024-02-06 17:02:52.417055	Graph has 1 connected components before pruning
2024-02-06 17:02:52.419138	Graph has 1 connected components after pruning
2024-02-06 17:02:52.419138	Graph has 1 connected components after reconnecting
2024-02-06 17:02:52.425153	regenerate igraph on pruned edges
2024-02-06 17:02:52.757955	Setting numeric label as time_series_labels or other sequential metadata for coloring edges
2024-02-06 17:02:53.431797	Making smooth edges
print(f'{datetime.now()}\tPlot Atlas View')
# color by tissue type, we 
color_dict = {'notochord': 2.5, 'central_nervous_system': 5.5, 'neural_crest': 4.5, 'paraxial_mesoderm': 0,
               'intermediate_mesoderm': 1, 'lateral_mesoderm': 1.5,
               'mesenchyme': 2.0, 'hematopoietic_system': 0.5, 'periderm': 8, 'endoderm': 7}

#color by major cell type
headwidth_bundle = 0.2
fig, ax = via.plot_atlas_view(via_object=v0, sc_labels=hybrid_cell_time, fontsize_labels=4, text_labels=True,
                              add_sc_embedding=True, sc_scatter_alpha=0.25, sc_scatter_size=5, 
                              cmap='rainbow', linewidth_bundle=3, 
                              sc_labels=coarse_cell_type, 
                              headwidth_bundle=headwidth_bundle)
fig.set_size_inches(35, 25)

plt.show()
2024-02-06 17:03:23.164984	Plot Atlas View
inside add sc embedding second if
../_images/641c3666337e34e2d2801f0d4170a4014fd9a804fbe2f63660d68c976a453e36.png
#if you want to color by categories (e.g. tissue type)
#autogenerate color_dict 
#color_dict = {}
#set_labels = list(set(labels))
#set_labels.sort(reverse=True)
#for index, value in enumerate(set_labels): color_dict[value] = index

#color by stage
fig2, ax2 = via.plot_atlas_view(via_object=v0, sc_labels=hybrid_cell_time, fontsize_labels=2, text_labels=False,
                                add_sc_embedding=True, sc_scatter_alpha=0.25, sc_scatter_size=5, cmap='plasma',
                                sc_labels_expression=time_point_numeric,
                                linewidth_bundle=3, 
                                headwidth_bundle=headwidth_bundle)
fig2.set_size_inches(35, 25)
inside add sc embedding second if
../_images/9b10ae4020cfe4132a9deaec57b9902dbbe4effd24e74a60cfbff912224d1a4a.png