3. Via 2.0 Cartography on Zebrahub (Visualization)

This tutorial focuses on how to generate the Via2.0 Zebrahub Atlas View. The TI analysis is shown in Tutorial #2

import scanpy as sc
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pyVIA.core as via
from datetime import datetime

Load the data

Data has been filtered and library normalized. Data is annotated with time/stage, cluster labels, and cell type labels (coarse and fine)

print(f'{datetime.now()}\tStart reading data')
adata=sc.read_h5ad( filename='/home/user/Trajectory/Datasets/Zebrafish_Lange2023/Rawdata/Zebrahub_data_via.h5ad')
#compute PCA
adata.uns['log1p']['base'] = None #fix for small bug in scanpy #see https://github.com/theislab/single-cell-tutorial/issues/97
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
sc.tl.pca(adata, n_comps=100)
print(adata)

2024-02-06 16:48:28.584723	Start reading data
AnnData object with n_obs × n_vars = 119457 × 28187
    obs: 'n_genes', 'n_counts', 'fish', 'timepoint_cluster', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_nc', 'pct_counts_nc', 'developmental_stage', 'timepoint', 'zebrafish_anatomy_ontology_class', 'zebrafish_anatomy_ontology_id', 'parc', 'kmeans50', 'celltype_extrainfo', 'zebrafish_anatomy_ontology_class_shobi', 'adjusted_timepoint', 'bool_list', 'global annotation', 'time_coarse_cell_type', 'time_fine_cell_type', 'parc_cluster', 'parc_cluster_and_cell_type'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'nc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'global_annotation_colors', 'hvg', 'log1p', 'pca', 'timepoint_colors', 'zebrafish_anatomy_ontology_class_colors', 'zebrafish_anatomy_ontology_id_colors'
    obsm: 'X_U', 'X_Via2_Atlas', 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'

Extract some annotations

n_cells = adata.shape[0]
print('cells is', n_cells)
parc_cluster = [i for i in adata.obs['parc_cluster']]
parc_cluster_and_cell_type = [i for i in adata.obs['parc_cluster_and_cell_type']]
coarse_cell_type = [i for i in adata.obs['zebrafish_anatomy_ontology_class']]
fine_cell_type = [i for i in adata.obs['zebrafish_anatomy_ontology_class_shobi']] #consolidated annotaions from individual time point files
coarse_fine_cell_type = [coarse_cell_type[i] + fine_cell_type[i] for i in range(n_cells)]
time_point_numeric = [i for i in adata.obs['adjusted_timepoint']] #assigned a numeric value to each stage (e.g. 5hpf is 5, 10hpf is 10,...)
early_cell_type = [i if str(i) != 'nan' else '' for i in adata.obs['global annotation']]

hybrid_cell_time = [str(fine_cell_type[i]) + '_'+early_cell_type[i]+'_'+str(time_point_numeric[i]) for i in range(n_cells)] #labels of early stage cells which was provided for cells <24hpf also included
parc_numeric_cluster_labels = [i for i in adata.obs['parc']]
parc_cluster = [i for i in adata.obs['parc_cluster']]

cells is 119457

Find a suitable root

root_str = 'paraxial mesoderm_Somites_0'
loc_root = np.where(np.asarray(hybrid_cell_time) == root_str)[0] #'paraxial mesoderm_PSM_0' 1 cell , 'paraxial mesoderm_Somites_0' has 51 cells, 'paraxial mesoderm_Muscle_0', 'tail bud_NMPs_0'
root = [loc_root[0]]
n_pcs = 100
print(f'{datetime.now()}\troot is cell number  {root[0]} of type {fine_cell_type[root[0]]}')

2024-02-06 16:49:16.395417	root is cell number  2604 of type paraxial mesoderm

Initialize parameters

knn=30
time_series = True  # True
memory = 10
jac_std_global = 1  # 1 is used in all the umap csv files
random_seed = 0#1  # 0
knn_sequential = 15  # 5 forviagraphs #15 for embedding
t_diff_step = 2
do_gaussian_kernel_edgeweights = False
edgebundle_pruning_twice = True
edgepruning_clustering_resolution = 0.15  # 0.15 viagraph and embedding
edgebundle_pruning = 0.05  # 0.15
neighboring_terminal_states_threshold = 4  # 3
viagraph_decay = 0.6  # 0.7, 0.9
max_vis_out = 2

Set terminal states (optional)

# The parameter "user_defined_terminal_group" is being set to a list of group level annotations corresponding to the provided true_label. These were terminal fates detected in previous runs and saved for reproducability.
# This parameter can also be left to its default user_defined_terminal_group = []. the results should be similar.
user_defined_cellfates= ['0_cartilage element', '2_dermis', '49_fin',
                                          '67_pharyngeal arch', '120_cranial cartilage',
                                          '1_forebrain', '71_neuron', '89_diencephalon',
                                          '136_radial glial cell',
                                          '10_epidermal cell', '83_neuromast', 
                                          '145_enteric musculature', '55_myotome', 
                                          '46_hematopoietic cell',                                          '102_vasculature', '127_Muller cell', '105_oligodendrocyte',                            '69_liver', '90_pharynx',
                                          '95_intestine']

Initialize and Run the Via2.0 Class

# Initialize and Run the Via2.0 Class
print(f'{datetime.now()}\tRun Via2.0')

v0 = via.VIA(adata.obsm['X_pca'][:, 0:n_pcs], true_label=parc_cluster_and_cell_type, 
             labels=parc_cluster, knn=knn, memory=memory,
             user_defined_terminal_group=user_defined_cellfates,
             edgepruning_clustering_resolution=edgepruning_clustering_resolution,
             neighboring_terminal_states_threshold=neighboring_terminal_states_threshold,
             too_big_factor=0.3, resolution_parameter=1,
             root_user=root, dataset='', random_seed=random_seed,
             is_coarse=True, preserve_disconnected=False, pseudotime_threshold_TS=40, x_lazy=0.99,
             do_gaussian_kernel_edgeweights=do_gaussian_kernel_edgeweights,
             alpha_teleport=0.99, edgebundle_pruning=edgebundle_pruning, edgebundle_pruning_twice=True,
             time_series=time_series, time_series_labels=time_point_numeric, knn_sequential=knn_sequential,
             knn_sequential_reverse=knn_sequential, t_diff_step=t_diff_step,
             do_compute_embedding=False)

v0.run_VIA()

2024-02-06 16:49:16.446582	Run Via2.0
2024-02-06 16:49:16.453681	Running VIA over input data of 119457 (samples) x 100 (features)
2024-02-06 16:49:16.453681	Knngraph has 30 neighbors
2024-02-06 16:49:49.943120	Using time series information to guide knn graph construction 
2024-02-06 16:49:49.954230	Time series ordered set [0, 5, 10, 15, 20, 25, 30, 35, 40, 45]
2024-02-06 16:50:36.547252	Shape neighbors (119457, 30) and sequential neighbors (119457, 30)
2024-02-06 16:50:36.600487	Shape augmented neighbors (119457, 60)
2024-02-06 16:50:36.651840	Actual average allowable time difference between nodes is 10.0
2024-02-06 16:50:55.131081	Finished global pruning of 30-knn graph used for clustering at level of 0.15. Kept 46.9 % of edges. 
2024-02-06 16:50:55.445815	Number of connected components used for clustergraph  is 1
2024-02-06 16:51:17.750064	Using predfined labels provided by user (this must be provided as an array)
2024-02-06 16:51:17.750064	Making cluster graph. Global cluster graph pruning level: 0.15
2024-02-06 16:51:19.093352	Graph has 1 connected components before pruning
2024-02-06 16:51:19.108996	Graph has 1 connected components after pruning
2024-02-06 16:51:19.108996	Graph has 1 connected components after reconnecting
2024-02-06 16:51:19.108996	48.0% links trimmed from local pruning relative to start
2024-02-06 16:51:19.108996	88.8% links trimmed from global pruning relative to start
2024-02-06 16:51:19.108996	Graph has 1 connected components before pruning
2024-02-06 16:51:19.108996	Graph has 3 connected components after pruning
2024-02-06 16:51:19.124624	Graph has 1 connected components after reconnecting
2024-02-06 16:51:19.124624	48.0% links trimmed from local pruning relative to start
2024-02-06 16:51:19.124624	92.1% links trimmed from global pruning relative to start
2024-02-06 16:51:19.124624	component number 0 out of  [0]

C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\scipy\sparse\_index.py:103: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  self._set_intXint(row, col, x.flat[0])

2024-02-06 16:51:20.255203	The root index, 2604 provided by the user belongs to cluster number 97 and corresponds to cell type 97_paraxial mesoderm
2024-02-06 16:51:20.404829	Computing lazy-teleporting expected hitting times
2024-02-06 16:51:31.983877	 ended all multiprocesses, will retrieve and reshape
try rw2 hitting times setup
start computing walks with rw2 method
g.indptr.size, 147

C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\pecanpy\graph.py:90: UserWarning: WARNING: Implicitly set node IDs to the canonical node ordering due to missing IDs field in the raw CSR npz file. This warning message can be suppressed by setting implicit_ids to True in the read_npz function call, or by setting the --implicit_ids flag in the CLI
  warnings.warn(

memory for rw2 hittings times  2. Using rw2 based pt
do scaling of pt
2024-02-06 16:51:41.963854	Terminal cluster list based on user defined cells/groups: [('0_cartilage element', 0), ('2_dermis', 2), ('49_fin', 49), ('67_pharyngeal arch', 67), ('120_cranial cartilage', 120), ('1_forebrain', 1), ('71_neuron', 71), ('89_diencephalon', 89), ('136_radial glial cell', 136), ('10_epidermal cell', 10), ('83_neuromast', 83), ('145_enteric musculature', 145), ('55_myotome', 55), ('46_hematopoietic cell', 46), ('102_vasculature', 102), ('127_Muller cell', 127), ('105_oligodendrocyte', 105), ('69_liver', 69), ('90_pharynx', 90), ('95_intestine', 95)]
2024-02-06 16:51:41.963854	Terminal clusters corresponding to unique lineages in this component are [0, 2, 49, 67, 120, 1, 71, 89, 136, 10, 83, 145, 55, 46, 102, 127, 105, 69, 90, 95] 
TESTING rw2_lineage probability at memory 10
testing rw2 lineage probability at memory 10
g.indptr.size, 147

C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\pecanpy\graph.py:90: UserWarning: WARNING: Implicitly set node IDs to the canonical node ordering due to missing IDs field in the raw CSR npz file. This warning message can be suppressed by setting implicit_ids to True in the read_npz function call, or by setting the --implicit_ids flag in the CLI
  warnings.warn(

2024-02-06 16:51:48.486988	 Cluster or terminal cell fate 0 is reached 265.0 times
2024-02-06 16:51:48.683785	 Cluster or terminal cell fate 2 is reached 321.0 times
2024-02-06 16:51:48.923914	 Cluster or terminal cell fate 49 is reached 38.0 times
2024-02-06 16:51:49.118724	 Cluster or terminal cell fate 67 is reached 326.0 times
2024-02-06 16:51:49.340747	 Cluster or terminal cell fate 120 is reached 169.0 times
2024-02-06 16:51:49.568767	 Cluster or terminal cell fate 1 is reached 65.0 times
2024-02-06 16:51:49.816750	 Cluster or terminal cell fate 71 is reached 45.0 times
2024-02-06 16:51:50.007561	 Cluster or terminal cell fate 89 is reached 417.0 times
2024-02-06 16:51:50.233503	 Cluster or terminal cell fate 136 is reached 116.0 times
2024-02-06 16:51:50.461331	 Cluster or terminal cell fate 10 is reached 71.0 times
2024-02-06 16:51:50.692439	 Cluster or terminal cell fate 83 is reached 30.0 times
2024-02-06 16:51:50.925563	 Cluster or terminal cell fate 145 is reached 116.0 times
2024-02-06 16:51:51.169573	 Cluster or terminal cell fate 55 is reached 45.0 times
2024-02-06 16:51:51.405499	 Cluster or terminal cell fate 46 is reached 25.0 times
2024-02-06 16:51:51.625457	 Cluster or terminal cell fate 102 is reached 124.0 times
2024-02-06 16:51:51.854602	 Cluster or terminal cell fate 127 is reached 109.0 times
2024-02-06 16:51:52.087725	 Cluster or terminal cell fate 105 is reached 59.0 times
2024-02-06 16:51:52.314733	 Cluster or terminal cell fate 69 is reached 47.0 times
2024-02-06 16:51:52.546824	 Cluster or terminal cell fate 90 is reached 66.0 times
2024-02-06 16:51:52.788913	 Cluster or terminal cell fate 95 is reached 40.0 times
2024-02-06 16:51:53.453693	There are (20) terminal clusters corresponding to unique lineages {0: '0_pectoral fin', 2: '2_dermis', 49: '49_fin', 67: '67_pharyngeal arch', 120: '120_cranial cartilage', 1: '1_forebrain', 71: '71_neuron', 89: '89_diencephalon', 136: '136_radial glial cell', 10: '10_epidermal cell', 83: '83_neuromast', 145: '145_enteric musculature', 55: '55_myotome', 46: '46_hematopoietic cell', 102: '102_vasculature', 127: '127_Muller cell', 105: '105_oligodendrocyte', 69: '69_liver', 90: '90_pharynx', 95: '95_intestine'}
2024-02-06 16:51:53.453693	Begin projection of pseudotime and lineage likelihood
2024-02-06 16:52:02.358324	Start reading data
2024-02-06 16:52:02.358324	Correlation of Via pseudotime with developmental stage 73.33 %
2024-02-06 16:52:02.400584	Cluster graph layout based on forward biasing
2024-02-06 16:52:02.426671	Graph has 1 connected components before pruning
2024-02-06 16:52:02.432658	Graph has 58 connected components after pruning
2024-02-06 16:52:02.478882	Graph has 1 connected components after reconnecting
2024-02-06 16:52:02.479882	67.2% links trimmed from local pruning relative to start
2024-02-06 16:52:02.479882	80.4% links trimmed from global pruning relative to start
2024-02-06 16:52:02.479882	Additional Visual cluster graph pruning for edge bundling at level: 0.15
2024-02-06 16:52:02.481883	Make via clustergraph edgebundle

C:\Users\Kevin tsia\anaconda3\envs\via2\lib\site-packages\scipy\sparse\_index.py:103: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  self._set_intXint(row, col, x.flat[0])

2024-02-06 16:52:05.635522	Hammer dims: Nodes shape: (146, 2) Edges shape: (890, 3)
2024-02-06 16:52:05.853588	Time elapsed 143.6 seconds

Computing the Atlas View

Set parameter init_pos

‘via’ : uses via graph to initialize embedding

‘spectral’ : uses normalized Laplacian of the fuzzy 1-skeleton

n_cell x 2 dimensional array preset by the user

Once you have the single cell atlas embedding, you can compute the edge layout and generate the full Atlas View

compute_atlas_embedding = True
if compute_atlas_embedding:
    # STEP 3
    print(f'{datetime.now()}\tStart Atlas embedding computation')
    atlas_embedding = via.via_atlas_emb(via_object=v0, init_pos='via', min_dist=0.2)
else: atlas_embedding = embedding=adata.obsm['X_Via2_Atlas'] #use pre-saved embedding

v0.embedding = atlas_embedding #remember to pass this onto the via_object

2024-02-06 16:52:05.878707	Start Atlas embedding computation
X-input (119457, 100)
len membership and n_cells 119457 119457
n cell 119457
2024-02-06 16:52:05.878707	Computing embedding on sc-Viagraph
2024-02-06 16:52:10.671518	using via cluster graph to initialize embedding
(119457, 100)

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs

Re-orient (optional)

If you want to apply a rotation transformation on the embedding

#re-orient/rotate. This can be skipped. Only use it if neceseary First plot out the atlas_embedding (code in next snipped) and see if you want to rotate it so the early cells are on the LHS. import math theta_deg = 180 theta_radians = math.sin(math.radians(theta_deg)) R = np.array([[math.cos(theta_radians), -math.sin(theta_radians)],[math.sin(theta_radians),math.cos(theta_radians)]]) atlas_embedding_ = np.matmul(R,atlas_embedding.T) atlas_embedding_=atlas_embedding.T v0.embedding = atlas_embedding_

Plot Atlas Embedding’

print(f'{datetime.now()}\tPlot Atlas Embedding')
f1, ax = via.plot_scatter(embedding=atlas_embedding, labels=time_point_numeric, cmap='plasma', s=5,
                          alpha=0.5, edgecolors='None',
                          title='atlas', text_labels=False)
f1.set_size_inches(10, 10)

f2, ax = via.plot_scatter(embedding=atlas_embedding, labels=coarse_cell_type, title='atlas celltype',
                          alpha=0.5, s=5)
f2.set_size_inches(10, 10)
plt.show()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

2024-02-06 17:02:03.305009	Plot Atlas Embedding

../_images/85f8728b1469431a0d855922a789456b73d099dacc82a51d3b119b6825ce5c41.png

../_images/4244eef9926088cd51361edd0b63410a016c0f05b64edcf75bbeb193ec0ef034.png

Atlas View Edges

Compute the edges and pass it onto the via_object to avoid recomputing each time you plot

print(f'{datetime.now()}\tStart Computing Atlas View')
dict_coarse = {'notochord':2.5, 'central_nervous_system':5.5,'neural_crest':4.5,'paraxial_mesoderm':0,'intermediate_mesoderm':1,'lateral_mesoderm':1.5,
                       'mesenchyme':2.0,'hematopoietic_system':0.5,'periderm':8,'endoderm':7}
decay = 0.55#0.7  # increasing decay increasing merging of edges
i_bw = 0.035#0.05  # increasing bw increases merging of edges
global_visual_pruning = 0.55
n_milestones = 300

hammerbundle_dict = via.make_edgebundle_milestone(via_object=v0, n_milestones=n_milestones, decay=decay,
                                                  initial_bandwidth=i_bw,
                                                  global_visual_pruning=global_visual_pruning,
                                                  sc_labels_numeric=time_point_numeric)
v0.hammerbundle_milestone_dict = hammerbundle_dict  # make the hmbd in one shot to be used for all plots

2024-02-06 17:02:15.718628	Start Computing Atlas View
2024-02-06 17:02:15.718628	Start finding milestones
2024-02-06 17:02:50.911185	End milestones with 300
2024-02-06 17:02:52.349099	Recompute weights
2024-02-06 17:02:52.414034	pruning milestone graph based on recomputed weights
2024-02-06 17:02:52.417055	Graph has 1 connected components before pruning
2024-02-06 17:02:52.419138	Graph has 1 connected components after pruning
2024-02-06 17:02:52.419138	Graph has 1 connected components after reconnecting
2024-02-06 17:02:52.425153	regenerate igraph on pruned edges
2024-02-06 17:02:52.757955	Setting numeric label as time_series_labels or other sequential metadata for coloring edges
2024-02-06 17:02:53.431797	Making smooth edges

print(f'{datetime.now()}\tPlot Atlas View')
# color by tissue type, we 
color_dict = {'notochord': 2.5, 'central_nervous_system': 5.5, 'neural_crest': 4.5, 'paraxial_mesoderm': 0,
               'intermediate_mesoderm': 1, 'lateral_mesoderm': 1.5,
               'mesenchyme': 2.0, 'hematopoietic_system': 0.5, 'periderm': 8, 'endoderm': 7}

#color by major cell type
headwidth_bundle = 0.2
fig, ax = via.plot_atlas_view(via_object=v0, sc_labels=hybrid_cell_time, fontsize_labels=4, text_labels=True,
                              add_sc_embedding=True, sc_scatter_alpha=0.25, sc_scatter_size=5, 
                              cmap='rainbow', linewidth_bundle=3, 
                              sc_labels=coarse_cell_type, 
                              headwidth_bundle=headwidth_bundle)
fig.set_size_inches(35, 25)

plt.show()

2024-02-06 17:03:23.164984	Plot Atlas View
inside add sc embedding second if

../_images/641c3666337e34e2d2801f0d4170a4014fd9a804fbe2f63660d68c976a453e36.png

#if you want to color by categories (e.g. tissue type)
#autogenerate color_dict 
#color_dict = {}
#set_labels = list(set(labels))
#set_labels.sort(reverse=True)
#for index, value in enumerate(set_labels): color_dict[value] = index

#color by stage
fig2, ax2 = via.plot_atlas_view(via_object=v0, sc_labels=hybrid_cell_time, fontsize_labels=2, text_labels=False,
                                add_sc_embedding=True, sc_scatter_alpha=0.25, sc_scatter_size=5, cmap='plasma',
                                sc_labels_expression=time_point_numeric,
                                linewidth_bundle=3, 
                                headwidth_bundle=headwidth_bundle)
fig2.set_size_inches(35, 25)

inside add sc embedding second if

../_images/9b10ae4020cfe4132a9deaec57b9902dbbe4effd24e74a60cfbff912224d1a4a.png