Source code for VIA.core

import numpy as np
import pandas as pd
import scipy
from scipy import sparse
from scipy.sparse import csr_matrix, csgraph, find
from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
import igraph as ig
import leidenalg
import time
from datetime import datetime
import hnswlib
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from matplotlib.path import get_path_collection_extents
import multiprocessing
from collections import Counter
from typing import Optional, Union

from pyVIA.plotting_via import *
from pyVIA.utils_via import *
from pyVIA.utils_via import _construct_knn, sequential_knn

#from plotting_via import *
#from utils_via import *
#from utils_via import _construct_knn, sequential_knn
from sklearn.preprocessing import normalize
import math

from pecanpy import pecanpy as node2vec

### to upload to pip core via2 file


def _composite_jacAffinity_distanceAffinity(sc_csr_graph: csr_matrix, projected_distances: ndarray = None):
    '''
    make a composite score for distance based affinity and  jaccard score
    :param sc_csr_graph: distances based affinities (.data is affinity)
    :param projected_distances: if using projected distances adjusted based on velocity then provide them
    :return: igraph of affinities based on Jaccard and distance-based affinities
    '''
    edges_ = list(zip(*sc_csr_graph.nonzero()))  # in the case of time-series, this is the augmented graph
    # Jaccard computation in igraph does not consider weights. However, doing the global pruning on distance based weights seems less effective than pruning on Jaccard similarities
    jac_edgeweights_list = ig.Graph(edges_, edge_attrs={'weight': sc_csr_graph.data}).similarity_jaccard(
        pairs=edges_)

    # the graph has self loops, so we need to clip these values as they have a jacc of 1, and much higher than other neighbors
    jac_edgeweights_list = np.clip(np.array(jac_edgeweights_list), a_max=np.percentile(np.array(jac_edgeweights_list),
                                                                                       95), a_min=None).tolist()

    if projected_distances is None:
        composite_edgeweights = np.multiply(sc_csr_graph.data, np.array(
            jac_edgeweights_list))  # 0<J<1 #combining distance-based affinities and jac-based affinities
    else:

        composite_edgeweights = np.multiply(projected_distances, np.array(
            jac_edgeweights_list))

    composite_igraph = ig.Graph(edges_, edge_attrs={'weight': composite_edgeweights.tolist()}).simplify(
        combine_edges='sum')  # used for clustergraph #in the case of time-series, this is the augmented graph

    return composite_igraph


def _get_loc_terminal_states(via0, X_input: ndarray):
    '''

    we need the location of terminal states from first iteration (Via0) to pass onto the second iterations of Via (Via1)
    this will allow identification of the terminal-cluster in fine-grained Via1 that best captures the terminal state from coarse Via0

    :param via0: coarse grained iteration of object class VIA
    :param X_input: via0.data. the data matrix (PCs) on which the TI is inferred
    :return:
    '''

    tsi_list = []  # find the single-cell which is nearest to the average-location of a terminal cluster in PCA space
    for tsi in via0.terminal_clusters:
        loc_i = np.where(np.asarray(via0.labels) == tsi)[0]
        val_pt = [via0.single_cell_pt_markov[i] for i in loc_i]
        th_pt = np.percentile(val_pt, 50)  # 50
        loc_i = [loc_i[i] for i in range(len(val_pt)) if val_pt[i] >= th_pt]
        temp = np.mean(X_input[loc_i], axis=0)
        labelsq, distances = via0.knn_struct.knn_query(temp, k=1)
        tsi_list.append(labelsq[0][0])
    return tsi_list

def _prob_reaching_terminal_state(terminal_state, A, root, n_simulations, q,
                                 cumstateChangeHist, cumstateChangeHist_all, seed):
    # this function is defined outside the VIA class to enable smooth parallel processing in Windows

    n = A.shape[0]

    A /= np.max(A)
    for i, r in enumerate(A):
        if np.all(r == 0):
            A[i, i] = 1
    # ensure probabilities sum to 1 along the rows
    P = A / A.sum(axis=1).reshape((n, 1))

    n_steps, count_reach_terminal_state = 2 * n, 0
    for _ in range(n_simulations):
        cur_state = root
        change_hist = np.zeros((n, n))
        change_hist[root, root] = 1

        x, terminal_state_found = 0, False

        while x < n_steps and not terminal_state_found:
            seed += 1
            np.random.seed(seed)
            next_state = np.random.choice(range(P.shape[0]), p=P[cur_state])
            if next_state == terminal_state:
                terminal_state_found = True

            change_hist[cur_state, next_state] += 1
            cur_state = next_state
            x += 1

        if terminal_state_found:
            cumstateChangeHist += np.any(change_hist > 0, axis=0)
            count_reach_terminal_state += 1
        cumstateChangeHist_all += np.any(change_hist > 0, axis=0)

    cumstateChangeHist_all[cumstateChangeHist_all == 0] = 1
    q.append([cumstateChangeHist, cumstateChangeHist_all])


def _rw2_walks(A: ndarray, root: int, memory: float = 0.8, weighted: bool = True, implicit_ids: bool = False,
              num_walks: int = 100, p_memory: float = 1.0, x_lazy: float = 0.95,
              alpha_teleport: float = 0.99):
    '''
    :param A: cluster level transition matrix (forward biased)
    :param root: root cluster index
    :param memory: 1/q * edge weight to a next-node that is not a neighbor of previous node. larger number means more memory and more introspective walk. small number <1 means more exploration
    :param p_memory: 1/p * edge weight to next node = previous node
    :param weighted: default True edge weights are provided as part of the array A
    :param walk_length: default None (walk_length = num_nodes *2)
    :return: implicit_ids
    '''
    from pecanpy.graph import AdjlstGraph
    # initialize SparseGraph object
    # g = AdjlstGraph()
    # setting up the lazy-teleporting behaviour
    n_states = A.shape[0]
    P = A / A.sum(axis=1).reshape((n_states, 1))
    # print('row normed P',P.shape, P, P.sum(axis=1))

    # bias_P is the transition probability matrix

    P = x_lazy * P + (1 - x_lazy) * np.identity(n_states)
    # print(P, P.sum(axis=1))
    P = alpha_teleport * P + ((1 - alpha_teleport) * (1 / n_states) * (np.ones((n_states, n_states))))

    from numba_progress import ProgressBar
    A_csr = csr_matrix(A)  # csr_matrix(P) #use P if you want to add in the Lazy-teleporting. I find using A directly works a bit better.
    # print(f'A_csr {A_csr}')
    # print(f'A dense {A}')
    g = node2vec.SparseOTF(p=p_memory, q=memory, workers=4, verbose=True,
                           extend=True)  # larger q, more memory and introverted search #
    # using the extend=True which is the node2vec+ implementation from Jan 2023 Renming Liu Bioinformatics paper that differentiates between weakly connected (prev, next) and strongly connected (prev, next)
    # https://pecanpy.readthedocs.io/en/latest/pecanpy.html
    g.indptr = A_csr.indptr  # ["indptr"].astype(np.uint32)
    g.indices = A_csr.indices  # ["indices"].astype(np.uint32)
    g.data = A_csr.data  # ["data"].astype(np.float32)

    if g.data is None:
        raise ValueError("Adjacency matrix data not found.")
    elif not weighted:
        print('no weights so assign 1 to all edges')
        g.data[:] = 1.0  # overwrite edge weights with constant
    num_nodes = g.indptr.size
    walk_length = num_nodes * 2
    #print(f'g.indptr.size, {num_nodes}')
    # look deeper at how the node id mapping is done: https://pecanpy.readthedocs.io/en/latest/_modules/pecanpy/graph.html#DenseGraph.read_edg
    g.set_node_ids(node_ids=None,
                   implicit_ids=implicit_ids,
                   num_nodes=int(num_nodes - 1))

    # generate random walks, which could then be used to train w2v
    # walks = g.simulate_walks(num_walks=2, walk_length=3)
    # https://pecanpy.readthedocs.io/en/latest/_modules/pecanpy/pecanpy.html#Base.simulate_walks
    g._preprocess_transition_probs()

    nodes = np.array([root], dtype=np.uint32)  # range(g.num_nodes
    # print(f'nodes {range(g.num_nodes)} and nodes {nodes}')
    start_node_idx_ary = np.concatenate([nodes] * num_walks)  # root<=> nodes
    # print(f'start_node_array {start_node_idx_ary}')
    tot_num_jobs = start_node_idx_ary.size

    random_state = g.random_state
    np.random.seed(random_state)
    np.random.shuffle(start_node_idx_ary)  # for balanced work load

    move_forward = g.get_move_forward()
    has_nbrs = g.get_has_nbrs()
    verbose = g.verbose

    # Acquire numba progress proxy for displaying the progress bar
    with ProgressBar(total=tot_num_jobs, disable=not verbose) as progress:
        walk_idx_mat = g._random_walks(
            tot_num_jobs,
            walk_length,
            random_state,
            start_node_idx_ary,
            has_nbrs,
            move_forward,
            progress,
        )
    # print(f'walk mat {walk_idx_mat}')
    # Map node index back to node ID
    walks = [g._map_walk(walk_idx_ary) for walk_idx_ary in walk_idx_mat]
    walks = np.asarray(walks).astype(int)
    # print(f'via walks and shape of n_walks x n_steps {walks.shape}') #ndarray n_walks x n_steps

    # emd = g.embed(num_walks=5, walk_length=num_nodes*2)
    # print(f'emd {emd.shape} {emd}')
    return walks


def _compute_rw2_lineageprobability(A: ndarray, root: int, memory: float = 0.8, p_memory: float = 1.0,
                                   num_walks: int = 1000, terminal_states: list = None, x_lazy: float = 0.95,
                                   alpha_teleport: float = 0.99):
    print(f'{datetime.now()}\tCalculating lineage probability at memory {memory}')
    n_states = A.shape[0]
    n_terminal_states = len(terminal_states)
    ts_index_counter = 0
    prob_mat = np.zeros((n_states, n_terminal_states))
    walks = _rw2_walks(A=A, root=root, memory=memory, num_walks=num_walks, p_memory=p_memory, x_lazy=x_lazy,
                      alpha_teleport=alpha_teleport)

    # if n_steps is None: n_steps = 2 * n_states
    n_steps = walks.shape[1] - 1
    for terminal_state in terminal_states:

        cumstateChangeHist = np.zeros((1, n_states))
        cumstateChangeHist_all = np.zeros((1, n_states))
        terminal_state_found_inAllWalks = False

        count_reached_bool = 0
        count_reach_terminal_state = 0
        for row in walks:
            change_hist = np.zeros((n_states, n_states))
            change_hist[root, root] = 1
            x = 0
            # print(f'walk {row}')
            terminal_state_found = False
            while x < n_steps and not terminal_state_found:
                cur_state = row[x]
                next_state = row[x + 1]
                if next_state == terminal_state:
                    terminal_state_found = True
                    count_reached_bool += 1

                change_hist[cur_state, next_state] += 1
                x += 1

            if terminal_state_found:
                cumstateChangeHist += np.any(change_hist > 0, axis=0)
                count_reach_terminal_state += 1
            cumstateChangeHist_all += np.any(change_hist > 0, axis=0)
        # print(f'count reached bool {count_reached_bool}')
        cumstateChangeHist_all[
            cumstateChangeHist_all == 0] = 1  # to avoid division by zero we say that the state has been visited once
        count_reached = cumstateChangeHist_all[0, terminal_state]
        # print(f'terminal state HistAll {terminal_state} is reached {count_reached} times')
        count_reached = cumstateChangeHist[0, terminal_state]
        # print(f'terminal state {terminal_state} is reached {count_reached} times')
        # print( f"{datetime.now()}\tFrom root {root},  TS {terminal_state} is reached {int(count_reached)} times.")
        np.set_printoptions(precision=3)
        prob_ = cumstateChangeHist / cumstateChangeHist_all
        # print(f'cumstatechangeHist {cumstateChangeHist}')
        # print(f'cumstatechangeHist_all {cumstateChangeHist_all}')
        # print(f'prob_ pre scaling {prob_}')
        # given a cluster is traversed (cumhistory_vec_all), how many of these traversals led to a successful path to Terminal State X
        # i.e. with what probability does traversal of a cluster lead to the desired terminal state (number of times traversed along a sucessful path)  #cumhistory_vec: number of times cluster is traversed on a successful path

        if count_reached == 0:
            prob_[:, terminal_state] = 0
            print(
                f'{datetime.now()}\t{terminal_state} cluster/state is never reached. try increase number of KNN (which will increase number of edges) or try to increase the value of jac_std_global and cluster_graph_pruning which will lower edge pruning')
        else:
            loc_1 = np.where(prob_ == 1)
            loc_1 = loc_1[1]
            prob_[0, loc_1] = 0
            # print('zerod out prob', prob_)
            temp_ = np.max(prob_)
            if temp_ == 0: temp_ = 1
            prob_ = prob_ / min(1,
                                1.1 * temp_)  # do some scaling to amplify the probabiltiies of those clusters that are not 1 to make the range of values more compact
            prob_[0, loc_1] = 1  # put back probability =1
            print(
                f'{datetime.now()}\tCluster or terminal cell fate {terminal_state} is reached {count_reached} times')  # {prob_}
        prob_mat[:, ts_index_counter] = prob_
        ts_index_counter += 1

    return prob_mat


def _compute_rw2_hittingtimes(A: ndarray, root: int, memory: float = 0.8, num_walks: int = 1300, x_lazy: float = 0.95,
                             alpha_teleport: float = 0.99):
    '''

    :param walks: ndarray walks of shape n_walks x n_steps
    :param A: ndarray of clustergraph (values are edge weights, higher means stronger edge)
    :return:
    '''
    # from scipy.sparse import csc_matrix
    # A_csr = csr_matrix(A)
    # print(f'cluster graph A', A)
    # generate ndarray of walks (n_walks x n_steps+1)
    print(f'{datetime.now()}\tstart computing walks with rw2 method')
    walks = _rw2_walks(A=A, root=root, memory=memory, num_walks=num_walks, x_lazy=x_lazy, alpha_teleport=alpha_teleport)
    num_nodes = A.shape[0]
    num_walks = walks.shape[0]
    walk_length = walks.shape[                      1] - 1  # (the first element in the walk is the root, so the walks array has n_steps + 1 element
    hitting_array = np.zeros((num_nodes, num_walks))

    walk_count = 0
    for walk_i in walks:
        walk_i_dist_cumulative = [0]
        # print(f'walk_i {walk_i}')
        for step in range(walk_length):
            edge_weight = A[walk_i[step], walk_i[step + 1]]
            # print(f'edge weight {edge_weight}')
            edge_distance = 1. / (1 + math.exp(edge_weight - 1))  # convert to distance
            walk_i_dist_cumulative.append(walk_i_dist_cumulative[-1] + edge_distance)
            # print(f'edge distance',edge_distance)
            # print(f'cumulative walk {walk_i_dist_cumulative}')
        for node_i in range(num_nodes):
            # print(f'node {node_i}')
            first_time_at_statei = np.where(walk_i == node_i)[0]

            if len(first_time_at_statei) == 0:
                hitting_array[node_i, walk_count] = walk_length + 1
            else:
                # print(f'first time hitting node {node_i} is {first_time_at_statei[0]}')
                hitting_array[node_i, walk_count] = walk_i_dist_cumulative[first_time_at_statei[0]]
        walk_count += 1
    # print(f'hitting_array {hitting_array}')

    # extract relevant hitting times
    hitting_array_final = np.zeros((1, num_nodes))
    no_times_state_reached_array = np.zeros((1, num_nodes))
    for i in range(num_nodes):
        rowtemp = hitting_array[i, :]
        no_times_state_reached_array[0, i] = np.sum(rowtemp != (walk_length + 1))

    for i in range(num_nodes):
        rowtemp = hitting_array[i, :]
        no_times_state_reached = np.sum(rowtemp != (walk_length + 1))
        if no_times_state_reached != 0:
            perc = np.percentile(rowtemp[rowtemp != walk_length + 1], 20) + 0.001  # 15 for Human and Toy
            # print('state ', i,' has perc' ,perc)
            hitting_array_final[0, i] = np.mean(rowtemp[rowtemp <= perc])
        else:
            hitting_array_final[0, i] = (walk_length + 1)

    return hitting_array_final[0]


def _simulate_markov_sub(A, num_sim, hitting_array, q, root, seed):
    # sub process that actually does the mcmcs
    n_states = A.shape[0]
    P = A / A.sum(axis=1).reshape((n_states, 1))

    hitting_array_temp = np.zeros((P.shape[0], 1)).astype('float64')
    n_steps = int(2 * n_states)

    currentState = root

    state = np.zeros((1, n_states))
    state[0, currentState] = 1
    state_root = state.copy()
    for i in range(num_sim):
        dist_list = []
        state = state_root
        currentState = root
        stateHist = state
        for x in range(n_steps):
            seed += 1
            np.random.seed(seed)
            nextState = np.random.choice(range(P.shape[0]), p=P[currentState])
            dist = A[currentState, nextState]
            dist_list.append(1. / (1 + math.exp(dist - 1)))

            state = np.zeros((1, n_states))
            state[0, nextState] = 1.0
            currentState = nextState
            # Keep track of state history
            stateHist = np.append(stateHist, state, axis=0)

        for state_i in range(P.shape[0]):
            first_time_at_statei = np.where(stateHist[:, state_i] == 1)[0]
            if len(first_time_at_statei) == 0:
                hitting_array_temp[state_i, 0] = n_steps + 1
            else:
                total_dist = 0
                for ff in range(first_time_at_statei[0]):
                    total_dist = dist_list[ff] + total_dist

                hitting_array_temp[state_i, 0] = total_dist  # first_time_at_statei[0]
        hitting_array = np.append(hitting_array, hitting_array_temp, axis=1)

    hitting_array = hitting_array[:, 1:]
    q.append(hitting_array)


def get_biased_weights(edges, weights, pt, round=1):
    # small nu means less forward biasing (0.5 is quite mild)
    # larger nu (in our case 1/nu) means more aggressive forwards biasing https://en.wikipedia.org/wiki/Generalised_logistic_function

    # using the pseudotime calculated from lazy-jumping walk. Otherwise using the refined MCMC Psuedotimes before
    # calculating lineage likelihood paths
    b = 1 if round == 1 else 20

    weights_thr, pct_thr = weights.mean(), np.percentile(pt, 80)
    loc_high_pt = set(np.where(pt > pct_thr)[0])
    for i in np.where(weights > weights_thr)[0]:
        start, end = edges[i]
        if start in loc_high_pt or end in loc_high_pt:
            weights[i] = 0.5 * weights.mean()
    weights.clip(np.percentile(weights, 10), np.percentile(weights, 90))

    bias_weight, K, c, C, nu = [], 1, 0, 1, 1
    for (s, t), w in zip(edges, weights):
        t_ab = pt[s] - pt[t]
        bias_weight.append(w * K / (C + math.exp(b * (t_ab + c))) ** nu)

    return bias_weight


def expected_num_steps(start_i, N):
    return np.dot(N, np.ones(N.shape[0]))[start_i]


def absorption_probability(N, R, absorption_state_j):
    M = np.dot(N, R)
    return M, M[:, absorption_state_j]


[docs]class VIA: ''' A class to represent the VIA analysis Parameters ---------- data: ndarray input matrix of size n_cells x n_dims. Expects the PCs or features that will be used in the TI computation. Can be e.g. adata.obsm['X_pca][:,0:20] true_label:list list of str/int that correspond to the ground truth or reference annotations. Can also be None when no labels are available labels: ndarray (nsamples, ) default is None. and PARC clusters are used for the viagraph. alternatively provide a list of clustermemberships that are integer values (not strings) to construct the viagraph using another clustering method or available annotations edgepruning_clustering_resolution_local:float default = 2 local level of pruning for PARC graph clustering stage. Range (0.1,3) higher numbers mean more edge retention. For large datasets can stick to just tuning edgepruning_clustering_resolution edgepruning_clustering_resolution: float (optional, default = 0.15, can also set as 'median') graph pruning for PARC clustering stage. Higher value keeps more edges, results in fewer clusters. Smaller value removes more edges and results in more clusters. Number of standard deviations below the network’s mean-jaccard-weighted edges. 0.1-1 provide reasonable pruning. higher value means less pruning (more edges retained). e.g. a value of 0.15 means all edges that are above mean(edgeweight)-0.15*std(edge-weights) are retained. We find both 0.15 and ‘median’ to yield good results/starting point and resulting in pruning away ~ 50-60% edges keep_all_local_dist: bool, str default value of 'auto' means that for smaller datasets local-pruning is done prior to clustering, but for large datasets local pruning is set to False for speed. can also set to be bool of True or False too_big_factor: float (optional, default=0.4). Forces clusters > 0.4*n_cells to be re-clustered resolution_parameter: float (default =1) larger value means more and smaller clusters partition_type:str (default "ModularityVP") Options small_pop: int (default 10) Via attempts to merge Clusters with a population < 10 cells with larger clusters. If you have a very small dataset (e.g. few hundred cells), then consider lowering to e.g. 5 jac_weighted_edges:bool (default = True) Use weighted edges in the PARC clustering step knn: int (optional, default = 30) number of K-Nearest Neighbors for HNSWlib KNN graph. Larger knn means more graph connectivity. Lower knn means more loosely connected clusters/cells n_iter_leiden:int random_seed: int Random seed to pass to clustering num_threads: distance:str (default 'l2') Euclidean distance 'l2' by default; other options 'ip' and 'cosine' for graph construction and similarity visual_cluster_graph_pruning: float (optional, default = 0.15) This only comes into play if the user deliberately chooses not to use the default edge-bundling method of visualizating edges (draw_piechart_graph()) and instead calls draw_piechart_graph_nobundle(). It is often set to the same value as the PARC clustering level of edgepruning_clustering_resolution. This does not impact computation of terminal states, pseudotime or lineage likelihoods. It controls the number of edges plotted for visual effect cluster_graph_pruning: float (optional, default =0.15) Pruning level of the cluster graph (does not impact number of clusters). Only impacts the connectivity of the clustergraph. Often set to the same value as the PARC clustering level of edgepruning_clustering_resolution.Reasonable range [0.1,1] To retain more connectivity in the clustergraph underlying the trajectory computations, increase the value time_smallpop: max time to be allowed handling singletons x_lazy:float (default =0.95) 1-x = probability of staying in same node (lazy). Values between 0.9-0.99 are reasonable alpha_teleport: float (default = 0.99) 1-alpha is probability of jumping. Values between 0.95-0.99 are reasonable unless prior knowledge of teleportation root_user: list, None can be a list of strings, a list of int or None (default is None) When the root_user is set as None and an RNA velocity matrix is available, a root will be automatically computed if the root_user is None and not velocity matrix is provided, then an arbitrary root is selected if the root_user is ['celltype_earlystage'] where the str corresponds to an item in true_label, then a suitable starting point will be selected corresponding to this group if the root_user is [678], where 678 is the index of the cell chosen as a start cell, then this will be the designated starting cell. It is possible to give a list of root indices and groups. [120, 699] or ['traj1_earlystage', 'traj2_earlystage'] when there are more than one trajectories preserve_disconnected: bool (default = True) If you believe there may be disconnected trajectories then set this to False dataset: str Can be set to 'group' or '' (default). this refers to the type of root label (group level root or single cell index) you are going to provide. if your true_label has a sensible group of cells for a root then you can set dataset to 'group' and make the root parameter ['labelname_root_cell_type'] if your root corresponds to one particular cell then set dataset = '' (default) embedding: ndarray (optional, default = None) embedding (e.g. precomputed tsne, umap, phate, via-umap) for plotting data. Size n_cells x 2 If an embedding is provided when running VIA, then a scatterplot colored by pseudotime, highlighting terminal fates velo_weight: float (optional, default = 0.5) #float between [0,1]. the weight assigned to directionality and connectivity derived from scRNA-velocity neighboring_terminal_states_threshold:int (default = 3). Candidates for terminal states that are neighbors of each other may be removed from the list if they have this number of more of terminal states as neighbors knn_sequential:int (default =10) number of knn in the adjacent time-point for time-series data (t_i and t_i+1) knn_sequential_reverse: int (default = 0) number of knn enforced from current to previous time point t_diff_step: int (default =1) Number of permitted temporal intervals between connected nodes. If time data is labeled as [0,25,50,75,100,..] then t_diff_step=1 corresponds to '25' and only edges within t_diff_steps are retained is_coarse:bool (default = True) If running VIA in two iterations where you wish to link the second fine-grained iteration with the initial iteration, then you set to False via_coarse: VIA (default = None) If instantiating a second iteration of VIA that needs to be linked to a previous iteration (e.g. via0), then set via_coarse to the previous via0 object df_annot: DataFrame (default None) used for the Mouse Organ data preserve_disconnected_after_pruning:bool (default = False) If you believe there are disconnected trajectories then set this to True and test your hypothesis A_velo: ndarray Cluster Graph Transition matrix based on rna velocity [n_clus x n_clus] velocity_matrix: matrix (default None) matrix of size [n_samples x n_genes]. this is the velocity matrix computed by scVelo (or similar package) and stored in adata.layers['velocity']. The genes used for computing velocity should correspond to those useing in gene_matrix Requires gene_matrix to be provided too. gene_matrix: matrix (default None) Only used if Velocity_matrix is available. matrix of size [n_samples x n_genes]. We recommend using a subset like HVGs rather than full set of genes. (need to densify input if taking from adata = adata.X.todense()) time_series:bool (default False) if the data has time-series labels then set to True time_series_labels:list (default None) list of integer values of temporal annoataions corresponding to e.g. hours (post fert), days, or sequential ordering pca_loadings: array (default None) the loadings of the pcs used to project the cells (to projected euclidean location based on velocity). n_cells x n_pcs secondary_annotations: None (default None) edgebundle_pruning:float (default=None) will by default be set to the same as the cluster_graph_pruning and influences the visualized level of pruning of edges. Typical values can be between [0,1] with higher numbers retaining more edges edgebundle_pruning_twice:bool default: False. When True, the edgebundling is applied to a further visually pruned (visual_cluster_graph_pruning) and can sometimes simplify the visualization. it does not impact the pseudotime and lineage computations piegraph_arrow_head_width: float (default = 0.1) size of arrow heads in via cluster graph piegraph_edgeweight_scalingfactor: (defaulf = 1.5) scaling factor for edge thickness in via cluster graph max_visual_outgoing_edges: int (default =2) Only allows max_visual_outgoing_edges to come out of any given node. Used in differentiation_flow() edgebundle_pruning:float (default=None) will by default be set to the same as the cluster_graph_pruning and influences the visualized level of pruning of edges. Typical values can be between [0,1] with higher numbers retaining more edges edgebundle_pruning_twice:bool default: False. When True, the edgebundling is applied to a further visually pruned (visual_cluster_graph_pruning) and can sometimes simplify the visualization for very cluttered graphs. it does not impact the pseudotime and lineage computations pseudotime_threshold_TS: int (default = 30) corresponds to the criteria for a state to be considered a candidate terminal cell fate to be 30% or later of the computed psuedotime range num_mcmc_simulations:int (default = 1300) number of random walk simulations conducted embedding_type: str (default = 'via-mds', other options are 'via-atlas' and 'via-force' do_compute_embedding: bool (default = False) If you want an embedding (n_samples x2) to be computed on the basis of the via sc graph then set this to True do_gaussian_kernel_edgeweights: bool (default = False) Type of edgeweighting on the graph edges memory: float (default = 2) higher q means more memory, more retrospective/inwards randomwalk. memory = 2 means run using the non-memory Via 1.0 mode viagraph_decay: float (default = 0.9) increasing decay causes more edges to merge memory: 1/q * edge weight to a next-node that is not a neighbor of previous node. larger number means more memory and more introspective walk. small number <1 means more exploration p_memory: 1/p * edge weight to next node = previous node. large value means more exploration graph_init_pos: matrix (or list of lists) to initialize the viagraph spatial_coords: np.ndarray of size n_cells x 2 (denoting x,y coordinates) of each spot/cell do_spatial_knn: Whether or not to do spatial mode of StaVia for graph augmentation do_spatial_layout: whether to use spatial coords for layout of the clustergraph spatial_knn:int = 15. number of knn's added based on spatial proximity indiciated by spatial_coords spatial_aux:list = [] a list of slice IDs so that only cells/spots on the same slice are considered when building the spatial_knn graph Attributes ------------ labels: array length (n_samples, ) of cluster labels ndarray pre determined cluster labels user defined. #np.asarray(pre_labels).flatten() single_cell_pt_markov: list length n_samples of pseudotime single_cell_bp: ndarray [n_lineages x n_samples] array of single cell branching probabilities towards each lineage (lineage normalized). Each column corresponds to a terminal state, in the order presented by the terminal_clusters attribute single_cell_bp_rownormed: ndarray [n_lineages x n_samples] array of single cell branching probabilities towards each lineage (cell normalized). Each column corresponds to a terminal state, in the order presented by the terminal_clusters attribute terminal_clusters: list list of clusters that are cell fates/ unique lineages cluster_bp: ndarray [n_clusters x n_terminal_states]. Lineage probability of cluster towards a particular terminal cluster state CSM: ndarray [n_cluster x n_clusters] array of cosine similarity used to weight the cluster graph transition matrix by velocity single_cell_transition_matrix: ndarray [n_samples x n_samples] terminal_clusters:list (default None) list of terminal clusters csr_full_graph: csr matrix of single-cell graph (augmented with sequential data when providing time_series information) csr_array_locally_pruned: csr matrix ig_full_graph: full_neighbor_array: user_defined_terminal_cell:list=[] user_defined_terminal_group:list=[] n_milestones: int = None Number of milestones in the via-mds computation (anything more than 10,000 can be computationally heavy and time consuming) Typically auto-determined within the via-mds function embedding: ndarray [n_cells x 2] provided by user or autocomputed with via-mds or via-umap ''' def __init__(self, data: ndarray, true_label=None, edgepruning_clustering_resolution_local: float = 1, edgepruning_clustering_resolution=0.15, labels: ndarray = None, keep_all_local_dist='auto', too_big_factor: float = 0.4, resolution_parameter: float = 1.0, partition_type: str = "ModularityVP", small_pop: int = 10, jac_weighted_edges: bool = True, knn: int = 30, n_iter_leiden: int = 5, random_seed: int = 42, num_threads=-1, distance='l2', time_smallpop=15, super_cluster_labels: bool = False, super_node_degree_list: bool = False, super_terminal_cells: bool = False, x_lazy: float = 0.99, alpha_teleport: float = 0.99, root_user=None, preserve_disconnected: bool = True, dataset: str = '', super_terminal_clusters: list = [], is_coarse=True, csr_full_graph: csr_matrix = '', csr_array_locally_pruned='', ig_full_graph='', full_neighbor_array='', full_distance_array='', embedding: ndarray = None, df_annot=None, preserve_disconnected_after_pruning: bool = False, secondary_annotations: list = None, pseudotime_threshold_TS: int = 30, cluster_graph_pruning: float = 0.15, visual_cluster_graph_pruning: float = 0.15, neighboring_terminal_states_threshold=3, num_mcmc_simulations=1300, piegraph_arrow_head_width=0.1, piegraph_edgeweight_scalingfactor=1.5, max_visual_outgoing_edges: int = 2, via_coarse=None, velocity_matrix=None, gene_matrix=None, velo_weight=0.5, edgebundle_pruning=None, A_velo=None, CSM=None, edgebundle_pruning_twice=False, pca_loadings=None, time_series=False, time_series_labels: list = None, knn_sequential: int = 10, knn_sequential_reverse: int = 0, t_diff_step: int = 1, single_cell_transition_matrix=None, embedding_type: str = 'via-mds', do_compute_embedding: bool = False, color_dict: {} = None, user_defined_terminal_cell: list = [], user_defined_terminal_group: list = [], do_gaussian_kernel_edgeweights: bool = False, RW2_mode: bool = False, working_dir_fp: str = '/home/', memory=5, viagraph_decay=0.9, p_memory=1, graph_init_pos: np.ndarray=None, spatial_coords:np.ndarray=None, do_spatial_knn:bool=False, do_spatial_layout:bool=False, spatial_knn:int = 15, spatial_aux:list = []): self.data = data self.nsamples, self.ncomp = data.shape if true_label is not None: self.true_label = true_label else: self.true_label = [1] * self.nsamples if velocity_matrix is None: velo_weight = 0 self.velo_weight = velo_weight # float between 0,1. the weight assigned to directionality and connectivity derived from scRNA-velocity if edgebundle_pruning is None: edgebundle_pruning = cluster_graph_pruning self.edgebundle_pruning = edgebundle_pruning if (root_user is None) & (velocity_matrix is None): root_user = [] dataset = '' self.root_user = root_user self.dataset = dataset self.knn_struct = None if isinstance(labels, list): labels = np.asarray(labels).flatten() self.labels = labels # np.asarray(pre_labels).flatten() where pre_labels is a list self.connected_comp_labels = None self.edgelist = None self.edgelist_unique = None # higher edgepruning_clustering_resolution_local means more edges are kept # higher edgepruning_clustering_resolution means more edges are kept if keep_all_local_dist == 'auto': # If large dataset skip local pruning to increase speed keep_all_local_dist = data.shape[0] > 50000 if resolution_parameter != 1: partition_type = "RBVP" # Reichardt and Bornholdt’s Potts model. Note that this is the same as ModularityVertexPartition when setting 𝛾 = 1 and normalising by 2m self.edgepruning_clustering_resolution_local = edgepruning_clustering_resolution_local self.edgepruning_clustering_resolution = edgepruning_clustering_resolution ##0.15 is also a recommended value performing empirically similar to 'median' self.keep_all_local_dist = keep_all_local_dist self.too_big_factor = too_big_factor ##if a cluster exceeds this share of the entire cell population, then the PARC will be run on the large cluster. at 0.4 it does not come into play self.resolution_parameter = resolution_parameter self.partition_type = partition_type self.small_pop = small_pop # smallest cluster population to be considered a community self.jac_weighted_edges = jac_weighted_edges self.knn = knn self.knn_sequential = knn_sequential # number of knn in the adjacent time-point for time-series data self.knn_sequential_reverse = knn_sequential_reverse # number of knn made between timepoint and previous time point self.n_iter_leiden = n_iter_leiden self.random_seed = random_seed # enable reproducible Leiden clustering self.num_threads = num_threads # number of threads used in KNN search/construction self.distance = distance # Euclidean distance 'l2' by default; other options 'ip' and 'cosine' self.time_smallpop = time_smallpop self.super_cluster_labels = super_cluster_labels self.super_node_degree_list = super_node_degree_list self.super_terminal_clusters = super_terminal_clusters self.full_neighbor_array = full_neighbor_array self.full_distance_array = full_distance_array self.ig_full_graph = ig_full_graph self.csr_array_locally_pruned = csr_array_locally_pruned self.csr_full_graph = csr_full_graph self.super_terminal_cells = super_terminal_cells self.x_lazy = x_lazy # 1-x = probability of staying in same node self.alpha_teleport = alpha_teleport # 1-alpha is probability of jumping self.preserve_disconnected = preserve_disconnected self.is_coarse = is_coarse # set to True for first round of VIA. if one chooses to run a second iteration of VIA that uses the terminal states from the first round, then set this to False for second iteration self.embedding = embedding self.df_annot = df_annot self.preserve_disconnected_after_pruning = preserve_disconnected_after_pruning # pruning can cause some fragmentation of small clusters, these should be reattached to the relevant component (so typically this is False as we dont want to preserve these small extra components resulting from pruning) self.secondary_annotations = secondary_annotations self.pseudotime_threshold_TS = pseudotime_threshold_TS self.cluster_graph_pruning = cluster_graph_pruning self.visual_cluster_graph_pruning = visual_cluster_graph_pruning # higher value means more edges retained. This is applied to the clustergraph before visulizing only if two rounds of edgbebundlng is done. self.neighboring_terminal_states_threshold = neighboring_terminal_states_threshold # number of neighbors of a terminal state has before it is eliminated as a TS self.num_mcmc_simulations = num_mcmc_simulations # number of mcmc simulations in second state of pseudotime computation self.piegraph_arrow_head_width = piegraph_arrow_head_width self.piegraph_edgeweight_scalingfactor = piegraph_edgeweight_scalingfactor self.max_visual_outgoing_edges = max_visual_outgoing_edges # higher value means more edges retained. This is applied to the clustergraph and is a strong threshold for number of edges shown if velocity_matrix is not None: velocity_matrix[np.isnan(velocity_matrix)] = 0 self.velocity_matrix = velocity_matrix # matrix from scVelo with velocities if gene_matrix is not None: gene_matrix[np.isnan(gene_matrix)] = 0 self.gene_matrix = gene_matrix # matrix (not numpy array) with gene expression #such as adata.X.todense() or adata.layers["Mu"] from scVelo (first moments of spliced gene counts self.A_velo = A_velo # the transition matrix weighted by the rna velocity [n_clus x n_clus] self.CSM = CSM # the cosine similarity matrix using velocity [n_clusters x n_clusters] self.edgebundle_pruning_twice = edgebundle_pruning_twice # default: False. When True, the edgebundling is applied to a further visually pruned (visual_cluster_graph_pruning) and can sometimes simplify the visualization. it does not impact the pseudotime and lineage computations self.pca_loadings = pca_loadings # the loadings of the pcs used to project the cells (to projected euclidean location based on velocity) self.time_series = time_series self.time_series_labels = time_series_labels self.t_diff_step = t_diff_step self.single_cell_transition_matrix = single_cell_transition_matrix self.embedding_type = embedding_type self.do_compute_embedding = do_compute_embedding self.color_dict = color_dict self.user_defined_terminal_cell = user_defined_terminal_cell self.user_defined_terminal_group = user_defined_terminal_group self.do_gaussian_kernel_edgeweights = do_gaussian_kernel_edgeweights self.RW2_mode = RW2_mode self.working_dir_fp = working_dir_fp self.memory = memory self.p_memory = p_memory self.viagraph_decay = viagraph_decay self.graph_init_pos = graph_init_pos self.spatial_coords = spatial_coords self.do_spatial_knn = do_spatial_knn self.do_spatial_layout = do_spatial_layout self.spatial_knn = spatial_knn self.spatial_aux = spatial_aux #list of labels that identify cells from the same tissue slice def _make_pt_augmented_adjacency_igraph(self, neighbors: ndarray, distances: ndarray, k_reverse: int = 10, k_seq: int = 0): ''' If you dont have time-series labels to adjust the construction of the single-cell graph, you can try using the pseudotime (single-cell level) to achieve the same sequencing effect. However, since the pseudo-times are derived from the original unguided structure, there may not neccessarily be a large visible impact on the graph structure :param neighbors: ndarray (n_cells x n_neighbors) original unguided knn graph structure :param distances: ndarray (n_cell x n_neighbors) :param k_reverse: (int) default = 10 :param k_seq: this is zero because terminal states that have lower pseudotimes should not be forced to look for neighbors in later pseudotimes :param knn: :return: ''' n_pt_augmented, d_pt_augmented = sequential_knn(self.data, [int(i * 10) for i in self.single_cell_pt_markov], neighbors, distances, k_seq=k_seq, k_reverse=k_reverse, num_threads=self.num_threads, distance=self.distance) adjacency_pt_augmented = self._make_csrmatrix_noselfloop(n_pt_augmented, d_pt_augmented, time_series=True, time_series_labels=[int(i * 10) for i in self.single_cell_pt_markov], t_diff_step=3) # this function has local pruning which removes neighbors that are more than t_dif apart. Since the same type of local pruning wrt t_dif is applied pre-clustergraph, we only need to call this function once in the case of time_series data ig_full_graph_pt_guided = _composite_jacAffinity_distanceAffinity(sc_csr_graph=adjacency_pt_augmented) return ig_full_graph_pt_guided, adjacency_pt_augmented def _get_terminal_clusters_user_defined_(self, user_defined_terminal_cell: list = [], user_defined_terminal_group: list = []): ''' Allow the user to optionally select group or cell index level terminal fates that override the automated cell fate detection :param user_defined_terminal_cell: list of cell indices corresponding to terminal fate cells :param user_defined_terminal_cell_group: list of group level labels corresponding to labels found in true_label, that represent cell fates :return: list of clusters to represent the cell fate clusters in the via-clustergraph ''' dict_user_defined_terminal_clusters = {} terminal_cluster_list = [] if len(user_defined_terminal_cell) > 0: for i in user_defined_terminal_cell: clus_ = self.labels[i] if clus_ not in terminal_cluster_list: terminal_cluster_list.append(clus_) else: print(f'{i} is a repeated terminal cluster') terminal_cluster_list.append(clus_) dict_user_defined_terminal_clusters[user_defined_terminal_cell] = clus_ else: for user_terminal_group in user_defined_terminal_group: # location in indices of which cells belong to this user_terminal_group index_terminal_group = np.where(np.asarray(self.true_label) == user_terminal_group)[0] potential_clusters = self.labels[index_terminal_group] clus_ = func_mode(potential_clusters.tolist()) if clus_ not in terminal_cluster_list: terminal_cluster_list.append(clus_) dict_user_defined_terminal_clusters[user_terminal_group] = clus_ print(f'{datetime.now()}\tTerminal cluster list based on user defined cells/groups:', [(key, dict_user_defined_terminal_clusters[key]) for key in dict_user_defined_terminal_clusters]) return terminal_cluster_list def _get_terminal_clusters(self, A, markov_pt, root_ai): n_ = A.shape[0] # number of states in the graph component if n_ <= 10: n_outlier_std = 3 if (n_ <= 40) & (n_ > 10): n_outlier_std = 2 if n_ >= 40: n_outlier_std = 2 # 1 pop_list = [] # print('get terminal', set(self.labels), np.where(self.labels == 0)) for i in list(set(self.labels)): pop_list.append(len(np.where(self.labels == i)[0])) # we weight the out-degree based on the population of clusters to avoid allowing small clusters to become the terminals based on population alone A_new = A.copy() for i in range(A.shape[0]): for j in range(A.shape[0]): A_new[i, j] = A[i, j] * (pop_list[i] + pop_list[j]) / (pop_list[i] * pop_list[j]) # make an igraph graph to compute the closeness g_dis = ig.Graph.Adjacency( (A_new > 0).tolist()) # need to manually add the weights as igraph treates A>0 as boolean g_dis.es['weights'] = 1 / A_new[A_new.nonzero()] # we want "distances" not weights for closeness and betweeness betweenness_score = g_dis.betweenness(weights='weights') betweenness_score_array = np.asarray(betweenness_score) betweenness_score_takeout_outlier = betweenness_score_array[betweenness_score_array < ( np.mean(betweenness_score_array) + n_outlier_std * np.std(betweenness_score_array))] betweenness_list = [i for i, score in enumerate(betweenness_score) if score < ( np.mean(betweenness_score_takeout_outlier) - 0 * np.std(betweenness_score_takeout_outlier))] closeness_score = g_dis.closeness(mode='ALL', cutoff=None, weights='weights', normalized=True) closeness_score_array = np.asarray(closeness_score) closeness_score_takeout_outlier = closeness_score_array[ closeness_score_array < (np.mean(closeness_score_array) + n_outlier_std * np.std(closeness_score_array))] closeness_list = [i for i, score in enumerate(closeness_score) if score < (np.mean(closeness_score_takeout_outlier) - 0 * np.std( closeness_score_takeout_outlier))] out_deg = A_new.sum(axis=1) out_deg = np.asarray(out_deg) outdegree_score_takeout_outlier = out_deg[out_deg < (np.mean(out_deg) + n_outlier_std * np.std(out_deg))] loc_deg = [i for i, score in enumerate(out_deg) if score < (np.mean(outdegree_score_takeout_outlier) - 0 * np.std(outdegree_score_takeout_outlier))] print(f"{datetime.now()}\tIdentifying terminal clusters corresponding to unique lineages...") print(f"{datetime.now()}\tCloseness:{closeness_list}") print(f"{datetime.now()}\tBetweenness:{betweenness_list}") print(f"{datetime.now()}\tOut Degree:{loc_deg}") markov_pt = np.asarray(markov_pt) pct = 10 if n_ <= 40 else 30 loc_pt = np.where(markov_pt >= np.percentile(markov_pt, pct))[0] terminal_clusters_1 = list(set(closeness_list) & set(betweenness_list)) terminal_clusters_2 = list(set(closeness_list) & set(loc_deg)) terminal_clusters_3 = list(set(betweenness_list) & set(loc_deg)) terminal_clusters = list(set(terminal_clusters_1) | set(terminal_clusters_2)) terminal_clusters = list(set(terminal_clusters) | set(terminal_clusters_3)) terminal_clusters = list(set(terminal_clusters) & set(loc_pt)) terminal_org = terminal_clusters.copy() for terminal_i in terminal_org: if terminal_i in terminal_clusters: removed_terminal_i = False else: removed_terminal_i = True # print('terminal state', terminal_i) count_nn = 0 ts_neigh = [] neigh_terminal = np.where(A[:, terminal_i] > 0)[0] if neigh_terminal.size > 0: for item in neigh_terminal: if item in terminal_clusters: ts_neigh.append(item) count_nn = count_nn + 1 if n_ >= 10: if item == root_ai: # if the terminal state is a neighbor of if terminal_i in terminal_clusters: terminal_clusters.remove(terminal_i) print( f"{datetime.now()}\tWe removed cluster {terminal_i} from the shortlist of terminal states") removed_terminal_i = True if count_nn >= self.neighboring_terminal_states_threshold: # 2 if removed_terminal_i == False: temp_remove = terminal_i temp_time = markov_pt[terminal_i] for to_remove_i in ts_neigh: if markov_pt[to_remove_i] < temp_time: temp_remove = to_remove_i temp_time = markov_pt[to_remove_i] terminal_clusters.remove(temp_remove) print( f"{datetime.now()}\tCluster {terminal_i} had {self.neighboring_terminal_states_threshold} or more neighboring terminal states {ts_neigh} and so we removed cluster {temp_remove}") if len(terminal_clusters) == 0: terminal_clusters = loc_deg # print('terminal_clusters', terminal_clusters) return terminal_clusters def compute_hitting_time(self, sparse_graph, root, x_lazy, alpha_teleport, number_eig=0): # 1- alpha is the probability of teleporting # 1- x_lazy is the probability of staying in current state (be lazy) # Computing lazy-teleporting expected hitting time beta_teleport = 2 * (1 - alpha_teleport) / (2 - alpha_teleport) N = sparse_graph.shape[0] Lsym = csgraph.laplacian(sparse_graph, normed=True) eigval, eigvec = scipy.sparse.linalg.eigsh(Lsym, number_eig or Lsym.shape[0] - 1) Greens, beta_norm_lap = np.zeros((N, N), float), np.zeros((N, N), float) Xu = np.zeros((N, N)) Xu[:, root] = 1 Xv_Xu = np.eye(N) - Xu for i in range(1 if alpha_teleport == 1 else 0, len(eigval)): vv = np.outer(eigvec[:, i], eigvec[:, i]) factor = beta_teleport + 2 * eigval[i] * x_lazy * (1 - beta_teleport) Greens += vv / factor beta_norm_lap += vv * factor D = np.diag(np.nan_to_num(np.array(sparse_graph.sum(axis=1)).reshape(-1) ** -.5, posinf=0)) t = D @ Greens @ D * beta_teleport hitting_matrix = np.diagonal(t)[np.newaxis, :] - t # Calculate only diagonal elements of Xv_Xu @ t return np.abs((Xv_Xu * t.T).sum(-1)), (hitting_matrix + hitting_matrix.T)[root] def simulate_branch_probability(self, terminal_state, A, root, num_sim=500): n_states = A.shape[0] ncpu = multiprocessing.cpu_count() if (ncpu == 1) | (ncpu == 2): n_jobs = 1 elif ncpu > 2: n_jobs = min(ncpu - 1, 5) # print('njobs', n_jobs) num_sim_pp = int(num_sim / n_jobs) # num of simulations per process jobs = [] manager = multiprocessing.Manager() q = manager.list() seed_list_ = list(range(n_jobs)) seed_list = [i + self.random_seed for i in seed_list_] for i in range(n_jobs): cumstateChangeHist = np.zeros((1, n_states)) cumstateChangeHist_all = np.zeros((1, n_states)) process = multiprocessing.Process(target=_prob_reaching_terminal_state, args=( terminal_state, A, root, num_sim_pp, q, cumstateChangeHist, cumstateChangeHist_all, seed_list[i])) jobs.append(process) for j in jobs: j.start() for j in jobs: j.join() cumhistory_vec = q[0][0] cumhistory_vec_all = q[0][1] count_reached = cumhistory_vec_all[0, terminal_state] for i in range(1, len(q)): cumhistory_vec = cumhistory_vec + q[i][0] cumhistory_vec_all = cumhistory_vec_all + q[i][1] count_reached = count_reached + q[i][1][0, terminal_state] print( f"{datetime.now()}\tFrom root {root}, the Terminal state {terminal_state} is reached {int(count_reached)} times.") # print('cumhistory_vec_all', cumhistory_vec_all) cumhistory_vec_all[ cumhistory_vec_all == 0] = 1 # to avoid division by zero we say that the state has been visited once # print('cumhistory_vec_all', cumhistory_vec_all) # print('cumhistory_vec', cumhistory_vec) prob_ = cumhistory_vec / cumhistory_vec_all # given a cluster is traversed (cumhistory_vec_all), how many of these traversals led to a successful path to Terminal State X # i.e. with what probability does traversal of a cluster lead to the desired terminal state (number of times traversed along a sucessful path) #cumhistory_vec: number of times cluster is traversed on a successful path np.set_printoptions(precision=3) if count_reached == 0: prob_[:, terminal_state] = 0 print('never reached state', terminal_state) else: loc_1 = np.where(prob_ == 1) loc_1 = loc_1[1] prob_[0, loc_1] = 0 # print('zerod out prob', prob_) temp_ = np.max(prob_) if temp_ == 0: temp_ = 1 prob_ = prob_ / min(1, 1.1 * temp_) # do some scaling to amplify the probabiltiies of those clusters that are not 1 to make the range of values more compact prob_[0, loc_1] = 1 # put back probability =1 return list(prob_)[0] def _simulate_markov(self, A, root): ''' Computes the MCMC based hitting times on the forward biased graph :param A: cluster graph adjacency matrix (forward biased) :param root: :return: mcmc hitting times ''' n_states = A.shape[0] P = A / A.sum(axis=1).reshape((n_states, 1)) # print('row normed P',P.shape, P, P.sum(axis=1)) x_lazy = self.x_lazy # 1-x is prob lazy alpha_teleport = self.alpha_teleport # bias_P is the transition probability matrix P = x_lazy * P + (1 - x_lazy) * np.identity(n_states) # print(P, P.sum(axis=1)) P = alpha_teleport * P + ((1 - alpha_teleport) * (1 / n_states) * (np.ones((n_states, n_states)))) # print('check prob of each row sum to one', P.sum(axis=1)) currentState = root state = np.zeros((1, n_states)) state[0, currentState] = 1 num_sim = self.num_mcmc_simulations ncpu = multiprocessing.cpu_count() if (ncpu == 1) | (ncpu == 2): n_jobs = 1 elif ncpu > 2: n_jobs = ncpu - 2 # min(ncpu - 2, 5) # print('njobs', n_jobs) num_sim_pp = int(num_sim / n_jobs) # num of simulations per process n_steps = int(2 * n_states) jobs = [] seed = self.random_seed dummy_list = [] for dummy in range(n_jobs): hitting_array = np.ones((P.shape[0], 1)) * 1000 dummy_list.append(hitting_array) with multiprocessing.Manager() as manager: q = manager.list() for i in range(2): # range(n_jobs): hitting_array = np.ones((P.shape[0], 1)) * 1000 process = multiprocessing.Process(target=_simulate_markov_sub, args=(P, num_sim_pp, hitting_array, q, root, seed)) jobs.append(process) process.start() for proc in jobs: proc.join() # for j in jobs: # j.start() # for j in jobs: # j.join() print(f"{datetime.now()}\tEnded all multiprocesses, will retrieve and reshape") hitting_array = q[0] for qi in q[1:]: hitting_array = np.append(hitting_array, qi, axis=1) # .get(), axis=1) # print('finished getting from queue', hitting_array.shape) hitting_array_final = np.zeros((1, n_states)) no_times_state_reached_array = np.zeros((1, n_states)) for i in range(n_states): rowtemp = hitting_array[i, :] no_times_state_reached_array[0, i] = np.sum(rowtemp != (n_steps + 1)) for i in range(n_states): rowtemp = hitting_array[i, :] no_times_state_reached = np.sum(rowtemp != (n_steps + 1)) if no_times_state_reached != 0: perc = np.percentile(rowtemp[rowtemp != n_steps + 1], 15) + 0.001 # 15 for Human and Toy # print('state ', i,' has perc' ,perc) hitting_array_final[0, i] = np.mean(rowtemp[rowtemp <= perc]) else: hitting_array_final[0, i] = (n_steps + 1) return hitting_array_final[0] def _compute_hitting_time_onbias(self, laplacian, inv_sqr_deg, root, x_lazy, alpha_teleport, number_eig=0): # 1- alpha is the probabilty of teleporting # 1- x_lazy is the probability of staying in current state (be lazy) beta_teleport = 2 * (1 - alpha_teleport) / (2 - alpha_teleport) N = laplacian.shape[0] print('is laplacian of biased symmetric', (laplacian.transpose() == laplacian).all()) Id = np.zeros((N, N), float) np.fill_diagonal(Id, 1) # norm_lap = scipy.sparse.csr_matrix.todense(laplacian) eig_val, eig_vec = np.linalg.eig( laplacian) # eig_vec[:,i] is eigenvector for eigenvalue eig_val[i] not eigh as this is only for symmetric. the eig vecs are not in decsending order print('eig val', eig_val.shape) if number_eig == 0: number_eig = eig_vec.shape[1] print('number of eig vec', number_eig) Greens_matrix = np.zeros((N, N), float) beta_norm_lap = np.zeros((N, N), float) Xu = np.zeros((N, N)) Xu[:, root] = 1 Id_Xv = np.zeros((N, N), int) np.fill_diagonal(Id_Xv, 1) Xv_Xu = Id_Xv - Xu start_ = 0 if alpha_teleport == 1: start_ = 1 # if there are no jumps (alph_teleport ==1), then the first term in beta-normalized Green's function will have 0 in denominator (first eigenvalue==0) for i in range(start_, number_eig): # 0 instead of 1st eg # print(i, 'th eigenvalue is', eig_val[i]) vec_i = eig_vec[:, i] factor = beta_teleport + 2 * eig_val[i] * x_lazy * (1 - beta_teleport) vec_i = np.reshape(vec_i, (-1, 1)) eigen_vec_mult = vec_i.dot(vec_i.T) Greens_matrix = Greens_matrix + ( eigen_vec_mult / factor) # Greens function is the inverse of the beta-normalized laplacian beta_norm_lap = beta_norm_lap + (eigen_vec_mult * factor) # beta-normalized laplacian temp = Greens_matrix.dot(inv_sqr_deg) temp = inv_sqr_deg.dot(temp) * beta_teleport hitting_matrix = np.zeros((N, N), float) diag_row = np.diagonal(temp) for i in range(N): hitting_matrix[i, :] = diag_row - temp[i, :] roundtrip_commute_matrix = hitting_matrix + hitting_matrix.T temp = Xv_Xu.dot(temp) final_hitting_times = np.diagonal( temp) ## number_eig x 1 vector of hitting times from root (u) to number_eig of other nodes roundtrip_times = roundtrip_commute_matrix[root, :] return abs(final_hitting_times), roundtrip_times def _project_branch_probability_sc(self, bp_array_clus, pt): # print('sum of branch probabilities at cluster level', np.sum(bp_array_clus, axis=1)) n_clus = len(list(set(self.labels))) n_cells = self.data.shape[0] knn_sc = 3 if self.data.shape[0] > 1000 else 10 neighbors, _ = self.knn_struct.knn_query(self.data, k=knn_sc) rows, cols, weights = [], [], [] for i, row in enumerate(neighbors): neighboring_clus = self.labels[row] for c in set(list(neighboring_clus)): rows.append(i) cols.append(c) weights.append(np.sum(neighboring_clus == c) / knn_sc) weights = csr_matrix((weights, (rows, cols)), shape=(n_cells, n_clus)) bp_array_sc = weights.dot(bp_array_clus) bp_array_sc /= np.max(bp_array_sc, axis=0) # divide cell probability by max value in that column so that rare lineages dont have distortedly low lineage probabilities for i, label_ts in enumerate(self.terminal_clusters): # list(set(self.terminal_clusters)) loc_i = np.where(self.labels == label_ts)[0] loc_noti = np.where(self.labels != label_ts)[0] if np.max(bp_array_sc[loc_noti, i]) > 0.8: bp_array_sc[loc_i, i] = 1.2 pt = np.asarray(pt) pt = np.reshape(pt, (n_clus, 1)) pt_sc = weights.dot(pt) pt_sc /= np.amax(pt_sc) return bp_array_sc, pt_sc.flatten()
[docs] def sc_transition_matrix(self, smooth_transition, b=10, use_sequentially_augmented=False): ''' #computes the single cell level transition directions that are later used to calculate velocity of embedding #based on changes at single cell level in genes and single cell level velocity :param smooth_transition: :param b: slope of logistic function :return: ''' # n_clus = len(list(set(self.labels))) # n_cells = self.data.shape[0] pt = self.single_cell_pt_markov * 2 / max( self.single_cell_pt_markov) # some scaling so that the input to the logistic function covers a range of sensible inputs labels = self.labels if use_sequentially_augmented: T = self.csr_array_locally_pruned_augmented # single cell edges are weighted as inverse of distance else: T = self.csr_array_locally_pruned # T = self.csr_full_graph #single cell thr_global = np.mean(T.data) - 2 * np.std(T.data) # 2* T.data[T.data < thr_global] = 0 T.setdiag(0) T.eliminate_zeros() size_T = T.size T.data = T.data.clip(np.percentile(T.data, 10), np.percentile(T.data, 90)) find_T = find(T) bias_weight, K, c, C, nu = [], 1, 0, 1, 1 bias_weight_pt, bias_weight_velo = [], [] # print('transition before biasing') # print(T) if self.CSM is not None: # if rna velocity is available, then combine the single-cell directions inferred by sc-pt and scRNA velocity to get the sc directions for i in range(size_T): start = find_T[0][i] end = find_T[1][i] weight = find_T[2][i] t_dif = pt[find_T[1][i]] - pt[find_T[0][i]] delta_gene = np.array(self.gene_matrix[end, :] - self.gene_matrix[start, :]) # change in gene expression when going from start cell to end cells # print('shape delta_gene', delta_gene.shape, delta_gene) # csm_ = 1- distance.cosine(delta_gene[0,:], self.velocity_matrix[start, :]) # print(csm_) csm_ = 0 csm_ = cosine_sim(delta_gene, self.velocity_matrix[start, :])[0] * 10 # based on sc level CSM # csm_ = self.CSM[labels[find_T[0][i]],labels[find_T[1][i]]] #based on cluster level CSM # if (csm_<0)&(t_dif<0): csm_ *=10 # if (csm_ >0) & (t_dif > 0): csm_ *= 10 # if i%100000==0: print(i,'out of', size_T, 'start', labels[start], 'end', labels[end], 'csm_', round(csm_,3), 't_diff', round(t_dif,3)) # print('shape gene_matrix', self.gene_matrix.shape) # print('start-end, csm', start, end, csm_) # bias_weight_pt.append(t_dif) # bias_weight_velo.append(csm_) bias_weight_pt.append(1 * K / ((C + math.exp(b * (-t_dif + c))) ** nu)) # bias_weight_velo.append(csm_) bias_weight_velo.append(1 * K / ((C + math.exp(b * (- csm_ + c))) ** nu)) # scale both the pt and velocity weights bias_weight_pt = 2.0 * np.asarray(bias_weight_pt) / np.mean( np.array(bias_weight_pt)) # need to normalize the pt and velo weights # print('list shape',bias_weight_pt.shape) # print('argwhere is nan') # argwhere=np.argwhere(np.isnan(bias_weight_velo)) # print(argwhere.shape) # print('bias weight velo is anywhere nan', np.isnan(bias_weight_velo).any(), np.mean(np.array(bias_weight_velo))) bias_weight_velo = np.nan_to_num(bias_weight_velo) bias_weight_velo = 2.0 * np.asarray(bias_weight_velo) / np.mean(np.array(bias_weight_velo)) # USE THIS # bias_weight_velo = np.asarray(bias_weight_velo)# # bias_weight_velo = np.clip(bias_weight_velo, 0, 1) #this approach zeros out negative edges # print('pt weight ', bias_weight_pt) bias_weight_velo = np.multiply(np.asarray(weight), bias_weight_velo) ## consider changing this weight to the weight based on projected "i" -to- current "neighbor" bias_weight_pt = np.multiply(np.asarray(weight), bias_weight_pt) # print('velo weight * weigt', bias_weight_velo) # print('pt weight *weight', bias_weight_pt) bias_weight = (1 - self.velo_weight) * bias_weight_pt + self.velo_weight * bias_weight_velo # print('bias_weight size', bias_weight.shape) T = csr_matrix((bias_weight / np.mean(np.array(bias_weight)), (np.array(find_T[0]), np.array(find_T[1]))), shape=T.shape) # print('sc transition weights T',np.isnan(T.data).any()) T.data = np.nan_to_num(T.data) # print('sc transition weights T', np.isnan(T.data).any()) else: # case where you dont have rna velocity, then the single-cell directions are only inferred using the single-cell pt b = 10 for i in range(size_T): # start = find_T[0][i] # end = find_T[1][i] weight = find_T[2][i] t_dif = pt[find_T[1][i]] - pt[find_T[0][i]] bias_weight.append(weight * K / ((C + math.exp(b * (-t_dif + c))) ** nu)) # b * (-t_dif T = csr_matrix((bias_weight / np.mean(np.array(bias_weight)), (np.array(find_T[0]), np.array(find_T[1]))), shape=T.shape) T.setdiag(0) T.eliminate_zeros() T = np.expm1(T) T = normalize(T, norm='l1', axis=1) ** smooth_transition T = T.multiply(csr_matrix(1.0 / np.abs(T).sum(1))) # rows sum to one return T
def _velocity_embedding(self, X_emb, smooth_transition, b, use_sequentially_augmented=False): ''' :param X_emb: :param smooth_transition: :return: V_emb ''' # T transition matrix at single cell level n_obs = X_emb.shape[0] V_emb = np.zeros(X_emb.shape) if self.single_cell_transition_matrix is None: self.single_cell_transition_matrix = self.sc_transition_matrix(smooth_transition, b, use_sequentially_augmented=use_sequentially_augmented) T = self.single_cell_transition_matrix else: T = self.single_cell_transition_matrix # the change in embedding distance when moving from cell i to its neighbors is given by dx for i in range(n_obs): indices = T[i].indices dX = X_emb[indices] - X_emb[i, None] # shape (n_neighbors, 2) dX /= l2_norm(dX)[:, None] # dX /= np.sqrt(dX.multiply(dX).sum(axis=1).A1)[:, None] dX[np.isnan(dX)] = 0 # zero diff in a steady-state # neighbor edge weights are used to weight the overall dX or velocity from cell i. probs = T[i].data # if probs.size ==0: print('velocity embedding probs=0 length', probs, i, self.true_label[i]) V_emb[i] = probs.dot(dX) - probs.mean() * dX.sum(0) V_emb /= 3 * quiver_autoscale(X_emb, V_emb) return V_emb def _make_csrmatrix_noselfloop(self, neighbors: np.ndarray, distances: np.ndarray, auto_: bool = True, distance_factor=.01, weights_as_inv_dist=True, min_max_scale=False, time_series=False, time_series_labels=None, t_diff_step=2) -> csr_matrix: """ Create sparse matrix from weighted knn graph. Default does local pruning and returns an affinity graph Parameters ---------- neighbors: np.ndarray of shape (n_samples, n_neighbors) Indicating neighbors of each sample. neighbors[i,j] means that sample j is a neighbor of sample i distances: np.ndarray of shape (n_samples, n_neighbors) Distances between neighboring samples corresponding `neighbors` auto_: bool, default=True If `False` and `self.keep_all_local_dist = False` perform local pruning (according to self.edgepruning_clustering_resolution_local) and remove self-loops distance_factor: float, default=0.01 Factor used in calculation of edge weights. mean(sqrt(distances))^2 / (sqrt(distances) + distance_factor) weights_as_inv_dist: bool, default = True Whether to convert the distances into weights that are proportional to inverse of the distance min_max_scale: bool, default = False This is called with True for constructing the sc-knn used for vertexclustergraph. we use the inverse of the distance but scaled from 0.5-2 such that we can combine the distance derived weights with the Jaccard similarity (0-1). The jaccard similiarity in igraph does not consider weights, so we re-introduce the impact of the distances which then play a role in the transition matrix for trajectories as well as the visualization of edge-weights In the case of the sc-KNN used for clustering, we do not re-introduce the distance based weights but leave it at the jaccard similarity as this empirically seems to work well time_series: bool, default = False. Whether or not this is a time course data set. will require time_series_labels in order to be actioned on time_series_labels: list, default = None. a list of numerical values reflecting the sampling time of each cell t_diff_step=int, default =2, number of sequential time steps permissible between edges. (e.g. edges allowed to be 2 or fewer time steps apart) Returns ------- sparse matrix representing the locally pruned weighted knn affinity graph. edge weight represents level of similarity. higher values mean stronger edge """ distances = np.sqrt(distances.astype(np.float32)) # distances =distances.astype(np.float32) # print(neighbors[0,:]) # print(distances[0,:]) neigh01 = neighbors[0, 1] # print('norm2', np.linalg.norm(self.data[0,:]-self.data[neigh01,:])) if time_series: msk = np.full_like(distances, True, dtype=np.bool_) # print('all edges', np.sum(msk)) # Remove self-loops msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis]) # print('non-self edges', np.sum(msk)) ''' #doing local pruning and then adding back the augmented edges does not work so well because when the edge weights are scaled and inverted, # the sequentially added edges appear very weak and noisy compared to the fairly strong edges that remain after the local pruning from the inital round of knngraph. If you retain all edges from initial graph construction, # then the average weight of edges is exaggeratedly higher than those edge weights from the sequentially added edges, and creates a better gradient of edge weights # Local pruning based on neighbor being too far. msk where we want to keep neighbors msk = distances <= (np.mean(distances, axis=1) + self.edgepruning_clustering_resolution_local * np.std(distances, axis=1))[:, np.newaxis] # Remove self-loops msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis]) last_n_columns = self.knn_sequential+1 msk[:,-last_n_columns:] = True # add back the edges belonging to knn-sequentially built part of the graph ''' # remove edges between nodes that >t_diff_step far apart in time_series_labels if t_diff_step is not None: time_series_set_order = list(sorted(list(set(time_series_labels)))) t_diff_mean = np.mean( np.array([abs(y - x) for x, y in zip(time_series_set_order[:-1], time_series_set_order[1:])])) print( f"{datetime.now()}\tActual average allowable time difference between nodes is {round(t_diff_mean * t_diff_step, 4)}") time_series_labels = np.asarray(time_series_labels) # print(colored(f"inside time_series msk")) rr = 0 count = 0 for row in neighbors: # if rr%20000==0: print(row, type(row), row[0]) rr += 1 t_row = time_series_labels[row[0]] # first neighbor is itself if t_diff_step is not None: for e_i, n_i in enumerate(row): if abs(time_series_labels[n_i] - t_row) > t_diff_mean * t_diff_step: count = count + 1 if np.sum(msk[row[0]]) > 4: msk[ row[0], e_i] = False # we want to ensure that each cell has at least 5 nn # print('links within tdiff',np.sum(msk)) elif auto_ and not self.keep_all_local_dist: # print('elif condition') # Local pruning based on neighbor being too far. msk where we want to keep neighbors msk = distances <= (np.mean(distances, axis=1) + self.edgepruning_clustering_resolution_local * np.std(distances, axis=1))[:, np.newaxis] # Remove self-loops msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis]) else: msk = np.full_like(distances, True, dtype=np.bool_) # print('msk shape', msk.shape) # Remove self-loops msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis]) # print('msk shape', msk.shape) # Inverting the distances outputs values in range [0-1]. This also causes many ``good'' neighbors ending up # having a weight near zero (misleading as non-neighbors have a weight of zero). Therefore we scale by the # mean distance. The inversed weights without min_max_scaling are used for the community detection graph if self.distance == 'cosine': rows = np.array([np.repeat(i, len(x)) for i, x in enumerate(neighbors)])[msk] cols = neighbors[msk] weights = distances[msk] weights.fill(1) # CHECK THIS # weights = logistic_function(weights) elif (weights_as_inv_dist == True) & (min_max_scale == False): # weights = (np.mean(distances[msk]) ** 2) / (distances[msk] + distance_factor) #larger weight is a stronger edge # weights = 1 / (distances[msk] + distance_factor) # do_exp_weight= True if self.do_gaussian_kernel_edgeweights == True: print('do_exp edge weights true in core_working') stdd = np.std(distances, axis=1) import numpy.ma as ma import umap.umap_ as u mx = ma.masked_array(distances, mask=distances == 0) minn = mx.min(axis=1) # np.min(distances,axis=1) min of nonzero entries per row new_distances = np.exp(-(distances - minn[:, None]) / stdd[:, None]) weights = new_distances[msk] sigmas, rhos = u.smooth_knn_dist( distances, k=self.knn, local_connectivity=float(1.0)) rows, cols, weights, dist_weights = u.compute_membership_strengths( neighbors, distances, sigmas, rhos) # print('local connectivity', float(distances.shape[1])) # print('k', self.knn) # print('sigmas', len(sigmas), sigmas) # print('rhos', len(rhos), rhos) result = csr_matrix((weights, (rows, cols)), shape=(len(neighbors), len(neighbors)), dtype=np.float32) result.eliminate_zeros() # print('applying symmetry in umap_') transpose = result.transpose() # print('size of symmetry only', (transpose + result).size) prod_matrix = result.multiply(transpose) result = (result + transpose - prod_matrix) # print('after fuzzy symm result.data.size, max, mean', result.data.size, max(result.data), np.mean(result.data)) return result else: weights = (np.mean(distances[msk]) ** 2) / ( distances[msk] +np.min(distances[msk]))#distance_factor) # larger weight is a stronger edge #Dec12 replacing distance_factor with np.min(distances) #print('np.min(distances) or distance_factor', np.min(distances[msk]), distance_factor) # weights = np.exp(-distances[msk]/stdd[:,None]) rows = np.array([np.repeat(i, len(x)) for i, x in enumerate(neighbors)])[msk] cols = neighbors[msk] elif min_max_scale == True: # the jaccard similarity does not consider edge weights, we will use these min-max scaled weights to make a composite weight together with Jaccard to retain the distance based info weights = distances[msk] # larger weight is a stronger edge weights = np.clip(weights, a_min=np.percentile(weights, 10), a_max=None) # the first neighbor is usually itself and hence has distance 0 # print('min clip',np.percentile(weights,10)) # print('clipped weights for current pc-dist', weights) weights = 0.5 + (weights - np.min(weights)) * (2 - 0.5) / (np.max(weights) - np.min(weights)) weights = 1 / weights # all weights between 0.5 and 2. these will later be multiplied by the Jac similarity which are between 0-1 to create a composite weight rows = np.array([np.repeat(i, len(x)) for i, x in enumerate(neighbors)])[msk] cols = neighbors[msk] # result = scipy.sparse.coo_matrix( (weights, (rows, cols)), shape=(len(neighbors), len(neighbors)) ) result = csr_matrix((weights, (rows, cols)), shape=(len(neighbors), len(neighbors)), dtype=np.float32) # result.eliminate_zeros() # transpose = result.transpose() # prod_matrix = result.multiply(transpose) # result = result + transpose# - prod_matrix return result def func_mode(self, ll): # return MODE of list # If multiple items are maximal, the function returns the first one encountered. return max(set(ll), key=ll.count) def make_JSON(self, folderpath='/home/user/JavaCode/basicgraph/', filename='VIA_JSON.js'): import networkx as nx from networkx.readwrite import json_graph from collections import defaultdict edgelist = self.edgelist_maxout weightlist = self.edgeweights_maxout min_w = min(weightlist) max_w = max(weightlist) weightlist = [(10 * (i - min_w) / (max_w - min_w)) + 1 for i in weightlist] n_groups = len(set(self.labels)) group_pop = np.zeros([n_groups, 1]) cluster_population_dict = {} for group_i in set(self.labels): loc_i = np.where(self.labels == group_i)[0] group_pop[group_i] = len(loc_i) # np.sum(loc_i) / 1000 + 1 cluster_population_dict[group_i] = len(loc_i) pop_max = cluster_population_dict[max(cluster_population_dict, key=cluster_population_dict.get)] pop_min = cluster_population_dict[min(cluster_population_dict, key=cluster_population_dict.get)] print(cluster_population_dict, pop_min, pop_max) pt_max = max(self.scaled_hitting_times) pt_min = min(self.scaled_hitting_times) scaled_hitting_times = [10 * (i - pt_min) / (pt_max - pt_min) for i in self.scaled_hitting_times] node_majority_truth_labels = [] for ci, cluster_i in enumerate(sorted(list(set(self.labels)))): cluster_i_loc = np.where(np.asarray(self.labels) == cluster_i)[0] majority_truth = str(self.func_mode(list(np.asarray(self.true_label)[cluster_i_loc]))) node_majority_truth_labels.append(majority_truth) majority_truth_labels_dict = dict(enumerate(node_majority_truth_labels)) temp = defaultdict(lambda: len(temp)) df_edges = pd.DataFrame(edgelist, columns=['source', 'target']) df_edges['weight'] = weightlist df_edges['distance'] = [150 / i for i in weightlist] G = nx.DiGraph() # directed graph for key in majority_truth_labels_dict: print('node', key, majority_truth_labels_dict[key], round(scaled_hitting_times[key], 1), cluster_population_dict[key], temp[majority_truth_labels_dict[key]]) # val denotes size in d3 by default G.add_node(key, group=majority_truth_labels_dict[key], pseudotime=scaled_hitting_times[key], val=(10 * (cluster_population_dict[key] - pop_min) / (pop_max - pop_min)) + 1, group_num=temp[majority_truth_labels_dict[key]]) for enum_i, i in enumerate(edgelist): print('edge', i, weightlist[enum_i], cluster_population_dict[i[0]], 150 / weightlist[enum_i]) if (scaled_hitting_times[i[0]] < scaled_hitting_times[i[1]]): source_node = i[0] target_node = i[1] else: source_node = i[1] target_node = i[0] # val edge controls number of emitted particles G.add_edge(u_of_edge=source_node, v_of_edge=target_node, weight=weightlist[enum_i], val=(5 * (cluster_population_dict[source_node] - pop_min) / (pop_max - pop_min)) + 1, distance=150 / weightlist[enum_i]) # Visualize the network: nx.draw_networkx(G) plt.show() import json j = json_graph.node_link_data(G) js = json.dumps(['var gData=', j], ensure_ascii=False, indent=2) with open(folderpath + filename, "w") as file: file.write(js) return def run_toobig_subPARC(self, X_data, jac_std_toobig=1, jac_weighted_edges=True): n_elements = X_data.shape[0] hnsw = _construct_knn(X_data, knn=self.knn, distance=self.distance, num_threads=self.num_threads, too_big=True) if self.knn >= 0.8 * n_elements: k = int(0.5 * n_elements) else: k = self.knn neighbor_array, distance_array = hnsw.knn_query(X_data, k=k) csr_array = self._make_csrmatrix_noselfloop(neighbor_array, distance_array) sources, targets = csr_array.nonzero() mask = np.zeros(len(sources), dtype=bool) mask |= (csr_array.data > ( np.mean(csr_array.data) + np.std(csr_array.data) * 5)) # smaller distance means stronger edge csr_array.data[mask] = 0 csr_array.eliminate_zeros() sources, targets = csr_array.nonzero() edgelist = list(zip(sources.tolist(), targets.tolist())) edgelist_copy = edgelist.copy() G = ig.Graph(edgelist, edge_attrs={'weight': csr_array.data.tolist()}) sim_list = G.similarity_jaccard(pairs=edgelist_copy) # list of jaccard weights new_edgelist = [] sim_list_array = np.asarray(sim_list) if jac_std_toobig == 'median': threshold = np.median(sim_list) else: threshold = np.mean(sim_list) - jac_std_toobig * np.std(sim_list) strong_locs = np.where(sim_list_array > threshold)[0] for ii in strong_locs: new_edgelist.append(edgelist_copy[ii]) sim_list_new = list(sim_list_array[strong_locs]) if jac_weighted_edges == True: G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist), edge_attrs={'weight': sim_list_new}) else: G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist)) G_sim.simplify(combine_edges='sum') if jac_weighted_edges == True: if self.partition_type == 'ModularityVP': partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, weights='weight', n_iterations=self.n_iter_leiden, seed=self.random_seed) # print('partition type MVP') else: partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, weights='weight', n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter=self.resolution_parameter) else: if self.partition_type == 'ModularityVP': # print('partition type MVP') partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, n_iterations=self.n_iter_leiden, seed=self.random_seed) else: print('partition type RBC') partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter=self.resolution_parameter) PARC_labels_leiden = np.asarray(partition.membership) PARC_labels_leiden = np.reshape(PARC_labels_leiden, (n_elements, 1)) small_pop_list = [] small_cluster_list = [] small_pop_exist = False dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True) for cluster in set(PARC_labels_leiden): population = len(np.where(PARC_labels_leiden == cluster)[0]) if population < 5: small_pop_exist = True small_pop_list.append(list(np.where(PARC_labels_leiden == cluster)[0])) small_cluster_list.append(cluster) for small_cluster in small_pop_list: for single_cell in small_cluster: old_neighbors = neighbor_array[single_cell, :] group_of_old_neighbors = PARC_labels_leiden[old_neighbors] group_of_old_neighbors = list(group_of_old_neighbors.flatten()) available_neighbours = set(group_of_old_neighbors) - set(small_cluster_list) if len(available_neighbours) > 0: available_neighbours_list = [value for value in group_of_old_neighbors if value in list(available_neighbours)] best_group = max(available_neighbours_list, key=available_neighbours_list.count) PARC_labels_leiden[single_cell] = best_group do_while_time = time.time() while (small_pop_exist == True) & (time.time() - do_while_time < 5): small_pop_list = [] small_pop_exist = False for cluster in set(list(PARC_labels_leiden.flatten())): population = len(np.where(PARC_labels_leiden == cluster)[0]) if population < 10: small_pop_exist = True small_pop_list.append(np.where(PARC_labels_leiden == cluster)[0]) for small_cluster in small_pop_list: for single_cell in small_cluster: old_neighbors = neighbor_array[single_cell, :] group_of_old_neighbors = PARC_labels_leiden[old_neighbors] group_of_old_neighbors = list(group_of_old_neighbors.flatten()) best_group = max(set(group_of_old_neighbors), key=group_of_old_neighbors.count) PARC_labels_leiden[single_cell] = best_group dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True) self.labels = PARC_labels_leiden return PARC_labels_leiden def find_root_group(self, graph_dense, PARC_labels_leiden, root_user, true_labels, super_cluster_labels_sub, super_node_degree_list): # PARC_labels_leiden is the subset belonging to the component of the graph being considered. graph_dense is a component of the full graph # returns the cluster most likely corresponding to the root (initial) state on the cluster the graph majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object) graph_node_label = [] min_deg = 1000 super_min_deg = 1000 found_super_and_sub_root = False found_any_root = False true_labels = np.asarray(true_labels) deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0] for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))): cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0] majority_truth = str(self.func_mode(list(true_labels[cluster_i_loc]))) if self.super_cluster_labels != False: super_majority_cluster = self.func_mode(list(np.asarray(super_cluster_labels_sub)[cluster_i_loc])) super_majority_cluster_loc = np.where(np.asarray(super_cluster_labels_sub) == super_majority_cluster)[0] super_majority_truth = self.func_mode(list(true_labels[super_majority_cluster_loc])) super_node_degree = super_node_degree_list[super_majority_cluster] if (str(root_user) in majority_truth) & (str(root_user) in str(super_majority_truth)): if super_node_degree < super_min_deg: found_super_and_sub_root = True root = cluster_i found_any_root = True min_deg = deg_list[ci] super_min_deg = super_node_degree print(f"{datetime.now()}\tNew root is {root}") majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i) graph_node_label.append(str(majority_truth) + 'c' + str(cluster_i)) if (self.super_cluster_labels == False) | (found_super_and_sub_root == False): # print('self.super_cluster_labels', super_cluster_labels_sub, ' foundsuper_cluster_sub and super root',found_super_and_sub_root) for ic, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))): cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0] # print('cluster', cluster_i, 'set true labels', set(true_labels)) true_labels = np.asarray(true_labels) majority_truth = str(self.func_mode(list(true_labels[cluster_i_loc]))) print('cluster', cluster_i, 'has majority', majority_truth) # , 'with degree list', deg_list) if (str(root_user) in str(majority_truth)): # 'in' not == if deg_list[ic] < min_deg: root = cluster_i found_any_root = True min_deg = deg_list[ic] print(f"{datetime.now()}\tNew root is {root} and majority {majority_truth}") # print('len graph node label', graph_node_label) if found_any_root == False: print(f"{datetime.now()}\tSetting arbitrary root {cluster_i}") root = cluster_i return graph_node_label, majority_truth_labels, deg_list, root def make_majority_truth_cluster_labels(self, PARC_labels_leiden, true_labels, graph_dense): majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object) graph_node_label = [] true_labels = np.asarray(true_labels) deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0] for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))): cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0] majority_truth = str(self.func_mode(list(true_labels[cluster_i_loc]))) majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i) graph_node_label.append(str(majority_truth) + 'c' + str(cluster_i)) deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0] return graph_node_label, majority_truth_labels, deg_list def find_root(self, graph_dense, PARC_labels_leiden, root_user, true_labels): # root-user is the singlecell index given by the user when running VIA majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object) graph_node_label = [] true_labels = np.asarray(true_labels) deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0] for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))): # print('cluster i', cluster_i) cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0] majority_truth = self.func_mode(list(true_labels[cluster_i_loc])) majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i) graph_node_label.append(str(majority_truth) + 'c' + str(cluster_i)) root = PARC_labels_leiden[root_user] return graph_node_label, majority_truth_labels, deg_list, root def find_root_2Morgan(self, graph_dense, PARC_labels_leiden, root_idx, true_labels): # single cell index given corresponding to user defined root cell majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object) graph_node_label = [] true_labels = np.asarray(true_labels) deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0] secondary_annotations = np.asarray(self.secondary_annotations) for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))): cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0] majority_truth = self.func_mode(list(true_labels[cluster_i_loc])) majority_truth_secondary = str(self.func_mode(list(secondary_annotations[cluster_i_loc]))) majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i) # graph_node_label.append(str(majority_truth)[0:5] + 'C' + str(cluster_i)) graph_node_label.append(str(majority_truth)[0:5] + 'C' + str(cluster_i) + str(majority_truth_secondary)) root = PARC_labels_leiden[root_idx] return graph_node_label, majority_truth_labels, deg_list, root def full_graph_paths(self, data, n_components_original=1): # make igraph object of very low-K KNN using the knn_struct PCA-dimension space made in PARC. # This is later used by find_shortest_path for sc_bp visual # neighbor array is not listed in in any order of proximity print(f"{datetime.now()}\tThe number of components in the original full graph is {n_components_original}") print(f"{datetime.now()}\tFor downstream visualization purposes we are also constructing a low knn-graph ") first, k0, n_comp = True, 3, n_components_original + 1 while (n_components_original == 1 and n_comp > 1) or \ (n_components_original > 1 and k0 <= 5 and n_comp > n_components_original): neighbors, distances = self.knn_struct.knn_query(data, k=k0) csr_array = self._make_csrmatrix_noselfloop(neighbors, distances, auto_=False) n_comp, comp_labels = connected_components(csr_array, return_labels=True) if first: first = False else: k0 += 1 # print(f"{datetime.now()}\t the size of neighbor array in low-KNN in pca-space for visualization is {neighbors.shape}") n_cells, n_neighbors = neighbors.shape rows = np.transpose(np.ones((n_neighbors, n_cells)) * range(0, n_cells)).flatten() cols = neighbors.flatten() csr_full_graph = csr_matrix((distances.flatten(), (rows, cols)), shape=(n_cells, n_cells)) return ig.Graph(list(zip(*csr_full_graph.nonzero()))).simplify(combine_edges='sum') def do_impute(self, df_gene, magic_steps=3, gene_list=[]): # ad_gene is an ann data object from scanpy # normalize across columns to get Transition matrix. transition_full_graph = normalize(self.csr_full_graph, norm='l1', axis=1) ** magic_steps print('shape of transition matrix raised to power', magic_steps, transition_full_graph.shape) subset = df_gene[gene_list].values return pd.DataFrame(transition_full_graph.dot(subset), index=df_gene.index, columns=gene_list) def run_subVIA(self): # Construct graph or obtain from previous run if self.is_coarse: neighbors, distances = self.knn_struct.knn_query(self.data, k=self.knn) # these n, d will be used as a basis for both the clustergraph and graph which we apply community detection. These two graphs are constructed differently because in the community detection we want to emphasize dissimiliarities, # but the in the clustergraph we want to enhance connectivity adjacency_augmented = None adjacency = None if (self.time_series == True) and ( self.time_series_labels is not None): # refine the graph for clustering (leiden) using time_series labels if available print(f"{datetime.now()}\tUsing time series information to guide knn graph construction ") n_augmented, d_augmented = sequential_knn(self.data, self.time_series_labels, neighbors, distances, k_seq=self.knn_sequential, k_reverse=self.knn_sequential_reverse, num_threads=self.num_threads, distance=self.distance) adjacency_augmented = self._make_csrmatrix_noselfloop(n_augmented, d_augmented, time_series=self.time_series, time_series_labels=self.time_series_labels, t_diff_step=self.t_diff_step) # this function has local pruning which removes neighbors that are more than t_dif apart. Since the same type of local pruning wrt t_dif is applied pre-clustergraph, we only need to call this function once in the case of time_series data adjacency = self._make_csrmatrix_noselfloop(neighbors, distances) # not augmented or t_diff'ed if (self.do_spatial_knn) and ( self.spatial_coords is not None): # refine the graph for clustering (leiden) using time_series labels if available if len(self.spatial_aux) == 0: print(f"{datetime.now()}\tSince no slice-labels were provided, all cells are assumed to be from the same tissue slice ") self.spatial_aux = ['slice1'] * self.nsamples#['slice1' for i in range(self.nsamples)] if adjacency_augmented is None: #we did not augment based on time series data print(f"{datetime.now()}\tUsing spatial information to guide knn graph construction. Note! If cells are from different slices of tissue, provide a list of tissue-slice IDs ") #n_augmented, d_augmented = spatial_knn(coords=self.spatial_coords, neighbors=neighbors, distances=distances, k_spatial=self.spatial_knn) n_augmented, d_augmented= spatial_knn_new(spatial_coords=self.spatial_coords, spatial_slice_labels=self.spatial_aux, neighbors=neighbors, distances=distances, k_spatial=self.spatial_knn, distance_metric='l2', num_threads=-1, too_big=False) adjacency_augmented = self._make_csrmatrix_noselfloop(n_augmented, d_augmented) # this function has local pruning which removes neighbors that are more than t_dif apart. Since the same type of local pruning wrt t_dif is applied pre-clustergraph, we only need to call this function once in the case of time_series data else: print(f"{datetime.now()}\tUsing spatial information to guide knn graph construction + combining with time-series data") n_augmented, d_augmented = spatial_knn_new(spatial_coords=self.spatial_coords, spatial_slice_labels = self.spatial_aux, neighbors=n_augmented, distances=d_augmented, k_spatial=self.spatial_knn,distance_metric = 'l2', num_threads= -1, too_big= False) adjacency_augmented = self._make_csrmatrix_noselfloop(n_augmented, d_augmented) if adjacency is None: adjacency = self._make_csrmatrix_noselfloop(neighbors, distances) # not augmented or t_diff'ed if adjacency is None: adjacency = self._make_csrmatrix_noselfloop(neighbors, distances) # , min_max_scale=True) #this function has local pruning based on edge distances else: neighbors, distances = self.full_neighbor_array, self.full_distance_array adjacency = self.csr_array_locally_pruned edges = np.array(list(zip(*adjacency.nonzero()))) sim = np.array( ig.Graph(n=self.nsamples, edges=edges.tolist(), edge_attrs={'weight': adjacency.data}).similarity_jaccard( pairs=edges)) # used to do igraph jaccard does not use weight information to compute Jacc # sim = adjacency.data tot = len(sim) # Prune Jacc weighted edges off graph on global level threshold = np.median(sim) if self.edgepruning_clustering_resolution == 'median' else sim.mean() - self.edgepruning_clustering_resolution * sim.std() strong_locs = np.asarray(np.where(sim > threshold)[0]) # to be used for leiden paritionining clustering g_local_global_jac = ig.Graph(n=self.nsamples, edges=list(edges[strong_locs]), edge_attrs={'weight': list(sim[strong_locs])}).simplify( combine_edges='sum') # used for the clustering step and has been locally and globally pruned # jac_sim_list = g_local_global_jac.similarity_jaccard(pairs=list(edges[strong_locs])) # g_local_global_jac =ig.Graph(n=self.nsamples, edges=list(edges[strong_locs]), edge_attrs={'weight': jac_sim_list}).simplify(combine_edges='sum') #used for clustering (globally and locally pruned) and then jacc weighted (Jacc does not consider weights) self.sparse_glob_loc_pruned = get_sparse_from_igraph(g_local_global_jac, weight_attr='weight') print( f"{datetime.now()}\tFinished global pruning of {self.knn}-knn graph used for clustering at level of {self.edgepruning_clustering_resolution}. Kept {round(100 * len(strong_locs) / tot, 1)} % of edges. ") if self.is_coarse: # Construct full graph with no pruning - used for cluster graph edges, not listed in any order of proximity if (self.time_series == True) and (self.time_series_labels is not None): csr_full_graph = adjacency_augmented.copy() # when time_series data is available if self.RW2_mode: print("saving csr for RW2") from scipy.sparse import save_npz str_date = str(str(datetime.now())[-3:]) print(f'Unique ID for RW2 file {str_date}') save_npz(self.working_dir_fp + 'pc' + str(self.ncomp) + '_knn' + str(self.knn) + 'kseq' + str( self.knn_sequential) + 'krev' + str(self.knn_sequential_reverse) + 'RW2_' + str_date + '.npz', csr_full_graph) # when no time-series data is available, in this iteration of weighting the edges we do not locally prune based on edge-weights becasue we intend to use all neighborhood info towards the edges in the clustergraph and ensuring the clustergraph is connected elif (self.do_spatial_knn == True): print(f"{datetime.now()}\tusing spatial coords to augment clustergraph") csr_full_graph = adjacency_augmented.copy() else: csr_full_graph = self._make_csrmatrix_noselfloop(neighbors, distances, auto_=False, min_max_scale=False, time_series=self.time_series, time_series_labels=self.time_series_labels) # no local pruning: auto_ set to false #min_max_scale=True n_original_comp, n_original_comp_labels = connected_components(csr_full_graph, directed=False) print(f"{datetime.now()}\tNumber of connected components used for clustergraph is {n_original_comp}") if ((self.pca_loadings is not None) & (self.velocity_matrix is not None)) & (self.gene_matrix is not None): edges_preClustergraph = list( zip(*csr_full_graph.nonzero())) # in the case of time-series, this is the augmented graph # projected_distances are the inverted Projected_distances (affinities) based on velocity adjusted location of neighbors projected_distances = get_projected_distances(loadings=self.pca_loadings, gene_matrix=self.gene_matrix, velocity_matrix=self.velocity_matrix, edgelist=edges_preClustergraph, current_pc=self.data) self.ig_full_graph = _composite_jacAffinity_distanceAffinity(sc_csr_graph=csr_full_graph, projected_distances=projected_distances) else: # this will be used for making the clustergraph. Note that in the case of time-series, this is the augmented graph with weights given by a composite metric of Jaccard and affinities self.ig_full_graph = _composite_jacAffinity_distanceAffinity( csr_full_graph) # 0<J<1 #combining distance-based affinities and jac-based affinities # for VIA we local and global prune the vertex cluster graph *after* making the clustergraph self.csr_array_locally_pruned = adjacency # this graph has only been locally pruned and in the case of time-series, it is not the augmented knn. it then subsequently in later code will be subject to global pruning, followed by Leiden clustering. saving it here for non-coarse runs if self.time_series == True: self.csr_array_locally_pruned_augmented = adjacency_augmented # t_diff edges removed and is sequentially augmented affinity graph (no Jaccard metric is added) else: self.csr_array_locally_pruned_augmented = None self.csr_full_graph = csr_full_graph # for timeseries, this is the sequentially augmented graph, and later used in sc_transitions. This can also be used for hnsw_umap self.full_neighbor_array = neighbors self.full_distance_array = distances # knn graph used for making trajectory drawing on the visualization # self.full_graph_shortpath = self.full_graph_paths(self.data, n_original_comp) neighbors = self.full_neighbor_array if self.labels is None: print( f"{datetime.now()}\tCommencing community detection") # this is done on the non-time-series augmented graph to avoid clustering largely based on prior known time-level data weights = 'weight' if self.jac_weighted_edges else None # type = leidenalg.ModularityVertexPartition if self.partition_type == 'ModularityVP' else leidenalg.RBConfigurationVertexPartition if self.partition_type == 'ModularityVP': partition = leidenalg.find_partition(g_local_global_jac, leidenalg.ModularityVertexPartition, weights=weights, n_iterations=self.n_iter_leiden, seed=self.random_seed) # print('partition type MVP') else: partition = leidenalg.find_partition(g_local_global_jac, leidenalg.RBConfigurationVertexPartition, weights=weights, n_iterations=self.n_iter_leiden, seed=self.random_seed, resolution_parameter=self.resolution_parameter) # partition = leidenalg.find_partition(g_local_global_jac, partition_type=type, weights=weights, n_iterations=self.n_iter_leiden, seed=self.random_seed) labels = np.array(partition.membership) print(f"{datetime.now()}\tFinished community detection. Found {len(set(labels))} clusters.") # Searching for clusters that are too big and split them too_big_clusters = [k for k, v in Counter(labels).items() if v > self.too_big_factor * self.nsamples] if len(too_big_clusters): print(f"{datetime.now()}\tFound {len(too_big_clusters)} clusters that are too big") time0_big = time.time() count_big_pop = len(too_big_clusters) num_times_expanded = 0 # TODO - add max running time condition while len(too_big_clusters) > 0 & ( not ((time.time() - time0_big > 200) & (num_times_expanded >= count_big_pop))): # while len(too_big_clusters) & (not((time.time() - time0_big >200) & (num_times_expanded >= count_big_pop))): print(f"{datetime.now()}\tExamining clusters that are above the too_big threshold") cluster = too_big_clusters.pop(0) idx = labels == cluster print(f"{datetime.now()}\tCluster {cluster} contains " f"{idx.sum()}>{round(self.too_big_factor * self.nsamples)} samples and is too big") data = self.data[idx] membership = max(labels) + 1 + np.array(self.run_toobig_subPARC(data)) num_times_expanded += 1 if len(set(membership)) > 1: labels[idx] = membership too_big_clusters.extend( [k for k, v in Counter(membership).items() if v > self.too_big_factor * self.nsamples]) else: print(f"{datetime.now()}\tCould not expand cluster {cluster}") # Search for clusters that are too small (like singletons) and merge them to non-small clusters based on neighbors' majority vote # first we make a quick pass through all clusters to remove very small outliers by merging with a larger cluster # print('before final small cluster handling we have',len(set(labels)), 'communities') too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop} print(f"{datetime.now()}\tMerging {len(set(too_small_clusters))} very small clusters (<{self.small_pop})") idx = np.where(np.isin(labels, list(too_small_clusters)))[0] neighbours_labels = labels[neighbors[idx]] for i, nl in zip(*[idx, neighbours_labels]): # Retrieve the first non small cluster, with highest number of neighbours label = next((label for label, n in Counter(nl).most_common() if label not in too_small_clusters), None) # label = next((label for label, n in Counter(nl).most_common()), None) if label is not None: # recall 0 is a valid label value labels[i] = label # too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop} too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop} # in this pass we allow clusters to be merged even if they are not a Large Cluster.. as multiple smaller ones might come together to form an acceptably large cluster do_while_time = time.time() while len(too_small_clusters) & (time.time() - do_while_time < 15): # membership of neighbours of samples in small clusters idx = np.where(np.isin(labels, list(too_small_clusters)))[0] neighbours_labels = labels[neighbors[idx]] for i, nl in zip(*[idx, neighbours_labels]): # Retrieve the first non small cluster, with highest number of neighbours # label = next((label for label, n in Counter(nl).most_common() if label not in too_small_clusters), None) label = next((label for label, n in Counter(nl).most_common()), None) if label is not None: # recall 0 is a valid label value labels[i] = label # Update set of too small clusters, stopping if converged too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop} # Reset labels to begin from zero and with no missing numbers self.labels = labels = np.unique(labels, return_inverse=True)[1] print(f"{datetime.now()}\tFinished detecting communities. Found", len(set(self.labels)), 'communities') else: print(f'{datetime.now()}\tUsing predfined labels provided by user (this must be provided as an array)') # end community detection # do kmeans instead ''' from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=20, random_state=1).fit(X_data) self.labels = kmeans.labels_ n_clus = len(set(self.labels)) self.labels = kmeans.labels_.flatten().tolist() pop_list = [] pop_list_raw = [] for item in range(len(set(PARC_labels_leiden))): pop_item = PARC_labels_leiden.count(item) pop_list.append((item, pop_item)) pop_list_raw.append(pop_item) ''' # Make cluster-graph print( f"{datetime.now()}\tMaking cluster graph. Global cluster graph pruning level: {self.cluster_graph_pruning}") graph = ig.VertexClustering(self.ig_full_graph, membership=self.labels).cluster_graph(combine_edges='sum') graph = recompute_weights(graph, Counter(self.labels)) # returns csr matrix self.cluster_graph_csr_not_pruned = graph # type csr NOT PRUNED edgeweights_pruned_clustergraph, edges_pruned_clustergraph, comp_labels = pruning_clustergraph(graph, global_pruning_std=self.cluster_graph_pruning, preserve_disconnected=self.preserve_disconnected, preserve_disconnected_after_pruning=self.preserve_disconnected_after_pruning) self.connected_comp_labels = comp_labels # ig.graph() creates an undirected graph,in which case get_spare_from_igraph will be symmetric for each edge (u,v) and edge (v,u) is created locallytrimmed_g = ig.Graph(edges_pruned_clustergraph, edge_attrs={ 'weight': edgeweights_pruned_clustergraph}) # .simplify(combine_edges='sum') #forward biasing happens later locallytrimmed_sparse_vc = get_sparse_from_igraph(locallytrimmed_g, weight_attr='weight') if self.edgebundle_pruning == self.cluster_graph_pruning: weights_for_layout = np.asarray(locallytrimmed_sparse_vc.data) # clip weights to prevent distorted visual scale weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10), np.percentile(weights_for_layout, 90)) # want to clip the weights used to get the layout weights_for_layout = list(weights_for_layout) g_layout = ig.Graph(list(zip(*locallytrimmed_sparse_vc.nonzero())), edge_attrs={'weight': weights_for_layout}) # if using a different pruning for visualized edge bundles than the one specified in self.cluster_graph_pruning else: edgeweights_layout, edges_layout, comp_labels_layout = pruning_clustergraph(graph, global_pruning_std=self.edgebundle_pruning, preserve_disconnected=self.preserve_disconnected, preserve_disconnected_after_pruning=self.preserve_disconnected_after_pruning) # layout = locallytrimmed_g.layout_fruchterman_reingold(weights='weight') #uses non-clipped weights but this can skew layout due to one or two outlier edges layout_g = ig.Graph(edges_layout, edge_attrs={ 'weight': edgeweights_layout}) # .simplify(combine_edges='sum') remving the 'combined_edges='sum' as it does not handle diagonal self loops which we require to be retained for singleton clusters that are not connected to other clusters layout_g_csr = get_sparse_from_igraph(layout_g, weight_attr='weight') weights_for_layout = np.asarray(layout_g_csr.data) for i in range(weights_for_layout.shape[0]): if weights_for_layout[i] == 0: print(i) # clip weights to prevent distorted visual scale in layout weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10), np.percentile(weights_for_layout, 90)) weights_for_layout = list(weights_for_layout) g_layout = ig.Graph(list(zip(*layout_g_csr.nonzero())), edge_attrs={'weight': weights_for_layout}) self.clustergraph_igraph_forlayout = g_layout # the layout of the graph is determine by a pruned clustergraph and the directionality of edges will be based on the final markov pseudotimes # the edgeweights of the bundle-edges is determined by the distance based metrics and jaccard similarities and not by the pseudotimes # for the transition matrix used in the markov pseudotime and differentiation probability computations, the edges will be further biased by the hittings times and markov pseudotimes # layout = g_layout.layout_fruchterman_reingold(weights='weight') # globally trimmed link self.edgelist_unique = set( tuple(sorted(l)) for l in zip(*locallytrimmed_sparse_vc.nonzero())) # keep only one of (0,1) and (1,0) self.edgelist = edges_pruned_clustergraph # after one round of global and local pruning of graph to be used for MCMCs # number of components n_components, labels_cc = connected_components(csgraph=locallytrimmed_sparse_vc, directed=False, return_labels=True) df_graph = pd.DataFrame(locallytrimmed_sparse_vc.todense()) if (self.velocity_matrix is not None) & (self.gene_matrix is not None): df_velocity = pd.DataFrame(self.velocity_matrix) print('size velocity matrix', len(self.labels), self.velocity_matrix.shape) df_velocity['labels'] = self.labels velocity_mean = df_velocity.groupby('labels', as_index=True).mean().to_numpy() gene_mean = pd.DataFrame(self.gene_matrix) gene_mean['labels'] = self.labels gene_mean = gene_mean.groupby('labels', as_index=True).mean().to_numpy() self.A_velo, self.CSM, velo_root_top3 = velocity_transition(A=df_graph.values, V=velocity_mean, G=gene_mean) # velocity directed transition matrix print('cell type of suggested roots:', [self.true_label[x] for x in velo_root_top3]) if self.time_series == True: print('cell stage of suggested roots:', [self.time_series_labels[x] for x in velo_root_top3]) df_graph['cc'] = labels_cc df_graph['pt'] = float('NaN') df_graph['majority_truth'] = 'maj truth' df_graph['graph_node_label'] = 'node label' PARC_labels_leiden = self.labels set_parc_labels = list(set(PARC_labels_leiden)) set_parc_labels.sort() tsi_list = [] df_graph['markov_pt'] = float('NaN') terminal_clus = [] # list of cluster labels that are found to be terminal clusters node_deg_list = [] super_terminal_clus_revised = [] pd_columnnames_terminal = [] dict_terminal_super_sub_pairs = {} self.root = [] large_components = [] for comp_i in range(n_components): loc_compi = np.where(labels_cc == comp_i)[0] if len(loc_compi) > 1: # need at least 2 nodes in in a component to make edges large_components.append(comp_i) elif len(loc_compi) == 0: df_graph.at[loc_compi[0], 'markov_pt'] = 1 # 0 df_graph.at[loc_compi[0], 'pt'] = 1 # 0 for comp_i in large_components: # range(n_components): print(f'{datetime.now()}\tcomponent number', comp_i, 'out of ', large_components) loc_compi = np.where(labels_cc == comp_i)[0] a_i = df_graph.iloc[loc_compi][loc_compi].values # transition matrix of relevant component # A_velo, CSM = velocity_transition(A=a_i, V=velocity_mean, G=gene_mean) # velocity directed transition matrix if self.A_velo is not None: A_velo, CSM = self.A_velo[loc_compi, :][loc_compi, :], self.CSM[loc_compi, :][ loc_compi, :] a_i = csr_matrix(a_i, (a_i.shape[0], a_i.shape[0])) cluster_labels_subi = [x for x in loc_compi] # print('cluster_labels_subi', cluster_labels_subi) sc_labels_subi = [PARC_labels_leiden[i] for i in range(len(PARC_labels_leiden)) if (PARC_labels_leiden[i] in cluster_labels_subi)] sc_truelabels_subi = [self.true_label[i] for i in range(len(PARC_labels_leiden)) if (PARC_labels_leiden[i] in cluster_labels_subi)] if (self.root_user is None) and (self.velocity_matrix is not None): # in case the root cluster is a singleton, move on to the next nominee in initial states root_i = velo_root_top3[0] cluster_i_loc = np.where(np.asarray(sc_labels_subi) == root_i)[0] if len(cluster_i_loc) < 2: root_i = velo_root_top3[1] cluster_i_loc = np.where(np.asarray(sc_labels_subi) == root_i)[0] if len(cluster_i_loc) < 2: root_i = velo_root_top3[2] cluster_i_loc = np.where(np.asarray(sc_labels_subi) == root_i)[0] if len(cluster_i_loc) < 2: root_i = velo_root_top3[0] graph_node_label, majority_truth_labels, node_deg_list_i = self.make_majority_truth_cluster_labels( PARC_labels_leiden=sc_labels_subi, true_labels=sc_truelabels_subi, graph_dense=a_i) for velo_root_candidate_i in velo_root_top3: cluster_i_loc = np.where(np.asarray(sc_labels_subi) == velo_root_candidate_i)[0] majority_truth = str(self.func_mode(list(np.asarray(sc_truelabels_subi)[cluster_i_loc]))) if velo_root_candidate_i == root_i: majority_truth_veloroot0 = majority_truth print( f"{datetime.now()}\tUsing the RNA velocity graph, A top3 candidate for initial state is {velo_root_candidate_i} comprising predominantly of {majority_truth} cells", ) print( f"{datetime.now()}\tUsing the RNA velocity graph, the suggested initial root state is {root_i} comprising predominantly of {majority_truth_veloroot0} cells") elif (self.dataset in ['toy', 'faced', 'mESC', 'iPSC', 'group']) and ( self.root_user is not None): # ((self.dataset == 'toy') | (self.dataset == 'faced')): root_user_ = None for ri in self.root_user: print(f"{datetime.now()}\tgroup root method") if ri in sc_truelabels_subi: root_user_ = ri print(f"{datetime.now()}\tfor component {comp_i}, the root is {root_user_} and ri {ri}") if root_user_ is None: root_user_ = sc_truelabels_subi[0] print(f"{datetime.now()}\tsetting a dummy root") graph_node_label, majority_truth_labels, node_deg_list_i, root_i = self.find_root_group(a_i, sc_labels_subi, root_user_, sc_truelabels_subi, [], []) elif (self.dataset == '2M'): for ri in self.root_user: if PARC_labels_leiden[ri] in cluster_labels_subi: root_user_ = ri graph_node_label, majority_truth_labels, node_deg_list_i, root_i = self.find_root_2Morgan(a_i, sc_labels_subi, root_user_, sc_truelabels_subi) # when root_user is given as a cell index else: root_user_ = None if (comp_i > len(self.root_user) - 1): root_user_ = 0 else: for ri in self.root_user: if PARC_labels_leiden[ri] in cluster_labels_subi: root_user_ = ri print( f"{datetime.now()}\tThe root index, {ri} provided by the user belongs to cluster number {PARC_labels_leiden[ri]} and corresponds to cell type {self.true_label[ri]}") if root_user_ is None: root_user_ = 0 graph_node_label, majority_truth_labels, node_deg_list_i, root_i = self.find_root(a_i, PARC_labels_leiden, root_user_, self.true_label) self.root.append(root_i) self.majority_truth_labels = majority_truth_labels # single cell level "Majority Truth of that Cluster + Clusterlabel" for item in node_deg_list_i: node_deg_list.append(item) new_root_index_found = False for ii, llabel in enumerate(cluster_labels_subi): if root_i == llabel: new_root_index = ii new_root_index_found = True if not new_root_index_found: print('cannot find the new root index') new_root_index = 0 print(f"{datetime.now()}\tComputing lazy-teleporting expected hitting times") hitting_times, roundtrip_times = self.compute_hitting_time(a_i, new_root_index, self.x_lazy, self.alpha_teleport) # rescale hitting times very_high = np.mean(hitting_times) + 1.5 * np.std(hitting_times) without_very_high_pt = [iii for iii in hitting_times if iii < very_high] new_very_high = np.mean(without_very_high_pt) + np.std(without_very_high_pt) # print('very high, and new very high', very_high, new_very_high) new_hitting_times = [x if x < very_high else very_high for x in hitting_times] hitting_times = np.asarray(new_hitting_times) scaling_fac = 10 / max(hitting_times) hitting_times = hitting_times * scaling_fac s_ai, t_ai = a_i.nonzero() edgelist_ai = list(zip(s_ai, t_ai)) edgeweights_ai = a_i.data # print('edgelist ai', edgelist_ai) # print('edgeweight ai', edgeweights_ai) # bias the edges based on pseudotime from "undirected" pruned graph biased_edgeweights_ai = get_biased_weights(edgelist_ai, edgeweights_ai, hitting_times) # biased_sparse = csr_matrix((biased_edgeweights, (row, col))) adjacency_matrix_ai = np.zeros((a_i.shape[0], a_i.shape[0])) for i, (start, end) in enumerate(edgelist_ai): adjacency_matrix_ai[start, end] = biased_edgeweights_ai[i] markov_hitting_times_ai = self._simulate_markov(adjacency_matrix_ai, new_root_index) # +adjacency_matrix.T)) ''' df_sc = pd.DataFrame() df_sc['parc'] = self.labels df_sc['true_time'] = self.time_series_labels df_ = df_sc.groupby(['parc']).mean() df_['via_pt'] = markov_hitting_times_ai df_['via_pt'] = df_['via_pt'].fillna(0) correlation = df_['via_pt'].corr(df_['true_time']) print(f'correlation cluster level via original mcmc pt before scaling, {correlation}') ''' #print('try rw2 hitting times setup') memory_pt = 2 # 2 rw2_hittingtimes = _compute_rw2_hittingtimes(A=adjacency_matrix_ai, root=new_root_index, memory=memory_pt, x_lazy=self.x_lazy, alpha_teleport=self.alpha_teleport) ''' df_sc = pd.DataFrame() df_sc['parc'] = self.labels df_sc['true_time'] = self.time_series_labels df_ = df_sc.groupby(['parc']).mean() df_['via_pt'] = rw2_hittingtimes df_['via_pt'] = df_['via_pt'].fillna(0) correlation = df_['via_pt'].corr(df_['true_time']) print(f'correlation via rw2 pt at memory q={memory} before scaling, {correlation}') ''' print(f'memory for rw2 hittings times {memory_pt}. Using rw2 based pt') markov_hitting_times_ai = rw2_hittingtimes # print('no rw2') # print('skip scaling of pt') #print('do scaling of pt') very_high = np.mean(markov_hitting_times_ai) + 1.5 * np.std(markov_hitting_times_ai) # 1.5 very_high = min(very_high, max(markov_hitting_times_ai)) # without_very_high_pt = [iii for iii in markov_hitting_times_ai if iii < very_high] # new_very_high = min(np.mean(without_very_high_pt) + np.std(without_very_high_pt), very_high) new_markov_hitting_times_ai = [x if x < very_high else very_high for x in markov_hitting_times_ai] markov_hitting_times_ai = np.asarray(new_markov_hitting_times_ai) scaling_fac = 1 / max(markov_hitting_times_ai) markov_hitting_times_ai = markov_hitting_times_ai * scaling_fac # for eee, ttt in enumerate(markov_hitting_times_ai):print('cluster ', eee, ' had markov time', ttt) # print('markov hitting times', [(i, j) for i, j in enumerate(markov_hitting_times_ai)]) # print('hitting times', [(i, j) for i, j in enumerate(hitting_times)]) # markov_hitting_times_ai = (markov_hitting_times_ai) # + hitting_times)*.5 #consensus adjacency_matrix_csr_ai = sparse.csr_matrix(adjacency_matrix_ai) # A_velo = velocity_transition(A=adjacency_matrix_ai, V=velocity_mean, G=gene_mean) #velocity directed transition matrix # print('A_velo', A_velo.shape) # print(A_velo) (sources, targets) = adjacency_matrix_csr_ai.nonzero() edgelist_ai = list(zip(sources, targets)) weights_ai = adjacency_matrix_csr_ai.data bias_weights_2_ai = get_biased_weights(edgelist_ai, weights_ai, markov_hitting_times_ai, round=2) adjacency_matrix2_ai = np.zeros((adjacency_matrix_ai.shape[0], adjacency_matrix_ai.shape[ 0])) # this will be the cluster transition matrix forward biased by pseudotimes for i, (start, end) in enumerate(edgelist_ai): adjacency_matrix2_ai[start, end] = bias_weights_2_ai[i] # print('adjacency_matrix2_ai col sum', np.sum(adjacency_matrix2_ai, axis=1)) # adjacency matrix used for branch prob and terminal states will be the weighted average of the pt_trans_matrix and the velo_trans_matrix # note that although A_velo and adjaccency_matrix2_ai do not have rows that sum to one due to the biasing steps, the transition matrix used for simulating branch probabilities will be normalized such that each row_sum = 1. This is done in the simulate_branch_prob function # however the biased weights without row normalization is used for visualization if self.A_velo is not None: adjacency_matrix2_ai = ( 1 - self.velo_weight) * adjacency_matrix2_ai + self.velo_weight * A_velo user_defined_terminal_clus_exists = False if (len(self.user_defined_terminal_group) > 0) | (len(self.user_defined_terminal_cell) > 0): user_defined_terminal_clus_exists = True if (self.super_terminal_cells == False) & ( user_defined_terminal_clus_exists == False): # when is_coarse = True, there is no list of terminal clusters/cells that are passed into VIA based on a previous iteration. # print('new_root_index', new_root_index, ' before get terminal') terminal_clus_ai = self._get_terminal_clusters(adjacency_matrix2_ai, markov_hitting_times_ai, new_root_index) temp_terminal_clus_ai = [] threshold_ = np.percentile(np.asarray(markov_hitting_times_ai), self.pseudotime_threshold_TS) #print('!!! Remove the [0:2] just using to speed up testing') for i in terminal_clus_ai:#[0:2]: if markov_hitting_times_ai[i] > threshold_: terminal_clus.append(cluster_labels_subi[i]) temp_terminal_clus_ai.append(i) terminal_clus_ai = temp_terminal_clus_ai elif (self.super_terminal_cells == False) & (user_defined_terminal_clus_exists == True): terminal_clus_ai = [] terminal_clus = [] threshold_ = np.percentile(np.asarray(markov_hitting_times_ai), self.pseudotime_threshold_TS) terminal_clus_temp = self._get_terminal_clusters_user_defined_(self.user_defined_terminal_cell, self.user_defined_terminal_group) for ti in terminal_clus_temp: if (ti in loc_compi): terminal_clus_ai.append(ti) terminal_clus.append(cluster_labels_subi[ti]) print( f"{datetime.now()}\tTerminal clusters corresponding to unique lineages in this component are {terminal_clus_ai} ") if self.memory == 0: use_rw2_prob = False else: use_rw2_prob = True if not use_rw2_prob: print(f'Via 1.0 lineage prob') for target_terminal in terminal_clus_ai: prob_ai = self.simulate_branch_probability(target_terminal, adjacency_matrix2_ai, new_root_index, num_sim=int(self.num_mcmc_simulations / 2)) print(f'terminal state {target_terminal} has probability {prob_ai}') df_graph['terminal_clus' + str(cluster_labels_subi[target_terminal])] = 0.0000000 pd_columnnames_terminal.append('terminal_clus' + str(cluster_labels_subi[target_terminal])) for k, prob_ii in enumerate(prob_ai): df_graph.at[cluster_labels_subi[k], 'terminal_clus' + str( cluster_labels_subi[target_terminal])] = prob_ii else: memory = self.memory p_memory = self.p_memory prob_lin_rw2 = _compute_rw2_lineageprobability(A=adjacency_matrix2_ai, memory=memory, root=new_root_index, terminal_states=terminal_clus_ai, x_lazy=self.x_lazy, alpha_teleport=self.alpha_teleport) for enum_ts, target_terminal in enumerate(terminal_clus_ai): df_graph['terminal_clus' + str(cluster_labels_subi[target_terminal])] = 0.0000000 pd_columnnames_terminal.append('terminal_clus' + str(cluster_labels_subi[target_terminal])) for k, prob_ii in enumerate(prob_lin_rw2[:, enum_ts]): df_graph.at[cluster_labels_subi[k], 'terminal_clus' + str( cluster_labels_subi[target_terminal])] = prob_ii bp_array = df_graph[pd_columnnames_terminal].values bp_array[np.isnan(bp_array)] = 1e-8 bp_array = bp_array / bp_array.sum(axis=1)[:, None] # row normalization at the cluster level bp_array[np.isnan(bp_array)] = 1e-8 for ei, ii in enumerate(loc_compi): df_graph.at[ii, 'pt'] = hitting_times[ei] df_graph.at[ii, 'graph_node_label'] = graph_node_label[ei] df_graph.at[ii, 'majority_truth'] = graph_node_label[ei] df_graph.at[ii, 'markov_pt'] = markov_hitting_times_ai[ei] locallytrimmed_g.vs["label"] = df_graph['graph_node_label'].values hitting_times = df_graph['pt'].values self.cluster_adjacency = adjacency_matrix2_ai self.hitting_times = hitting_times # not markov chain simulated # print('hitting times', hitting_times[0:10]) self.markov_hitting_times = df_graph['markov_pt'].values # hitting_times on the forward biased# # print('markov hitting times,', self.markov_hitting_times[0:10]) self.terminal_clusters = terminal_clus dict_ts_mode = {} for i in terminal_clus: loc__ = np.where(np.asarray(self.labels) == i)[0] i_mode = func_mode(list(np.asarray(self.true_label)[loc__])) dict_ts_mode[i] = i_mode print( f"{datetime.now()}\tThere are ({len(terminal_clus)}) terminal clusters corresponding to unique lineages {dict_ts_mode}") # {self.terminal_clusters} ") self.node_degree_list = node_deg_list print(f"{datetime.now()}\tBegin projection of pseudotime and lineage likelihood") self.cluster_bp = bp_array # there is row normalization at the cluster level so each row sums to 1 df_graph['markov_pt'] = df_graph['markov_pt'].fillna(10) # 0 # nan_count = df_graph['markov_pt'].isna().sum() # print('df graph markov pt nan count', nan_count) self.single_cell_bp, self.single_cell_pt_markov = self._project_branch_probability_sc(bp_array, df_graph[ 'markov_pt'].values) self.single_cell_pt_markov = [item for item in self.single_cell_pt_markov if not (math.isnan(item)) == True] if self.time_series_labels is not None: df_ = pd.DataFrame() df_['via_pt'] = self.single_cell_pt_markov df_['true_time'] = self.time_series_labels df_['via_pt'] = df_['via_pt'].fillna(0) correlation = df_['via_pt'].corr(df_['true_time']) print(f'{datetime.now()}\tStart reading data') print( f'{datetime.now()}\tCorrelation of Via pseudotime with developmental stage {round(correlation * 100, 2)} %') # the single_cell_bp are not re-rownormalized. In fact we scale each column (lineage) by the max value in the lineage (column) to prevent rarer/smaller lineages from being under-represented. # to get a row-normalized results of the single-cell branching probabilities, we offer single_cell_bp_rownormed as an attribute so that the probabilities of each cell sum to 1. single_cell_bp_rownormed = np.nan_to_num(self.single_cell_bp, nan=0.0, posinf=0.0, neginf=0.0) # row normalize row_sums = single_cell_bp_rownormed.sum(axis=1) single_cell_bp_rownormed = single_cell_bp_rownormed / row_sums[:, np.newaxis] # print(f'{datetime.now()}\tCheck cell 0 of sc pb rownormed {single_cell_bp_rownormed[0, :]},{single_cell_bp_rownormed.sum(axis=1)}') self.single_cell_bp_rownormed = single_cell_bp_rownormed self.dict_terminal_super_sub_pairs = dict_terminal_super_sub_pairs hitting_times = self.markov_hitting_times # cluster level ### CONSTRUCT GRAPH USED FOR VISUALIZATION by doing further pruning so we can easily visualize the TI # rebias the edge weights based on the recomputed final markov hitting time and do a final pruning used for visualization (not MCMCs) bias_weights_2_all = get_biased_weights(self.edgelist, edgeweights_pruned_clustergraph, self.markov_hitting_times, round=2) # edgeweights of initial globally and locally pruned graph (all components) need to be forward biased n_clus = len(set(self.labels)) temp_csr = csr_matrix((bias_weights_2_all, tuple(zip(*self.edgelist))), shape=(n_clus, n_clus)) # locally and globally pruned and used for adjacencyMatrix2ai if self.A_velo is None: final_transition_matrix_all_components = temp_csr.toarray() else: final_transition_matrix_all_components = (self.velo_weight) * self.A_velo + (1 - self.velo_weight) * ( temp_csr.toarray()) print(f"{datetime.now()}\tTransition matrix with weight of {self.velo_weight} on RNA velocity") # the final array used for making the visual edgelists will be weighted average of A_velo and pt_based_trans (temp_csr) final_transition_matrix_all_components = csr_matrix(final_transition_matrix_all_components) print(f'{datetime.now()}\tCluster graph layout based on forward biasing') # print("**** USING OLD LAYOUT BEFORE BIASING ****") # ensure no extreme values distort layout: ''' print('NEW, KEEP OR NOT? ensure no extreme values distort layout:') bias_weights_2_all_hi = np.mean(bias_weights_2_all) + 3*np.std(bias_weights_2_all) bias_weights_2_all_lo = np.mean(bias_weights_2_all) - 3*np.std(bias_weights_2_all) bias_weights_2_subset = [x for x in bias_weights_2_all if ((x < bias_weights_2_all_hi) & (x> bias_weights_2_all_lo))] threshold_hi = np.percentile(bias_weights_2_subset,99) threshold_lo = np.percentile(bias_weights_2_subset,1) bias_weights_2_all = [x if x < threshold_hi else threshold_hi for x in bias_weights_2_all ] bias_weights_2_all = [x if x > threshold_lo else threshold_lo for x in bias_weights_2_all ] ''' visual_g = ig.Graph(self.edgelist, edge_attrs={'weight': bias_weights_2_all}).simplify( combine_edges='sum') # used to be commented out # random.seed(self.random_seed) if (self.graph_init_pos is not None) & (self.do_spatial_layout): if self.graph_init_pos is None: self.graph_init_pos = self.spatial_coords list_graph_init_pos = [] print('using provided spatial coords to initialise viagraph layout. Only use this if all cells are from the same slice') for i in range(len(set(self.labels))): where = np.where(np.array(self.labels) == i)[0] #print('where are the clusters cells',where) x_mean = self.graph_init_pos[where, 0].mean() print('x_mean',x_mean) y_mean = self.graph_init_pos[where, 1].mean() list_graph_init_pos.append([x_mean, y_mean]) #list of lists print('list graph init pos') print(list_graph_init_pos) layout = visual_g.layout_fruchterman_reingold(weights='weight', seed=list_graph_init_pos) #layout = visual_g.layout_fruchterman_reingold(weights='weight') else: layout = visual_g.layout_fruchterman_reingold(weights='weight') #layout = visual_g.layout_fruchterman_reingold(weights='weight', seed=np.matrix(layout)) # used to be commented out self.graph_node_pos = layout.coords self.layout = layout # reassign if (self.embedding is None) & (self.do_compute_embedding == True): if self.embedding_type == 'via-atlas': print(f'{datetime.now()}\tRun via-atlas') from sklearn.preprocessing import normalize # row_stoch = normalize(self.csr_full_graph, norm='l1', axis=1) ##row_stoch = row_stoch ** 3 # temp_pca = csr_matrix(self.data) # X_diffused_data = row_stoch * temp_pca # matrix multiplication to diffuse the pcs # X_input=self.data for random_state in [self.random_seed]: # 6 self.random_seed,6 for min_dist in [0.1]: # 35,0.3,0.25,0.2,0.1]: for rw2_comp in [self.ncomp]: import random str_date = random.randint(0, 1000) #print('embedding pseudorand', str_date) distance_metric = 'euclidean' # 'cosine' # #print( f'distance metric {distance_metric} and min_dist {min_dist} and randomstate {random_state} and pseudoid {str_date}') # input = self.data # '/home/user/Trajectory/Datasets/EB_Phate/RW2_sparse_matrix510Pp1p5_R2Wemd.csv' # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/Cao_ProtoVert/RW2/RW2_sparse_matrix5094_pc30_knn30_krev15_kseq15_rs0.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Cao_ProtoVert/RW2/RW2_P1_Q1_sparse_matrix5820_pc20_knn30_krev15_kseq15_rs0.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/WagnerZebrafish/RW2/knn100_pc30_kseq50RW2_sparse_matrix5820.csv') # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/5000hvg/pc100_knn10kseq5krev5RW2_noparc137noK0871_walklength20_numwalks20_dim128.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/5000hvg/pc100_knn10kseq5krev5RW2_672_walklength80_numwalks20_dim64.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/2000hvg/pc100_knn20kseq5krev5RW2_120q10walklength20798.csv') # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/EB_Phate/RW2/pc20_knn100kseq50krev50RW2_emd029.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_knn20kseq0krev0RW2_887_P20Q20.csv')# not this # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_knn50kseq0krev0RW2_gaussTrue_051_P1Q1.csv') # not this # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_knn20kseq0krev0RW2_887.csv') not this # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/EB_Phate/RW2/pc20_knn100kseq50krev50RW2_sparse_matrix029_P1_Qp001_numwalk20.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/EB_Phate/RW2/pc20_knn100kseq50krev50RW2_sparse_matrix029_P1_Q100_numwalk20.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_P1_Q1000_knn50kseq10krev10RW2_988.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_P1_Q1_knn50kseq10krev10RW2_988.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/2MOrgan/RW2/pc50_knn30kseq10krev0RW2_970_P1_Q10_rs42_tdiffstep2_gaussFalse_970.csv') #r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/RW2/pc30_knn30kseq15krev15RW2_sparse_matrix_122.csv') # pc30_knn30kseq15krev15RW2_emd_122.csv') # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Pijuan_Gastrulation/RW2/pc30_P1_Qp001_knn30kseq15krev15RW2_sparse_matrix_122.csv') #r2w_input = r2w_input.drop(['Unnamed: 0'], axis=1).values #input = r2w_input[:, 0:rw2_comp] # print('NOT using rw2 embedding CANNOT USE THIS CODE ON GITHUB YET AS IT USES LOCAL FILES') input = self.data #print('USING rw2 embedding CANNOT USE THIS CODE ON GITHUB YET AS IT USES LOCAL FILES') ''' from sklearn.preprocessing import normalize row_stoch = self.csr_full_graph row_stoch.data = np.clip(row_stoch.data, np.percentile(row_stoch.data, 10), np.percentile(row_stoch.data, 90)) row_stoch = normalize(row_stoch, norm='l1', axis=1) row_stoch = row_stoch ** 2 # level of diffusion temp = csr_matrix(input) input = row_stoch * temp # matrix multiplication ''' do_initVia = True # when calling via_atlas_emb() from within the class, it automatically initializes using the via cluster graph n_epochs = 100 # 100 usually if do_initVia: self.embedding = via_atlas_emb(X_input=self.data, graph=self.csr_full_graph, n_epochs=n_epochs, spread=1, distance_metric=distance_metric, min_dist=min_dist, saveto='', random_state=random_state, init_pos='via', cluster_membership=self.labels, layout=layout.coords) else: self.embedding = via_atlas_emb(X_input=self.data, graph=self.csr_full_graph, n_epochs=n_epochs, spread=1, distance_metric=distance_metric, min_dist=min_dist, saveto='', random_state=random_state) print(f'{datetime.now()}\tCompleted via-atlas embedding') title_umap = 'via-atlas rw2 knn/pc/knnseq:' + str(self.knn) + '/' + str( self.ncomp) + '/' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str( min_dist) + 'rs' + str(self.random_seed) if self.time_series_labels is not None: color_labels = self.time_series_labels f1, ax = plot_scatter(embedding=self.embedding, labels=color_labels, cmap='plasma', s=5, alpha=0.5, edgecolors='None', title=title_umap, text_labels=False) f1.set_size_inches(10, 10) # str_date = str(str(datetime.now())[-3:]) # save_str = '/home/user/Trajectory/Datasets/WagnerZebrafish/viaumap_RW2_5820_k' + str( self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str( self.knn_sequential) + '_knnreverse' + str(self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str( random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) + 'viaInit' + str(do_initVia) + str_date # save_str = '/home/user/Trajectory/Datasets/WagnerZebrafish/viaumap_PCA_k' + str(self.knn) + 'TimeseriesAug'+str(self.time_series)+ '_kseq' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str( random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) + 'viaInit'+str(do_initVia)+ str_date # save_str = '/home/user/Trajectory/Datasets/MEF_Schiebinger/viaumap_k' + str( self.knn) + 'kseq' + str(self.knn_sequential) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str( self.random_seed) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) + str_date + 'stage' # save_str = '/home/user/Trajectory/Datasets/EB_Phate/viaumap_R2W_029_P1_Qp001_k' + str( self.knn) +'TimeseriesAug'+str(self.time_series)+ 'kseq' + str(self.knn_sequential) +'_knnreverse'+str(self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + '_mainRS'+str(self.random_seed)+'mindist' + str(min_dist) + 'rsUmap' + str(random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'_ViaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date # save_str = '/home/user/Trajectory/Datasets/EB_Phate/viaumap_PCA_k' + str( self.knn) + 'TimeseriesAug' + str(self.time_series) + 'kseq' + str( self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + '_mainRS' + str(self.random_seed) + 'mindist' + str( min_dist) + 'rsUmap' + str(random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) + '_ViaInitLayout' + str( do_initVia) + '_nEpochs' + str(n_epochs) + '_' + str_date # save_str = '/home/user/Trajectory/Datasets/Cao_ProtoVert/viaumap_RW2_P1_Q1_5820_k' + str( self.knn) + 'TimeseriesAug'+str(self.time_series)+ '_kseq' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str( random_state) +'_rsVIA'+str(self.random_seed)+ 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'viaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date # save_str = '/home/user/Trajectory/Datasets/Cao_ProtoVert/viaumap_PCA_k' + str( self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str( self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str( random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) + 'viaInitLayout' + str( do_initVia) + '_nEpochs' + str(n_epochs) + '_' + str_date # save_str = '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/viaumap_RW2_P1_Qp001_rw2comp'+str(rw2_comp)+'_122_k' + str( self.knn) +'_TimeAug'+str(self.time_series)+ '_kseq' + str(self.knn_sequential) +'_knnreverse'+str(self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str( random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) + str_date +'_viaInitTrue' # save_str = '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/viaumap_PCA_k' + str(self.knn) + '_TimeAug' + str( self.time_series) + '_kseq' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str( random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights)+'_viaInit'+str(do_initVia) +str(n_epochs)+'_'+ str_date # save_str = '/home/user/Trajectory/Datasets/MouseNeuron/viaumap_RW2_P1_Q1000_rw2comp' + str(rw2_comp) + '_988_k' + str(self.knn) + 'TimeseriesAug'+str(self.time_series)+'_kseq' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(self.t_diff_step) + 'mindist' + str(min_dist) +'_rsMain'+str(self.random_seed)+ '_rs' + str(random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'_viaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date # save_str = '/home/user/Trajectory/Datasets/MouseNeuron/viaumap_PCA_k' + str(self.knn) + 'TimeseriesAug'+str(self.time_series)+'_kseq' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(self.t_diff_step) + 'mindist' + str(min_dist) +'_rsMain'+str(self.random_seed)+ '_rs' + str(random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'_viaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date # save_str = '/home/user/Trajectory/Datasets/2MOrgan/viaumap_RW2_P1_Q10_rw2comp' + str( rw2_comp) + '_970_k' + str(self.knn) + 'kseq' + str( self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str( random_state) + 'doGauss' + str(self.do_gaussian_kernel_edgeweights) +'viaInitTrue'+ str_date # save_str = '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/via_umaps_all/viaumap_2000hvg_k' + str( self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str( self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str( random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'_distMet'+distance_metric[0:3]+ 'viaInit' + str(do_initVia) + str_date # save_str = '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/embeddings/rw2_npz871_2000hvg_k' + str( self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str( self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + '_cluster_gp'+str(self.cluster_graph_pruning)+'_npc' + str(self.ncomp) + 'tdiff' + str( self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str( random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) + '_distMet' + distance_metric[ 0:3] + 'viaInit' + str( do_initVia) + str_date #df_umap.to_csv('/home/user/Trajectory/Datasets/MouseNeuron/2000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'nps'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+'stage'+str_date+".csv") # savefig_ = '/home/user/Trajectory/Datasets/MouseNeuron/2000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'nps'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+str_date+'stage.png' # save_str = '/home/user/Trajectory/Datasets/Qiu_Mouse_GastrulaPup/via-umap/5000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'npcs'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'doGaussianEdge'+str(self.do_gaussian_kernel_edgeweights)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+'_distmet'+distance_metric+str(str_date) save_str = '/home/user/Trajectory/Datasets/Packer_Elegans2019/via-umap/5000hvg_viaumap_k' + str( self.knn) + 'kseq' + str(self.knn_sequential) + 'npcs' + str( self.ncomp) + 'tdiff' + str(self.t_diff_step) + 'doGaussianEdge' + str( self.do_gaussian_kernel_edgeweights) + 'mindist' + str(min_dist) + 'rs' + str( self.random_seed) + '_distmet' + distance_metric + str(str_date) df_umap = pd.DataFrame(self.embedding) # df_umap.to_csv(save_str + '.csv') # f1.savefig(save_str + 'stage.png', facecolor='white', transparent=False) # savefig_ = '/home/user/Trajectory/Datasets/MouseNeuron/2000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'nps'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+str_date+'celltype.png' celltype_label = [i.split("_")[0] for i in self.true_label] # self.true_label f2, ax = plot_scatter(embedding=self.embedding, labels=celltype_label, title=title_umap, alpha=0.5, s=5, color_dict=self.color_dict) f2.set_size_inches(10, 10) # save_str = '/home/user/Trajectory/Datasets/Rashmi2023/emt/'+self.embedding_type+'_knn'+str(self.knn) # f2.savefig(save_str + 'celltype.png', facecolor='white', transparent=False) plt.show() elif self.embedding_type == 'via-mds': str_date = str(str(datetime.now())[-3:]) print(f'{datetime.now()}\tRun via-mds') # n_milestones = min(self.nsamples, max(10000, int(0.1*self.nsamples))) for rs_i in [self.random_seed]: for k_project_milestones in [3]: # ,5,10]: for k_mds_i in [25]: # [5,10,15,25]: for diffusion_i in [2]: # [2,5,10]: t_difference = self.t_diff_step k_seq_i = 2 n_milestones_mds = min(3000, self.data.shape[0]) self.embedding = via_mds(X_pca=self.data, n_milestones=n_milestones_mds, k=k_mds_i, knn_seq=k_seq_i, k_project_milestones=k_project_milestones, # neighbors_distances=self.full_neighbor_array, diffusion_op=diffusion_i, random_seed=rs_i, viagraph_full=self.csr_full_graph, t_difference=t_difference, time_series_labels=self.time_series_labels, saveto='', double_diffusion=False) str_date = str(str(datetime.now())[-3:]) # save_str = '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/mds/viamds_singlediffusion_pcs' + str( shape_data) + '_k' + str(self.knn) + '_milestones' + str( n_milestones_mds) + '_kprojectmilestones' + str( k_project_milestones) + 't_step' + str(t_difference) + '_knnmds' + str( k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str( self.knn_sequential) + '_nps' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str( diffusion_i) + '_RsMds' + str(rs_i) + '_' + str_date # save_str = '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/via_mds/viamds_singlediffusion_pcs' + str( shape_data) + '_k' + str(self.knn) + '_milestones' + str( n_milestones_mds) + '_kprojectmilestones' + str( k_project_milestones) + 't_step' + str(t_difference) + '_knnmds' + str( k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str( self.knn_sequential) + '_nps' + str(self.ncomp) + '_tdiff' + str( self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str( diffusion_i) + '_RsMds' + str(rs_i) + '_' + str_date # save_str = '/home/user/Trajectory/Datasets/Qiu_Mouse_GastrulaPup/mds/viamds_singlediffusion_pcs' + str( shape_data) + '_k' + str(self.knn) + '_milestones' + str( n_milestones_mds) + '_kprojectmilestones' + str( k_project_milestones) + 't_step' + str(t_difference) + '_knnmds' + str( k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str( self.knn_sequential) + '_nps' + str(self.ncomp) + '_tdiff' + str( self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str( diffusion_i) + '_RsMds' + str(rs_i) + '_' + str_date # save_str = '/home/user/Trajectory/Datasets/WagnerZebrafish/2000hvg/mds/singlediffusion_viamds/2000hvg_viamds_singlediffusion_pcs'+str(shape_data)+'_k' + str(self.knn) + '_milestones'+str(n_milestones_mds)+'_kprojectmilestones'+str(k_project_milestones)+'t_step'+str(t_difference)+'_knnmds' + str(k_mds_i) +'_kseqmds' + str(k_seq_i) +'_kseq'+str(self.knn_sequential)+'_nps' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step)+'_randseed'+str(self.random_seed)+ '_diffusionop'+str(diffusion_i)+'_RsMds'+str(rs_i)+'_'+str_date # save_str = '/home/user/Trajectory/Datasets/Cao_ProtoVert/mds_theseare1000hvg/singlediffusion_viamds/1000hvg_viamds_singlediffusion_doExp'+str(self.do_gaussian_kernel_edgeweights)+'_k' + str(self.knn) + '_milestones'+str(n_milestones_mds)+'_kprojectmilestones'+str(k_project_milestones)+'t_stepmds'+str(t_difference)+'_knnmds' + str(k_mds_i) +'_kseqmds' + str(k_seq_i) +'_kseq'+str(self.knn_sequential)+'_nps' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step)+'_randseed'+str(self.random_seed)+ '_diffusionop'+str(diffusion_i)+'_rsMds'+str(rs_i)+'_'+str_date # save_str = '/home/user/Trajectory/Datasets/EB_Phate/viamds_R2W_029_P1_Qp001_singlediffusion_prescaled_doExp' + str( self.do_gaussian_kernel_edgeweights) + '_k' + str(self.knn) + '_milestones' + str( n_milestones_mds) + '_kprojectmilestones' + str( k_project_milestones) + 't_stepmds' + str(t_difference) + '_knnmds' + str( k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str( self.knn_sequential) + '_npc' + str(self.ncomp) + '_tdiff' + str( self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str( diffusion_i) + '_rsMds' + str(rs_i) + '_' + str_date # save_str = '/home/user/Trajectory/Datasets/MEF_Schiebinger/viamds_singlediffusion_prescaled_doExp'+str(self.do_gaussian_kernel_edgeweights)+'_k' + str(self.knn) + '_milestones'+str(n_milestones_mds)+'_kprojectmilestones'+str(k_project_milestones)+'t_stepmds'+str(t_difference)+'_knnmds' + str(k_mds_i) +'_kseqmds' + str(k_seq_i) +'_kseq'+str(self.knn_sequential)+'_npc' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step)+'_randseed'+str(self.random_seed)+ '_diffusionop'+str(diffusion_i)+'_rsMds'+str(rs_i)+'_'+str_date print(f'{datetime.now()}\tCompleted via-mds') # save_str = '/home/user/Trajectory/Datasets/Qiu_Mouse_GastrulaPup/mds/' + self.embedding_type + '_knn' + str( self.knn) + '_projectmilestones' + str(k_project_milestones) + '_kmds' + str( k_mds_i) + '_diffusion' + str(diffusion_i)+'_strdate'+str(str_date) save_str = '/home/user/Trajectory/Datasets/Packer_Elegans2019/via-mds/' + self.embedding_type + '_knn' + str( self.knn) + '_projectmilestones' + str(k_project_milestones) + '_kmds' + str( k_mds_i) + '_diffusion' + str(diffusion_i) + '_strdate' + str(str_date) df_mds = pd.DataFrame(self.embedding) df_mds.to_csv(save_str + ".csv") celltype_label = [i.split("_")[0] for i in self.true_label] # self.true_label f1, ax = plot_scatter(embedding=self.embedding, labels=celltype_label, title='via-mds', alpha=0.5, cmap='rainbow', text_labels=True) f1.set_size_inches(10, 10) #f1.savefig(save_str + '_celltype.png', facecolor='white', transparent=False) if self.time_series_labels is not None: f2, ax2 = plot_scatter(embedding=self.embedding, labels=self.time_series_labels, title='via-mds', alpha=0.5, cmap='plasma') f2.set_size_inches(10, 10) #f2.savefig(save_str + '_stage.png', facecolor='white', transparent=False) elif self.embedding_type == 'via-force': print(f'{datetime.now()}\tRun via-force') # n_milestone = min(self.nsamples, max(5000, int(0.1*self.nsamples))) n_milestones = 3000 self.embedding = via_forcelayout(X_pca=self.data, viagraph_full=self.csr_full_graph, k=10, knn_seq=10, n_milestones=n_milestones) if self.time_series_labels is None: color_labels = self.true_label else: color_labels = self.time_series_labels if (isinstance(color_labels[0], str)) == True: categorical = True else: categorical = False plot_scatter(embedding=self.embedding, labels=color_labels, title='via-force', categorical=categorical) plot_scatter(embedding=self.embedding, labels=self.true_label, title='via-mds', categorical=True) plt.show() else: print( f'{datetime.now()}\tNo embedding will be computed: if you wish to compute a via-embedding specify one of via-force, via-mds or via-umap') if self.edgebundle_pruning_twice == False: # print('creating bundle with single round of global pruning at a level of', self.edgebundle_pruning) print(f"{datetime.now()}\tStarting make edgebundle viagraph...") self.hammerbundle_cluster, layout = make_edgebundle_viagraph(layout, g_layout, decay=self.viagraph_decay) # simplifying structure of edges used on the visual layout edgeweights_maxout_2, edgelist_maxout_2, comp_labels_2 = pruning_clustergraph( final_transition_matrix_all_components, global_pruning_std=self.visual_cluster_graph_pruning, max_outgoing=self.max_visual_outgoing_edges, preserve_disconnected=self.preserve_disconnected) if self.edgebundle_pruning_twice == True: print( f"{datetime.now()}\tAdditional Visual cluster graph pruning for edge bundling at level: {self.visual_cluster_graph_pruning}") layout_g = ig.Graph(self.edgelist, edge_attrs={'weight': edgeweights_maxout_2}).simplify( combine_edges='sum') layout_g_csr = get_sparse_from_igraph(layout_g, weight_attr='weight') weights_for_layout = np.asarray(layout_g_csr.data) # clip weights to prevent distorted visual scale in layout weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10), np.percentile(weights_for_layout, 90)) weights_for_layout = list(weights_for_layout) graph_for_layout = ig.Graph(list(zip(*layout_g_csr.nonzero())), edge_attrs={'weight': weights_for_layout}) self.clustergraph_igraph_forlayout = graph_for_layout # edge bundle is based on the visually (double) pruned graph rather than the inital graph used for Velo and Pseudotime self.hammerbundle_cluster, layout = make_edgebundle_viagraph(layout=layout, graph=graph_for_layout, decay=self.viagraph_decay) ''' else: print(f'redoing cluster graph layout based on forward biased edges') layout_g = ig.Graph(edgelist_maxout_2, edge_attrs={'weight': bias_weights_2_all}).simplify( combine_edges='sum') layout_g_csr = get_sparse_from_igraph(layout_g, weight_attr='weight') weights_for_layout = np.asarray(layout_g_csr.data) # clip weights to prevent distorted visual scale in layout weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10), np.percentile(weights_for_layout, 90)) weights_for_layout = list(weights_for_layout) graph_for_layout = ig.Graph(list(zip(*layout_g_csr.nonzero())), edge_attrs={'weight': weights_for_layout}) # edge bundle is based on the visually (double) pruned graph rather than the inital graph used for Velo and Pseudotime self.hammerbundle_cluster, layout = make_edgebundle_viagraph(layout, graph_for_layout) ''' temp_csr = csr_matrix((np.array(edgeweights_maxout_2), tuple(zip(*edgelist_maxout_2))), shape=(n_clus, n_clus)) temp_csr = temp_csr.transpose().todense() + temp_csr.todense() temp_csr = np.tril(temp_csr, -1) # elements along the main diagonal and above are set to zero temp_csr = csr_matrix(temp_csr) edgeweights_maxout_2 = temp_csr.data scale_factor = max(edgeweights_maxout_2) - min(edgeweights_maxout_2) edgeweights_maxout_2 = [((wi + .1) * 2.5 / scale_factor) + 0.1 for wi in edgeweights_maxout_2] sources, targets = temp_csr.nonzero() edgelist_maxout_2 = list(zip(sources.tolist(), targets.tolist())) self.edgelist_maxout = edgelist_maxout_2 self.edgeweights_maxout = edgeweights_maxout_2 remove_outliers = hitting_times threshold = np.percentile(remove_outliers, 95) # np.mean(remove_outliers) + 1* np.std(remove_outliers) th_hitting_times = [x if x < threshold else threshold for x in hitting_times] remove_outliers_low = hitting_times[hitting_times < (np.mean(hitting_times) - 0.3 * np.std(hitting_times))] threshold_low = 0 if remove_outliers_low.size else np.percentile(remove_outliers_low, 5) th_hitting_times = [x if x > threshold_low else threshold_low for x in th_hitting_times] scaled_hitting_times = (th_hitting_times - np.min(th_hitting_times)) npmax = np.max(scaled_hitting_times) or 1 scaled_hitting_times = scaled_hitting_times * (1000 / npmax) self.scaled_hitting_times = scaled_hitting_times # cluster mcmc pt cluster level scaled_hitting_times = scaled_hitting_times.astype(int) pal = ig.drawing.colors.AdvancedGradientPalette(['yellow', 'green', 'blue'], n=1001) # making a new "augmented" single-cell graph based on the computed pseudotimes - potentially useful when there are no time-series labels for (optionally) guiding the graph structure use_pt_to_guide_graph = False if use_pt_to_guide_graph == True: print(f'{datetime.now()}\tMake pt-augmented knn') pt_augmented_adjacency_igraph, adjacency_augmented = self._make_pt_augmented_adjacency_igraph( neighbors=neighbors, distances=distances, k_reverse=10, knn=20) print(f'{datetime.now()}\tRun via-umap on pt-augmented knn') # graph=csr_full_graph self.embedding = via_atlas_emb(X_input=self.data, graph=self.adjacency_pt_augmented, n_epochs=100, spread=1, distance_metric='euclidean', min_dist=0.3, saveto='/home/user/Trajectory/Datasets/HumanCD34/pt-aug/not_ptaug_umap_.csv') # usually min_dist default =0.1, for cd34 0.8 ''' self.embedding = via_mds(X_pca=self.data, k=15, t_diffusion=2, n_milestones=min(self.nsamples, max(10000, int(0.1 * self.nsamples))), viagraph_full=adjacency_pt_augmented, time_series_labels=[int(i*10) for i in self.single_cell_pt_markov], saveto='') ''' all_colors = [] # TODO clean up this paragraph/put into a separate function for i in scaled_hitting_times: all_colors.append(pal.get(int(i))[0:3]) locallytrimmed_g.vs['hitting_times'] = scaled_hitting_times locallytrimmed_g.vs['color'] = [pal.get(i)[0:3] for i in scaled_hitting_times] self.group_color = [colors.to_hex(v) for v in locallytrimmed_g.vs['color']] # based on ygb scale viridis_cmap = cm.get_cmap('viridis_r') self.group_color_cmap = [colors.to_hex(v) for v in viridis_cmap(scaled_hitting_times / 1000)] # based on ygb scale self.graph_node_label = df_graph['graph_node_label'].values self.edgeweight = [e['weight'] * 1 for e in locallytrimmed_g.es] # print('To draw viagraph edges without bundling and plot piechart composition of cell types: use self.draw_piechart_graph_nobundle()') # self.draw_piechart_graph_nobundle() # print('drawing with bundle piechart... Use self.draw_piechart_graph() to reproduce this plot') initial_bandwidth = 0.05 # 0.05 decay = 0.7 # 0.7 n_milestones = 150 # f you min(self.nsamples, max(500, int(0.1 * self.nsamples))) # print(f'milestones for edgebundling {n_milestones}') global_visual_pruning = 0.15 extra_title_text = 'jac_visual:' + str(global_visual_pruning) + ' bw:' + str( initial_bandwidth) + ' decay:' + str(decay) # sc_graph = self.csr_full_graph self.hammerbundle_milestone_dict = None if self.embedding is not None: print( f"{datetime.now()}\tStart making edgebundle milestone with {n_milestones} milestones...This can be recomputed with make_edgebundle_milestone()") self.hammerbundle_milestone_dict = make_edgebundle_milestone(embedding=self.embedding, sc_graph=self.ig_full_graph, n_milestones=n_milestones, global_visual_pruning=global_visual_pruning, initial_bandwidth=initial_bandwidth, decay=decay, weighted=True, sc_labels_numeric=self.time_series_labels, sc_pt=self.single_cell_pt_markov, terminal_cluster_list=self.terminal_clusters, single_cell_lineage_prob=self.single_cell_bp) self.labels = list(self.labels) for tsi in self.terminal_clusters: loc_i = np.where(np.asarray(self.labels) == tsi)[0] val_pt = [self.single_cell_pt_markov[i] for i in loc_i] th_pt = np.percentile(val_pt, 50) # 50 loc_i = [loc_i[i] for i in range(len(val_pt)) if val_pt[i] >= th_pt] temp = np.mean(self.data[loc_i], axis=0) labelsq, distances = self.knn_struct.knn_query(temp, k=1) tsi_list.append(labelsq[0][0]) if self.embedding is not None: print('REMEMBER TO RE-INCLUDE the PLT.SHOW HERE - COMMENTING IT OUT FOR NOW') plot_scatter(embedding=self.embedding, labels=self.single_cell_pt_markov, sc_index_terminal_states=tsi_list, title='pseudotime and terminal states', cmap='plasma', true_labels=self.true_label) plot_scatter(embedding=self.embedding, labels=self.true_label, sc_index_terminal_states=tsi_list, title='lineage and terminal states', cmap='rainbow', true_labels=self.labels) #plt.show() return def accuracy(self, onevsall=1): true_labels = self.true_label Index_dict = {} PARC_labels = self.labels N = len(PARC_labels) n_cancer = list(true_labels).count(onevsall) n_pbmc = N - n_cancer for k in range(N): Index_dict.setdefault(PARC_labels[k], []).append(true_labels[k]) num_groups = len(Index_dict) sorted_keys = list(sorted(Index_dict.keys())) error_count = [] pbmc_labels = [] thp1_labels = [] fp, fn, tp, tn, precision, recall, f1_score = 0, 0, 0, 0, 0, 0, 0 for kk in sorted_keys: vals = [t for t in Index_dict[kk]] majority_val = self.func_mode(vals) # if majority_val == onevsall: print('cluster', kk, ' has majority', onevsall, 'with population', len(vals)) if kk == -1: len_unknown = len(vals) # print('len unknown', len_unknown) if (majority_val == onevsall) and (kk != -1): thp1_labels.append(kk) fp = fp + len([e for e in vals if e != onevsall]) tp = tp + len([e for e in vals if e == onevsall]) list_error = [e for e in vals if e != majority_val] e_count = len(list_error) error_count.append(e_count) elif (majority_val != onevsall) and (kk != -1): pbmc_labels.append(kk) tn = tn + len([e for e in vals if e != onevsall]) fn = fn + len([e for e in vals if e == onevsall]) error_count.append(len([e for e in vals if e != majority_val])) predict_class_array = np.array(PARC_labels) PARC_labels_array = np.array(PARC_labels) number_clusters_for_target = len(thp1_labels) for cancer_class in thp1_labels: predict_class_array[PARC_labels_array == cancer_class] = 1 for benign_class in pbmc_labels: predict_class_array[PARC_labels_array == benign_class] = 0 predict_class_array.reshape((predict_class_array.shape[0], -1)) error_rate = sum(error_count) / N n_target = tp + fn tnr = tn / n_pbmc fnr = fn / n_cancer tpr = tp / n_cancer fpr = fp / n_pbmc if tp != 0 or fn != 0: recall = tp / (tp + fn) # ability to find all positives if tp != 0 or fp != 0: precision = tp / (tp + fp) # ability to not misclassify negatives as positives if precision != 0 or recall != 0: f1_score = precision * recall * 2 / (precision + recall) majority_truth_labels = np.empty((len(true_labels), 1), dtype=object) for cluster_i in set(PARC_labels): cluster_i_loc = np.where(np.asarray(PARC_labels) == cluster_i)[0] true_labels = np.asarray(true_labels) majority_truth = self.func_mode(list(true_labels[cluster_i_loc])) majority_truth_labels[cluster_i_loc] = majority_truth majority_truth_labels = list(majority_truth_labels.flatten()) accuracy_val = [error_rate, f1_score, tnr, fnr, tpr, fpr, precision, recall, num_groups, n_target] return accuracy_val, predict_class_array, majority_truth_labels, number_clusters_for_target def run_VIA(self): print( f'{datetime.now()}\tRunning VIA over input data of {self.data.shape[0]} (samples) x {self.data.shape[1]} (features)') print(f'{datetime.now()}\tKnngraph has {self.knn} neighbors') self.knn_struct = _construct_knn(self.data, knn=self.knn, distance=self.distance, num_threads=self.num_threads) st = time.time() self.run_subVIA() run_time = time.time() - st print(f'{datetime.now()}\tTime elapsed {round(run_time, 1)} seconds') do_accuracy = False if do_accuracy: print(f'{datetime.now()}\tCalculating accuracy scores for clusters in the StaVia graph. saved to via_object.stats_df as a dataframe') targets = list(set(self.true_label)) targets.sort() N = len(self.true_label) self.f1_accumulated = 0 self.f1_mean = 0 self.stats_df = pd.DataFrame( {'edgepruning_clustering_resolution': [self.edgepruning_clustering_resolution], 'edgepruning_clustering_resolution_local': [self.edgepruning_clustering_resolution_local], 'runtime(s)': [run_time]}) list_roc = [] if len(targets) > 1: f1_accumulated, f1_acc_noweighting = 0, 0 for onevsall_val in targets: # print('target is', onevsall_val) vals_roc, predict_class_array, majority_truth_labels, numclusters_targetval = \ self.accuracy(onevsall=onevsall_val) f1_current = vals_roc[1] f1_accumulated = f1_accumulated + f1_current * (list(self.true_label).count(onevsall_val)) / N f1_acc_noweighting = f1_acc_noweighting + f1_current list_roc.append([self.edgepruning_clustering_resolution, self.edgepruning_clustering_resolution_local, onevsall_val] + vals_roc + [numclusters_targetval] + [run_time]) f1_mean = f1_acc_noweighting / len(targets) df_accuracy = pd.DataFrame(list_roc, columns=['edgepruning_clustering_resolution', 'edgepruning_clustering_resolution_local', 'onevsall-target', 'error rate', 'f1-score', 'tnr', 'fnr', 'tpr', 'fpr', 'precision', 'recall', 'num_groups', 'population of target', 'num clusters', 'clustering runtime']) # df_accuracy.to_csv('/home/user/Trajectory/Datasets/Cao_ProtoVert/df_accuracy_.csv') self.f1_accumulated = f1_accumulated self.f1_mean = f1_mean self.stats_df = df_accuracy