Source code for VIA.core

import numpy as np
import pandas as pd
import scipy
from scipy import sparse
from scipy.sparse import csr_matrix, csgraph, find
from scipy.sparse.csgraph import minimum_spanning_tree, connected_components
import igraph as ig
import leidenalg
import time
from datetime import datetime
import hnswlib
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from matplotlib.path import get_path_collection_extents
import multiprocessing
from collections import Counter
from typing import Optional, Union

from pyVIA.plotting_via import *
from pyVIA.utils_via import *
from pyVIA.utils_via import _construct_knn, sequential_knn

#from plotting_via import *
#from utils_via import *
#from utils_via import _construct_knn, sequential_knn
from sklearn.preprocessing import normalize
import math

from pecanpy import pecanpy as node2vec

### to upload to pip core via2 file


def _composite_jacAffinity_distanceAffinity(sc_csr_graph: csr_matrix, projected_distances: ndarray = None):
    '''
    make a composite score for distance based affinity and  jaccard score
    :param sc_csr_graph: distances based affinities (.data is affinity)
    :param projected_distances: if using projected distances adjusted based on velocity then provide them
    :return: igraph of affinities based on Jaccard and distance-based affinities
    '''
    edges_ = list(zip(*sc_csr_graph.nonzero()))  # in the case of time-series, this is the augmented graph
    # Jaccard computation in igraph does not consider weights. However, doing the global pruning on distance based weights seems less effective than pruning on Jaccard similarities
    jac_edgeweights_list = ig.Graph(edges_, edge_attrs={'weight': sc_csr_graph.data}).similarity_jaccard(
        pairs=edges_)

    # the graph has self loops, so we need to clip these values as they have a jacc of 1, and much higher than other neighbors
    jac_edgeweights_list = np.clip(np.array(jac_edgeweights_list), a_max=np.percentile(np.array(jac_edgeweights_list),
                                                                                       95), a_min=None).tolist()

    if projected_distances is None:
        composite_edgeweights = np.multiply(sc_csr_graph.data, np.array(
            jac_edgeweights_list))  # 0<J<1 #combining distance-based affinities and jac-based affinities
    else:

        composite_edgeweights = np.multiply(projected_distances, np.array(
            jac_edgeweights_list))

    composite_igraph = ig.Graph(edges_, edge_attrs={'weight': composite_edgeweights.tolist()}).simplify(
        combine_edges='sum')  # used for clustergraph #in the case of time-series, this is the augmented graph

    return composite_igraph


def _get_loc_terminal_states(via0, X_input: ndarray):
    '''

    we need the location of terminal states from first iteration (Via0) to pass onto the second iterations of Via (Via1)
    this will allow identification of the terminal-cluster in fine-grained Via1 that best captures the terminal state from coarse Via0

    :param via0: coarse grained iteration of object class VIA
    :param X_input: via0.data. the data matrix (PCs) on which the TI is inferred
    :return:
    '''

    tsi_list = []  # find the single-cell which is nearest to the average-location of a terminal cluster in PCA space
    for tsi in via0.terminal_clusters:
        loc_i = np.where(np.asarray(via0.labels) == tsi)[0]
        val_pt = [via0.single_cell_pt_markov[i] for i in loc_i]
        th_pt = np.percentile(val_pt, 50)  # 50
        loc_i = [loc_i[i] for i in range(len(val_pt)) if val_pt[i] >= th_pt]
        temp = np.mean(X_input[loc_i], axis=0)
        labelsq, distances = via0.knn_struct.knn_query(temp, k=1)
        tsi_list.append(labelsq[0][0])
    return tsi_list

def _prob_reaching_terminal_state(terminal_state, A, root, n_simulations, q,
                                 cumstateChangeHist, cumstateChangeHist_all, seed):
    # this function is defined outside the VIA class to enable smooth parallel processing in Windows

    n = A.shape[0]

    A /= np.max(A)
    for i, r in enumerate(A):
        if np.all(r == 0):
            A[i, i] = 1
    # ensure probabilities sum to 1 along the rows
    P = A / A.sum(axis=1).reshape((n, 1))

    n_steps, count_reach_terminal_state = 2 * n, 0
    for _ in range(n_simulations):
        cur_state = root
        change_hist = np.zeros((n, n))
        change_hist[root, root] = 1

        x, terminal_state_found = 0, False

        while x < n_steps and not terminal_state_found:
            seed += 1
            np.random.seed(seed)
            next_state = np.random.choice(range(P.shape[0]), p=P[cur_state])
            if next_state == terminal_state:
                terminal_state_found = True

            change_hist[cur_state, next_state] += 1
            cur_state = next_state
            x += 1

        if terminal_state_found:
            cumstateChangeHist += np.any(change_hist > 0, axis=0)
            count_reach_terminal_state += 1
        cumstateChangeHist_all += np.any(change_hist > 0, axis=0)

    cumstateChangeHist_all[cumstateChangeHist_all == 0] = 1
    q.append([cumstateChangeHist, cumstateChangeHist_all])


def _rw2_walks(A: ndarray, root: int, memory: float = 0.8, weighted: bool = True, implicit_ids: bool = False,
              num_walks: int = 100, p_memory: float = 1.0, x_lazy: float = 0.95,
              alpha_teleport: float = 0.99):
    '''
    :param A: cluster level transition matrix (forward biased)
    :param root: root cluster index
    :param memory: 1/q * edge weight to a next-node that is not a neighbor of previous node. larger number means more memory and more introspective walk. small number <1 means more exploration
    :param p_memory: 1/p * edge weight to next node = previous node
    :param weighted: default True edge weights are provided as part of the array A
    :param walk_length: default None (walk_length = num_nodes *2)
    :return: implicit_ids
    '''
    from pecanpy.graph import AdjlstGraph
    # initialize SparseGraph object
    # g = AdjlstGraph()
    # setting up the lazy-teleporting behaviour
    n_states = A.shape[0]
    P = A / A.sum(axis=1).reshape((n_states, 1))
    # print('row normed P',P.shape, P, P.sum(axis=1))

    # bias_P is the transition probability matrix

    P = x_lazy * P + (1 - x_lazy) * np.identity(n_states)
    # print(P, P.sum(axis=1))
    P = alpha_teleport * P + ((1 - alpha_teleport) * (1 / n_states) * (np.ones((n_states, n_states))))

    from numba_progress import ProgressBar
    A_csr = csr_matrix(A)  # csr_matrix(P) #use P if you want to add in the Lazy-teleporting. I find using A directly works a bit better.
    # print(f'A_csr {A_csr}')
    # print(f'A dense {A}')
    g = node2vec.SparseOTF(p=p_memory, q=memory, workers=4, verbose=True,
                           extend=True)  # larger q, more memory and introverted search #
    # using the extend=True which is the node2vec+ implementation from Jan 2023 Renming Liu Bioinformatics paper that differentiates between weakly connected (prev, next) and strongly connected (prev, next)
    # https://pecanpy.readthedocs.io/en/latest/pecanpy.html
    g.indptr = A_csr.indptr  # ["indptr"].astype(np.uint32)
    g.indices = A_csr.indices  # ["indices"].astype(np.uint32)
    g.data = A_csr.data  # ["data"].astype(np.float32)

    if g.data is None:
        raise ValueError("Adjacency matrix data not found.")
    elif not weighted:
        print('no weights so assign 1 to all edges')
        g.data[:] = 1.0  # overwrite edge weights with constant
    num_nodes = g.indptr.size
    walk_length = num_nodes * 2
    #print(f'g.indptr.size, {num_nodes}')
    # look deeper at how the node id mapping is done: https://pecanpy.readthedocs.io/en/latest/_modules/pecanpy/graph.html#DenseGraph.read_edg
    g.set_node_ids(node_ids=None,
                   implicit_ids=implicit_ids,
                   num_nodes=int(num_nodes - 1))

    # generate random walks, which could then be used to train w2v
    # walks = g.simulate_walks(num_walks=2, walk_length=3)
    # https://pecanpy.readthedocs.io/en/latest/_modules/pecanpy/pecanpy.html#Base.simulate_walks
    g._preprocess_transition_probs()

    nodes = np.array([root], dtype=np.uint32)  # range(g.num_nodes
    # print(f'nodes {range(g.num_nodes)} and nodes {nodes}')
    start_node_idx_ary = np.concatenate([nodes] * num_walks)  # root<=> nodes
    # print(f'start_node_array {start_node_idx_ary}')
    tot_num_jobs = start_node_idx_ary.size

    random_state = g.random_state
    np.random.seed(random_state)
    np.random.shuffle(start_node_idx_ary)  # for balanced work load

    move_forward = g.get_move_forward()
    has_nbrs = g.get_has_nbrs()
    verbose = g.verbose

    # Acquire numba progress proxy for displaying the progress bar
    with ProgressBar(total=tot_num_jobs, disable=not verbose) as progress:
        walk_idx_mat = g._random_walks(
            tot_num_jobs,
            walk_length,
            random_state,
            start_node_idx_ary,
            has_nbrs,
            move_forward,
            progress,
        )
    # print(f'walk mat {walk_idx_mat}')
    # Map node index back to node ID
    walks = [g._map_walk(walk_idx_ary) for walk_idx_ary in walk_idx_mat]
    walks = np.asarray(walks).astype(int)
    # print(f'via walks and shape of n_walks x n_steps {walks.shape}') #ndarray n_walks x n_steps

    # emd = g.embed(num_walks=5, walk_length=num_nodes*2)
    # print(f'emd {emd.shape} {emd}')
    return walks


def _compute_rw2_lineageprobability(A: ndarray, root: int, memory: float = 0.8, p_memory: float = 1.0,
                                   num_walks: int = 1000, terminal_states: list = None, x_lazy: float = 0.95,
                                   alpha_teleport: float = 0.99):
    print(f'{datetime.now()}\tCalculating lineage probability at memory {memory}')
    n_states = A.shape[0]
    n_terminal_states = len(terminal_states)
    ts_index_counter = 0
    prob_mat = np.zeros((n_states, n_terminal_states))
    walks = _rw2_walks(A=A, root=root, memory=memory, num_walks=num_walks, p_memory=p_memory, x_lazy=x_lazy,
                      alpha_teleport=alpha_teleport)

    # if n_steps is None: n_steps = 2 * n_states
    n_steps = walks.shape[1] - 1
    for terminal_state in terminal_states:

        cumstateChangeHist = np.zeros((1, n_states))
        cumstateChangeHist_all = np.zeros((1, n_states))
        terminal_state_found_inAllWalks = False

        count_reached_bool = 0
        count_reach_terminal_state = 0
        for row in walks:
            change_hist = np.zeros((n_states, n_states))
            change_hist[root, root] = 1
            x = 0
            # print(f'walk {row}')
            terminal_state_found = False
            while x < n_steps and not terminal_state_found:
                cur_state = row[x]
                next_state = row[x + 1]
                if next_state == terminal_state:
                    terminal_state_found = True
                    count_reached_bool += 1

                change_hist[cur_state, next_state] += 1
                x += 1

            if terminal_state_found:
                cumstateChangeHist += np.any(change_hist > 0, axis=0)
                count_reach_terminal_state += 1
            cumstateChangeHist_all += np.any(change_hist > 0, axis=0)
        # print(f'count reached bool {count_reached_bool}')
        cumstateChangeHist_all[
            cumstateChangeHist_all == 0] = 1  # to avoid division by zero we say that the state has been visited once
        count_reached = cumstateChangeHist_all[0, terminal_state]
        # print(f'terminal state HistAll {terminal_state} is reached {count_reached} times')
        count_reached = cumstateChangeHist[0, terminal_state]
        # print(f'terminal state {terminal_state} is reached {count_reached} times')
        # print( f"{datetime.now()}\tFrom root {root},  TS {terminal_state} is reached {int(count_reached)} times.")
        np.set_printoptions(precision=3)
        prob_ = cumstateChangeHist / cumstateChangeHist_all
        # print(f'cumstatechangeHist {cumstateChangeHist}')
        # print(f'cumstatechangeHist_all {cumstateChangeHist_all}')
        # print(f'prob_ pre scaling {prob_}')
        # given a cluster is traversed (cumhistory_vec_all), how many of these traversals led to a successful path to Terminal State X
        # i.e. with what probability does traversal of a cluster lead to the desired terminal state (number of times traversed along a sucessful path)  #cumhistory_vec: number of times cluster is traversed on a successful path

        if count_reached == 0:
            prob_[:, terminal_state] = 0
            print(
                f'{datetime.now()}\t{terminal_state} cluster/state is never reached. try increase number of KNN (which will increase number of edges) or try to increase the value of jac_std_global and cluster_graph_pruning which will lower edge pruning')
        else:
            loc_1 = np.where(prob_ == 1)
            loc_1 = loc_1[1]
            prob_[0, loc_1] = 0
            # print('zerod out prob', prob_)
            temp_ = np.max(prob_)
            if temp_ == 0: temp_ = 1
            prob_ = prob_ / min(1,
                                1.1 * temp_)  # do some scaling to amplify the probabiltiies of those clusters that are not 1 to make the range of values more compact
            prob_[0, loc_1] = 1  # put back probability =1
            print(
                f'{datetime.now()}\tCluster or terminal cell fate {terminal_state} is reached {count_reached} times')  # {prob_}
        prob_mat[:, ts_index_counter] = prob_
        ts_index_counter += 1

    return prob_mat


def _compute_rw2_hittingtimes(A: ndarray, root: int, memory: float = 0.8, num_walks: int = 1300, x_lazy: float = 0.95,
                             alpha_teleport: float = 0.99):
    '''

    :param walks: ndarray walks of shape n_walks x n_steps
    :param A: ndarray of clustergraph (values are edge weights, higher means stronger edge)
    :return:
    '''
    # from scipy.sparse import csc_matrix
    # A_csr = csr_matrix(A)
    # print(f'cluster graph A', A)
    # generate ndarray of walks (n_walks x n_steps+1)
    print(f'{datetime.now()}\tstart computing walks with rw2 method')
    walks = _rw2_walks(A=A, root=root, memory=memory, num_walks=num_walks, x_lazy=x_lazy, alpha_teleport=alpha_teleport)
    num_nodes = A.shape[0]
    num_walks = walks.shape[0]
    walk_length = walks.shape[                      1] - 1  # (the first element in the walk is the root, so the walks array has n_steps + 1 element
    hitting_array = np.zeros((num_nodes, num_walks))

    walk_count = 0
    for walk_i in walks:
        walk_i_dist_cumulative = [0]
        # print(f'walk_i {walk_i}')
        for step in range(walk_length):
            edge_weight = A[walk_i[step], walk_i[step + 1]]
            # print(f'edge weight {edge_weight}')
            edge_distance = 1. / (1 + math.exp(edge_weight - 1))  # convert to distance
            walk_i_dist_cumulative.append(walk_i_dist_cumulative[-1] + edge_distance)
            # print(f'edge distance',edge_distance)
            # print(f'cumulative walk {walk_i_dist_cumulative}')
        for node_i in range(num_nodes):
            # print(f'node {node_i}')
            first_time_at_statei = np.where(walk_i == node_i)[0]

            if len(first_time_at_statei) == 0:
                hitting_array[node_i, walk_count] = walk_length + 1
            else:
                # print(f'first time hitting node {node_i} is {first_time_at_statei[0]}')
                hitting_array[node_i, walk_count] = walk_i_dist_cumulative[first_time_at_statei[0]]
        walk_count += 1
    # print(f'hitting_array {hitting_array}')

    # extract relevant hitting times
    hitting_array_final = np.zeros((1, num_nodes))
    no_times_state_reached_array = np.zeros((1, num_nodes))
    for i in range(num_nodes):
        rowtemp = hitting_array[i, :]
        no_times_state_reached_array[0, i] = np.sum(rowtemp != (walk_length + 1))

    for i in range(num_nodes):
        rowtemp = hitting_array[i, :]
        no_times_state_reached = np.sum(rowtemp != (walk_length + 1))
        if no_times_state_reached != 0:
            perc = np.percentile(rowtemp[rowtemp != walk_length + 1], 20) + 0.001  # 15 for Human and Toy
            # print('state ', i,' has perc' ,perc)
            hitting_array_final[0, i] = np.mean(rowtemp[rowtemp <= perc])
        else:
            hitting_array_final[0, i] = (walk_length + 1)

    return hitting_array_final[0]


def _simulate_markov_sub(A, num_sim, hitting_array, q, root, seed):
    # sub process that actually does the mcmcs
    n_states = A.shape[0]
    P = A / A.sum(axis=1).reshape((n_states, 1))

    hitting_array_temp = np.zeros((P.shape[0], 1)).astype('float64')
    n_steps = int(2 * n_states)

    currentState = root

    state = np.zeros((1, n_states))
    state[0, currentState] = 1
    state_root = state.copy()
    for i in range(num_sim):
        dist_list = []
        state = state_root
        currentState = root
        stateHist = state
        for x in range(n_steps):
            seed += 1
            np.random.seed(seed)
            nextState = np.random.choice(range(P.shape[0]), p=P[currentState])
            dist = A[currentState, nextState]
            dist_list.append(1. / (1 + math.exp(dist - 1)))

            state = np.zeros((1, n_states))
            state[0, nextState] = 1.0
            currentState = nextState
            # Keep track of state history
            stateHist = np.append(stateHist, state, axis=0)

        for state_i in range(P.shape[0]):
            first_time_at_statei = np.where(stateHist[:, state_i] == 1)[0]
            if len(first_time_at_statei) == 0:
                hitting_array_temp[state_i, 0] = n_steps + 1
            else:
                total_dist = 0
                for ff in range(first_time_at_statei[0]):
                    total_dist = dist_list[ff] + total_dist

                hitting_array_temp[state_i, 0] = total_dist  # first_time_at_statei[0]
        hitting_array = np.append(hitting_array, hitting_array_temp, axis=1)

    hitting_array = hitting_array[:, 1:]
    q.append(hitting_array)


def get_biased_weights(edges, weights, pt, round=1):
    # small nu means less forward biasing (0.5 is quite mild)
    # larger nu (in our case 1/nu) means more aggressive forwards biasing https://en.wikipedia.org/wiki/Generalised_logistic_function

    # using the pseudotime calculated from lazy-jumping walk. Otherwise using the refined MCMC Psuedotimes before
    # calculating lineage likelihood paths
    b = 1 if round == 1 else 20

    weights_thr, pct_thr = weights.mean(), np.percentile(pt, 80)
    loc_high_pt = set(np.where(pt > pct_thr)[0])
    for i in np.where(weights > weights_thr)[0]:
        start, end = edges[i]
        if start in loc_high_pt or end in loc_high_pt:
            weights[i] = 0.5 * weights.mean()
    weights.clip(np.percentile(weights, 10), np.percentile(weights, 90))

    bias_weight, K, c, C, nu = [], 1, 0, 1, 1
    for (s, t), w in zip(edges, weights):
        t_ab = pt[s] - pt[t]
        bias_weight.append(w * K / (C + math.exp(b * (t_ab + c))) ** nu)

    return bias_weight


def expected_num_steps(start_i, N):
    return np.dot(N, np.ones(N.shape[0]))[start_i]


def absorption_probability(N, R, absorption_state_j):
    M = np.dot(N, R)
    return M, M[:, absorption_state_j]


[docs]class VIA:
    '''

    A class to represent the VIA analysis

    Parameters
    ----------

    data: ndarray
        input matrix of size n_cells x n_dims. Expects the PCs or features that will be used in the TI computation. Can be e.g. adata.obsm['X_pca][:,0:20]
    true_label:list
        list of str/int that correspond to the ground truth or reference annotations. Can also be None when no labels are available
    labels: ndarray (nsamples, )
        default is None. and PARC clusters are used for the viagraph. alternatively provide a list of clustermemberships that are integer values (not strings) to construct the viagraph using another clustering method or available annotations
    edgepruning_clustering_resolution_local:float
        default = 2
        local level of pruning for PARC graph clustering stage. Range (0.1,3) higher numbers mean more edge retention. For large datasets can stick to just tuning edgepruning_clustering_resolution
    edgepruning_clustering_resolution: float
        (optional, default = 0.15, can also set as 'median') graph pruning for PARC clustering stage. Higher value keeps more edges, results in fewer clusters. Smaller value removes more edges and results in more clusters. Number of standard deviations below the network’s mean-jaccard-weighted edges. 0.1-1 provide reasonable pruning.
        higher value means less pruning (more edges retained). e.g. a value of 0.15 means all edges that are above mean(edgeweight)-0.15*std(edge-weights) are retained.
        We find both 0.15 and ‘median’ to yield good results/starting point and resulting in pruning away ~ 50-60% edges
    keep_all_local_dist: bool, str
        default value of 'auto' means that for smaller datasets local-pruning is done prior to clustering, but for large datasets local pruning is set to False for speed.
        can also set to be bool of True or False
    too_big_factor: float
        (optional, default=0.4). Forces clusters > 0.4*n_cells to be re-clustered
    resolution_parameter: float
        (default =1) larger value means more and smaller clusters
    partition_type:str
        (default "ModularityVP") Options
    small_pop: int
         (default 10) Via attempts to merge Clusters with a population < 10 cells with larger clusters. If you have a very small dataset (e.g. few hundred cells), then consider lowering to e.g. 5
    jac_weighted_edges:bool
        (default = True) Use weighted edges in the PARC clustering step
    knn: int
        (optional, default = 30) number of K-Nearest Neighbors for HNSWlib KNN graph. Larger knn means more graph connectivity. Lower knn means more loosely connected clusters/cells
    n_iter_leiden:int
    random_seed: int
        Random seed to pass to clustering
    num_threads:
    distance:str
        (default 'l2') Euclidean distance 'l2' by default; other options 'ip' and 'cosine' for graph construction and similarity
    visual_cluster_graph_pruning: float
        (optional, default = 0.15) This only comes into play if the user deliberately chooses not to use the default edge-bundling method of visualizating edges (draw_piechart_graph()) and instead calls draw_piechart_graph_nobundle().
        It is often set to the same value as the PARC clustering level of edgepruning_clustering_resolution. This does not impact computation of terminal states, pseudotime or lineage likelihoods.
        It controls the number of edges plotted for visual effect
    cluster_graph_pruning: float
        (optional, default =0.15) Pruning level of the cluster graph (does not impact number of clusters). Only impacts the connectivity of the clustergraph. Often set to the same value as the PARC clustering level of edgepruning_clustering_resolution.Reasonable range [0.1,1]
        To retain more connectivity in the clustergraph underlying the trajectory computations, increase the value
    time_smallpop: max time to be allowed handling singletons

    x_lazy:float
        (default =0.95) 1-x = probability of staying in same node (lazy). Values between 0.9-0.99 are reasonable
    alpha_teleport: float
        (default = 0.99) 1-alpha is probability of jumping. Values between 0.95-0.99 are reasonable unless prior knowledge of teleportation
    root_user: list, None
        can be a list of strings, a list of int or None
        (default is None) When the root_user is set as None and an RNA velocity matrix is available, a root will be automatically computed
        if the root_user is None and not velocity matrix is provided, then an arbitrary root is selected
        if the root_user is ['celltype_earlystage'] where the str corresponds to an item in true_label, then a suitable starting point will be selected corresponding to this group
        if the root_user is [678], where 678 is the index of the cell chosen as a start cell, then this will be the designated starting cell.
        It is possible to give a list of root indices and groups. [120, 699] or ['traj1_earlystage', 'traj2_earlystage'] when there are more than one trajectories
    preserve_disconnected: bool
        (default = True) If you believe there may be disconnected trajectories then set this to False
    dataset: str
        Can be set to 'group' or '' (default). this refers to the type of root label (group level root or single cell index) you are going to provide.
        if your true_label has a sensible group of cells for a root then you can set dataset to 'group' and make the root parameter ['labelname_root_cell_type']
        if your root corresponds to one particular cell then set dataset = '' (default)
    embedding: ndarray
        (optional, default = None) embedding (e.g. precomputed tsne, umap, phate, via-umap) for plotting data. Size n_cells x 2
        If an embedding is provided when running VIA, then a scatterplot colored by pseudotime, highlighting terminal fates
    velo_weight: float
        (optional, default = 0.5) #float between [0,1]. the weight assigned to directionality and connectivity derived from scRNA-velocity
    neighboring_terminal_states_threshold:int
        (default = 3). Candidates for terminal states that are neighbors of each other may be removed from the list if they have this number of more of terminal states as neighbors
    knn_sequential:int
        (default =10) number of knn in the adjacent time-point for time-series data (t_i and t_i+1)
    knn_sequential_reverse: int
        (default = 0) number of knn enforced from current to previous time point
    t_diff_step: int
        (default =1) Number of permitted temporal intervals between connected nodes. If time data is labeled as [0,25,50,75,100,..]
        then t_diff_step=1 corresponds to '25' and only edges within t_diff_steps are retained
    is_coarse:bool
        (default = True) If running VIA in two iterations where you wish to link the second fine-grained iteration with the initial iteration, then you set to False
    via_coarse: VIA
        (default = None) If instantiating a second iteration of VIA that needs to be linked to a previous iteration (e.g. via0), then set via_coarse to the previous via0 object
    df_annot: DataFrame
        (default None) used for the Mouse Organ data
    preserve_disconnected_after_pruning:bool
        (default = False) If you believe there are disconnected trajectories then set this to True and test your hypothesis
    A_velo: ndarray
        Cluster Graph Transition matrix based on rna velocity [n_clus x n_clus]
    velocity_matrix: matrix
            (default None) matrix of size [n_samples x n_genes]. this is the velocity matrix computed by scVelo (or similar package) and stored in adata.layers['velocity']. The genes used for computing velocity should correspond to those useing in gene_matrix
            Requires gene_matrix to be provided too.
    gene_matrix: matrix
            (default None) Only used if Velocity_matrix is available. matrix of size [n_samples x n_genes]. We recommend using a subset like HVGs rather than full set of genes. (need to densify input if taking from adata = adata.X.todense())
    time_series:bool
        (default False) if the data has time-series labels then set to True
    time_series_labels:list
        (default None) list of integer values of temporal annoataions corresponding to e.g. hours (post fert), days, or sequential ordering
    pca_loadings: array
        (default None) the loadings of the pcs used to project the cells (to projected euclidean location based on velocity). n_cells x n_pcs
    secondary_annotations: None
        (default None)
    edgebundle_pruning:float
        (default=None) will by default be set to the same as the cluster_graph_pruning and influences the visualized level of pruning of edges.
        Typical values can be between [0,1] with higher numbers retaining more edges
    edgebundle_pruning_twice:bool
        default: False. When True, the edgebundling is applied to a further visually pruned (visual_cluster_graph_pruning) and can sometimes simplify the visualization. it does not impact the pseudotime and lineage computations
     piegraph_arrow_head_width: float
        (default = 0.1) size of arrow heads in via cluster graph
    piegraph_edgeweight_scalingfactor:
        (defaulf = 1.5) scaling factor for edge thickness in via cluster graph
    max_visual_outgoing_edges: int
        (default =2) Only allows max_visual_outgoing_edges to come out of any given node. Used in differentiation_flow()

    edgebundle_pruning:float
        (default=None) will by default be set to the same as the cluster_graph_pruning and influences the visualized level of pruning of edges.
        Typical values can be between [0,1] with higher numbers retaining more edges

    edgebundle_pruning_twice:bool
        default: False. When True, the edgebundling is applied to a further visually pruned (visual_cluster_graph_pruning) and can sometimes simplify the visualization for very cluttered graphs. it does not impact the pseudotime and lineage computations
    pseudotime_threshold_TS: int
        (default = 30) corresponds to the criteria for a state to be considered a candidate terminal cell fate to be 30% or later of the computed psuedotime range
    num_mcmc_simulations:int
        (default = 1300) number of random walk simulations conducted
    embedding_type: str
        (default = 'via-mds', other options are 'via-atlas' and 'via-force'
    do_compute_embedding: bool
        (default = False) If you want an embedding (n_samples x2) to be computed on the basis of the via sc graph then set this to True
    do_gaussian_kernel_edgeweights: bool
        (default = False) Type of edgeweighting on the graph edges
    memory: float
        (default = 2) higher q means more memory, more retrospective/inwards randomwalk. memory = 2 means run using the non-memory Via 1.0 mode
    viagraph_decay: float
        (default = 0.9) increasing decay causes more edges to merge
    memory: 1/q * edge weight to a next-node that is not a neighbor of previous node. larger number means more memory and more introspective walk. small number <1 means more exploration
    p_memory: 1/p * edge weight to next node = previous node. large value means more exploration
    graph_init_pos: matrix (or list of lists) to initialize the viagraph
    spatial_coords: np.ndarray of size n_cells x 2 (denoting x,y coordinates) of each spot/cell
    do_spatial_knn: Whether or not to do spatial mode of StaVia for graph augmentation
    do_spatial_layout: whether to use spatial coords for layout of the clustergraph
    spatial_knn:int = 15. number of knn's added based on spatial proximity indiciated by spatial_coords
    spatial_aux:list = [] a list of slice IDs so that only cells/spots on the same slice are considered when building the spatial_knn graph

    Attributes
    ------------
    labels: array
        length (n_samples, ) of cluster labels ndarray pre determined cluster labels user defined. #np.asarray(pre_labels).flatten()
    single_cell_pt_markov: list
        length n_samples of pseudotime
    single_cell_bp: ndarray
        [n_lineages x n_samples] array of single cell branching probabilities towards each lineage (lineage normalized).
        Each column corresponds to a terminal state, in the order presented by the terminal_clusters attribute
    single_cell_bp_rownormed: ndarray
        [n_lineages x n_samples] array of single cell branching probabilities towards each lineage (cell normalized).
        Each column corresponds to a terminal state, in the order presented by the terminal_clusters attribute
    terminal_clusters: list
        list of clusters that are cell fates/ unique lineages
    cluster_bp: ndarray
        [n_clusters x n_terminal_states]. Lineage probability of cluster towards a particular terminal cluster state
    CSM: ndarray
        [n_cluster x n_clusters] array of cosine similarity used to weight the cluster graph transition matrix by velocity
    single_cell_transition_matrix: ndarray
        [n_samples x n_samples]
    terminal_clusters:list
        (default None) list of terminal clusters
    csr_full_graph: csr matrix of single-cell graph (augmented with sequential data when providing time_series information)
    csr_array_locally_pruned: csr matrix
    ig_full_graph:
    full_neighbor_array:
    user_defined_terminal_cell:list=[]
    user_defined_terminal_group:list=[]
    n_milestones: int = None Number of milestones in the via-mds computation (anything more than 10,000 can be computationally heavy and time consuming) Typically auto-determined within the via-mds function
    embedding: ndarray
        [n_cells x 2] provided by user or autocomputed with via-mds or via-umap

    '''

    def __init__(self, data: ndarray, true_label=None, edgepruning_clustering_resolution_local: float = 1, edgepruning_clustering_resolution=0.15,
                 labels: ndarray = None,
                 keep_all_local_dist='auto', too_big_factor: float = 0.4, resolution_parameter: float = 1.0,
                 partition_type: str = "ModularityVP", small_pop: int = 10,
                 jac_weighted_edges: bool = True, knn: int = 30, n_iter_leiden: int = 5, random_seed: int = 42,
                 num_threads=-1, distance='l2', time_smallpop=15,
                 super_cluster_labels: bool = False, super_node_degree_list: bool = False,
                 super_terminal_cells: bool = False, x_lazy: float = 0.99, alpha_teleport: float = 0.99,
                 root_user=None, preserve_disconnected: bool = True, dataset: str = '',
                 super_terminal_clusters: list = [],
                 is_coarse=True, csr_full_graph: csr_matrix = '', csr_array_locally_pruned='', ig_full_graph='',
                 full_neighbor_array='', full_distance_array='', embedding: ndarray = None, df_annot=None,
                 preserve_disconnected_after_pruning: bool = False,
                 secondary_annotations: list = None, pseudotime_threshold_TS: int = 30,
                 cluster_graph_pruning: float = 0.15,
                 visual_cluster_graph_pruning: float = 0.15, neighboring_terminal_states_threshold=3,
                 num_mcmc_simulations=1300,
                 piegraph_arrow_head_width=0.1,
                 piegraph_edgeweight_scalingfactor=1.5, max_visual_outgoing_edges: int = 2, via_coarse=None,
                 velocity_matrix=None,
                 gene_matrix=None, velo_weight=0.5, edgebundle_pruning=None, A_velo=None, CSM=None,
                 edgebundle_pruning_twice=False, pca_loadings=None, time_series=False,
                 time_series_labels: list = None, knn_sequential: int = 10, knn_sequential_reverse: int = 0,
                 t_diff_step: int = 1, single_cell_transition_matrix=None,
                 embedding_type: str = 'via-mds', do_compute_embedding: bool = False, color_dict: {} = None,
                 user_defined_terminal_cell: list = [], user_defined_terminal_group: list = [],
                 do_gaussian_kernel_edgeweights: bool = False, RW2_mode: bool = False,
                 working_dir_fp: str = '/home/', memory=5, viagraph_decay=0.9, p_memory=1, graph_init_pos: np.ndarray=None, spatial_coords:np.ndarray=None, do_spatial_knn:bool=False, do_spatial_layout:bool=False, spatial_knn:int = 15, spatial_aux:list = []):

        self.data = data
        self.nsamples, self.ncomp = data.shape
        if true_label is not None:
            self.true_label = true_label
        else:
            self.true_label = [1] * self.nsamples
        if velocity_matrix is None: velo_weight = 0
        self.velo_weight = velo_weight  # float between 0,1. the weight assigned to directionality and connectivity derived from scRNA-velocity
        if edgebundle_pruning is None: edgebundle_pruning = cluster_graph_pruning
        self.edgebundle_pruning = edgebundle_pruning
        if (root_user is None) & (velocity_matrix is None):
            root_user = []
            dataset = ''
        self.root_user = root_user
        self.dataset = dataset
        self.knn_struct = None
        if isinstance(labels, list):
            labels = np.asarray(labels).flatten()
        self.labels = labels  # np.asarray(pre_labels).flatten() where pre_labels is a list
        self.connected_comp_labels = None
        self.edgelist = None
        self.edgelist_unique = None

        # higher edgepruning_clustering_resolution_local means more edges are kept
        # higher edgepruning_clustering_resolution means more edges are kept
        if keep_all_local_dist == 'auto':
            # If large dataset skip local pruning to increase speed
            keep_all_local_dist = data.shape[0] > 50000

        if resolution_parameter != 1:
            partition_type = "RBVP"  # Reichardt and Bornholdt’s Potts model. Note that this is the same as ModularityVertexPartition when setting 𝛾 = 1 and normalising by 2m

        self.edgepruning_clustering_resolution_local = edgepruning_clustering_resolution_local
        self.edgepruning_clustering_resolution = edgepruning_clustering_resolution  ##0.15 is also a recommended value performing empirically similar to 'median'
        self.keep_all_local_dist = keep_all_local_dist
        self.too_big_factor = too_big_factor  ##if a cluster exceeds this share of the entire cell population, then the PARC will be run on the large cluster. at 0.4 it does not come into play
        self.resolution_parameter = resolution_parameter
        self.partition_type = partition_type
        self.small_pop = small_pop  # smallest cluster population to be considered a community
        self.jac_weighted_edges = jac_weighted_edges
        self.knn = knn
        self.knn_sequential = knn_sequential  # number of knn in the adjacent time-point for time-series data
        self.knn_sequential_reverse = knn_sequential_reverse  # number of knn made between timepoint and previous time point
        self.n_iter_leiden = n_iter_leiden
        self.random_seed = random_seed  # enable reproducible Leiden clustering
        self.num_threads = num_threads  # number of threads used in KNN search/construction
        self.distance = distance  # Euclidean distance 'l2' by default; other options 'ip' and 'cosine'
        self.time_smallpop = time_smallpop

        self.super_cluster_labels = super_cluster_labels
        self.super_node_degree_list = super_node_degree_list
        self.super_terminal_clusters = super_terminal_clusters
        self.full_neighbor_array = full_neighbor_array
        self.full_distance_array = full_distance_array
        self.ig_full_graph = ig_full_graph
        self.csr_array_locally_pruned = csr_array_locally_pruned
        self.csr_full_graph = csr_full_graph
        self.super_terminal_cells = super_terminal_cells


        self.x_lazy = x_lazy  # 1-x = probability of staying in same node
        self.alpha_teleport = alpha_teleport  # 1-alpha is probability of jumping
        self.preserve_disconnected = preserve_disconnected
        self.is_coarse = is_coarse  # set to True for first round of VIA. if one chooses to run a second iteration of VIA that uses the terminal states from the first round, then set this to False for second iteration
        self.embedding = embedding
        self.df_annot = df_annot
        self.preserve_disconnected_after_pruning = preserve_disconnected_after_pruning  # pruning can cause some fragmentation of small clusters, these should be reattached to the relevant component (so typically this is False as we dont want to preserve these small extra components resulting from pruning)
        self.secondary_annotations = secondary_annotations
        self.pseudotime_threshold_TS = pseudotime_threshold_TS
        self.cluster_graph_pruning = cluster_graph_pruning
        self.visual_cluster_graph_pruning = visual_cluster_graph_pruning  # higher value means more edges retained. This is applied to the clustergraph before visulizing only if two rounds of edgbebundlng is done.
        self.neighboring_terminal_states_threshold = neighboring_terminal_states_threshold  # number of neighbors of a terminal state has before it is eliminated as a TS
        self.num_mcmc_simulations = num_mcmc_simulations  # number of mcmc simulations in second state of pseudotime computation
        self.piegraph_arrow_head_width = piegraph_arrow_head_width
        self.piegraph_edgeweight_scalingfactor = piegraph_edgeweight_scalingfactor
        self.max_visual_outgoing_edges = max_visual_outgoing_edges  # higher value means more edges retained. This is applied to the clustergraph and is a strong threshold for number of edges shown
        if velocity_matrix is not None: velocity_matrix[np.isnan(velocity_matrix)] = 0
        self.velocity_matrix = velocity_matrix  # matrix from scVelo with velocities
        if gene_matrix is not None: gene_matrix[np.isnan(gene_matrix)] = 0
        self.gene_matrix = gene_matrix  # matrix (not numpy array) with gene expression #such as adata.X.todense() or adata.layers["Mu"] from scVelo (first moments of spliced gene counts
        self.A_velo = A_velo  # the transition matrix weighted by the rna velocity [n_clus x n_clus]
        self.CSM = CSM  # the cosine similarity matrix using velocity [n_clusters x n_clusters]
        self.edgebundle_pruning_twice = edgebundle_pruning_twice  # default: False. When True, the edgebundling is applied to a further visually pruned (visual_cluster_graph_pruning) and can sometimes simplify the visualization. it does not impact the pseudotime and lineage computations
        self.pca_loadings = pca_loadings  # the loadings of the pcs used to project the cells (to projected euclidean location based on velocity)
        self.time_series = time_series
        self.time_series_labels = time_series_labels
        self.t_diff_step = t_diff_step
        self.single_cell_transition_matrix = single_cell_transition_matrix
        self.embedding_type = embedding_type
        self.do_compute_embedding = do_compute_embedding
        self.color_dict = color_dict
        self.user_defined_terminal_cell = user_defined_terminal_cell
        self.user_defined_terminal_group = user_defined_terminal_group
        self.do_gaussian_kernel_edgeweights = do_gaussian_kernel_edgeweights
        self.RW2_mode = RW2_mode
        self.working_dir_fp = working_dir_fp
        self.memory = memory
        self.p_memory = p_memory
        self.viagraph_decay = viagraph_decay
        self.graph_init_pos = graph_init_pos
        self.spatial_coords = spatial_coords
        self.do_spatial_knn = do_spatial_knn
        self.do_spatial_layout = do_spatial_layout
        self.spatial_knn = spatial_knn
        self.spatial_aux = spatial_aux  #list of labels that identify cells from the same tissue slice

    def _make_pt_augmented_adjacency_igraph(self, neighbors: ndarray, distances: ndarray, k_reverse: int = 10,
                                           k_seq: int = 0):
        '''
        If you dont have time-series labels to adjust the construction of the single-cell graph, you can try using the
        pseudotime (single-cell level) to achieve the same sequencing effect. However, since the pseudo-times are derived from
        the original unguided structure, there may not neccessarily be a large visible impact on the graph structure

        :param neighbors: ndarray (n_cells x n_neighbors) original unguided knn graph structure
        :param distances: ndarray (n_cell x n_neighbors)
        :param k_reverse: (int) default = 10
        :param k_seq: this is zero because terminal states that have lower pseudotimes should not be forced to look for neighbors in later pseudotimes
        :param knn:
        :return:
        '''
        n_pt_augmented, d_pt_augmented = sequential_knn(self.data,
                                                        [int(i * 10) for i in self.single_cell_pt_markov],
                                                        neighbors, distances,
                                                        k_seq=k_seq, k_reverse=k_reverse,
                                                        num_threads=self.num_threads, distance=self.distance)

        adjacency_pt_augmented = self._make_csrmatrix_noselfloop(n_pt_augmented, d_pt_augmented, time_series=True,
                                                                 time_series_labels=[int(i * 10) for i in
                                                                                     self.single_cell_pt_markov],
                                                                 t_diff_step=3)  # this function has local pruning which removes neighbors that are more than t_dif apart. Since the same type of local pruning wrt t_dif is applied pre-clustergraph, we only need to call this function once in the case of time_series data
        ig_full_graph_pt_guided = _composite_jacAffinity_distanceAffinity(sc_csr_graph=adjacency_pt_augmented)
        return ig_full_graph_pt_guided, adjacency_pt_augmented

    def _get_terminal_clusters_user_defined_(self, user_defined_terminal_cell: list = [],
                                            user_defined_terminal_group: list = []):
        '''
        Allow the user to optionally select group or cell index level terminal fates that override the automated cell fate detection

        :param user_defined_terminal_cell: list of cell indices corresponding to terminal fate cells
        :param user_defined_terminal_cell_group: list of group level labels corresponding to labels found in true_label, that represent cell fates
        :return: list of clusters to represent the cell fate clusters in the via-clustergraph
        '''
        dict_user_defined_terminal_clusters = {}
        terminal_cluster_list = []
        if len(user_defined_terminal_cell) > 0:
            for i in user_defined_terminal_cell:
                clus_ = self.labels[i]
                if clus_ not in terminal_cluster_list:
                    terminal_cluster_list.append(clus_)
                else:
                    print(f'{i} is a repeated terminal cluster')
                    terminal_cluster_list.append(clus_)
                    dict_user_defined_terminal_clusters[user_defined_terminal_cell] = clus_
        else:

            for user_terminal_group in user_defined_terminal_group:

                # location in indices of which cells belong to this user_terminal_group
                index_terminal_group = np.where(np.asarray(self.true_label) == user_terminal_group)[0]
                potential_clusters = self.labels[index_terminal_group]
                clus_ = func_mode(potential_clusters.tolist())
                if clus_ not in terminal_cluster_list:
                    terminal_cluster_list.append(clus_)
                    dict_user_defined_terminal_clusters[user_terminal_group] = clus_
        print(f'{datetime.now()}\tTerminal cluster list based on user defined cells/groups:',
              [(key, dict_user_defined_terminal_clusters[key]) for key in dict_user_defined_terminal_clusters])
        return terminal_cluster_list

    def _get_terminal_clusters(self, A, markov_pt, root_ai):
        n_ = A.shape[0]  # number of states in the graph component

        if n_ <= 10: n_outlier_std = 3
        if (n_ <= 40) & (n_ > 10): n_outlier_std = 2

        if n_ >= 40: n_outlier_std = 2  # 1

        pop_list = []

        # print('get terminal', set(self.labels), np.where(self.labels == 0))
        for i in list(set(self.labels)):
            pop_list.append(len(np.where(self.labels == i)[0]))
        # we weight the out-degree based on the population of clusters to avoid allowing small clusters to become the terminals based on population alone
        A_new = A.copy()
        for i in range(A.shape[0]):
            for j in range(A.shape[0]):
                A_new[i, j] = A[i, j] * (pop_list[i] + pop_list[j]) / (pop_list[i] * pop_list[j])

        # make an igraph graph to compute the closeness
        g_dis = ig.Graph.Adjacency(
            (A_new > 0).tolist())  # need to manually add the weights as igraph treates A>0 as boolean
        g_dis.es['weights'] = 1 / A_new[A_new.nonzero()]  # we want "distances" not weights for closeness and betweeness

        betweenness_score = g_dis.betweenness(weights='weights')
        betweenness_score_array = np.asarray(betweenness_score)
        betweenness_score_takeout_outlier = betweenness_score_array[betweenness_score_array < (
                np.mean(betweenness_score_array) + n_outlier_std * np.std(betweenness_score_array))]
        betweenness_list = [i for i, score in enumerate(betweenness_score) if score < (
                np.mean(betweenness_score_takeout_outlier) - 0 * np.std(betweenness_score_takeout_outlier))]

        closeness_score = g_dis.closeness(mode='ALL', cutoff=None, weights='weights', normalized=True)
        closeness_score_array = np.asarray(closeness_score)
        closeness_score_takeout_outlier = closeness_score_array[
            closeness_score_array < (np.mean(closeness_score_array) + n_outlier_std * np.std(closeness_score_array))]
        closeness_list = [i for i, score in enumerate(closeness_score) if
                          score < (np.mean(closeness_score_takeout_outlier) - 0 * np.std(
                              closeness_score_takeout_outlier))]

        out_deg = A_new.sum(axis=1)
        out_deg = np.asarray(out_deg)

        outdegree_score_takeout_outlier = out_deg[out_deg < (np.mean(out_deg) + n_outlier_std * np.std(out_deg))]
        loc_deg = [i for i, score in enumerate(out_deg) if
                   score < (np.mean(outdegree_score_takeout_outlier) - 0 * np.std(outdegree_score_takeout_outlier))]
        print(f"{datetime.now()}\tIdentifying terminal clusters corresponding to unique lineages...")
        print(f"{datetime.now()}\tCloseness:{closeness_list}")
        print(f"{datetime.now()}\tBetweenness:{betweenness_list}")
        print(f"{datetime.now()}\tOut Degree:{loc_deg}")

        markov_pt = np.asarray(markov_pt)
        pct = 10 if n_ <= 40 else 30
        loc_pt = np.where(markov_pt >= np.percentile(markov_pt, pct))[0]

        terminal_clusters_1 = list(set(closeness_list) & set(betweenness_list))
        terminal_clusters_2 = list(set(closeness_list) & set(loc_deg))
        terminal_clusters_3 = list(set(betweenness_list) & set(loc_deg))
        terminal_clusters = list(set(terminal_clusters_1) | set(terminal_clusters_2))
        terminal_clusters = list(set(terminal_clusters) | set(terminal_clusters_3))
        terminal_clusters = list(set(terminal_clusters) & set(loc_pt))

        terminal_org = terminal_clusters.copy()

        for terminal_i in terminal_org:
            if terminal_i in terminal_clusters:
                removed_terminal_i = False
            else:
                removed_terminal_i = True
            # print('terminal state', terminal_i)
            count_nn = 0
            ts_neigh = []
            neigh_terminal = np.where(A[:, terminal_i] > 0)[0]
            if neigh_terminal.size > 0:
                for item in neigh_terminal:
                    if item in terminal_clusters:
                        ts_neigh.append(item)
                        count_nn = count_nn + 1

                    if n_ >= 10:
                        if item == root_ai:  # if the terminal state is a neighbor of
                            if terminal_i in terminal_clusters:
                                terminal_clusters.remove(terminal_i)
                                print(
                                    f"{datetime.now()}\tWe removed cluster {terminal_i} from the shortlist of terminal states")
                                removed_terminal_i = True
                if count_nn >= self.neighboring_terminal_states_threshold:  # 2
                    if removed_terminal_i == False:
                        temp_remove = terminal_i
                        temp_time = markov_pt[terminal_i]
                        for to_remove_i in ts_neigh:
                            if markov_pt[to_remove_i] < temp_time:
                                temp_remove = to_remove_i
                                temp_time = markov_pt[to_remove_i]
                        terminal_clusters.remove(temp_remove)
                        print(
                            f"{datetime.now()}\tCluster {terminal_i} had {self.neighboring_terminal_states_threshold} or more neighboring terminal states {ts_neigh} and so we removed cluster {temp_remove}")
        if len(terminal_clusters) == 0: terminal_clusters = loc_deg
        # print('terminal_clusters', terminal_clusters)
        return terminal_clusters

    def compute_hitting_time(self, sparse_graph, root, x_lazy, alpha_teleport, number_eig=0):
        # 1- alpha is the probability of teleporting
        # 1- x_lazy is the probability of staying in current state (be lazy)
        # Computing lazy-teleporting expected hitting time
        beta_teleport = 2 * (1 - alpha_teleport) / (2 - alpha_teleport)
        N = sparse_graph.shape[0]

        Lsym = csgraph.laplacian(sparse_graph, normed=True)
        eigval, eigvec = scipy.sparse.linalg.eigsh(Lsym, number_eig or Lsym.shape[0] - 1)

        Greens, beta_norm_lap = np.zeros((N, N), float), np.zeros((N, N), float)
        Xu = np.zeros((N, N))
        Xu[:, root] = 1
        Xv_Xu = np.eye(N) - Xu

        for i in range(1 if alpha_teleport == 1 else 0, len(eigval)):
            vv = np.outer(eigvec[:, i], eigvec[:, i])
            factor = beta_teleport + 2 * eigval[i] * x_lazy * (1 - beta_teleport)

            Greens += vv / factor
            beta_norm_lap += vv * factor

        D = np.diag(np.nan_to_num(np.array(sparse_graph.sum(axis=1)).reshape(-1) ** -.5, posinf=0))
        t = D @ Greens @ D * beta_teleport

        hitting_matrix = np.diagonal(t)[np.newaxis, :] - t
        # Calculate only diagonal elements of Xv_Xu @ t
        return np.abs((Xv_Xu * t.T).sum(-1)), (hitting_matrix + hitting_matrix.T)[root]

    def simulate_branch_probability(self, terminal_state, A, root, num_sim=500):

        n_states = A.shape[0]

        ncpu = multiprocessing.cpu_count()
        if (ncpu == 1) | (ncpu == 2):
            n_jobs = 1
        elif ncpu > 2:
            n_jobs = min(ncpu - 1, 5)
        # print('njobs', n_jobs)
        num_sim_pp = int(num_sim / n_jobs)  # num of simulations per process
        jobs = []

        manager = multiprocessing.Manager()

        q = manager.list()
        seed_list_ = list(range(n_jobs))
        seed_list = [i + self.random_seed for i in seed_list_]

        for i in range(n_jobs):
            cumstateChangeHist = np.zeros((1, n_states))
            cumstateChangeHist_all = np.zeros((1, n_states))
            process = multiprocessing.Process(target=_prob_reaching_terminal_state, args=(
                terminal_state, A, root, num_sim_pp, q, cumstateChangeHist,
                cumstateChangeHist_all,
                seed_list[i]))
            jobs.append(process)

        for j in jobs:
            j.start()

        for j in jobs:
            j.join()

        cumhistory_vec = q[0][0]
        cumhistory_vec_all = q[0][1]

        count_reached = cumhistory_vec_all[0, terminal_state]

        for i in range(1, len(q)):
            cumhistory_vec = cumhistory_vec + q[i][0]
            cumhistory_vec_all = cumhistory_vec_all + q[i][1]

            count_reached = count_reached + q[i][1][0, terminal_state]

        print(
            f"{datetime.now()}\tFrom root {root},  the Terminal state {terminal_state} is reached {int(count_reached)} times.")
        # print('cumhistory_vec_all', cumhistory_vec_all)
        cumhistory_vec_all[
            cumhistory_vec_all == 0] = 1  # to avoid division by zero we say that the state has been visited once
        # print('cumhistory_vec_all', cumhistory_vec_all)
        # print('cumhistory_vec', cumhistory_vec)
        prob_ = cumhistory_vec / cumhistory_vec_all
        # given a cluster is traversed (cumhistory_vec_all), how many of these traversals led to a successful path to Terminal State X
        # i.e. with what probability does traversal of a cluster lead to the desired terminal state (number of times traversed along a sucessful path)  #cumhistory_vec: number of times cluster is traversed on a successful path

        np.set_printoptions(precision=3)

        if count_reached == 0:
            prob_[:, terminal_state] = 0
            print('never reached state', terminal_state)
        else:
            loc_1 = np.where(prob_ == 1)

            loc_1 = loc_1[1]

            prob_[0, loc_1] = 0
            # print('zerod out prob', prob_)
            temp_ = np.max(prob_)
            if temp_ == 0: temp_ = 1
            prob_ = prob_ / min(1,
                                1.1 * temp_)  # do some scaling to amplify the probabiltiies of those clusters that are not 1 to make the range of values more compact
        prob_[0, loc_1] = 1  # put back probability =1

        return list(prob_)[0]

    def _simulate_markov(self, A, root):
        '''
        Computes the MCMC based hitting times on the forward biased graph
        :param A: cluster graph adjacency matrix (forward biased)
        :param root:
        :return: mcmc hitting times
        '''
        n_states = A.shape[0]
        P = A / A.sum(axis=1).reshape((n_states, 1))
        # print('row normed P',P.shape, P, P.sum(axis=1))
        x_lazy = self.x_lazy  # 1-x is prob lazy
        alpha_teleport = self.alpha_teleport
        # bias_P is the transition probability matrix

        P = x_lazy * P + (1 - x_lazy) * np.identity(n_states)
        # print(P, P.sum(axis=1))
        P = alpha_teleport * P + ((1 - alpha_teleport) * (1 / n_states) * (np.ones((n_states, n_states))))
        # print('check prob of each row sum to one', P.sum(axis=1))

        currentState = root
        state = np.zeros((1, n_states))
        state[0, currentState] = 1
        num_sim = self.num_mcmc_simulations

        ncpu = multiprocessing.cpu_count()
        if (ncpu == 1) | (ncpu == 2):
            n_jobs = 1
        elif ncpu > 2:
            n_jobs = ncpu - 2  # min(ncpu - 2, 5)
        # print('njobs', n_jobs)
        num_sim_pp = int(num_sim / n_jobs)  # num of simulations per process

        n_steps = int(2 * n_states)

        jobs = []
        seed = self.random_seed
        dummy_list = []
        for dummy in range(n_jobs):
            hitting_array = np.ones((P.shape[0], 1)) * 1000
            dummy_list.append(hitting_array)

        with multiprocessing.Manager() as manager:
            q = manager.list()
            for i in range(2):  # range(n_jobs):
                hitting_array = np.ones((P.shape[0], 1)) * 1000
                process = multiprocessing.Process(target=_simulate_markov_sub,
                                                  args=(P, num_sim_pp, hitting_array, q, root, seed))
                jobs.append(process)
                process.start()
            for proc in jobs:
                proc.join()
            # for j in jobs:
            # j.start()

            # for j in jobs:
            # j.join()

            print(f"{datetime.now()}\tEnded all multiprocesses, will retrieve and reshape")
            hitting_array = q[0]
            for qi in q[1:]:
                hitting_array = np.append(hitting_array, qi, axis=1)  # .get(), axis=1)
            # print('finished getting from queue', hitting_array.shape)
        hitting_array_final = np.zeros((1, n_states))
        no_times_state_reached_array = np.zeros((1, n_states))

        for i in range(n_states):
            rowtemp = hitting_array[i, :]
            no_times_state_reached_array[0, i] = np.sum(rowtemp != (n_steps + 1))

        for i in range(n_states):
            rowtemp = hitting_array[i, :]
            no_times_state_reached = np.sum(rowtemp != (n_steps + 1))
            if no_times_state_reached != 0:
                perc = np.percentile(rowtemp[rowtemp != n_steps + 1], 15) + 0.001  # 15 for Human and Toy
                # print('state ', i,' has perc' ,perc)
                hitting_array_final[0, i] = np.mean(rowtemp[rowtemp <= perc])
            else:
                hitting_array_final[0, i] = (n_steps + 1)

        return hitting_array_final[0]

    def _compute_hitting_time_onbias(self, laplacian, inv_sqr_deg, root, x_lazy, alpha_teleport, number_eig=0):
        # 1- alpha is the probabilty of teleporting
        # 1- x_lazy is the probability of staying in current state (be lazy)
        beta_teleport = 2 * (1 - alpha_teleport) / (2 - alpha_teleport)
        N = laplacian.shape[0]
        print('is laplacian of biased symmetric', (laplacian.transpose() == laplacian).all())
        Id = np.zeros((N, N), float)
        np.fill_diagonal(Id, 1)
        # norm_lap = scipy.sparse.csr_matrix.todense(laplacian)

        eig_val, eig_vec = np.linalg.eig(
            laplacian)  # eig_vec[:,i] is eigenvector for eigenvalue eig_val[i] not eigh as this is only for symmetric. the eig vecs are not in decsending order
        print('eig val', eig_val.shape)
        if number_eig == 0: number_eig = eig_vec.shape[1]
        print('number of eig vec', number_eig)
        Greens_matrix = np.zeros((N, N), float)
        beta_norm_lap = np.zeros((N, N), float)
        Xu = np.zeros((N, N))
        Xu[:, root] = 1
        Id_Xv = np.zeros((N, N), int)
        np.fill_diagonal(Id_Xv, 1)
        Xv_Xu = Id_Xv - Xu
        start_ = 0
        if alpha_teleport == 1:
            start_ = 1  # if there are no jumps (alph_teleport ==1), then the first term in beta-normalized Green's function will have 0 in denominator (first eigenvalue==0)

        for i in range(start_, number_eig):  # 0 instead of 1st eg
            # print(i, 'th eigenvalue is', eig_val[i])
            vec_i = eig_vec[:, i]
            factor = beta_teleport + 2 * eig_val[i] * x_lazy * (1 - beta_teleport)

            vec_i = np.reshape(vec_i, (-1, 1))
            eigen_vec_mult = vec_i.dot(vec_i.T)
            Greens_matrix = Greens_matrix + (
                    eigen_vec_mult / factor)  # Greens function is the inverse of the beta-normalized laplacian
            beta_norm_lap = beta_norm_lap + (eigen_vec_mult * factor)  # beta-normalized laplacian

        temp = Greens_matrix.dot(inv_sqr_deg)
        temp = inv_sqr_deg.dot(temp) * beta_teleport
        hitting_matrix = np.zeros((N, N), float)
        diag_row = np.diagonal(temp)
        for i in range(N):
            hitting_matrix[i, :] = diag_row - temp[i, :]

        roundtrip_commute_matrix = hitting_matrix + hitting_matrix.T
        temp = Xv_Xu.dot(temp)
        final_hitting_times = np.diagonal(
            temp)  ## number_eig x 1 vector of hitting times from root (u) to number_eig of other nodes
        roundtrip_times = roundtrip_commute_matrix[root, :]
        return abs(final_hitting_times), roundtrip_times

    def _project_branch_probability_sc(self, bp_array_clus, pt):
        # print('sum of branch probabilities at cluster level', np.sum(bp_array_clus, axis=1))
        n_clus = len(list(set(self.labels)))
        n_cells = self.data.shape[0]

        knn_sc = 3 if self.data.shape[0] > 1000 else 10
        neighbors, _ = self.knn_struct.knn_query(self.data, k=knn_sc)

        rows, cols, weights = [], [], []
        for i, row in enumerate(neighbors):
            neighboring_clus = self.labels[row]
            for c in set(list(neighboring_clus)):
                rows.append(i)
                cols.append(c)
                weights.append(np.sum(neighboring_clus == c) / knn_sc)

        weights = csr_matrix((weights, (rows, cols)), shape=(n_cells, n_clus))

        bp_array_sc = weights.dot(bp_array_clus)
        bp_array_sc /= np.max(bp_array_sc,
                              axis=0)  # divide cell probability by max value in that column so that rare lineages dont have distortedly low lineage probabilities

        for i, label_ts in enumerate(self.terminal_clusters):  # list(set(self.terminal_clusters))

            loc_i = np.where(self.labels == label_ts)[0]
            loc_noti = np.where(self.labels != label_ts)[0]
            if np.max(bp_array_sc[loc_noti, i]) > 0.8: bp_array_sc[loc_i, i] = 1.2
        pt = np.asarray(pt)
        pt = np.reshape(pt, (n_clus, 1))
        pt_sc = weights.dot(pt)
        pt_sc /= np.amax(pt_sc)
        return bp_array_sc, pt_sc.flatten()

[docs]    def sc_transition_matrix(self, smooth_transition, b=10, use_sequentially_augmented=False):
        '''
        #computes the single cell level transition directions that are later used to calculate velocity of embedding
        #based on changes at single cell level in genes and single cell level velocity

        :param smooth_transition:
        :param b: slope of logistic function
        :return:
        '''
        # n_clus = len(list(set(self.labels)))
        # n_cells = self.data.shape[0]
        pt = self.single_cell_pt_markov * 2 / max(
            self.single_cell_pt_markov)  # some scaling so that the input to the logistic function covers a range of sensible inputs
        labels = self.labels
        if use_sequentially_augmented:
            T = self.csr_array_locally_pruned_augmented  # single cell edges are weighted as inverse of distance

        else:
            T = self.csr_array_locally_pruned
        # T = self.csr_full_graph #single cell

        thr_global = np.mean(T.data) - 2 * np.std(T.data)  # 2*
        T.data[T.data < thr_global] = 0
        T.setdiag(0)
        T.eliminate_zeros()

        size_T = T.size

        T.data = T.data.clip(np.percentile(T.data, 10), np.percentile(T.data, 90))
        find_T = find(T)
        bias_weight, K, c, C, nu = [], 1, 0, 1, 1
        bias_weight_pt, bias_weight_velo = [], []
        # print('transition before biasing')
        # print(T)
        if self.CSM is not None:
            # if rna velocity is available, then combine the single-cell directions inferred by sc-pt and scRNA velocity to get the sc directions
            for i in range(size_T):
                start = find_T[0][i]
                end = find_T[1][i]
                weight = find_T[2][i]
                t_dif = pt[find_T[1][i]] - pt[find_T[0][i]]

                delta_gene = np.array(self.gene_matrix[end, :] - self.gene_matrix[start,
                                                                 :])  # change in gene expression when going from start cell to end cells

                # print('shape delta_gene', delta_gene.shape, delta_gene)

                # csm_ = 1- distance.cosine(delta_gene[0,:], self.velocity_matrix[start, :])
                # print(csm_)
                csm_ = 0

                csm_ = cosine_sim(delta_gene, self.velocity_matrix[start, :])[0] * 10  # based on sc level CSM

                # csm_ = self.CSM[labels[find_T[0][i]],labels[find_T[1][i]]] #based on cluster level CSM
                # if (csm_<0)&(t_dif<0): csm_ *=10
                # if (csm_ >0) & (t_dif > 0): csm_ *= 10
                # if i%100000==0:  print(i,'out of', size_T, 'start', labels[start], 'end', labels[end], 'csm_', round(csm_,3), 't_diff', round(t_dif,3))
                # print('shape gene_matrix', self.gene_matrix.shape)
                # print('start-end, csm', start, end, csm_)
                # bias_weight_pt.append(t_dif)
                # bias_weight_velo.append(csm_)
                bias_weight_pt.append(1 * K / ((C + math.exp(b * (-t_dif + c))) ** nu))
                # bias_weight_velo.append(csm_)
                bias_weight_velo.append(1 * K / ((C + math.exp(b * (- csm_ + c))) ** nu))
            # scale both the pt and velocity weights
            bias_weight_pt = 2.0 * np.asarray(bias_weight_pt) / np.mean(
                np.array(bias_weight_pt))  # need to normalize the pt and velo weights
            # print('list shape',bias_weight_pt.shape)
            # print('argwhere is nan')
            # argwhere=np.argwhere(np.isnan(bias_weight_velo))
            # print(argwhere.shape)
            # print('bias weight velo is anywhere nan', np.isnan(bias_weight_velo).any(), np.mean(np.array(bias_weight_velo)))
            bias_weight_velo = np.nan_to_num(bias_weight_velo)
            bias_weight_velo = 2.0 * np.asarray(bias_weight_velo) / np.mean(np.array(bias_weight_velo))  # USE THIS
            # bias_weight_velo = np.asarray(bias_weight_velo)#
            # bias_weight_velo = np.clip(bias_weight_velo, 0, 1) #this approach zeros out negative edges
            # print('pt weight ', bias_weight_pt)
            bias_weight_velo = np.multiply(np.asarray(weight),
                                           bias_weight_velo)  ## consider changing this weight to the weight based on projected "i" -to- current "neighbor"
            bias_weight_pt = np.multiply(np.asarray(weight), bias_weight_pt)
            # print('velo weight * weigt', bias_weight_velo)
            # print('pt weight *weight', bias_weight_pt)
            bias_weight = (1 - self.velo_weight) * bias_weight_pt + self.velo_weight * bias_weight_velo
            # print('bias_weight size', bias_weight.shape)
            T = csr_matrix((bias_weight / np.mean(np.array(bias_weight)), (np.array(find_T[0]), np.array(find_T[1]))),
                           shape=T.shape)
            # print('sc transition weights T',np.isnan(T.data).any())
            T.data = np.nan_to_num(T.data)
            # print('sc transition weights T', np.isnan(T.data).any())
        else:
            # case where you dont have rna velocity, then the single-cell directions are only inferred using the single-cell pt
            b = 10
            for i in range(size_T):
                # start = find_T[0][i]
                # end = find_T[1][i]
                weight = find_T[2][i]
                t_dif = pt[find_T[1][i]] - pt[find_T[0][i]]
                bias_weight.append(weight * K / ((C + math.exp(b * (-t_dif + c))) ** nu))  # b * (-t_dif

            T = csr_matrix((bias_weight / np.mean(np.array(bias_weight)), (np.array(find_T[0]), np.array(find_T[1]))),
                           shape=T.shape)

        T.setdiag(0)

        T.eliminate_zeros()
        T = np.expm1(T)

        T = normalize(T, norm='l1', axis=1) ** smooth_transition

        T = T.multiply(csr_matrix(1.0 / np.abs(T).sum(1)))  # rows sum to one
        return T

    def _velocity_embedding(self, X_emb, smooth_transition, b, use_sequentially_augmented=False):
        '''

        :param X_emb:
        :param smooth_transition:
        :return: V_emb
        '''
        # T transition matrix at single cell level
        n_obs = X_emb.shape[0]
        V_emb = np.zeros(X_emb.shape)
        if self.single_cell_transition_matrix is None:
            self.single_cell_transition_matrix = self.sc_transition_matrix(smooth_transition, b,
                                                                           use_sequentially_augmented=use_sequentially_augmented)
            T = self.single_cell_transition_matrix
        else:
            T = self.single_cell_transition_matrix

        # the change in embedding distance when moving from cell i to its neighbors is given by dx
        for i in range(n_obs):
            indices = T[i].indices
            dX = X_emb[indices] - X_emb[i, None]  # shape (n_neighbors, 2)
            dX /= l2_norm(dX)[:, None]

            # dX /= np.sqrt(dX.multiply(dX).sum(axis=1).A1)[:, None]
            dX[np.isnan(dX)] = 0  # zero diff in a steady-state
            # neighbor edge weights are used to weight the overall dX or velocity from cell i.
            probs = T[i].data
            # if probs.size ==0: print('velocity embedding probs=0 length', probs, i, self.true_label[i])
            V_emb[i] = probs.dot(dX) - probs.mean() * dX.sum(0)
        V_emb /= 3 * quiver_autoscale(X_emb, V_emb)
        return V_emb

    def _make_csrmatrix_noselfloop(self, neighbors: np.ndarray, distances: np.ndarray,
                                   auto_: bool = True, distance_factor=.01, weights_as_inv_dist=True,
                                   min_max_scale=False, time_series=False, time_series_labels=None,
                                   t_diff_step=2) -> csr_matrix:
        """
        Create sparse matrix from weighted knn graph. Default does local pruning and returns an affinity graph

        Parameters
        ----------
        neighbors: np.ndarray of shape (n_samples, n_neighbors)
            Indicating neighbors of each sample. neighbors[i,j] means that sample j is a neighbor of sample i

        distances: np.ndarray of shape (n_samples, n_neighbors)
            Distances between neighboring samples corresponding `neighbors`

        auto_: bool, default=True
            If `False` and `self.keep_all_local_dist = False` perform local pruning (according to self.edgepruning_clustering_resolution_local)
            and remove self-loops

        distance_factor: float, default=0.01
            Factor used in calculation of edge weights. mean(sqrt(distances))^2 / (sqrt(distances) + distance_factor)
        weights_as_inv_dist: bool, default = True
            Whether to convert the distances into weights that are proportional to inverse of the distance
        min_max_scale: bool, default = False
            This is called with True for constructing the sc-knn used for vertexclustergraph.
            we use the inverse of the distance but scaled from 0.5-2 such that we can combine the distance derived weights with the
            Jaccard similarity (0-1). The jaccard similiarity in igraph does not consider weights, so we re-introduce the impact of the
            distances which then play a role in the transition matrix for trajectories as well as the visualization of edge-weights
            In the case of the sc-KNN used for clustering, we do not re-introduce the distance based weights but leave it at the
            jaccard similarity as this empirically seems to work well
        time_series: bool, default = False. Whether or not this is a time course data set. will require time_series_labels in order to be actioned on
        time_series_labels: list, default = None. a list of numerical values reflecting the sampling time of each cell
        t_diff_step=int, default =2, number of sequential time steps permissible between edges. (e.g. edges allowed to be 2 or fewer time steps apart)
        Returns
        -------
            sparse matrix representing the locally pruned weighted knn affinity graph. edge weight represents level of similarity. higher values mean stronger edge
        """
        distances = np.sqrt(distances.astype(np.float32))
        # distances =distances.astype(np.float32)

        # print(neighbors[0,:])
        # print(distances[0,:])
        neigh01 = neighbors[0, 1]
        # print('norm2', np.linalg.norm(self.data[0,:]-self.data[neigh01,:]))
        if time_series:
            msk = np.full_like(distances, True, dtype=np.bool_)
            # print('all edges', np.sum(msk))
            # Remove self-loops
            msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis])
            # print('non-self edges', np.sum(msk))

            '''
            #doing local pruning and then adding back the augmented edges does not work so well because when the edge weights are scaled and inverted,
            # the sequentially added edges appear very weak and noisy compared to the fairly strong edges that remain after the local pruning from the inital round of knngraph. If you retain all edges from initial graph construction,
            # then the average weight of edges is exaggeratedly higher than those edge weights from the sequentially added edges, and creates a better gradient of edge weights
            # Local pruning based on neighbor being too far. msk where we want to keep neighbors
            msk = distances <= (np.mean(distances, axis=1) + self.edgepruning_clustering_resolution_local * np.std(distances, axis=1))[:,
                               np.newaxis]
            # Remove self-loops
            msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis])
            last_n_columns = self.knn_sequential+1
            msk[:,-last_n_columns:] = True # add back the edges belonging to knn-sequentially built part of the graph
            '''
            # remove edges between nodes that >t_diff_step far apart in time_series_labels

            if t_diff_step is not None:
                time_series_set_order = list(sorted(list(set(time_series_labels))))
                t_diff_mean = np.mean(
                    np.array([abs(y - x) for x, y in zip(time_series_set_order[:-1], time_series_set_order[1:])]))
                print(
                    f"{datetime.now()}\tActual average allowable time difference between nodes is {round(t_diff_mean * t_diff_step, 4)}")
            time_series_labels = np.asarray(time_series_labels)
            # print(colored(f"inside time_series msk"))

            rr = 0
            count = 0
            for row in neighbors:
                # if rr%20000==0: print(row, type(row), row[0])
                rr += 1
                t_row = time_series_labels[row[0]]  # first neighbor is itself

                if t_diff_step is not None:
                    for e_i, n_i in enumerate(row):
                        if abs(time_series_labels[n_i] - t_row) > t_diff_mean * t_diff_step:
                            count = count + 1
                            if np.sum(msk[row[0]]) > 4: msk[
                                row[0], e_i] = False  # we want to ensure that each cell has at least 5 nn
            # print('links within tdiff',np.sum(msk))

        elif auto_ and not self.keep_all_local_dist:
            # print('elif condition')
            # Local pruning based on neighbor being too far. msk where we want to keep neighbors
            msk = distances <= (np.mean(distances, axis=1) + self.edgepruning_clustering_resolution_local * np.std(distances, axis=1))[:,
                               np.newaxis]
            # Remove self-loops
            msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis])

        else:

            msk = np.full_like(distances, True, dtype=np.bool_)
            # print('msk shape', msk.shape)
            # Remove self-loops
            msk &= (neighbors != np.arange(neighbors.shape[0])[:, np.newaxis])

            # print('msk shape', msk.shape)
        # Inverting the distances outputs values in range [0-1]. This also causes many ``good'' neighbors ending up
        # having a weight near zero (misleading as non-neighbors have a weight of zero). Therefore we scale by the
        # mean distance. The inversed weights without min_max_scaling are used for the community detection graph

        if self.distance == 'cosine':
            rows = np.array([np.repeat(i, len(x)) for i, x in enumerate(neighbors)])[msk]
            cols = neighbors[msk]
            weights = distances[msk]
            weights.fill(1)  # CHECK THIS
            # weights = logistic_function(weights)

        elif (weights_as_inv_dist == True) & (min_max_scale == False):

            # weights = (np.mean(distances[msk]) ** 2) / (distances[msk] + distance_factor) #larger weight is a stronger edge

            # weights = 1 / (distances[msk] + distance_factor)
            # do_exp_weight= True
            if self.do_gaussian_kernel_edgeweights == True:
                print('do_exp edge weights true in core_working')
                stdd = np.std(distances, axis=1)
                import numpy.ma as ma
                import umap.umap_ as u
                mx = ma.masked_array(distances, mask=distances == 0)

                minn = mx.min(axis=1)  # np.min(distances,axis=1) min of nonzero entries per row
                new_distances = np.exp(-(distances - minn[:, None]) / stdd[:, None])
                weights = new_distances[msk]
                sigmas, rhos = u.smooth_knn_dist(
                    distances, k=self.knn, local_connectivity=float(1.0))

                rows, cols, weights, dist_weights = u.compute_membership_strengths(
                    neighbors, distances, sigmas, rhos)

                # print('local connectivity', float(distances.shape[1]))
                # print('k', self.knn)
                # print('sigmas', len(sigmas), sigmas)
                # print('rhos', len(rhos), rhos)
                result = csr_matrix((weights, (rows, cols)), shape=(len(neighbors), len(neighbors)), dtype=np.float32)
                result.eliminate_zeros()

                # print('applying symmetry in umap_')
                transpose = result.transpose()
                # print('size of symmetry only', (transpose + result).size)
                prod_matrix = result.multiply(transpose)

                result = (result + transpose - prod_matrix)
                # print('after fuzzy symm result.data.size, max, mean', result.data.size, max(result.data), np.mean(result.data))
                return result
            else:
                weights = (np.mean(distances[msk]) ** 2) / (
                        distances[msk] +np.min(distances[msk]))#distance_factor)
                # larger weight is a stronger edge #Dec12 replacing distance_factor with np.min(distances)
                #print('np.min(distances) or distance_factor', np.min(distances[msk]), distance_factor)
            # weights = np.exp(-distances[msk]/stdd[:,None])
            rows = np.array([np.repeat(i, len(x)) for i, x in enumerate(neighbors)])[msk]
            cols = neighbors[msk]
        elif min_max_scale == True:  # the jaccard similarity does not consider edge weights, we will use these min-max scaled weights to make a composite weight together with Jaccard to retain the distance based info
            weights = distances[msk]  # larger weight is a stronger edge
            weights = np.clip(weights, a_min=np.percentile(weights, 10),
                              a_max=None)  # the first neighbor is usually itself and hence has distance 0
            # print('min clip',np.percentile(weights,10))
            # print('clipped weights for current pc-dist', weights)
            weights = 0.5 + (weights - np.min(weights)) * (2 - 0.5) / (np.max(weights) - np.min(weights))
            weights = 1 / weights  # all weights between 0.5 and 2. these will later be multiplied by the Jac similarity which are between 0-1 to create a composite weight

            rows = np.array([np.repeat(i, len(x)) for i, x in enumerate(neighbors)])[msk]
            cols = neighbors[msk]

        # result = scipy.sparse.coo_matrix( (weights, (rows, cols)), shape=(len(neighbors), len(neighbors)) )
        result = csr_matrix((weights, (rows, cols)), shape=(len(neighbors), len(neighbors)), dtype=np.float32)
        # result.eliminate_zeros()
        # transpose = result.transpose()

        # prod_matrix = result.multiply(transpose)
        # result = result + transpose# - prod_matrix
        return result

    def func_mode(self, ll):
        # return MODE of list
        # If multiple items are maximal, the function returns the first one encountered.
        return max(set(ll), key=ll.count)

    def make_JSON(self, folderpath='/home/user/JavaCode/basicgraph/', filename='VIA_JSON.js'):
        import networkx as nx

        from networkx.readwrite import json_graph
        from collections import defaultdict
        edgelist = self.edgelist_maxout

        weightlist = self.edgeweights_maxout
        min_w = min(weightlist)
        max_w = max(weightlist)
        weightlist = [(10 * (i - min_w) / (max_w - min_w)) + 1 for i in weightlist]
        n_groups = len(set(self.labels))
        group_pop = np.zeros([n_groups, 1])
        cluster_population_dict = {}
        for group_i in set(self.labels):
            loc_i = np.where(self.labels == group_i)[0]
            group_pop[group_i] = len(loc_i)  # np.sum(loc_i) / 1000 + 1
            cluster_population_dict[group_i] = len(loc_i)

        pop_max = cluster_population_dict[max(cluster_population_dict, key=cluster_population_dict.get)]
        pop_min = cluster_population_dict[min(cluster_population_dict, key=cluster_population_dict.get)]
        print(cluster_population_dict, pop_min, pop_max)
        pt_max = max(self.scaled_hitting_times)
        pt_min = min(self.scaled_hitting_times)
        scaled_hitting_times = [10 * (i - pt_min) / (pt_max - pt_min) for i in self.scaled_hitting_times]
        node_majority_truth_labels = []
        for ci, cluster_i in enumerate(sorted(list(set(self.labels)))):
            cluster_i_loc = np.where(np.asarray(self.labels) == cluster_i)[0]
            majority_truth = str(self.func_mode(list(np.asarray(self.true_label)[cluster_i_loc])))
            node_majority_truth_labels.append(majority_truth)

        majority_truth_labels_dict = dict(enumerate(node_majority_truth_labels))
        temp = defaultdict(lambda: len(temp))

        df_edges = pd.DataFrame(edgelist, columns=['source', 'target'])
        df_edges['weight'] = weightlist
        df_edges['distance'] = [150 / i for i in weightlist]
        G = nx.DiGraph()  # directed graph
        for key in majority_truth_labels_dict:
            print('node', key, majority_truth_labels_dict[key], round(scaled_hitting_times[key], 1),
                  cluster_population_dict[key],
                  temp[majority_truth_labels_dict[key]])
            # val denotes size in d3 by default
            G.add_node(key, group=majority_truth_labels_dict[key], pseudotime=scaled_hitting_times[key],
                       val=(10 * (cluster_population_dict[key] - pop_min) / (pop_max - pop_min)) + 1,
                       group_num=temp[majority_truth_labels_dict[key]])
        for enum_i, i in enumerate(edgelist):
            print('edge', i, weightlist[enum_i], cluster_population_dict[i[0]], 150 / weightlist[enum_i])
            if (scaled_hitting_times[i[0]] < scaled_hitting_times[i[1]]):
                source_node = i[0]
                target_node = i[1]
            else:
                source_node = i[1]
                target_node = i[0]
            # val edge controls number of emitted particles
            G.add_edge(u_of_edge=source_node, v_of_edge=target_node, weight=weightlist[enum_i],
                       val=(5 * (cluster_population_dict[source_node] - pop_min) / (pop_max - pop_min)) + 1,
                       distance=150 / weightlist[enum_i])

        # Visualize the network:
        nx.draw_networkx(G)
        plt.show()
        import json

        j = json_graph.node_link_data(G)

        js = json.dumps(['var gData=', j], ensure_ascii=False, indent=2)
        with open(folderpath + filename, "w") as file:
            file.write(js)
        return

    def run_toobig_subPARC(self, X_data, jac_std_toobig=1,
                           jac_weighted_edges=True):
        n_elements = X_data.shape[0]
        hnsw = _construct_knn(X_data, knn=self.knn, distance=self.distance, num_threads=self.num_threads, too_big=True)
        if self.knn >= 0.8 * n_elements:
            k = int(0.5 * n_elements)
        else:
            k = self.knn
        neighbor_array, distance_array = hnsw.knn_query(X_data, k=k)
        csr_array = self._make_csrmatrix_noselfloop(neighbor_array, distance_array)
        sources, targets = csr_array.nonzero()
        mask = np.zeros(len(sources), dtype=bool)
        mask |= (csr_array.data > (
                np.mean(csr_array.data) + np.std(csr_array.data) * 5))  # smaller distance means stronger edge
        csr_array.data[mask] = 0
        csr_array.eliminate_zeros()
        sources, targets = csr_array.nonzero()
        edgelist = list(zip(sources.tolist(), targets.tolist()))
        edgelist_copy = edgelist.copy()
        G = ig.Graph(edgelist, edge_attrs={'weight': csr_array.data.tolist()})
        sim_list = G.similarity_jaccard(pairs=edgelist_copy)  # list of jaccard weights
        new_edgelist = []
        sim_list_array = np.asarray(sim_list)
        if jac_std_toobig == 'median':
            threshold = np.median(sim_list)
        else:
            threshold = np.mean(sim_list) - jac_std_toobig * np.std(sim_list)
        strong_locs = np.where(sim_list_array > threshold)[0]
        for ii in strong_locs: new_edgelist.append(edgelist_copy[ii])
        sim_list_new = list(sim_list_array[strong_locs])

        if jac_weighted_edges == True:
            G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist), edge_attrs={'weight': sim_list_new})
        else:
            G_sim = ig.Graph(n=n_elements, edges=list(new_edgelist))
        G_sim.simplify(combine_edges='sum')
        if jac_weighted_edges == True:
            if self.partition_type == 'ModularityVP':
                partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition, weights='weight',
                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed)
                # print('partition type MVP')
            else:
                partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition, weights='weight',
                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed,
                                                     resolution_parameter=self.resolution_parameter)

        else:
            if self.partition_type == 'ModularityVP':
                # print('partition type MVP')
                partition = leidenalg.find_partition(G_sim, leidenalg.ModularityVertexPartition,
                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed)
            else:
                print('partition type RBC')
                partition = leidenalg.find_partition(G_sim, leidenalg.RBConfigurationVertexPartition,
                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed,
                                                     resolution_parameter=self.resolution_parameter)
        PARC_labels_leiden = np.asarray(partition.membership)
        PARC_labels_leiden = np.reshape(PARC_labels_leiden, (n_elements, 1))
        small_pop_list = []
        small_cluster_list = []
        small_pop_exist = False
        dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True)
        for cluster in set(PARC_labels_leiden):
            population = len(np.where(PARC_labels_leiden == cluster)[0])
            if population < 5:
                small_pop_exist = True
                small_pop_list.append(list(np.where(PARC_labels_leiden == cluster)[0]))
                small_cluster_list.append(cluster)

        for small_cluster in small_pop_list:
            for single_cell in small_cluster:
                old_neighbors = neighbor_array[single_cell, :]
                group_of_old_neighbors = PARC_labels_leiden[old_neighbors]
                group_of_old_neighbors = list(group_of_old_neighbors.flatten())
                available_neighbours = set(group_of_old_neighbors) - set(small_cluster_list)
                if len(available_neighbours) > 0:
                    available_neighbours_list = [value for value in group_of_old_neighbors if
                                                 value in list(available_neighbours)]
                    best_group = max(available_neighbours_list, key=available_neighbours_list.count)
                    PARC_labels_leiden[single_cell] = best_group

        do_while_time = time.time()
        while (small_pop_exist == True) & (time.time() - do_while_time < 5):
            small_pop_list = []
            small_pop_exist = False
            for cluster in set(list(PARC_labels_leiden.flatten())):
                population = len(np.where(PARC_labels_leiden == cluster)[0])
                if population < 10:
                    small_pop_exist = True
                    small_pop_list.append(np.where(PARC_labels_leiden == cluster)[0])
            for small_cluster in small_pop_list:
                for single_cell in small_cluster:
                    old_neighbors = neighbor_array[single_cell, :]
                    group_of_old_neighbors = PARC_labels_leiden[old_neighbors]
                    group_of_old_neighbors = list(group_of_old_neighbors.flatten())
                    best_group = max(set(group_of_old_neighbors), key=group_of_old_neighbors.count)
                    PARC_labels_leiden[single_cell] = best_group

        dummy, PARC_labels_leiden = np.unique(list(PARC_labels_leiden.flatten()), return_inverse=True)
        self.labels = PARC_labels_leiden
        return PARC_labels_leiden

    def find_root_group(self, graph_dense, PARC_labels_leiden, root_user, true_labels, super_cluster_labels_sub,
                        super_node_degree_list):
        # PARC_labels_leiden is the subset belonging to the component of the graph being considered. graph_dense is a component of the full graph
        # returns the cluster most likely corresponding to the root (initial) state on the cluster the graph
        majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object)
        graph_node_label = []
        min_deg = 1000
        super_min_deg = 1000
        found_super_and_sub_root = False
        found_any_root = False
        true_labels = np.asarray(true_labels)

        deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0]

        for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))):
            cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0]
            majority_truth = str(self.func_mode(list(true_labels[cluster_i_loc])))
            if self.super_cluster_labels != False:
                super_majority_cluster = self.func_mode(list(np.asarray(super_cluster_labels_sub)[cluster_i_loc]))
                super_majority_cluster_loc = np.where(np.asarray(super_cluster_labels_sub) == super_majority_cluster)[0]
                super_majority_truth = self.func_mode(list(true_labels[super_majority_cluster_loc]))
                super_node_degree = super_node_degree_list[super_majority_cluster]
                if (str(root_user) in majority_truth) & (str(root_user) in str(super_majority_truth)):
                    if super_node_degree < super_min_deg:
                        found_super_and_sub_root = True
                        root = cluster_i
                        found_any_root = True
                        min_deg = deg_list[ci]
                        super_min_deg = super_node_degree
                        print(f"{datetime.now()}\tNew root is {root}")
            majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i)

            graph_node_label.append(str(majority_truth) + 'c' + str(cluster_i))
        if (self.super_cluster_labels == False) | (found_super_and_sub_root == False):
            # print('self.super_cluster_labels', super_cluster_labels_sub, ' foundsuper_cluster_sub and super root',found_super_and_sub_root)
            for ic, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))):
                cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0]
                # print('cluster', cluster_i, 'set true labels', set(true_labels))
                true_labels = np.asarray(true_labels)

                majority_truth = str(self.func_mode(list(true_labels[cluster_i_loc])))

                print('cluster', cluster_i, 'has majority', majority_truth)  # , 'with degree list', deg_list)

                if (str(root_user) in str(majority_truth)):  # 'in' not ==
                    if deg_list[ic] < min_deg:
                        root = cluster_i
                        found_any_root = True
                        min_deg = deg_list[ic]
                        print(f"{datetime.now()}\tNew root is {root} and majority {majority_truth}")
        # print('len graph node label', graph_node_label)
        if found_any_root == False:
            print(f"{datetime.now()}\tSetting arbitrary root {cluster_i}")
            root = cluster_i
        return graph_node_label, majority_truth_labels, deg_list, root

    def make_majority_truth_cluster_labels(self, PARC_labels_leiden, true_labels, graph_dense):
        majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object)
        graph_node_label = []
        true_labels = np.asarray(true_labels)
        deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0]

        for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))):
            cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0]
            majority_truth = str(self.func_mode(list(true_labels[cluster_i_loc])))

            majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i)

            graph_node_label.append(str(majority_truth) + 'c' + str(cluster_i))
            deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0]

        return graph_node_label, majority_truth_labels, deg_list

    def find_root(self, graph_dense, PARC_labels_leiden, root_user, true_labels):
        # root-user is the singlecell index given by the user when running VIA
        majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object)
        graph_node_label = []
        true_labels = np.asarray(true_labels)

        deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0]

        for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))):
            # print('cluster i', cluster_i)
            cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0]

            majority_truth = self.func_mode(list(true_labels[cluster_i_loc]))

            majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i)

            graph_node_label.append(str(majority_truth) + 'c' + str(cluster_i))
        root = PARC_labels_leiden[root_user]
        return graph_node_label, majority_truth_labels, deg_list, root

    def find_root_2Morgan(self, graph_dense, PARC_labels_leiden, root_idx, true_labels):
        # single cell index given corresponding to user defined root cell
        majority_truth_labels = np.empty((len(PARC_labels_leiden), 1), dtype=object)
        graph_node_label = []
        true_labels = np.asarray(true_labels)

        deg_list = graph_dense.sum(axis=1).reshape((1, -1)).tolist()[0]
        secondary_annotations = np.asarray(self.secondary_annotations)
        for ci, cluster_i in enumerate(sorted(list(set(PARC_labels_leiden)))):
            cluster_i_loc = np.where(np.asarray(PARC_labels_leiden) == cluster_i)[0]

            majority_truth = self.func_mode(list(true_labels[cluster_i_loc]))
            majority_truth_secondary = str(self.func_mode(list(secondary_annotations[cluster_i_loc])))
            majority_truth_labels[cluster_i_loc] = str(majority_truth) + 'c' + str(cluster_i)

            # graph_node_label.append(str(majority_truth)[0:5] + 'C' + str(cluster_i))
            graph_node_label.append(str(majority_truth)[0:5] + 'C' + str(cluster_i) + str(majority_truth_secondary))
        root = PARC_labels_leiden[root_idx]
        return graph_node_label, majority_truth_labels, deg_list, root

    def full_graph_paths(self, data, n_components_original=1):
        # make igraph object of very low-K KNN using the knn_struct PCA-dimension space made in PARC.
        # This is later used by find_shortest_path for sc_bp visual
        # neighbor array is not listed in in any order of proximity

        print(f"{datetime.now()}\tThe number of components in the original full graph is {n_components_original}")
        print(f"{datetime.now()}\tFor downstream visualization purposes we are also constructing a low knn-graph ")
        first, k0, n_comp = True, 3, n_components_original + 1
        while (n_components_original == 1 and n_comp > 1) or \
                (n_components_original > 1 and k0 <= 5 and n_comp > n_components_original):
            neighbors, distances = self.knn_struct.knn_query(data, k=k0)
            csr_array = self._make_csrmatrix_noselfloop(neighbors, distances, auto_=False)
            n_comp, comp_labels = connected_components(csr_array, return_labels=True)
            if first:
                first = False
            else:
                k0 += 1

        # print(f"{datetime.now()}\t the size of neighbor array in low-KNN in pca-space for visualization is {neighbors.shape}")
        n_cells, n_neighbors = neighbors.shape
        rows = np.transpose(np.ones((n_neighbors, n_cells)) * range(0, n_cells)).flatten()
        cols = neighbors.flatten()
        csr_full_graph = csr_matrix((distances.flatten(), (rows, cols)), shape=(n_cells, n_cells))

        return ig.Graph(list(zip(*csr_full_graph.nonzero()))).simplify(combine_edges='sum')



    def do_impute(self, df_gene, magic_steps=3, gene_list=[]):
        # ad_gene is an ann data object from scanpy
        # normalize across columns to get Transition matrix.
        transition_full_graph = normalize(self.csr_full_graph, norm='l1', axis=1) ** magic_steps

        print('shape of transition matrix raised to power', magic_steps, transition_full_graph.shape)
        subset = df_gene[gene_list].values
        return pd.DataFrame(transition_full_graph.dot(subset), index=df_gene.index, columns=gene_list)

    def run_subVIA(self):

        # Construct graph or obtain from previous run
        if self.is_coarse:

            neighbors, distances = self.knn_struct.knn_query(self.data,
                                                             k=self.knn)  # these n, d will be used as a basis for both the clustergraph and graph which we apply community detection. These two graphs are constructed differently because in the community detection we want to emphasize dissimiliarities,
            # but the in the clustergraph we want to enhance connectivity
            adjacency_augmented = None
            adjacency = None
            if (self.time_series == True) and (
                    self.time_series_labels is not None):  # refine the graph for clustering (leiden) using time_series labels if available
                print(f"{datetime.now()}\tUsing time series information to guide knn graph construction ")

                n_augmented, d_augmented = sequential_knn(self.data, self.time_series_labels, neighbors, distances,
                                                          k_seq=self.knn_sequential,
                                                          k_reverse=self.knn_sequential_reverse,
                                                          num_threads=self.num_threads, distance=self.distance)

                adjacency_augmented = self._make_csrmatrix_noselfloop(n_augmented, d_augmented,
                                                                      time_series=self.time_series,
                                                                      time_series_labels=self.time_series_labels,
                                                                      t_diff_step=self.t_diff_step)  # this function has local pruning which removes neighbors that are more than t_dif apart. Since the same type of local pruning wrt t_dif is applied pre-clustergraph, we only need to call this function once in the case of time_series data


                adjacency = self._make_csrmatrix_noselfloop(neighbors, distances)  # not augmented or t_diff'ed

            if (self.do_spatial_knn) and (
                    self.spatial_coords is not None):  # refine the graph for clustering (leiden) using time_series labels if available

                if len(self.spatial_aux) == 0:
                    print(f"{datetime.now()}\tSince no slice-labels were provided, all cells are assumed to be from the same tissue slice ")
                    self.spatial_aux =  ['slice1'] * self.nsamples#['slice1' for i in range(self.nsamples)]
                if adjacency_augmented is None: #we did not augment based on time series data
                    print(f"{datetime.now()}\tUsing spatial information to guide knn graph construction. Note! If cells are from different slices of tissue, provide a list of tissue-slice IDs ")
                    #n_augmented, d_augmented = spatial_knn(coords=self.spatial_coords, neighbors=neighbors,                                                           distances=distances,                                                           k_spatial=self.spatial_knn)
                    n_augmented, d_augmented=                    spatial_knn_new(spatial_coords=self.spatial_coords, spatial_slice_labels=self.spatial_aux,
                                    neighbors=neighbors,
                                    distances=distances, k_spatial=self.spatial_knn, distance_metric='l2',
                                    num_threads=-1, too_big=False)
                    adjacency_augmented = self._make_csrmatrix_noselfloop(n_augmented, d_augmented)  # this function has local pruning which removes neighbors that are more than t_dif apart. Since the same type of local pruning wrt t_dif is applied pre-clustergraph, we only need to call this function once in the case of time_series data
                else:
                    print(f"{datetime.now()}\tUsing spatial information to guide knn graph construction + combining with time-series data")
                    n_augmented, d_augmented = spatial_knn_new(spatial_coords=self.spatial_coords,  spatial_slice_labels = self.spatial_aux, neighbors=n_augmented,
                                                           distances=d_augmented,     k_spatial=self.spatial_knn,distance_metric = 'l2', num_threads= -1,                     too_big= False)
                    adjacency_augmented = self._make_csrmatrix_noselfloop(n_augmented, d_augmented)
                if adjacency is  None: adjacency = self._make_csrmatrix_noselfloop(neighbors, distances)  # not augmented or t_diff'ed

            if adjacency is None:
                adjacency = self._make_csrmatrix_noselfloop(neighbors,
                                                            distances)  # , min_max_scale=True) #this function has local pruning based on edge distances


        else:
            neighbors, distances = self.full_neighbor_array, self.full_distance_array
            adjacency = self.csr_array_locally_pruned

        edges = np.array(list(zip(*adjacency.nonzero())))
        sim = np.array(
            ig.Graph(n=self.nsamples, edges=edges.tolist(), edge_attrs={'weight': adjacency.data}).similarity_jaccard(
                pairs=edges))  # used to do igraph jaccard does not use  weight information to compute Jacc
        # sim = adjacency.data
        tot = len(sim)

        # Prune Jacc weighted edges off graph on global level
        threshold = np.median(sim) if self.edgepruning_clustering_resolution == 'median' else sim.mean() - self.edgepruning_clustering_resolution * sim.std()
        strong_locs = np.asarray(np.where(sim > threshold)[0])
        # to be used for leiden paritionining clustering
        g_local_global_jac = ig.Graph(n=self.nsamples, edges=list(edges[strong_locs]),
                                      edge_attrs={'weight': list(sim[strong_locs])}).simplify(
            combine_edges='sum')  # used for the clustering step and has been locally and globally pruned
        # jac_sim_list = g_local_global_jac.similarity_jaccard(pairs=list(edges[strong_locs]))
        # g_local_global_jac =ig.Graph(n=self.nsamples, edges=list(edges[strong_locs]), edge_attrs={'weight': jac_sim_list}).simplify(combine_edges='sum') #used for clustering (globally and locally pruned) and then jacc weighted (Jacc does not consider weights)
        self.sparse_glob_loc_pruned = get_sparse_from_igraph(g_local_global_jac, weight_attr='weight')

        print(
            f"{datetime.now()}\tFinished global pruning of {self.knn}-knn graph used for clustering at level of {self.edgepruning_clustering_resolution}. Kept {round(100 * len(strong_locs) / tot, 1)} % of edges. ")

        if self.is_coarse:
            # Construct full graph with no pruning - used for cluster graph edges, not listed in any order of proximity
            if (self.time_series == True) and (self.time_series_labels is not None):

                csr_full_graph = adjacency_augmented.copy()  # when time_series data is available
                if self.RW2_mode:
                    print("saving csr for RW2")
                    from scipy.sparse import save_npz
                    str_date = str(str(datetime.now())[-3:])
                    print(f'Unique ID for RW2 file {str_date}')
                    save_npz(self.working_dir_fp + 'pc' + str(self.ncomp) + '_knn' + str(self.knn) + 'kseq' + str(
                        self.knn_sequential) + 'krev' + str(self.knn_sequential_reverse) + 'RW2_' + str_date + '.npz',
                             csr_full_graph)

            # when no time-series data is available, in this iteration of weighting the edges we do not locally prune based on edge-weights becasue we intend to use all neighborhood info towards the edges in the clustergraph and ensuring the clustergraph is connected
            elif (self.do_spatial_knn == True):
                print(f"{datetime.now()}\tusing spatial coords to augment clustergraph")
                csr_full_graph = adjacency_augmented.copy()
            else:
                csr_full_graph = self._make_csrmatrix_noselfloop(neighbors, distances, auto_=False, min_max_scale=False,
                                                                 time_series=self.time_series,
                                                                 time_series_labels=self.time_series_labels)  # no local pruning: auto_ set to false #min_max_scale=True
            n_original_comp, n_original_comp_labels = connected_components(csr_full_graph, directed=False)
            print(f"{datetime.now()}\tNumber of connected components used for clustergraph  is {n_original_comp}")

            if ((self.pca_loadings is not None) & (self.velocity_matrix is not None)) & (self.gene_matrix is not None):

                edges_preClustergraph = list(
                    zip(*csr_full_graph.nonzero()))  # in the case of time-series, this is the augmented graph
                # projected_distances are the inverted Projected_distances (affinities) based on velocity adjusted location of neighbors
                projected_distances = get_projected_distances(loadings=self.pca_loadings, gene_matrix=self.gene_matrix,
                                                              velocity_matrix=self.velocity_matrix,
                                                              edgelist=edges_preClustergraph, current_pc=self.data)

                self.ig_full_graph = _composite_jacAffinity_distanceAffinity(sc_csr_graph=csr_full_graph,
                                                                             projected_distances=projected_distances)

            else:
                # this will be used for making the clustergraph. Note that in the case of time-series, this is the augmented graph with weights given by a composite metric of Jaccard and affinities
                self.ig_full_graph = _composite_jacAffinity_distanceAffinity(
                    csr_full_graph)  # 0<J<1 #combining distance-based affinities and jac-based affinities
            # for VIA we local and global prune the vertex cluster graph *after* making the clustergraph
            self.csr_array_locally_pruned = adjacency  # this graph has only been locally pruned and in the case of time-series, it is not the augmented knn. it then subsequently in later code will be subject to global pruning, followed by Leiden clustering. saving it here for non-coarse runs

            if self.time_series == True:
                self.csr_array_locally_pruned_augmented = adjacency_augmented  # t_diff edges removed and is sequentially augmented affinity graph (no Jaccard metric is added)
            else:
                self.csr_array_locally_pruned_augmented = None

            self.csr_full_graph = csr_full_graph  # for timeseries, this is the sequentially augmented graph, and later used in sc_transitions. This can also be used for hnsw_umap
            self.full_neighbor_array = neighbors
            self.full_distance_array = distances

            # knn graph used for making trajectory drawing on the visualization
            # self.full_graph_shortpath = self.full_graph_paths(self.data, n_original_comp)
            neighbors = self.full_neighbor_array
        if self.labels is None:
            print(
                f"{datetime.now()}\tCommencing community detection")  # this is done on the non-time-series augmented graph to avoid clustering largely based on prior known time-level data
            weights = 'weight' if self.jac_weighted_edges else None
            # type = leidenalg.ModularityVertexPartition if self.partition_type == 'ModularityVP' else leidenalg.RBConfigurationVertexPartition
            if self.partition_type == 'ModularityVP':
                partition = leidenalg.find_partition(g_local_global_jac, leidenalg.ModularityVertexPartition,
                                                     weights=weights,
                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed)
                # print('partition type MVP')
            else:
                partition = leidenalg.find_partition(g_local_global_jac, leidenalg.RBConfigurationVertexPartition,
                                                     weights=weights,
                                                     n_iterations=self.n_iter_leiden, seed=self.random_seed,
                                                     resolution_parameter=self.resolution_parameter)
            # partition = leidenalg.find_partition(g_local_global_jac, partition_type=type, weights=weights,                                             n_iterations=self.n_iter_leiden, seed=self.random_seed)
            labels = np.array(partition.membership)

            print(f"{datetime.now()}\tFinished community detection. Found {len(set(labels))} clusters.")

            # Searching for clusters that are too big and split them
            too_big_clusters = [k for k, v in Counter(labels).items() if v > self.too_big_factor * self.nsamples]
            if len(too_big_clusters):
                print(f"{datetime.now()}\tFound {len(too_big_clusters)} clusters that are too big")

            time0_big = time.time()
            count_big_pop = len(too_big_clusters)
            num_times_expanded = 0
            # TODO - add max running time condition
            while len(too_big_clusters) > 0 & (
                    not ((time.time() - time0_big > 200) & (num_times_expanded >= count_big_pop))):
                # while len(too_big_clusters) & (not((time.time() - time0_big >200) & (num_times_expanded >= count_big_pop))):
                print(f"{datetime.now()}\tExamining clusters that are above the too_big threshold")
                cluster = too_big_clusters.pop(0)
                idx = labels == cluster
                print(f"{datetime.now()}\tCluster {cluster} contains "
                      f"{idx.sum()}>{round(self.too_big_factor * self.nsamples)} samples and is too big")

                data = self.data[idx]
                membership = max(labels) + 1 + np.array(self.run_toobig_subPARC(data))
                num_times_expanded += 1

                if len(set(membership)) > 1:
                    labels[idx] = membership
                    too_big_clusters.extend(
                        [k for k, v in Counter(membership).items() if v > self.too_big_factor * self.nsamples])
                else:
                    print(f"{datetime.now()}\tCould not expand cluster {cluster}")

            # Search for clusters that are too small (like singletons) and merge them to non-small clusters based on neighbors' majority vote
            # first we make a quick pass through all clusters to remove very small outliers by merging with a larger cluster
            # print('before final small cluster handling we have',len(set(labels)), 'communities')

            too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop}
            print(f"{datetime.now()}\tMerging {len(set(too_small_clusters))} very small clusters (<{self.small_pop})")
            idx = np.where(np.isin(labels, list(too_small_clusters)))[0]
            neighbours_labels = labels[neighbors[idx]]
            for i, nl in zip(*[idx, neighbours_labels]):
                # Retrieve the first non small cluster, with highest number of neighbours
                label = next((label for label, n in Counter(nl).most_common() if label not in too_small_clusters), None)
                # label = next((label for label, n in Counter(nl).most_common()), None)
                if label is not None:  # recall 0 is a valid label value
                    labels[i] = label

                # too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop}
            too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop}
            # in this pass we allow clusters to be merged even if they are not a Large Cluster.. as multiple smaller ones might come together to form an acceptably large cluster
            do_while_time = time.time()
            while len(too_small_clusters) & (time.time() - do_while_time < 15):
                # membership of neighbours of samples in small clusters
                idx = np.where(np.isin(labels, list(too_small_clusters)))[0]
                neighbours_labels = labels[neighbors[idx]]
                for i, nl in zip(*[idx, neighbours_labels]):
                    # Retrieve the first non small cluster, with highest number of neighbours
                    # label = next((label for label, n in Counter(nl).most_common() if label not in too_small_clusters), None)
                    label = next((label for label, n in Counter(nl).most_common()), None)
                    if label is not None:  # recall 0 is a valid label value
                        labels[i] = label
                # Update set of too small clusters, stopping if converged
                too_small_clusters = {k for k, v in Counter(labels).items() if v < self.small_pop}
            # Reset labels to begin from zero and with no missing numbers
            self.labels = labels = np.unique(labels, return_inverse=True)[1]

            print(f"{datetime.now()}\tFinished detecting communities. Found", len(set(self.labels)), 'communities')
        else:
            print(f'{datetime.now()}\tUsing predfined labels provided by user (this must be provided as an array)')

        # end community detection
        # do kmeans instead
        '''
        from sklearn.cluster import KMeans
        kmeans = KMeans(n_clusters=20, random_state=1).fit(X_data)
        self.labels = kmeans.labels_
        n_clus = len(set(self.labels))
        self.labels = kmeans.labels_.flatten().tolist()

        pop_list = []
        pop_list_raw = []
        for item in range(len(set(PARC_labels_leiden))):
            pop_item = PARC_labels_leiden.count(item)
            pop_list.append((item, pop_item))
            pop_list_raw.append(pop_item)
        '''
        # Make cluster-graph
        print(
            f"{datetime.now()}\tMaking cluster graph. Global cluster graph pruning level: {self.cluster_graph_pruning}")
        graph = ig.VertexClustering(self.ig_full_graph, membership=self.labels).cluster_graph(combine_edges='sum')

        graph = recompute_weights(graph, Counter(self.labels))  # returns csr matrix

        self.cluster_graph_csr_not_pruned = graph  # type csr NOT PRUNED
        edgeweights_pruned_clustergraph, edges_pruned_clustergraph, comp_labels = pruning_clustergraph(graph,
                                                                                                       global_pruning_std=self.cluster_graph_pruning,
                                                                                                       preserve_disconnected=self.preserve_disconnected,
                                                                                                       preserve_disconnected_after_pruning=self.preserve_disconnected_after_pruning)

        self.connected_comp_labels = comp_labels
        # ig.graph() creates an undirected graph,in which case get_spare_from_igraph will be symmetric for each edge (u,v) and edge (v,u) is created
        locallytrimmed_g = ig.Graph(edges_pruned_clustergraph, edge_attrs={
            'weight': edgeweights_pruned_clustergraph})  # .simplify(combine_edges='sum') #forward biasing happens later

        locallytrimmed_sparse_vc = get_sparse_from_igraph(locallytrimmed_g, weight_attr='weight')

        if self.edgebundle_pruning == self.cluster_graph_pruning:

            weights_for_layout = np.asarray(locallytrimmed_sparse_vc.data)
            # clip weights to prevent distorted visual scale
            weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10),
                                         np.percentile(weights_for_layout,
                                                       90))  # want to clip the weights used to get the layout
            weights_for_layout = list(weights_for_layout)

            g_layout = ig.Graph(list(zip(*locallytrimmed_sparse_vc.nonzero())),
                                edge_attrs={'weight': weights_for_layout})


        # if using a different pruning for visualized edge bundles than the one specified in self.cluster_graph_pruning
        else:

            edgeweights_layout, edges_layout, comp_labels_layout = pruning_clustergraph(graph,
                                                                                        global_pruning_std=self.edgebundle_pruning,
                                                                                        preserve_disconnected=self.preserve_disconnected,
                                                                                        preserve_disconnected_after_pruning=self.preserve_disconnected_after_pruning)

            # layout = locallytrimmed_g.layout_fruchterman_reingold(weights='weight') #uses non-clipped weights but this can skew layout due to one or two outlier edges
            layout_g = ig.Graph(edges_layout, edge_attrs={
                'weight': edgeweights_layout})  # .simplify(combine_edges='sum') remving the 'combined_edges='sum' as it does not handle diagonal self loops which we require to be retained for singleton clusters that are not connected to other clusters

            layout_g_csr = get_sparse_from_igraph(layout_g, weight_attr='weight')
            weights_for_layout = np.asarray(layout_g_csr.data)

            for i in range(weights_for_layout.shape[0]):
                if weights_for_layout[i] == 0: print(i)
            # clip weights to prevent distorted visual scale in layout
            weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10),
                                         np.percentile(weights_for_layout, 90))
            weights_for_layout = list(weights_for_layout)

            g_layout = ig.Graph(list(zip(*layout_g_csr.nonzero())), edge_attrs={'weight': weights_for_layout})
        self.clustergraph_igraph_forlayout = g_layout
        # the layout of the graph is determine by a pruned clustergraph and the directionality of edges will be based on the final markov pseudotimes
        # the edgeweights of the bundle-edges is determined by the distance based metrics and jaccard similarities and not by the pseudotimes
        # for the transition matrix used in the markov pseudotime and differentiation probability computations, the edges will be further biased by the hittings times and markov pseudotimes

        # layout = g_layout.layout_fruchterman_reingold(weights='weight')

        # globally trimmed link
        self.edgelist_unique = set(
            tuple(sorted(l)) for l in zip(*locallytrimmed_sparse_vc.nonzero()))  # keep only one of (0,1) and (1,0)
        self.edgelist = edges_pruned_clustergraph  # after one round of global and local pruning of graph to be used for MCMCs

        # number of components
        n_components, labels_cc = connected_components(csgraph=locallytrimmed_sparse_vc, directed=False,
                                                       return_labels=True)

        df_graph = pd.DataFrame(locallytrimmed_sparse_vc.todense())

        if (self.velocity_matrix is not None) & (self.gene_matrix is not None):
            df_velocity = pd.DataFrame(self.velocity_matrix)
            print('size velocity matrix', len(self.labels), self.velocity_matrix.shape)
            df_velocity['labels'] = self.labels
            velocity_mean = df_velocity.groupby('labels', as_index=True).mean().to_numpy()
            gene_mean = pd.DataFrame(self.gene_matrix)
            gene_mean['labels'] = self.labels
            gene_mean = gene_mean.groupby('labels', as_index=True).mean().to_numpy()

            self.A_velo, self.CSM, velo_root_top3 = velocity_transition(A=df_graph.values, V=velocity_mean,
                                                                        G=gene_mean)  # velocity directed transition matrix

            print('cell type of suggested roots:', [self.true_label[x] for x in velo_root_top3])
            if self.time_series == True:  print('cell stage of suggested roots:',
                                                [self.time_series_labels[x] for x in velo_root_top3])

        df_graph['cc'] = labels_cc
        df_graph['pt'] = float('NaN')
        df_graph['majority_truth'] = 'maj truth'
        df_graph['graph_node_label'] = 'node label'

        PARC_labels_leiden = self.labels
        set_parc_labels = list(set(PARC_labels_leiden))
        set_parc_labels.sort()

        tsi_list = []
        df_graph['markov_pt'] = float('NaN')
        terminal_clus = []  # list of cluster labels that are found to be terminal clusters
        node_deg_list = []
        super_terminal_clus_revised = []
        pd_columnnames_terminal = []
        dict_terminal_super_sub_pairs = {}
        self.root = []
        large_components = []
        for comp_i in range(n_components):
            loc_compi = np.where(labels_cc == comp_i)[0]

            if len(loc_compi) > 1:  # need at least 2 nodes in in a component to make edges
                large_components.append(comp_i)
            elif len(loc_compi) == 0:
                df_graph.at[loc_compi[0], 'markov_pt'] = 1  # 0
                df_graph.at[loc_compi[0], 'pt'] = 1  # 0

        for comp_i in large_components:  # range(n_components):
            print(f'{datetime.now()}\tcomponent number', comp_i, 'out of ', large_components)
            loc_compi = np.where(labels_cc == comp_i)[0]

            a_i = df_graph.iloc[loc_compi][loc_compi].values  # transition matrix of relevant component
            # A_velo, CSM = velocity_transition(A=a_i, V=velocity_mean,   G=gene_mean)  # velocity directed transition matrix
            if self.A_velo is not None: A_velo, CSM = self.A_velo[loc_compi, :][loc_compi, :], self.CSM[loc_compi, :][
                                                                                               loc_compi, :]
            a_i = csr_matrix(a_i, (a_i.shape[0], a_i.shape[0]))

            cluster_labels_subi = [x for x in loc_compi]
            # print('cluster_labels_subi', cluster_labels_subi)
            sc_labels_subi = [PARC_labels_leiden[i] for i in range(len(PARC_labels_leiden)) if
                              (PARC_labels_leiden[i] in cluster_labels_subi)]

            sc_truelabels_subi = [self.true_label[i] for i in range(len(PARC_labels_leiden)) if
                                  (PARC_labels_leiden[i] in cluster_labels_subi)]

            if (self.root_user is None) and (self.velocity_matrix is not None):
                # in case the root cluster is a singleton, move on to the next nominee in initial states
                root_i = velo_root_top3[0]
                cluster_i_loc = np.where(np.asarray(sc_labels_subi) == root_i)[0]
                if len(cluster_i_loc) < 2:
                    root_i = velo_root_top3[1]
                    cluster_i_loc = np.where(np.asarray(sc_labels_subi) == root_i)[0]
                    if len(cluster_i_loc) < 2:
                        root_i = velo_root_top3[2]
                        cluster_i_loc = np.where(np.asarray(sc_labels_subi) == root_i)[0]
                        if len(cluster_i_loc) < 2:
                            root_i = velo_root_top3[0]

                graph_node_label, majority_truth_labels, node_deg_list_i = self.make_majority_truth_cluster_labels(
                    PARC_labels_leiden=sc_labels_subi, true_labels=sc_truelabels_subi, graph_dense=a_i)
                for velo_root_candidate_i in velo_root_top3:
                    cluster_i_loc = np.where(np.asarray(sc_labels_subi) == velo_root_candidate_i)[0]
                    majority_truth = str(self.func_mode(list(np.asarray(sc_truelabels_subi)[cluster_i_loc])))
                    if velo_root_candidate_i == root_i: majority_truth_veloroot0 = majority_truth
                    print(
                        f"{datetime.now()}\tUsing the RNA velocity graph, A top3 candidate for initial state is {velo_root_candidate_i} comprising predominantly of {majority_truth} cells",
                    )
                print(
                    f"{datetime.now()}\tUsing the RNA velocity graph, the suggested initial root state is {root_i} comprising predominantly of {majority_truth_veloroot0} cells")


            elif (self.dataset in ['toy', 'faced', 'mESC', 'iPSC', 'group']) and (
                    self.root_user is not None):  # ((self.dataset == 'toy') | (self.dataset == 'faced')):
                root_user_ = None
                for ri in self.root_user:
                    print(f"{datetime.now()}\tgroup root method")
                    if ri in sc_truelabels_subi:
                        root_user_ = ri
                        print(f"{datetime.now()}\tfor component {comp_i}, the root is {root_user_} and ri {ri}")
                if root_user_ is None:
                    root_user_ = sc_truelabels_subi[0]
                    print(f"{datetime.now()}\tsetting a dummy root")


                graph_node_label, majority_truth_labels, node_deg_list_i, root_i = self.find_root_group(a_i,
                                                                                                            sc_labels_subi,
                                                                                                            root_user_,
                                                                                                            sc_truelabels_subi,
                                                                                                            [], [])


            elif (self.dataset == '2M'):
                for ri in self.root_user:
                    if PARC_labels_leiden[ri] in cluster_labels_subi: root_user_ = ri
                graph_node_label, majority_truth_labels, node_deg_list_i, root_i = self.find_root_2Morgan(a_i,
                                                                                                          sc_labels_subi,
                                                                                                          root_user_,
                                                                                                          sc_truelabels_subi)

            # when root_user is given as a cell index
            else:
                root_user_ = None
                if (comp_i > len(self.root_user) - 1):
                    root_user_ = 0

                else:
                    for ri in self.root_user:
                        if PARC_labels_leiden[ri] in cluster_labels_subi:
                            root_user_ = ri
                            print(
                                f"{datetime.now()}\tThe root index, {ri} provided by the user belongs to cluster number {PARC_labels_leiden[ri]} and corresponds to cell type {self.true_label[ri]}")
                if root_user_ is None: root_user_ = 0
                graph_node_label, majority_truth_labels, node_deg_list_i, root_i = self.find_root(a_i,
                                                                                                  PARC_labels_leiden,
                                                                                                  root_user_,
                                                                                                  self.true_label)

            self.root.append(root_i)
            self.majority_truth_labels = majority_truth_labels  # single cell level "Majority Truth of that Cluster + Clusterlabel"

            for item in node_deg_list_i:
                node_deg_list.append(item)

            new_root_index_found = False
            for ii, llabel in enumerate(cluster_labels_subi):
                if root_i == llabel:
                    new_root_index = ii
                    new_root_index_found = True

            if not new_root_index_found:
                print('cannot find the new root index')
                new_root_index = 0

            print(f"{datetime.now()}\tComputing lazy-teleporting expected hitting times")
            hitting_times, roundtrip_times = self.compute_hitting_time(a_i, new_root_index, self.x_lazy,
                                                                       self.alpha_teleport)
            # rescale hitting times
            very_high = np.mean(hitting_times) + 1.5 * np.std(hitting_times)
            without_very_high_pt = [iii for iii in hitting_times if iii < very_high]
            new_very_high = np.mean(without_very_high_pt) + np.std(without_very_high_pt)
            # print('very high, and new very high', very_high, new_very_high)
            new_hitting_times = [x if x < very_high else very_high for x in hitting_times]
            hitting_times = np.asarray(new_hitting_times)
            scaling_fac = 10 / max(hitting_times)
            hitting_times = hitting_times * scaling_fac
            s_ai, t_ai = a_i.nonzero()
            edgelist_ai = list(zip(s_ai, t_ai))
            edgeweights_ai = a_i.data
            # print('edgelist ai', edgelist_ai)
            # print('edgeweight ai', edgeweights_ai)
            # bias the edges based on pseudotime from "undirected" pruned graph
            biased_edgeweights_ai = get_biased_weights(edgelist_ai, edgeweights_ai, hitting_times)

            # biased_sparse = csr_matrix((biased_edgeweights, (row, col)))
            adjacency_matrix_ai = np.zeros((a_i.shape[0], a_i.shape[0]))

            for i, (start, end) in enumerate(edgelist_ai):
                adjacency_matrix_ai[start, end] = biased_edgeweights_ai[i]

            markov_hitting_times_ai = self._simulate_markov(adjacency_matrix_ai,
                                                           new_root_index)  # +adjacency_matrix.T))
            '''
            df_sc = pd.DataFrame()
            df_sc['parc'] = self.labels
            df_sc['true_time'] = self.time_series_labels
            df_ = df_sc.groupby(['parc']).mean()
            df_['via_pt'] = markov_hitting_times_ai
            df_['via_pt'] = df_['via_pt'].fillna(0)

            correlation = df_['via_pt'].corr(df_['true_time'])
            print(f'correlation cluster level via original mcmc pt before scaling, {correlation}')
            '''

            #print('try rw2 hitting times setup')
            memory_pt = 2  # 2
            rw2_hittingtimes = _compute_rw2_hittingtimes(A=adjacency_matrix_ai, root=new_root_index, memory=memory_pt,
                                                        x_lazy=self.x_lazy, alpha_teleport=self.alpha_teleport)

            '''
            df_sc = pd.DataFrame()
            df_sc['parc'] = self.labels
            df_sc['true_time'] = self.time_series_labels
            df_ = df_sc.groupby(['parc']).mean()
            df_['via_pt'] = rw2_hittingtimes
            df_['via_pt'] = df_['via_pt'].fillna(0)


            correlation = df_['via_pt'].corr(df_['true_time'])
            print(f'correlation via rw2 pt at memory q={memory} before scaling, {correlation}')
            '''
            print(f'memory for rw2 hittings times  {memory_pt}. Using rw2 based pt')
            markov_hitting_times_ai = rw2_hittingtimes

            # print('no rw2')
            # print('skip scaling of pt')
            #print('do scaling of pt')

            very_high = np.mean(markov_hitting_times_ai) + 1.5 * np.std(markov_hitting_times_ai)  # 1.5
            very_high = min(very_high, max(markov_hitting_times_ai))
            # without_very_high_pt = [iii for iii in markov_hitting_times_ai if iii < very_high]
            # new_very_high = min(np.mean(without_very_high_pt) + np.std(without_very_high_pt), very_high)

            new_markov_hitting_times_ai = [x if x < very_high else very_high for x in markov_hitting_times_ai]

            markov_hitting_times_ai = np.asarray(new_markov_hitting_times_ai)
            scaling_fac = 1 / max(markov_hitting_times_ai)
            markov_hitting_times_ai = markov_hitting_times_ai * scaling_fac

            # for eee, ttt in enumerate(markov_hitting_times_ai):print('cluster ', eee, ' had markov time', ttt)

            # print('markov hitting times', [(i, j) for i, j in enumerate(markov_hitting_times_ai)])
            # print('hitting times', [(i, j) for i, j in enumerate(hitting_times)])
            # markov_hitting_times_ai = (markov_hitting_times_ai)  # + hitting_times)*.5 #consensus
            adjacency_matrix_csr_ai = sparse.csr_matrix(adjacency_matrix_ai)
            # A_velo = velocity_transition(A=adjacency_matrix_ai, V=velocity_mean, G=gene_mean) #velocity directed transition matrix
            # print('A_velo', A_velo.shape)
            # print(A_velo)
            (sources, targets) = adjacency_matrix_csr_ai.nonzero()
            edgelist_ai = list(zip(sources, targets))
            weights_ai = adjacency_matrix_csr_ai.data
            bias_weights_2_ai = get_biased_weights(edgelist_ai, weights_ai, markov_hitting_times_ai, round=2)
            adjacency_matrix2_ai = np.zeros((adjacency_matrix_ai.shape[0], adjacency_matrix_ai.shape[
                0]))  # this will be the cluster transition matrix forward biased by pseudotimes
            for i, (start, end) in enumerate(edgelist_ai):
                adjacency_matrix2_ai[start, end] = bias_weights_2_ai[i]
            # print('adjacency_matrix2_ai col sum', np.sum(adjacency_matrix2_ai, axis=1))
            # adjacency matrix used for branch prob and terminal states will be the weighted average of the pt_trans_matrix and the velo_trans_matrix
            # note that although A_velo and adjaccency_matrix2_ai do not have rows that sum to one due to the biasing steps, the transition matrix used for simulating branch probabilities will be normalized such that each row_sum = 1. This is done in the simulate_branch_prob function
            # however the biased weights without row normalization is used for visualization
            if self.A_velo is not None: adjacency_matrix2_ai = (
                                                                           1 - self.velo_weight) * adjacency_matrix2_ai + self.velo_weight * A_velo
            user_defined_terminal_clus_exists = False
            if (len(self.user_defined_terminal_group) > 0) | (len(self.user_defined_terminal_cell) > 0):
                user_defined_terminal_clus_exists = True

            if (self.super_terminal_cells == False) & (
                    user_defined_terminal_clus_exists == False):  # when is_coarse = True, there is no list of terminal clusters/cells that are passed into VIA based on a previous iteration.
                # print('new_root_index', new_root_index, ' before get terminal')
                terminal_clus_ai = self._get_terminal_clusters(adjacency_matrix2_ai, markov_hitting_times_ai,
                                                              new_root_index)
                temp_terminal_clus_ai = []
                threshold_ = np.percentile(np.asarray(markov_hitting_times_ai), self.pseudotime_threshold_TS)
                #print('!!! Remove the [0:2] just using to speed up testing')
                for i in terminal_clus_ai:#[0:2]:

                    if markov_hitting_times_ai[i] > threshold_:
                        terminal_clus.append(cluster_labels_subi[i])
                        temp_terminal_clus_ai.append(i)
                terminal_clus_ai = temp_terminal_clus_ai
            elif (self.super_terminal_cells == False) & (user_defined_terminal_clus_exists == True):
                terminal_clus_ai = []
                terminal_clus = []
                threshold_ = np.percentile(np.asarray(markov_hitting_times_ai), self.pseudotime_threshold_TS)
                terminal_clus_temp = self._get_terminal_clusters_user_defined_(self.user_defined_terminal_cell,
                                                                              self.user_defined_terminal_group)
                for ti in terminal_clus_temp:
                    if (ti in loc_compi):
                        terminal_clus_ai.append(ti)
                        terminal_clus.append(cluster_labels_subi[ti])

            print(
                f"{datetime.now()}\tTerminal clusters corresponding to unique lineages in this component are {terminal_clus_ai} ")

            if self.memory == 0:
                use_rw2_prob = False
            else:
                use_rw2_prob = True
            if not use_rw2_prob:
                print(f'Via 1.0 lineage prob')
                for target_terminal in terminal_clus_ai:

                    prob_ai = self.simulate_branch_probability(target_terminal,
                                                               adjacency_matrix2_ai,
                                                               new_root_index,
                                                               num_sim=int(self.num_mcmc_simulations / 2))
                    print(f'terminal state {target_terminal} has probability {prob_ai}')
                    df_graph['terminal_clus' + str(cluster_labels_subi[target_terminal])] = 0.0000000

                    pd_columnnames_terminal.append('terminal_clus' + str(cluster_labels_subi[target_terminal]))

                    for k, prob_ii in enumerate(prob_ai):
                        df_graph.at[cluster_labels_subi[k], 'terminal_clus' + str(
                            cluster_labels_subi[target_terminal])] = prob_ii
            else:
                memory = self.memory
                p_memory = self.p_memory

                prob_lin_rw2 = _compute_rw2_lineageprobability(A=adjacency_matrix2_ai, memory=memory,
                                                              root=new_root_index,
                                                              terminal_states=terminal_clus_ai, x_lazy=self.x_lazy,
                                                              alpha_teleport=self.alpha_teleport)
                for enum_ts, target_terminal in enumerate(terminal_clus_ai):
                    df_graph['terminal_clus' + str(cluster_labels_subi[target_terminal])] = 0.0000000

                    pd_columnnames_terminal.append('terminal_clus' + str(cluster_labels_subi[target_terminal]))

                    for k, prob_ii in enumerate(prob_lin_rw2[:, enum_ts]):
                        df_graph.at[cluster_labels_subi[k], 'terminal_clus' + str(
                            cluster_labels_subi[target_terminal])] = prob_ii
            bp_array = df_graph[pd_columnnames_terminal].values
            bp_array[np.isnan(bp_array)] = 1e-8

            bp_array = bp_array / bp_array.sum(axis=1)[:, None]  # row normalization at the cluster level
            bp_array[np.isnan(bp_array)] = 1e-8

            for ei, ii in enumerate(loc_compi):
                df_graph.at[ii, 'pt'] = hitting_times[ei]
                df_graph.at[ii, 'graph_node_label'] = graph_node_label[ei]
                df_graph.at[ii, 'majority_truth'] = graph_node_label[ei]
                df_graph.at[ii, 'markov_pt'] = markov_hitting_times_ai[ei]

            locallytrimmed_g.vs["label"] = df_graph['graph_node_label'].values
            hitting_times = df_graph['pt'].values

        self.cluster_adjacency = adjacency_matrix2_ai
        self.hitting_times = hitting_times  # not markov chain simulated
        # print('hitting times', hitting_times[0:10])
        self.markov_hitting_times = df_graph['markov_pt'].values  # hitting_times on the forward biased#
        # print('markov hitting times,', self.markov_hitting_times[0:10])
        self.terminal_clusters = terminal_clus

        dict_ts_mode = {}
        for i in terminal_clus:
            loc__ = np.where(np.asarray(self.labels) == i)[0]
            i_mode = func_mode(list(np.asarray(self.true_label)[loc__]))
            dict_ts_mode[i] = i_mode
        print(
            f"{datetime.now()}\tThere are ({len(terminal_clus)}) terminal clusters corresponding to unique lineages {dict_ts_mode}")  # {self.terminal_clusters} ")
        self.node_degree_list = node_deg_list
        print(f"{datetime.now()}\tBegin projection of pseudotime and lineage likelihood")
        self.cluster_bp = bp_array  # there is row normalization at the cluster level so each row sums to 1

        df_graph['markov_pt'] = df_graph['markov_pt'].fillna(10)  # 0
        # nan_count = df_graph['markov_pt'].isna().sum()
        # print('df graph markov pt nan count', nan_count)
        self.single_cell_bp, self.single_cell_pt_markov = self._project_branch_probability_sc(bp_array, df_graph[
            'markov_pt'].values)

        self.single_cell_pt_markov = [item for item in self.single_cell_pt_markov if not (math.isnan(item)) == True]
        if self.time_series_labels is not None:
            df_ = pd.DataFrame()
            df_['via_pt'] = self.single_cell_pt_markov
            df_['true_time'] = self.time_series_labels
            df_['via_pt'] = df_['via_pt'].fillna(0)

            correlation = df_['via_pt'].corr(df_['true_time'])
            print(f'{datetime.now()}\tStart reading data')
            print(
                f'{datetime.now()}\tCorrelation of Via pseudotime with developmental stage {round(correlation * 100, 2)} %')

        # the single_cell_bp are not re-rownormalized. In fact we scale each column (lineage) by the max value in the lineage (column) to prevent rarer/smaller lineages from being under-represented.
        # to get a row-normalized results of the single-cell branching probabilities, we offer single_cell_bp_rownormed as an attribute so that the probabilities of each cell sum to 1.

        single_cell_bp_rownormed = np.nan_to_num(self.single_cell_bp, nan=0.0, posinf=0.0, neginf=0.0)
        # row normalize
        row_sums = single_cell_bp_rownormed.sum(axis=1)
        single_cell_bp_rownormed = single_cell_bp_rownormed / row_sums[:, np.newaxis]
        # print(f'{datetime.now()}\tCheck cell 0 of sc pb rownormed {single_cell_bp_rownormed[0, :]},{single_cell_bp_rownormed.sum(axis=1)}')
        self.single_cell_bp_rownormed = single_cell_bp_rownormed
        self.dict_terminal_super_sub_pairs = dict_terminal_super_sub_pairs
        hitting_times = self.markov_hitting_times  # cluster level

        ### CONSTRUCT GRAPH USED FOR VISUALIZATION by doing further pruning so we can easily visualize the TI

        # rebias the edge weights based on the recomputed final markov hitting time and do a final pruning used for visualization (not MCMCs)
        bias_weights_2_all = get_biased_weights(self.edgelist, edgeweights_pruned_clustergraph,
                                                self.markov_hitting_times,
                                                round=2)  # edgeweights of initial globally and locally pruned graph (all components) need to be forward biased

        n_clus = len(set(self.labels))
        temp_csr = csr_matrix((bias_weights_2_all, tuple(zip(*self.edgelist))),
                              shape=(n_clus, n_clus))  # locally and globally pruned and used for adjacencyMatrix2ai

        if self.A_velo is None:
            final_transition_matrix_all_components = temp_csr.toarray()
        else:
            final_transition_matrix_all_components = (self.velo_weight) * self.A_velo + (1 - self.velo_weight) * (
                temp_csr.toarray())
            print(f"{datetime.now()}\tTransition matrix with weight of {self.velo_weight} on RNA velocity")
        # the final array used for making the visual edgelists will be weighted average of A_velo and pt_based_trans (temp_csr)
        final_transition_matrix_all_components = csr_matrix(final_transition_matrix_all_components)

        print(f'{datetime.now()}\tCluster graph layout based on forward biasing')
        # print("**** USING OLD LAYOUT BEFORE BIASING ****")

        # ensure no extreme values distort layout:
        '''
        print('NEW, KEEP OR NOT? ensure no extreme values distort layout:')
        bias_weights_2_all_hi = np.mean(bias_weights_2_all) + 3*np.std(bias_weights_2_all)
        bias_weights_2_all_lo = np.mean(bias_weights_2_all) - 3*np.std(bias_weights_2_all)
        bias_weights_2_subset = [x for x in bias_weights_2_all if ((x < bias_weights_2_all_hi) & (x> bias_weights_2_all_lo))]
        threshold_hi = np.percentile(bias_weights_2_subset,99)
        threshold_lo = np.percentile(bias_weights_2_subset,1)
        bias_weights_2_all = [x if x < threshold_hi else threshold_hi for x in bias_weights_2_all ]
        bias_weights_2_all = [x if x > threshold_lo else threshold_lo for x in bias_weights_2_all ]
        '''
        visual_g = ig.Graph(self.edgelist, edge_attrs={'weight': bias_weights_2_all}).simplify(
            combine_edges='sum')  # used to be commented out
        # random.seed(self.random_seed)


        if (self.graph_init_pos is not None) & (self.do_spatial_layout):
            if self.graph_init_pos is None: self.graph_init_pos = self.spatial_coords
            list_graph_init_pos = []
            print('using provided spatial coords to initialise viagraph layout. Only use this if all cells are from the same slice')
            for i in range(len(set(self.labels))):
                where = np.where(np.array(self.labels) == i)[0]
                #print('where are the clusters cells',where)
                x_mean = self.graph_init_pos[where, 0].mean()
                print('x_mean',x_mean)
                y_mean = self.graph_init_pos[where, 1].mean()
                list_graph_init_pos.append([x_mean, y_mean]) #list of lists
                print('list graph init pos')
            print(list_graph_init_pos)

            layout = visual_g.layout_fruchterman_reingold(weights='weight', seed=list_graph_init_pos)
            #layout = visual_g.layout_fruchterman_reingold(weights='weight')
        else:
            layout = visual_g.layout_fruchterman_reingold(weights='weight')
        #layout = visual_g.layout_fruchterman_reingold(weights='weight',  seed=np.matrix(layout))  # used to be commented out

        self.graph_node_pos = layout.coords
        self.layout = layout  # reassign

        if (self.embedding is None) & (self.do_compute_embedding == True):
            if self.embedding_type == 'via-atlas':
                print(f'{datetime.now()}\tRun via-atlas')

                from sklearn.preprocessing import normalize
                # row_stoch = normalize(self.csr_full_graph, norm='l1', axis=1)
                ##row_stoch = row_stoch ** 3
                # temp_pca = csr_matrix(self.data)
                # X_diffused_data = row_stoch * temp_pca  # matrix multiplication to diffuse the pcs
                # X_input=self.data
                for random_state in [self.random_seed]:  # 6 self.random_seed,6
                    for min_dist in [0.1]:  # 35,0.3,0.25,0.2,0.1]:
                        for rw2_comp in [self.ncomp]:
                            import random
                            str_date = random.randint(0, 1000)
                            #print('embedding pseudorand', str_date)
                            distance_metric = 'euclidean'  # 'cosine' #
                            #print(                                f'distance metric {distance_metric} and min_dist {min_dist} and randomstate {random_state} and pseudoid {str_date}')
                            # input = self.data
                            # '/home/user/Trajectory/Datasets/EB_Phate/RW2_sparse_matrix510Pp1p5_R2Wemd.csv'

                            # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/Cao_ProtoVert/RW2/RW2_sparse_matrix5094_pc30_knn30_krev15_kseq15_rs0.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Cao_ProtoVert/RW2/RW2_P1_Q1_sparse_matrix5820_pc20_knn30_krev15_kseq15_rs0.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/WagnerZebrafish/RW2/knn100_pc30_kseq50RW2_sparse_matrix5820.csv')

                            # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/5000hvg/pc100_knn10kseq5krev5RW2_noparc137noK0871_walklength20_numwalks20_dim128.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/5000hvg/pc100_knn10kseq5krev5RW2_672_walklength80_numwalks20_dim64.csv')

                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/2000hvg/pc100_knn20kseq5krev5RW2_120q10walklength20798.csv')
                            # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/EB_Phate/RW2/pc20_knn100kseq50krev50RW2_emd029.csv')

                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_knn20kseq0krev0RW2_887_P20Q20.csv')# not this
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_knn50kseq0krev0RW2_gaussTrue_051_P1Q1.csv')  # not this
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_knn20kseq0krev0RW2_887.csv') not this

                            # r2w_input = pd.read_csv( '/home/user/Trajectory/Datasets/EB_Phate/RW2/pc20_knn100kseq50krev50RW2_sparse_matrix029_P1_Qp001_numwalk20.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/EB_Phate/RW2/pc20_knn100kseq50krev50RW2_sparse_matrix029_P1_Q100_numwalk20.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_P1_Q1000_knn50kseq10krev10RW2_988.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/MouseNeuron/RW2/pc30_P1_Q1_knn50kseq10krev10RW2_988.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/2MOrgan/RW2/pc50_knn30kseq10krev0RW2_970_P1_Q10_rs42_tdiffstep2_gaussFalse_970.csv')
                            #r2w_input = pd.read_csv(                                '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/RW2/pc30_knn30kseq15krev15RW2_sparse_matrix_122.csv')  # pc30_knn30kseq15krev15RW2_emd_122.csv')
                            # r2w_input = pd.read_csv('/home/user/Trajectory/Datasets/Pijuan_Gastrulation/RW2/pc30_P1_Qp001_knn30kseq15krev15RW2_sparse_matrix_122.csv')

                            #r2w_input = r2w_input.drop(['Unnamed: 0'], axis=1).values
                            #input = r2w_input[:, 0:rw2_comp]
                            # print('NOT using rw2 embedding CANNOT USE THIS CODE ON GITHUB YET AS IT USES LOCAL FILES')
                            input = self.data
                            #print('USING rw2 embedding CANNOT USE THIS CODE ON GITHUB YET AS IT USES LOCAL FILES')

                            '''
                            from sklearn.preprocessing import normalize
                            row_stoch = self.csr_full_graph
                            row_stoch.data = np.clip(row_stoch.data, np.percentile(row_stoch.data, 10),
                                                     np.percentile(row_stoch.data, 90))
                            row_stoch = normalize(row_stoch, norm='l1', axis=1)
                            row_stoch = row_stoch ** 2  # level of diffusion

                            temp = csr_matrix(input)

                            input = row_stoch * temp  # matrix multiplication
                            '''
                            do_initVia = True  # when calling via_atlas_emb() from within the class, it automatically initializes using the via cluster graph
                            n_epochs = 100  # 100 usually

                            if do_initVia:
                                self.embedding = via_atlas_emb(X_input=self.data, graph=self.csr_full_graph,
                                                               n_epochs=n_epochs,
                                                               spread=1,
                                                               distance_metric=distance_metric, min_dist=min_dist,
                                                               saveto='',
                                                               random_state=random_state, init_pos='via',
                                                               cluster_membership=self.labels, layout=layout.coords)
                            else:
                                self.embedding = via_atlas_emb(X_input=self.data, graph=self.csr_full_graph,
                                                               n_epochs=n_epochs,
                                                               spread=1,
                                                               distance_metric=distance_metric, min_dist=min_dist,
                                                               saveto='',
                                                               random_state=random_state)

                            print(f'{datetime.now()}\tCompleted via-atlas embedding')
                            title_umap = 'via-atlas rw2 knn/pc/knnseq:' + str(self.knn) + '/' + str(
                                self.ncomp) + '/' + str(self.knn_sequential) + '_knnreverse' + str(
                                self.knn_sequential_reverse) + 'tdiff' + str(
                                self.t_diff_step) + 'mindist' + str(
                                min_dist) + 'rs' + str(self.random_seed)
                            if self.time_series_labels is not None:
                                color_labels = self.time_series_labels
                                f1, ax = plot_scatter(embedding=self.embedding, labels=color_labels, cmap='plasma', s=5,
                                                      alpha=0.5, edgecolors='None',
                                                      title=title_umap, text_labels=False)
                                f1.set_size_inches(10, 10)

                                # str_date = str(str(datetime.now())[-3:])
                                # save_str = '/home/user/Trajectory/Datasets/WagnerZebrafish/viaumap_RW2_5820_k' + str(                                    self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str(                                   self.knn_sequential) + '_knnreverse' + str(self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str(                                    random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str(                                    self.do_gaussian_kernel_edgeweights) + 'viaInit' + str(do_initVia) + str_date

                                # save_str = '/home/user/Trajectory/Datasets/WagnerZebrafish/viaumap_PCA_k' + str(self.knn) + 'TimeseriesAug'+str(self.time_series)+ '_kseq' + str(self.knn_sequential) + '_knnreverse' + str(  self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str(                                random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str(                                self.do_gaussian_kernel_edgeweights) + 'viaInit'+str(do_initVia)+ str_date
                                # save_str = '/home/user/Trajectory/Datasets/MEF_Schiebinger/viaumap_k' + str(                                self.knn) + 'kseq' + str(self.knn_sequential) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str(                                self.random_seed) + 'doGauss' + str(                                self.do_gaussian_kernel_edgeweights) + str_date + 'stage'

                                # save_str = '/home/user/Trajectory/Datasets/EB_Phate/viaumap_R2W_029_P1_Qp001_k' + str( self.knn) +'TimeseriesAug'+str(self.time_series)+ 'kseq' + str(self.knn_sequential) +'_knnreverse'+str(self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + '_mainRS'+str(self.random_seed)+'mindist' + str(min_dist) + 'rsUmap' + str(random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'_ViaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date
                                # save_str = '/home/user/Trajectory/Datasets/EB_Phate/viaumap_PCA_k' + str(                                    self.knn) + 'TimeseriesAug' + str(self.time_series) + 'kseq' + str(                                    self.knn_sequential) + '_knnreverse' + str(                                    self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + '_mainRS' + str(self.random_seed) + 'mindist' + str(                                    min_dist) + 'rsUmap' + str(random_state) + 'doGauss' + str(                                    self.do_gaussian_kernel_edgeweights) + '_ViaInitLayout' + str(                                    do_initVia) + '_nEpochs' + str(n_epochs) + '_' + str_date
                                # save_str = '/home/user/Trajectory/Datasets/Cao_ProtoVert/viaumap_RW2_P1_Q1_5820_k' + str( self.knn) + 'TimeseriesAug'+str(self.time_series)+ '_kseq' + str(self.knn_sequential) + '_knnreverse' + str(                                self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str(                                random_state) +'_rsVIA'+str(self.random_seed)+ 'doGauss' + str(                                self.do_gaussian_kernel_edgeweights) +'viaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date
                                # save_str = '/home/user/Trajectory/Datasets/Cao_ProtoVert/viaumap_PCA_k' + str(                                    self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str(                                    self.knn_sequential) + '_knnreverse' + str(                                    self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str(                                    random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str(                                    self.do_gaussian_kernel_edgeweights) + 'viaInitLayout' + str(                                    do_initVia) + '_nEpochs' + str(n_epochs) + '_' + str_date
                                # save_str = '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/viaumap_RW2_P1_Qp001_rw2comp'+str(rw2_comp)+'_122_k' + str(                                    self.knn) +'_TimeAug'+str(self.time_series)+ '_kseq' + str(self.knn_sequential) +'_knnreverse'+str(self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str(                                    random_state) + 'doGauss' + str(                                    self.do_gaussian_kernel_edgeweights) + str_date +'_viaInitTrue'
                                # save_str = '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/viaumap_PCA_k' + str(self.knn) + '_TimeAug' + str(                                    self.time_series) + '_kseq' + str(self.knn_sequential) + '_knnreverse' + str(                                    self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str(                                    random_state) + 'doGauss' + str(                                    self.do_gaussian_kernel_edgeweights)+'_viaInit'+str(do_initVia)      +str(n_epochs)+'_'+ str_date
                                # save_str = '/home/user/Trajectory/Datasets/MouseNeuron/viaumap_RW2_P1_Q1000_rw2comp' + str(rw2_comp) + '_988_k' + str(self.knn) + 'TimeseriesAug'+str(self.time_series)+'_kseq' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(self.t_diff_step) + 'mindist' + str(min_dist) +'_rsMain'+str(self.random_seed)+ '_rs' + str(random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'_viaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date
                                # save_str = '/home/user/Trajectory/Datasets/MouseNeuron/viaumap_PCA_k' + str(self.knn) + 'TimeseriesAug'+str(self.time_series)+'_kseq' + str(self.knn_sequential) + '_knnreverse' + str( self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(self.t_diff_step) + 'mindist' + str(min_dist) +'_rsMain'+str(self.random_seed)+ '_rs' + str(random_state) + 'doGauss' + str( self.do_gaussian_kernel_edgeweights) +'_viaInitLayout'+str(do_initVia)+'_nEpochs'+str(n_epochs)+'_'+ str_date
                                # save_str = '/home/user/Trajectory/Datasets/2MOrgan/viaumap_RW2_P1_Q10_rw2comp' + str(                                    rw2_comp) + '_970_k' + str(self.knn) + 'kseq' + str(                                    self.knn_sequential) + '_knnreverse' + str(                                    self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + 'mindist' + str(min_dist) + 'rs' + str(                                    random_state) + 'doGauss' + str(self.do_gaussian_kernel_edgeweights) +'viaInitTrue'+ str_date
                                # save_str = '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/via_umaps_all/viaumap_2000hvg_k' + str(                                    self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str(                                    self.knn_sequential) + '_knnreverse' + str(                                    self.knn_sequential_reverse) + 'npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str(                                    random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str(                                    self.do_gaussian_kernel_edgeweights) +'_distMet'+distance_metric[0:3]+ 'viaInit' + str(do_initVia) + str_date
                                # save_str = '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/RW2/embeddings/rw2_npz871_2000hvg_k' + str(                                    self.knn) + 'TimeseriesAug' + str(self.time_series) + '_kseq' + str(                                    self.knn_sequential) + '_knnreverse' + str(                                    self.knn_sequential_reverse) + '_cluster_gp'+str(self.cluster_graph_pruning)+'_npc' + str(self.ncomp) + 'tdiff' + str(                                    self.t_diff_step) + 'mindist' + str(min_dist) + '_rsUmap' + str(                                    random_state) + '_rsVIA' + str(self.random_seed) + 'doGauss' + str(                                    self.do_gaussian_kernel_edgeweights) + '_distMet' + distance_metric[                                                                                        0:3] + 'viaInit' + str(                                    do_initVia) + str_date                                #df_umap.to_csv('/home/user/Trajectory/Datasets/MouseNeuron/2000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'nps'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+'stage'+str_date+".csv")
                                # savefig_ = '/home/user/Trajectory/Datasets/MouseNeuron/2000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'nps'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+str_date+'stage.png'
                                # save_str = '/home/user/Trajectory/Datasets/Qiu_Mouse_GastrulaPup/via-umap/5000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'npcs'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'doGaussianEdge'+str(self.do_gaussian_kernel_edgeweights)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+'_distmet'+distance_metric+str(str_date)
                                save_str = '/home/user/Trajectory/Datasets/Packer_Elegans2019/via-umap/5000hvg_viaumap_k' + str(
                                    self.knn) + 'kseq' + str(self.knn_sequential) + 'npcs' + str(
                                    self.ncomp) + 'tdiff' + str(self.t_diff_step) + 'doGaussianEdge' + str(
                                    self.do_gaussian_kernel_edgeweights) + 'mindist' + str(min_dist) + 'rs' + str(
                                    self.random_seed) + '_distmet' + distance_metric + str(str_date)
                                df_umap = pd.DataFrame(self.embedding)
                                # df_umap.to_csv(save_str + '.csv')
                                # f1.savefig(save_str + 'stage.png', facecolor='white', transparent=False)

                            # savefig_ = '/home/user/Trajectory/Datasets/MouseNeuron/2000hvg_viaumap_k'+str(self.knn)+'kseq'+str(self.knn_sequential)+'nps'+str(self.ncomp)+'tdiff'+str(self.t_diff_step)+'mindist'+str(min_dist)+'rs'+str(self.random_seed)+str_date+'celltype.png'

                            celltype_label = [i.split("_")[0] for i in self.true_label]  # self.true_label
                            f2, ax = plot_scatter(embedding=self.embedding, labels=celltype_label, title=title_umap,
                                                  alpha=0.5, s=5, color_dict=self.color_dict)
                            f2.set_size_inches(10, 10)
                            # save_str = '/home/user/Trajectory/Datasets/Rashmi2023/emt/'+self.embedding_type+'_knn'+str(self.knn)
                            # f2.savefig(save_str + 'celltype.png', facecolor='white', transparent=False)
                            plt.show()

            elif self.embedding_type == 'via-mds':
                str_date = str(str(datetime.now())[-3:])
                print(f'{datetime.now()}\tRun via-mds')
                # n_milestones = min(self.nsamples, max(10000, int(0.1*self.nsamples)))

                for rs_i in [self.random_seed]:
                    for k_project_milestones in [3]:  # ,5,10]:
                        for k_mds_i in [25]:  # [5,10,15,25]:
                            for diffusion_i in [2]:  # [2,5,10]:
                                t_difference = self.t_diff_step
                                k_seq_i = 2
                                n_milestones_mds = min(3000, self.data.shape[0])

                                self.embedding = via_mds(X_pca=self.data, n_milestones=n_milestones_mds, k=k_mds_i,
                                                         knn_seq=k_seq_i, k_project_milestones=k_project_milestones,
                                                         # neighbors_distances=self.full_neighbor_array,

                                                         diffusion_op=diffusion_i, random_seed=rs_i,
                                                         viagraph_full=self.csr_full_graph, t_difference=t_difference,
                                                         time_series_labels=self.time_series_labels, saveto='',
                                                         double_diffusion=False)

                                str_date = str(str(datetime.now())[-3:])
                                # save_str = '/home/user/Trajectory/Datasets/Pijuan_Gastrulation/mds/viamds_singlediffusion_pcs' + str(                                    shape_data) + '_k' + str(self.knn) + '_milestones' + str(                                   n_milestones_mds) + '_kprojectmilestones' + str(                                    k_project_milestones) + 't_step' + str(t_difference) + '_knnmds' + str(                                    k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str(                                    self.knn_sequential) + '_nps' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str(                                    diffusion_i) + '_RsMds' + str(rs_i) + '_' + str_date
                                # save_str = '/home/user/Trajectory/Datasets/Zebrafish_Lange2023/via_mds/viamds_singlediffusion_pcs' + str(                                    shape_data) + '_k' + str(self.knn) + '_milestones' + str(                                    n_milestones_mds) + '_kprojectmilestones' + str(                                    k_project_milestones) + 't_step' + str(t_difference) + '_knnmds' + str(                                    k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str(                                    self.knn_sequential) + '_nps' + str(self.ncomp) + '_tdiff' + str(                                    self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str(                                    diffusion_i) + '_RsMds' + str(rs_i) + '_' + str_date
                                # save_str = '/home/user/Trajectory/Datasets/Qiu_Mouse_GastrulaPup/mds/viamds_singlediffusion_pcs' + str(                                    shape_data) + '_k' + str(self.knn) + '_milestones' + str(                                    n_milestones_mds) + '_kprojectmilestones' + str(                                    k_project_milestones) + 't_step' + str(t_difference) + '_knnmds' + str(                                    k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str(                                    self.knn_sequential) + '_nps' + str(self.ncomp) + '_tdiff' + str(                                    self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str(                        diffusion_i) + '_RsMds' + str(rs_i) + '_' + str_date
                                # save_str = '/home/user/Trajectory/Datasets/WagnerZebrafish/2000hvg/mds/singlediffusion_viamds/2000hvg_viamds_singlediffusion_pcs'+str(shape_data)+'_k' + str(self.knn) + '_milestones'+str(n_milestones_mds)+'_kprojectmilestones'+str(k_project_milestones)+'t_step'+str(t_difference)+'_knnmds' + str(k_mds_i) +'_kseqmds' + str(k_seq_i) +'_kseq'+str(self.knn_sequential)+'_nps' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step)+'_randseed'+str(self.random_seed)+ '_diffusionop'+str(diffusion_i)+'_RsMds'+str(rs_i)+'_'+str_date
                                # save_str = '/home/user/Trajectory/Datasets/Cao_ProtoVert/mds_theseare1000hvg/singlediffusion_viamds/1000hvg_viamds_singlediffusion_doExp'+str(self.do_gaussian_kernel_edgeweights)+'_k' + str(self.knn) + '_milestones'+str(n_milestones_mds)+'_kprojectmilestones'+str(k_project_milestones)+'t_stepmds'+str(t_difference)+'_knnmds' + str(k_mds_i) +'_kseqmds' + str(k_seq_i) +'_kseq'+str(self.knn_sequential)+'_nps' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step)+'_randseed'+str(self.random_seed)+ '_diffusionop'+str(diffusion_i)+'_rsMds'+str(rs_i)+'_'+str_date
                                # save_str = '/home/user/Trajectory/Datasets/EB_Phate/viamds_R2W_029_P1_Qp001_singlediffusion_prescaled_doExp' + str(                                    self.do_gaussian_kernel_edgeweights) + '_k' + str(self.knn) + '_milestones' + str(                                    n_milestones_mds) + '_kprojectmilestones' + str(                                    k_project_milestones) + 't_stepmds' + str(t_difference) + '_knnmds' + str(                                    k_mds_i) + '_kseqmds' + str(k_seq_i) + '_kseq' + str(                                    self.knn_sequential) + '_npc' + str(self.ncomp) + '_tdiff' + str(                                    self.t_diff_step) + '_randseed' + str(self.random_seed) + '_diffusionop' + str(                                    diffusion_i) + '_rsMds' + str(rs_i) + '_' + str_date                                # save_str = '/home/user/Trajectory/Datasets/MEF_Schiebinger/viamds_singlediffusion_prescaled_doExp'+str(self.do_gaussian_kernel_edgeweights)+'_k' + str(self.knn) + '_milestones'+str(n_milestones_mds)+'_kprojectmilestones'+str(k_project_milestones)+'t_stepmds'+str(t_difference)+'_knnmds' + str(k_mds_i) +'_kseqmds' + str(k_seq_i) +'_kseq'+str(self.knn_sequential)+'_npc' + str(self.ncomp) + '_tdiff' + str(self.t_diff_step)+'_randseed'+str(self.random_seed)+ '_diffusionop'+str(diffusion_i)+'_rsMds'+str(rs_i)+'_'+str_date
                                print(f'{datetime.now()}\tCompleted via-mds')

                                # save_str = '/home/user/Trajectory/Datasets/Qiu_Mouse_GastrulaPup/mds/' + self.embedding_type + '_knn' + str(                                    self.knn) + '_projectmilestones' + str(k_project_milestones) + '_kmds' + str(                                    k_mds_i) + '_diffusion' + str(diffusion_i)+'_strdate'+str(str_date)
                                save_str = '/home/user/Trajectory/Datasets/Packer_Elegans2019/via-mds/' + self.embedding_type + '_knn' + str(
                                    self.knn) + '_projectmilestones' + str(k_project_milestones) + '_kmds' + str(
                                    k_mds_i) + '_diffusion' + str(diffusion_i) + '_strdate' + str(str_date)
                                df_mds = pd.DataFrame(self.embedding)
                                df_mds.to_csv(save_str + ".csv")
                                celltype_label = [i.split("_")[0] for i in self.true_label]  # self.true_label
                                f1, ax = plot_scatter(embedding=self.embedding, labels=celltype_label, title='via-mds',
                                                      alpha=0.5, cmap='rainbow', text_labels=True)

                                f1.set_size_inches(10, 10)
                                #f1.savefig(save_str + '_celltype.png', facecolor='white', transparent=False)

                                if self.time_series_labels is not None:
                                    f2, ax2 = plot_scatter(embedding=self.embedding, labels=self.time_series_labels,
                                                           title='via-mds', alpha=0.5, cmap='plasma')

                                    f2.set_size_inches(10, 10)
                                    #f2.savefig(save_str + '_stage.png', facecolor='white', transparent=False)



            elif self.embedding_type == 'via-force':
                print(f'{datetime.now()}\tRun via-force')
                # n_milestone = min(self.nsamples, max(5000, int(0.1*self.nsamples)))
                n_milestones = 3000
                self.embedding = via_forcelayout(X_pca=self.data, viagraph_full=self.csr_full_graph, k=10, knn_seq=10,
                                                 n_milestones=n_milestones)
                if self.time_series_labels is None:
                    color_labels = self.true_label
                else:
                    color_labels = self.time_series_labels
                if (isinstance(color_labels[0], str)) == True:
                    categorical = True
                else:
                    categorical = False

                plot_scatter(embedding=self.embedding, labels=color_labels, title='via-force', categorical=categorical)
                plot_scatter(embedding=self.embedding, labels=self.true_label, title='via-mds', categorical=True)
                plt.show()
            else:
                print(
                    f'{datetime.now()}\tNo embedding will be computed: if you wish to compute a via-embedding specify one of via-force, via-mds or via-umap')

        if self.edgebundle_pruning_twice == False:
            # print('creating bundle with single round of global pruning at a level of', self.edgebundle_pruning)
            print(f"{datetime.now()}\tStarting make edgebundle viagraph...")
            self.hammerbundle_cluster, layout = make_edgebundle_viagraph(layout, g_layout, decay=self.viagraph_decay)

        # simplifying structure of edges used on the visual layout
        edgeweights_maxout_2, edgelist_maxout_2, comp_labels_2 = pruning_clustergraph(
            final_transition_matrix_all_components,
            global_pruning_std=self.visual_cluster_graph_pruning,
            max_outgoing=self.max_visual_outgoing_edges,
            preserve_disconnected=self.preserve_disconnected)

        if self.edgebundle_pruning_twice == True:
            print(
                f"{datetime.now()}\tAdditional Visual cluster graph pruning for edge bundling at level: {self.visual_cluster_graph_pruning}")
            layout_g = ig.Graph(self.edgelist, edge_attrs={'weight': edgeweights_maxout_2}).simplify(
                combine_edges='sum')
            layout_g_csr = get_sparse_from_igraph(layout_g, weight_attr='weight')
            weights_for_layout = np.asarray(layout_g_csr.data)
            # clip weights to prevent distorted visual scale in layout
            weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10),
                                         np.percentile(weights_for_layout, 90))
            weights_for_layout = list(weights_for_layout)
            graph_for_layout = ig.Graph(list(zip(*layout_g_csr.nonzero())), edge_attrs={'weight': weights_for_layout})
            self.clustergraph_igraph_forlayout = graph_for_layout
            # edge bundle is based on the visually (double) pruned graph rather than the inital graph used for Velo and Pseudotime
            self.hammerbundle_cluster, layout = make_edgebundle_viagraph(layout=layout, graph=graph_for_layout, decay=self.viagraph_decay)
        '''
        else:
            print(f'redoing cluster graph layout based on forward biased edges')
            layout_g = ig.Graph(edgelist_maxout_2, edge_attrs={'weight': bias_weights_2_all}).simplify(
                combine_edges='sum')
            layout_g_csr = get_sparse_from_igraph(layout_g, weight_attr='weight')
            weights_for_layout = np.asarray(layout_g_csr.data)
            # clip weights to prevent distorted visual scale in layout
            weights_for_layout = np.clip(weights_for_layout, np.percentile(weights_for_layout, 10),
                                         np.percentile(weights_for_layout, 90))
            weights_for_layout = list(weights_for_layout)
            graph_for_layout = ig.Graph(list(zip(*layout_g_csr.nonzero())), edge_attrs={'weight': weights_for_layout})
            # edge bundle is based on the visually (double) pruned graph rather than the inital graph used for Velo and Pseudotime
            self.hammerbundle_cluster, layout = make_edgebundle_viagraph(layout, graph_for_layout)
        '''
        temp_csr = csr_matrix((np.array(edgeweights_maxout_2), tuple(zip(*edgelist_maxout_2))), shape=(n_clus, n_clus))
        temp_csr = temp_csr.transpose().todense() + temp_csr.todense()
        temp_csr = np.tril(temp_csr, -1)  # elements along the main diagonal and above are set to zero
        temp_csr = csr_matrix(temp_csr)
        edgeweights_maxout_2 = temp_csr.data
        scale_factor = max(edgeweights_maxout_2) - min(edgeweights_maxout_2)
        edgeweights_maxout_2 = [((wi + .1) * 2.5 / scale_factor) + 0.1 for wi in edgeweights_maxout_2]

        sources, targets = temp_csr.nonzero()
        edgelist_maxout_2 = list(zip(sources.tolist(), targets.tolist()))
        self.edgelist_maxout = edgelist_maxout_2
        self.edgeweights_maxout = edgeweights_maxout_2

        remove_outliers = hitting_times
        threshold = np.percentile(remove_outliers, 95)  # np.mean(remove_outliers) + 1* np.std(remove_outliers)
        th_hitting_times = [x if x < threshold else threshold for x in hitting_times]
        remove_outliers_low = hitting_times[hitting_times < (np.mean(hitting_times) - 0.3 * np.std(hitting_times))]
        threshold_low = 0 if remove_outliers_low.size else np.percentile(remove_outliers_low, 5)
        th_hitting_times = [x if x > threshold_low else threshold_low for x in th_hitting_times]

        scaled_hitting_times = (th_hitting_times - np.min(th_hitting_times))
        npmax = np.max(scaled_hitting_times) or 1
        scaled_hitting_times = scaled_hitting_times * (1000 / npmax)

        self.scaled_hitting_times = scaled_hitting_times  # cluster mcmc pt cluster level
        scaled_hitting_times = scaled_hitting_times.astype(int)
        pal = ig.drawing.colors.AdvancedGradientPalette(['yellow', 'green', 'blue'], n=1001)

        # making a new "augmented" single-cell graph based on the computed pseudotimes - potentially useful when there are no time-series labels for (optionally) guiding the graph structure
        use_pt_to_guide_graph = False
        if use_pt_to_guide_graph == True:
            print(f'{datetime.now()}\tMake pt-augmented knn')
            pt_augmented_adjacency_igraph, adjacency_augmented = self._make_pt_augmented_adjacency_igraph(
                neighbors=neighbors, distances=distances, k_reverse=10, knn=20)

            print(f'{datetime.now()}\tRun via-umap on pt-augmented knn')  # graph=csr_full_graph
            self.embedding = via_atlas_emb(X_input=self.data, graph=self.adjacency_pt_augmented, n_epochs=100, spread=1,
                                           distance_metric='euclidean', min_dist=0.3,
                                           saveto='/home/user/Trajectory/Datasets/HumanCD34/pt-aug/not_ptaug_umap_.csv')  # usually min_dist default =0.1, for cd34 0.8

        '''
        self.embedding = via_mds(X_pca=self.data, k=15, t_diffusion=2,
                                 n_milestones=min(self.nsamples, max(10000, int(0.1 * self.nsamples))),
                                 viagraph_full=adjacency_pt_augmented, time_series_labels=[int(i*10) for i in self.single_cell_pt_markov],
                                 saveto='')
        '''


        all_colors = []  # TODO clean up this paragraph/put into a separate function
        for i in scaled_hitting_times:
            all_colors.append(pal.get(int(i))[0:3])

        locallytrimmed_g.vs['hitting_times'] = scaled_hitting_times
        locallytrimmed_g.vs['color'] = [pal.get(i)[0:3] for i in scaled_hitting_times]

        self.group_color = [colors.to_hex(v) for v in locallytrimmed_g.vs['color']]  # based on ygb scale
        viridis_cmap = cm.get_cmap('viridis_r')

        self.group_color_cmap = [colors.to_hex(v) for v in
                                 viridis_cmap(scaled_hitting_times / 1000)]  # based on ygb scale

        self.graph_node_label = df_graph['graph_node_label'].values
        self.edgeweight = [e['weight'] * 1 for e in locallytrimmed_g.es]

        # print('To draw viagraph edges without bundling and plot piechart composition of cell types: use self.draw_piechart_graph_nobundle()')
        # self.draw_piechart_graph_nobundle()
        # print('drawing with bundle piechart... Use self.draw_piechart_graph() to reproduce this plot')

        initial_bandwidth = 0.05  # 0.05
        decay = 0.7  # 0.7
        n_milestones = 150  # f you min(self.nsamples, max(500, int(0.1 * self.nsamples)))

        # print(f'milestones for edgebundling {n_milestones}')
        global_visual_pruning = 0.15
        extra_title_text = 'jac_visual:' + str(global_visual_pruning) + ' bw:' + str(
            initial_bandwidth) + ' decay:' + str(decay)
        # sc_graph = self.csr_full_graph
        self.hammerbundle_milestone_dict = None
        if self.embedding is not None:
            print(
                f"{datetime.now()}\tStart making edgebundle milestone with {n_milestones} milestones...This can be recomputed with make_edgebundle_milestone()")
            self.hammerbundle_milestone_dict = make_edgebundle_milestone(embedding=self.embedding,
                                                                         sc_graph=self.ig_full_graph,
                                                                         n_milestones=n_milestones,
                                                                         global_visual_pruning=global_visual_pruning,
                                                                         initial_bandwidth=initial_bandwidth,
                                                                         decay=decay,
                                                                         weighted=True,
                                                                         sc_labels_numeric=self.time_series_labels,
                                                                         sc_pt=self.single_cell_pt_markov,
                                                                         terminal_cluster_list=self.terminal_clusters,
                                                                         single_cell_lineage_prob=self.single_cell_bp)

        self.labels = list(self.labels)


        for tsi in self.terminal_clusters:
            loc_i = np.where(np.asarray(self.labels) == tsi)[0]
            val_pt = [self.single_cell_pt_markov[i] for i in loc_i]

            th_pt = np.percentile(val_pt, 50)  # 50
            loc_i = [loc_i[i] for i in range(len(val_pt)) if val_pt[i] >= th_pt]
            temp = np.mean(self.data[loc_i], axis=0)
            labelsq, distances = self.knn_struct.knn_query(temp, k=1)

            tsi_list.append(labelsq[0][0])

        if self.embedding is not None:
            print('REMEMBER TO RE-INCLUDE the PLT.SHOW HERE - COMMENTING IT OUT FOR NOW')
            plot_scatter(embedding=self.embedding, labels=self.single_cell_pt_markov, sc_index_terminal_states=tsi_list,
                         title='pseudotime and terminal states', cmap='plasma', true_labels=self.true_label)

            plot_scatter(embedding=self.embedding, labels=self.true_label,
                         sc_index_terminal_states=tsi_list, title='lineage and terminal states', cmap='rainbow',
                         true_labels=self.labels)
            #plt.show()

        return


    def accuracy(self, onevsall=1):

        true_labels = self.true_label
        Index_dict = {}
        PARC_labels = self.labels
        N = len(PARC_labels)
        n_cancer = list(true_labels).count(onevsall)
        n_pbmc = N - n_cancer

        for k in range(N):
            Index_dict.setdefault(PARC_labels[k], []).append(true_labels[k])
        num_groups = len(Index_dict)
        sorted_keys = list(sorted(Index_dict.keys()))
        error_count = []
        pbmc_labels = []
        thp1_labels = []
        fp, fn, tp, tn, precision, recall, f1_score = 0, 0, 0, 0, 0, 0, 0

        for kk in sorted_keys:
            vals = [t for t in Index_dict[kk]]
            majority_val = self.func_mode(vals)
            # if majority_val == onevsall: print('cluster', kk, ' has majority', onevsall, 'with population', len(vals))
            if kk == -1:
                len_unknown = len(vals)
                # print('len unknown', len_unknown)
            if (majority_val == onevsall) and (kk != -1):
                thp1_labels.append(kk)
                fp = fp + len([e for e in vals if e != onevsall])
                tp = tp + len([e for e in vals if e == onevsall])
                list_error = [e for e in vals if e != majority_val]
                e_count = len(list_error)
                error_count.append(e_count)
            elif (majority_val != onevsall) and (kk != -1):
                pbmc_labels.append(kk)
                tn = tn + len([e for e in vals if e != onevsall])
                fn = fn + len([e for e in vals if e == onevsall])
                error_count.append(len([e for e in vals if e != majority_val]))

        predict_class_array = np.array(PARC_labels)
        PARC_labels_array = np.array(PARC_labels)
        number_clusters_for_target = len(thp1_labels)
        for cancer_class in thp1_labels:
            predict_class_array[PARC_labels_array == cancer_class] = 1
        for benign_class in pbmc_labels:
            predict_class_array[PARC_labels_array == benign_class] = 0
        predict_class_array.reshape((predict_class_array.shape[0], -1))
        error_rate = sum(error_count) / N
        n_target = tp + fn
        tnr = tn / n_pbmc
        fnr = fn / n_cancer
        tpr = tp / n_cancer
        fpr = fp / n_pbmc

        if tp != 0 or fn != 0: recall = tp / (tp + fn)  # ability to find all positives
        if tp != 0 or fp != 0: precision = tp / (tp + fp)  # ability to not misclassify negatives as positives
        if precision != 0 or recall != 0:
            f1_score = precision * recall * 2 / (precision + recall)

        majority_truth_labels = np.empty((len(true_labels), 1), dtype=object)
        for cluster_i in set(PARC_labels):
            cluster_i_loc = np.where(np.asarray(PARC_labels) == cluster_i)[0]
            true_labels = np.asarray(true_labels)
            majority_truth = self.func_mode(list(true_labels[cluster_i_loc]))
            majority_truth_labels[cluster_i_loc] = majority_truth

        majority_truth_labels = list(majority_truth_labels.flatten())
        accuracy_val = [error_rate, f1_score, tnr, fnr, tpr, fpr, precision,
                        recall, num_groups, n_target]

        return accuracy_val, predict_class_array, majority_truth_labels, number_clusters_for_target

    def run_VIA(self):
        print(
            f'{datetime.now()}\tRunning VIA over input data of {self.data.shape[0]} (samples) x {self.data.shape[1]} (features)')
        print(f'{datetime.now()}\tKnngraph has {self.knn} neighbors')

        self.knn_struct = _construct_knn(self.data, knn=self.knn, distance=self.distance, num_threads=self.num_threads)
        st = time.time()
        self.run_subVIA()
        run_time = time.time() - st
        print(f'{datetime.now()}\tTime elapsed {round(run_time, 1)} seconds')
        do_accuracy = False

        if do_accuracy:
            print(f'{datetime.now()}\tCalculating accuracy scores for clusters in the StaVia graph. saved to via_object.stats_df as a dataframe')
            targets = list(set(self.true_label))
            targets.sort()

            N = len(self.true_label)
            self.f1_accumulated = 0
            self.f1_mean = 0
            self.stats_df = pd.DataFrame(
                {'edgepruning_clustering_resolution': [self.edgepruning_clustering_resolution], 'edgepruning_clustering_resolution_local': [self.edgepruning_clustering_resolution_local],
                 'runtime(s)': [run_time]})

            list_roc = []
            if len(targets) > 1:
                f1_accumulated, f1_acc_noweighting = 0, 0
                for onevsall_val in targets:
                    # print('target is', onevsall_val)
                    vals_roc, predict_class_array, majority_truth_labels, numclusters_targetval = \
                        self.accuracy(onevsall=onevsall_val)
                    f1_current = vals_roc[1]
                    f1_accumulated = f1_accumulated + f1_current * (list(self.true_label).count(onevsall_val)) / N
                    f1_acc_noweighting = f1_acc_noweighting + f1_current

                    list_roc.append([self.edgepruning_clustering_resolution, self.edgepruning_clustering_resolution_local, onevsall_val] +
                                    vals_roc + [numclusters_targetval] + [run_time])

                f1_mean = f1_acc_noweighting / len(targets)

                df_accuracy = pd.DataFrame(list_roc,
                                           columns=['edgepruning_clustering_resolution', 'edgepruning_clustering_resolution_local', 'onevsall-target', 'error rate',
                                                    'f1-score', 'tnr', 'fnr',
                                                    'tpr', 'fpr', 'precision', 'recall', 'num_groups',
                                                    'population of target', 'num clusters', 'clustering runtime'])
                # df_accuracy.to_csv('/home/user/Trajectory/Datasets/Cao_ProtoVert/df_accuracy_.csv')

                self.f1_accumulated = f1_accumulated
                self.f1_mean = f1_mean
                self.stats_df = df_accuracy