import pandas as pd
import scanpy as sc
from scanpy import read
import wget
import gdown
import os
from scipy.io import loadmat
from datetime import datetime
def _zebrahub(foldername="./", use_velocity:bool = False):
"""Load Zebrahub data as AnnData object (2Gb). Optioally load with velocity matrices (10Gb) by setting use_velocity = True
The data has been filtered and log-normed as follows:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=5)
print('after filtering', adata)
sc.pp.normalize_total(adata, target_sum=1e4)
adata.raw = adata # saving the raw counts
sc.pp.log1p(adata)
Args:
foldername (string): foldername (string): path to directory where you want to store the dataset (or read it from if it's already been downloaded. './' current directory is default
Returns:
AnnData object
.. image:: https://github.com/ShobiStassen/VIA/blob/master/Figures/AtlasGallery/zebrahub_labeled.png?raw=true
:width="200px"
"""
# read files as pandas objects
if use_velocity: data_path = foldername + "Zebrahub_data_via_obsmVelocity.h5ad"
else: data_path = foldername + "zebrahub_data_via.h5ad"
if not os.path.isfile(data_path):
if use_velocity:
print(f'{datetime.now()}\tStart downloading data... This could take a few minutes (10 Gb file)')
data_url = "https://drive.google.com/file/d/1eOJ734HufRlz2uGTXZE7DZkQ1a5FpRGk/view?usp=drive_link"
else:
print(f'{datetime.now()}\tStart downloading data... This could take a few minutes (2.5 Gb file)')
data_url = "https://drive.google.com/file/d/1Pr_-5JDJYbpaUFwP5BvDrtfEQa_kb3Zq/view?usp=drive_link"
#wget.download(data_url, data_path)
adata = read(data_path, backup_url=data_url, sparse=True, cache=True)
print(f'{datetime.now()}\tFinished downloading data. Saved to {data_path}')
else:adata=sc.read_h5ad( filename=data_path)
#adata=sc.read_h5ad( filename=data_path)
return adata
def _mouse_gastrulation_sala(foldername="./"):
"""Load Mouse Gastrulation 2019 Pijuan Sala data. This anndata object includes
Args:
foldername (string): foldername (string): path to directory where you want to store the dataset (or read it from if it's already been downloaded. './' current directory is default
Returns:
AnnData object
.. image:: https://github.com/ShobiStassen/VIA/blob/master/Figures/AtlasGallery/mouseGastrSala.png?raw=true
:width="200px"
"""
# read files as pandas objects
data_path = foldername + "pijuan_gastrulation_via.h5ad"
if not os.path.isfile(data_path):
print(f'{datetime.now()}\tStart downloading data... This could take a few minutes')
data_url = "https://drive.google.com/file/d/1rvH04WAF97nXd0UiHfcVIdIF6sxS3QhL/view"
data_url='https://drive.google.com/file/d/1rvH04WAF97nXd0UiHfcVIdIF6sxS3QhL/view?usp=drive_link'
#wget.download(data_url, data_path)
print(f'{datetime.now()}\tFinished downloading data. Saved to {data_path}')
adata = read(data_path, backup_url=data_url, sparse=True, cache=True)
#adata=sc.read_h5ad( filename=data_path)
return adata
[docs]def toy_multifurcating(foldername="./"):
"""Load Toy_Multifurcating data as AnnData object
To access obs (label) as list, use AnnData.obs['group_id'].values.tolist()
Args:
foldername (string): foldername (string): path to directory where you want to store the dataset './' current directory is default
Returns:
AnnData object
.. image:: https://github.com/ShobiStassen/VIA/blob/master/Figures/toy3_streamvia.png?raw=true
:width="200px"
"""
# read files as pandas objects
data_path = foldername + "toy_multifurcating_M8_n1000d1000.csv"
ids_path = foldername + "toy_multifurcating_M8_n1000d1000_ids_with_truetime.csv"
if not os.path.isfile(data_path):
data_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/toy_multifurcating_M8_n1000d1000.csv"
wget.download(data_url, data_path)
if not os.path.isfile(ids_path):
print(f'{datetime.now()}\tStart downloading data...')
ids_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/toy_multifurcating_M8_n1000d1000_ids_with_truetime.csv"
wget.download(ids_url, ids_path)
print(f'{datetime.now()}\tFinished downloading data. Saved to {data_path}')
df_counts = pd.read_csv(data_path)
df_ids = pd.read_csv(ids_path)
# rearrange df_ids in ascending order of cell_id
df_ids['cell_id_num'] = [int(s[1::]) for s in df_ids['cell_id']]
df_counts = df_counts.drop('Unnamed: 0', axis=1)
df_ids = df_ids.sort_values(by=['cell_id_num'])
df_ids = df_ids.reset_index(drop=True)
true_label = df_ids[['group_id', 'true_time']]
# create AnnData object
adata = sc.AnnData(df_counts, obs=true_label)#, dtype='float32')
return adata
[docs]def toy_disconnected(foldername="./"):
"""Load Toy_Disconnected data as AnnData object
To access obs (label) as list, use AnnData.obs['group_id'].values.tolist()
Args:
foldername (string): Default current directory. path to directory where you want to store the dataset
Returns:
AnnData object
.. image:: https://github.com/ShobiStassen/VIA/blob/master/Figures/stream_plot_toy4.png?raw=true
:width="200px"
"""
# read files as pandas objects
data_path = foldername + "toy_disconnected_M9_n1000d1000.csv"
ids_path = foldername + "toy_disconnected_M9_n1000d1000_ids_with_truetime.csv"
if not os.path.isfile(data_path):
data_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/toy_disconnected_M9_n1000d1000.csv"
wget.download(data_url, data_path)
if not os.path.isfile(ids_path):
ids_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/toy_disconnected_M9_n1000d1000_ids_with_truetime.csv"
wget.download(ids_url, ids_path)
df_counts = pd.read_csv(data_path)
df_ids = pd.read_csv(ids_path)
# rearrange df_ids in ascending order of cell_id
df_ids['cell_id_num'] = [int(s[1::]) for s in df_ids['cell_id']]
df_counts = df_counts.drop('Unnamed: 0', axis=1)
df_ids = df_ids.sort_values(by=['cell_id_num'])
df_ids = df_ids.reset_index(drop=True)
true_label = df_ids[['group_id', 'true_time']]
# create AnnData object
adata = sc.AnnData(df_counts, obs=true_label)#, dtype='float32')
return adata
[docs]def cell_cycle_cyto_data(foldername="./"):
'''
Load cell cycle imagine based flow-cyto features
AnnData object with n_obs × n_vars = 2036 × 38
obs: 'cell_cycle_phase'
:param foldername (string) Default current directory. path to directory where you want to store the dataset
:return: anndata
'''
data_path = foldername + "cell_cycle_cyto.h5ad"
if not os.path.isfile(data_path):
ids_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/cell_cycle_cyto.h5ad"
wget.download(ids_url, data_path)
adata=sc.read_h5ad(filename=data_path)
print(adata)
return adata
[docs]def scRNA_hematopoiesis(foldername="./"):
"""Load scRNA seq Hematopoiesis data as AnnData object
Args:
foldername (string): Directory of dataset
Returns:
AnnData object
.. image:: https://github.com/ShobiStassen/VIA/blob/master/Figures/humancd34_streamplot.png?raw=true
:width="200px"
"""
# read files as pandas objects
data_path = foldername + "human_cd34_bm_rep1.h5ad"
ids_path = foldername + "Nover_Cor_PredFine_notLogNorm.csv"
if not os.path.isfile(data_path):
data_url = "https://docs.google.com/uc?id=1ZSZbMeTQQPfPBGcnfUNDNL4om98UiNcO"
gdown.download(data_url, data_path, quiet=False)
if not os.path.isfile(ids_path):
ids_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/Nover_Cor_PredFine_notLogNorm.csv"
wget.download(ids_url, ids_path)
ad = sc.read(data_path)
nover_labels = pd.read_csv(ids_path)['x'].values.tolist()
dict_abb = {'Basophils': 'BASO1', 'CD4+ Effector Memory': 'TCEL7', 'Colony Forming Unit-Granulocytes': 'GRAN1',
'Colony Forming Unit-Megakaryocytic': 'MEGA1', 'Colony Forming Unit-Monocytes': 'MONO1',
'Common myeloid progenitors': "CMP", 'Early B cells': "PRE_B2", 'Eosinophils': "EOS2",
'Erythroid_CD34- CD71+ GlyA-': "ERY2", 'Erythroid_CD34- CD71+ GlyA+': "ERY3",
'Erythroid_CD34+ CD71+ GlyA-': "ERY1", 'Erythroid_CD34- CD71lo GlyA+': 'ERY4',
'Granulocyte/monocyte progenitors': "GMP", 'Hematopoietic stem cells_CD133+ CD34dim': "HSC1",
'Hematopoietic stem cells_CD38- CD34+': "HSC2",
'Mature B cells class able to switch': "B_a2", 'Mature B cells class switched': "B_a4",
'Mature NK cells_CD56- CD16- CD3-': "Nka3", 'Monocytes': "MONO2",
'Megakaryocyte/erythroid progenitors': "MEP", 'Myeloid Dendritic Cells': 'mDC (cDC)',
'Naïve B cells': "B_a1",
'Plasmacytoid Dendritic Cells': "pDC", 'Pro B cells': 'PRE_B3'}
# NOTE: Myeloid DCs are now called Conventional Dendritic Cells cDCs
nover_labels = [dict_abb[i] for i in nover_labels]
for i in list(set(nover_labels)):
print('Cell type', i, 'has ', nover_labels.count(i), 'cells')
# tsnem = ad.obsm['tsne']
true_label = nover_labels
# create AnnData object
ad.obs['label'] = [i for i in nover_labels]
#adata = sc.AnnData(ad.X)
#adata.obs['label'] = true_label
#adata.obs_names = ad.obs_names
#adata.var_names = ad.var_names
return ad
[docs]def scATAC_hematopoiesis(foldername="./"):
"""Load scATAC seq Hematopoiesis data as AnnData object
Args:
foldername (string): Directory of dataset
Returns:
AnnData object
"""
# read files as pandas objects
data_path = foldername + "scATAC_hemato_Buenrostro.csv"
if not os.path.isfile(data_path):
data_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/scATAC_hemato_Buenrostro.csv"
wget.download(data_url, data_path)
df = pd.read_csv(data_path)
print('number cells', df.shape[0])
cell_types = ['GMP', 'HSC', 'MEP', 'CLP', 'CMP', 'LMuPP', 'MPP', 'pDC', 'mono', 'UNK']
cell_annot = df['cellname'].values
true_label = []
found_annot = False
for annot in cell_annot:
for cell_type_i in cell_types:
if cell_type_i in annot:
true_label.append(cell_type_i)
found_annot = True
if found_annot == False:
true_label.append('unknown')
found_annot = False
PCcol = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5']
X_in = df[PCcol].values
# create AnnData object
adata = sc.AnnData(X_in)
adata.obs['cell_type'] = true_label
return adata
[docs]def cell_cycle(foldername="./"):
"""Load cell cycle data as AnnData object
Args:
foldername (string): Directory of dataset
Returns:
AnnData object
.. image:: https://github.com/ShobiStassen/VIA/blob/master/Figures/mb231_overall_300dpi.png?raw=true
:width="200px"
"""
# read files as pandas objects
data_path = foldername + "mcf7_38features.csv"
ids_path = foldername + "mcf7_phases.csv"
if not os.path.isfile(data_path):
data_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/mcf7_38features.csv"
wget.download(data_url, data_path)
if not os.path.isfile(ids_path):
ids_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/mcf7_phases.csv"
wget.download(ids_url, ids_path)
df = pd.read_csv(data_path)
df = df.drop('Unnamed: 0', 1)
true_label = pd.read_csv(ids_path)
true_label = list(true_label['phase'].values.flatten())
print('There are ', len(true_label), 'MCF7 cells and ', df.shape[1], 'features')
adata = sc.AnnData(df)
adata.obs["phase"] = true_label
adata.var_names = df.columns
return adata
[docs]def embryoid_body(foldername="./"):
"""Load embryoid body data as AnnData object
Args:
foldername (string): Directory to save dataset
Returns:
AnnData object
"""
# read files as pandas objects
data_path = foldername + "EBdata.mat"
emb_path = foldername + "EB_phate_embedding.csv"
if not os.path.isfile(data_path):
data_url = "https://docs.google.com/uc?id=1yz3zR1KAmghjYB_nLLUZoIlKN9Ew4RHf"
gdown.download(data_url, data_path, quiet=False)
if not os.path.isfile(emb_path):
emb_url = "https://raw.githubusercontent.com/ShobiStassen/VIA/master/Datasets/EB_phate_embedding.csv"
wget.download(emb_url, emb_path)
annots = loadmat(data_path)
data = annots[
'data'].toarray() # has been filtered but not yet normed (by library size) nor other subsequent pre-processing steps
time_labels = annots['cells'].flatten().tolist()
time_labels = ['Day ' + str(i) for i in time_labels]
adata = sc.AnnData(data)
# Load in Phate embedding (can also use Umap/tsne embedding if desired)
Y_phate = pd.read_csv(emb_path)
Y_phate = Y_phate.values
# construct AnnData object
gene_names = []
gene_names_raw = annots['EBgenes_name']
for i in gene_names_raw:
gene_names.append(i[0][0])
adata.var_names = gene_names
adata.obs['time'] = time_labels # ['Day '+str(i) for i in time_labels]
return adata
[docs]def moffitt_preoptic(foldername="./"):
"""Load preoptic hypothalamus mouse data from moffitt et al.,m as AnnData object
Args:
foldername (string): foldername (string): path to directory where you want to store the dataset './' current directory is default
Returns:
AnnData object
.. image:: https://github.com/ShobiStassen/VIA/blob/master/Figures/Bregma29_tissue.png?raw=true
:width="200px"
"""
# read files as pandas objects
data_path = foldername + "anndata_moffit.h5ad"
data_url="https://ndownloader.figshare.com/files/28169379"
#data_url = 'https://github.com/ShobiStassen/VIA/blob/2cb4085c4a660f0410c4d8725a4322818387e19d/Datasets/anndata_moffit.h5ad' #same file as in figshare. using github url doesnt work for h5ad
adata = sc.read(filename=data_path,backup_url=data_url)
#adata = sc.read_h5ad(data_path) #
return adata
[docs]def zesta(foldername="./"):
'''
:return:
'''
# read files as pandas objects
data_path = foldername + "anndata_moffit.h5ad"
data_url = 'https://figshare.com/s/191076ef460ac933071e'
adata = sc.read(filename=data_path, backup_url=data_url)
return adata