Source code for spider.svi

import numpy as np
from scipy import optimize
import pandas as pd
import anndata
import os
from os.path import exists
import squidpy as sq
import warnings
warnings.filterwarnings('ignore')
import time
from importlib import resources
from scipy import sparse
from joblib import Parallel, delayed
from tqdm import tqdm


[docs]def tf_corr(idata, adata, is_human, out_f, threshold=0.3, n_jobs=20, overwrite=False, step=None, pathways=[], keep_top_score=None, smooth=False): """ Compute transcription factor correlations with ligand-receptor interactions. This function analyzes the given interface data and evaluates the correlations between each interaction profile and their target genes, saving the results to specified output files. Parameters ---------- idata : AnnData Input AnnData object containing interface data. adata : AnnData Input AnnData object containing count data. is_human : bool Indicator for whether the data pertains to human genes. out_f : str Output directory path where results will be saved. threshold : float, optional Minimum correlation coefficient threshold for calling TF support (default is 0.3). n_jobs : int, optional Number of parallel jobs to run during processing (default is 20). overwrite : bool, optional If True, existing output files will be overwritten (default is False). step : float, optional The step size for adjusting the threshold during the iterative process (default is None). pathways : list, optional A list of pathways to analyze; if empty, pathways will be loaded based on the input data. keep_top_score : int, optional The number of top scoring interactions to retain (default is None). smooth : bool, optional If True, applies smoothing to the expression data (default is False). Returns ------- None The function saves results directly to the specified output files and updates the `idata` object with correlation and score data. Notes ----- - The function checks if the output directory exists and creates it if it does not. - If smoothing is enabled, it applies a kernel smoothing method based on spatial data. - Results are saved in sparse matrix format and CSV files for easy access and further analysis. Examples -------- >>> tf_corr(idata, adata, adata.uns['is_human'], 'output_directory/', threshold=0.3, n_jobs=30, overwrite=True, step=None, keep_top_score=5) """ lr_df = idata.var[['ligand', 'receptor']] if len(pathways) == 0: pathways = subset_pathway_df(load_pathway_df(is_human), lr_df, adata) else: pathways = subset_pathway_df(pathways, lr_df, adata) print(pathways.head()) unique_genes = np.unique(pathways[['src', 'dest']]) value_to_index = {value: idx for idx, value in enumerate(unique_genes)} pathways['src_idx'] = pathways['src'].map(value_to_index) pathways['dest_idx'] = pathways['dest'].map(value_to_index) lr_df['receptor_idx'] = lr_df['receptor'].map(value_to_index) lr_df = lr_df.replace(np.nan, -1) lr_df['receptor_idx'] = lr_df['receptor_idx'].astype(int) hop1, sub_hop1, sub_hop2, sub_hop3, with_sv_tf_r, \ sv_target_genes, sv_target_genes_index = get_ref_hops(adata, pathways, unique_genes, lr_df, value_to_index) idata.var['with_sv_tf'] = lr_df['with_sv_tf'] print(f'Testing {idata.var["with_sv_tf"].sum()}/{len(idata.var)} LRIs ({int(100*round(idata.var["with_sv_tf"].sum()/len(idata.var), 2))}%) with SV target genes.') if (overwrite) | (not exists( f'{out_f}sparse_tf_corr.npz')): if smooth: neighborA = idata.obs.groupby('A')['B'].apply(list) neighborB = idata.obs.groupby('B')['A'].apply(list) neighbor = pd.concat([neighborA, neighborB], axis=0).reset_index(drop=False) neighbor.columns = ['A', 'B'] neighbor = pd.Series(neighbor.groupby('A')['B'].sum()) adata_exp = pd.DataFrame(idata.uns['tf_count'], columns=idata.uns['tf_header'], index=adata.obs_names) adata_exp_copy = adata_exp.copy() for cell in adata_exp.index: if cell in neighbor.index: related_spots = neighbor.loc[cell] adata_exp_copy.loc[cell] = (adata_exp.loc[cell] + algebraic_mean(related_spots, adata_exp)) * 0.5 adata_exp = adata_exp_copy.to_numpy() smth_eta = 1 smth_nu = 1 smth_kernel = 'exp' from scipy.spatial import distance_matrix kernel_D = distance_matrix(adata.obsm["spatial"], adata.obsm["spatial"]) for i in range(adata_exp.shape[1]): nzind = np.where(adata_exp[:,i] > 0)[0] phi = kernel_function(kernel_D[nzind,:][:,nzind], smth_eta, smth_nu, smth_kernel, normalization='unit_col_sum') adata_exp[nzind,i] = np.matmul( phi, adata_exp[nzind,i].reshape(-1,1) )[:,0] adata_exp = pd.DataFrame(adata_exp, columns=idata.uns['tf_header'], index=adata.obs_names) else: adata_exp = pd.DataFrame(idata.uns['tf_count'], columns=idata.uns['tf_header'], index=adata.obs_names) pseudo_count = 1e-10 adata_exp[adata_exp < pseudo_count] = pseudo_count gene_name_map = {value: key for key, value in value_to_index.items()} # adata_exp = pd.DataFrame(adata_exp, index=adata.obs_names, columns=adata.var_names) adata_exp = adata_exp / adata_exp.max(axis=0).values major_df = run_cells(adata_exp, unique_genes, sv_target_genes, with_sv_tf_r, hop1, sub_hop1, sub_hop2, sub_hop3, sv_target_genes_index, gene_name_map, n_jobs=n_jobs) results_corrs, results_tf_scores = run_receptors(major_df, idata, n_jobs=n_jobs, keep_top_score=keep_top_score) # save correlaitons result sparse.save_npz(f'{out_f}sparse_tf_corr.npz', sparse.csr_matrix(results_corrs.values)) pd.Series(results_corrs.columns).to_csv(f'{out_f}tf_corr_columns.csv') pd.Series(results_corrs.index).to_csv(f'{out_f}tf_corr_index.csv') # save score result sparse.save_npz(f'{out_f}sparse_tf_scores.npz', sparse.csr_matrix(results_tf_scores.values)) pd.Series(results_tf_scores.columns).to_csv(f'{out_f}tf_scores_columns.csv') pd.Series(results_tf_scores.index).to_csv(f'{out_f}tf_scores_index.csv') sparse_mat = sparse.csr_matrix(sparse.load_npz(f'{out_f}sparse_tf_corr.npz')) columns = pd.read_csv(f'{out_f}tf_corr_columns.csv', index_col=0).to_numpy().flatten() index = pd.read_csv(f'{out_f}tf_corr_index.csv', index_col=0).to_numpy().flatten() idata.uns['tf_corr'] = pd.DataFrame(sparse_mat.A, index=index, columns=columns) sparse_mat = sparse.csr_matrix(sparse.load_npz(f'{out_f}sparse_tf_scores.npz')) columns = pd.read_csv(f'{out_f}tf_scores_columns.csv', index_col=0).to_numpy().flatten() index = pd.read_csv(f'{out_f}tf_scores_index.csv', index_col=0).to_numpy().flatten() idata.obsm['tf_score'] = pd.DataFrame(sparse_mat.A, index=index, columns=columns) weight = count_tf(idata, threshold=threshold) if step is not None: while weight > 0.5 or threshold >= 0.1: threshold = round(threshold - step, 2) weight = count_tf(idata, threshold=threshold)
def count_tf(idata, threshold=0.3): tf_count = (idata.uns['tf_corr'] > threshold).sum(axis=1).sort_values(ascending=False) print(f'{sum(tf_count > 0)}/{idata.var["with_sv_tf"].sum()} LRIs ({int(100*round(sum(tf_count > 0)/idata.var["with_sv_tf"].sum(), 2))}%) are predicted to be activated by downstream genes (threshold={threshold}).') idata.var['tf_support_count'] = -1 idata.var.loc[tf_count.index, 'tf_support_count'] = tf_count print(f'Excluded {sum(tf_count == 0)}/{idata.shape[1]} LRIs ({int(100*round(sum(tf_count == 0)/idata.shape[1], 2))}%) without SV support (threshold={threshold}).') return 1-sum(tf_count > 0)/idata.var["with_sv_tf"].sum() def algebraic_mean(related_samples, df, alpha=0.3): # alpha is the portion for max values = df.loc[related_samples] mean = values.mean() * (1-alpha) + values.max() * alpha return mean def kernel_function(x, eta, nu, kernel, normalization=None): if kernel == "exp": phi = np.exp(-np.power(x/eta, nu)) elif kernel == "lorentz": phi = 1 / ( 1 + np.power(x/eta, nu) ) else: phi = x if normalization == "unit_row_sum": phi = (phi.T / np.sum(phi.T, axis=0)).T elif normalization == "unit_col_sum": phi = phi / np.sum(phi, axis=0) return phi def load_pathway_df(is_human): from importlib import resources with resources.path("spider.lrdb", "pathways.tsv") as pw_fn: pw_list = pd.read_csv(pw_fn, sep='\t', index_col=0) if is_human: print('Using human pathway dataset.') pw_list = pw_list[pw_list.species=='Human'] else: print('Using mouse pathway dataset.') pw_list = pw_list[pw_list.species=='Mouse'] return pw_list def subset_pathway_df(pathways, lr_df, adata): pathways = pathways[['src', 'dest']] pathways.index = pathways['src'] + '_' + pathways['dest'] # remove pathways that are in the lrdb by index as we only want the downstream targets pathways = pathways[~pathways.index.isin(lr_df.index)] pathways = pathways[pathways['src'].isin(adata.var_names) | pathways['dest'].isin(adata.var_names)] # to tolerate missing points in pathwaay pathways = pathways.drop_duplicates() return pathways def get_ref_hops(adata, pathways, unique_genes, lr_df, value_to_index): from scipy import sparse hop1 = sparse.csr_matrix((np.ones(len(pathways)), (pathways['src_idx'], pathways['dest_idx'])), shape=(len(unique_genes), len(unique_genes))) hop2 = hop1.dot(hop1) hop3 = hop2.dot(hop1) r_index = lr_df['receptor_idx'].unique() r_index = np.array([x for x in r_index if x != -1]) svg = adata.var[adata.var['label'] != -1].index.to_numpy() sv_target_genes = np.intersect1d(svg, unique_genes) sv_target_genes_index = np.array([value_to_index[gene] for gene in sv_target_genes]) print(f'Found {len(sv_target_genes)} SV pathway nodes.') # sub_hop: receptor x sv target genes sub_hop1 = hop1[r_index, :][:, sv_target_genes_index].A sub_hop2 = hop2[r_index, :][:, sv_target_genes_index].A sub_hop3 = hop3[r_index, :][:, sv_target_genes_index].A sum_path_all_htop = (sub_hop1 + sub_hop2 + sub_hop3).sum(axis=1) with_sv_tf_r = r_index[sum_path_all_htop != 0] print(f'Found {len(with_sv_tf_r)}/{len(sum_path_all_htop)} receptors ({int(100*round(len(with_sv_tf_r)/len(sum_path_all_htop), 2))}%) with SV target genes.') sub_hop1 = hop1[with_sv_tf_r, :][:, sv_target_genes_index].A sub_hop2 = hop2[with_sv_tf_r, :][:, sv_target_genes_index].A sub_hop3 = hop3[with_sv_tf_r, :][:, sv_target_genes_index].A lr_df['with_sv_tf'] = 0 lr_df.loc[lr_df['receptor_idx'].isin(with_sv_tf_r), 'with_sv_tf'] = 1 return hop1, sub_hop1, sub_hop2, sub_hop3, with_sv_tf_r, sv_target_genes, sv_target_genes_index def process_cell(cell, unique_genes, norm_exp, sv_target_genes, with_sv_tf_r, hop1, sub_hop1, sub_hop2, sub_hop3, sv_target_genes_index, gene_name_map, pseudo_count=1e-10): # build full exp for cell exp = np.zeros(len(unique_genes)) for i, gene in enumerate(unique_genes): if gene in norm_exp.columns: exp[i] = norm_exp.loc[cell, gene] exp[exp < pseudo_count] = pseudo_count # set pseudo count to exp weighted_hop1 = hop1.multiply(exp).tocsr() weighted_hop2 = (weighted_hop1.dot(hop1)).multiply(exp).tocsr() weighted_hop3 = (weighted_hop2.dot(hop1)).multiply(exp).tocsr() weighted_sub_hop1 = weighted_hop1[with_sv_tf_r, :][:, sv_target_genes_index].A / sub_hop1 weighted_sub_hop1[~np.isfinite(weighted_sub_hop1)] = 0 weighted_sub_hop2 = weighted_hop2[with_sv_tf_r, :][:, sv_target_genes_index].A / sub_hop2 weighted_sub_hop2[~np.isfinite(weighted_sub_hop2)] = 0 weighted_sub_hop2 = np.power(weighted_sub_hop2, 1/2) weighted_sub_hop3 = weighted_hop3[with_sv_tf_r, :][:, sv_target_genes_index].A / sub_hop3 weighted_sub_hop3[~np.isfinite(weighted_sub_hop3)] = 0 weighted_sub_hop3 = np.power(weighted_sub_hop3, 1/3) weighted_sum_path_all_htop = np.nansum([weighted_sub_hop1, weighted_sub_hop2, weighted_sub_hop3], axis=0) count_df = pd.DataFrame(weighted_sum_path_all_htop, columns=sv_target_genes) count_df['receptor'] = [gene_name_map[x] for x in with_sv_tf_r] count_df['cell'] = cell return count_df def run_cells(norm_exp, unique_genes, sv_target_genes, with_sv_tf_r, hop1, sub_hop1, sub_hop2, sub_hop3, sv_target_genes_index, gene_name_map, n_jobs=-1): results = Parallel(n_jobs=n_jobs)( delayed(process_cell)(cell, unique_genes, norm_exp, sv_target_genes, with_sv_tf_r, hop1, sub_hop1, sub_hop2, sub_hop3, sv_target_genes_index, gene_name_map)\ for cell in tqdm(norm_exp.index, desc="Processing cells") ) major_df = pd.concat(results) print('0227:1525') return major_df def process_receptor(r, major_df, idata, keep_top_score=10): receptor_df = major_df[major_df.receptor == r].set_index('receptor').reset_index(drop=True).set_index('cell') receptor_df = receptor_df.loc[:, receptor_df.sum(axis=0) > 0] lrs = idata.var_names[idata.var_names.str.split('_').str[1] == r].to_numpy().tolist() corrs = [] sv_tf_scores = [] for lr in lrs: j = np.argwhere(idata.var_names == lr)[0][0] receivers = [] for i in range(len(idata)): if idata.layers['direction'][i,j] == 1: receivers.append(idata.obs.iloc[i]['B']) else: receivers.append(idata.obs.iloc[i]['A']) sv_tf_score = receptor_df.loc[receivers].reset_index(drop=True) # drop the columns that are all zeros sv_tf_score = sv_tf_score[sv_tf_score.columns[sv_tf_score.sum()!=0]] sv_tf_score.index = idata.obs_names sv_tf_score[lr] = idata.to_df()[lr] if sv_tf_score[sv_tf_score.columns[:-1]].max().max() == 0: continue corr = sv_tf_score.corr(method='pearson')[lr][:-1] # the last row if the lr # only keep the positive correlations corr = corr[corr>0].sort_values(ascending=False) corrs.append(corr) sv_tf_score = sv_tf_score.drop(lr, axis=1)[corr.index[:keep_top_score]] sv_tf_score.columns = [f'{lr}-{c}' for c in sv_tf_score.columns] sv_tf_scores.append(sv_tf_score) corrs = pd.concat(corrs, axis=1) sv_tf_scores = pd.concat(sv_tf_scores, axis=1) return [corrs, sv_tf_scores] def run_receptors(major_df, idata, n_jobs=-1, keep_top_score=10): results = Parallel(n_jobs=n_jobs)( delayed(process_receptor)(r, major_df, idata, keep_top_score) for r in tqdm(major_df.receptor.unique(), desc="Processing receptors") ) corrs = [result[0] for result in results] sv_tf_scores = [result[1] for result in results] results_corrs = pd.concat(corrs, axis=1).T.astype(np.float16).replace(np.nan, 0) results_tf_scores = pd.concat(sv_tf_scores, axis=1).replace(1e-10, 0) return results_corrs, results_tf_scores # V1 code def abstract(idata, n_neighbors=10, alpha=0.3): # Method: Building abstract interfaces with self-organizing map from somde import SomNode df = idata.to_df().T corinfo = idata.obs corinfo["total_count"]=df.sum(0) X=corinfo[['row','col']].values.astype(np.float32) som = SomNode(X,n_neighbors) ndf,ninfo = som.mtx(df, alpha=alpha) meta_idata = anndata.AnnData(ndf.T) meta_idata.obs[['row', 'col', 'total_count']] = ninfo.to_numpy(dtype=float) meta_idata.obsm['spatial'] = meta_idata.obs[['row', 'col']].to_numpy() idata = som_mapping(som, idata, df) return som, idata, meta_idata def som_mapping(som, idata, df): bsmc = som.som.bmus soml = [] for i in np.arange(bsmc.shape[0]): u,v = bsmc[i] soml.append(v*som.somn+u) idata.obs['som_node'] = -1 ids = np.sort(np.unique(np.array(soml))) count = 0 for i in ids: idata.obs.loc[df.loc[:,np.array(soml)==i].columns,'som_node'] = count count += 1 return idata def meta_pattern_to_idata(idata, meta_idata): idata.obsm['pattern_score'] = meta_idata.obsm['pattern_score'][idata.obs['som_node'].to_numpy()] idata.var = meta_idata.var for i in np.array(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'])[np.isin(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'],list(meta_idata.uns.keys()))]: idata.uns[i] = meta_idata.uns[i] try: idata.uns[i+"_time"] = meta_idata.uns[i+"_time"] except: pass print(f'Added key pattern_score in idata.obsm and method results and running time in uns') def tf_pattern_to_idata(idata_raw, idata): idata_raw.uns['tf_corr'] = idata.uns['tf_corr'] idata_raw.obsm['pattern_score'] = idata.obsm['pattern_score'] idata_raw.var['is_svi'] = 0 idata_raw.var.loc[idata.var_names, 'is_svi'] = idata.var['is_svi'] idata_raw.var['label'] = -1 idata_raw.var.loc[idata.var_names, 'label'] = idata.var['label'] corr_cols = [f'pattern_correlation_{x}' for x in range(idata_raw.obsm['pattern_score'].shape[1])] idata_raw.var[corr_cols] = 0 idata_raw.var.loc[idata_raw[:,idata_raw.var['is_svi']==1].var_names,corr_cols] = idata.var[corr_cols] for i in np.array(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'])[np.isin(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'],list(idata.uns.keys()))]: idata_raw.uns[i] = idata.uns[i] try: idata_raw.uns[i+"_time"] = idata.uns[i+"_time"] except: pass def find_svi(idata, out_f, overwrite, R_path, som=None, n_jobs=10, skip_metric=False): # Method: Identifying spatially variable LR interactions # Gaussian models svi_nnSVG(idata,out_f,R_path,overwrite, n_jobs=n_jobs) svi_SOMDE(idata,out_f,overwrite, som=som, n_jobs=n_jobs) svi_SpatialDE2_omnibus(idata,out_f,overwrite, n_jobs=n_jobs) # svi_SpatialDE2(idata,out_f,overwrite) svi_SpatialDE(idata,out_f,overwrite) # Non-parametric covariance test svi_SPARKX(idata,out_f,R_path,overwrite, n_jobs=n_jobs) # HMRF svi_scGCO(idata,out_f,overwrite, n_jobs=n_jobs) # baseline auto-correlation metrics if not skip_metric: svi_moran(idata,out_f,overwrite, n_jobs=n_jobs) svi_geary(idata,out_f,overwrite, n_jobs=n_jobs) def svi_moran(idata, work_dir, overwrite=False, n_jobs=10): try: t0=time.time() n_perms=1000 if (overwrite) | (not exists( f'{work_dir}moranI.csv')): sq.gr.spatial_neighbors(idata, key_added='spatial') sq.gr.spatial_autocorr( idata, mode="moran", n_perms=n_perms, n_jobs=n_jobs, ) idata.uns['moranI_time'] = time.time()-t0 idata.uns['moranI'].to_csv(f'{work_dir}moranI.csv') result = pd.read_csv(f'{work_dir}moranI.csv', index_col=0) idata.uns['moranI'] = result print(f'Added key moranI in idata.uns') except: pass def svi_geary(idata, work_dir,overwrite=False, n_jobs=10): try: t0=time.time() n_perms=1000 if (overwrite) | (not exists( f'{work_dir}gearyC.csv')): sq.gr.spatial_neighbors(idata, key_added='spatial') sq.gr.spatial_autocorr( idata, mode="geary", n_perms=n_perms, n_jobs=n_jobs, ) idata.uns['gearyC_time'] = time.time()-t0 idata.uns['gearyC'].to_csv(f'{work_dir}gearyC.csv') result = pd.read_csv(f'{work_dir}gearyC.csv', index_col=0) idata.uns['gearyC'] = result print(f'Added key gearyC in idata.uns') except: pass def svi_nnSVG(idata, work_dir, R_path, overwrite=False, n_jobs=10): # overwrite = True try: count_f = f'{work_dir}idata_count.csv' meta_f = f'{work_dir}idata_meta.csv' if (overwrite) | ((not exists(count_f)) & (not exists(meta_f))): idata.to_df().to_csv(count_f) idata.obs[['row', 'col']].to_csv(meta_f) if (overwrite) | (not exists( f'{work_dir}nnSVG.csv')): t0=time.time() with resources.path("spider.R_script", "run_nnSVG.R") as pw_fn: os.system(str(f'/bin/bash -c "{R_path} -f {pw_fn} {count_f} {meta_f} {work_dir} {n_jobs}"')) idata.uns['nnSVG_time'] = time.time()-t0 result = pd.read_csv(f'{work_dir}nnSVG.csv', index_col=0) idata.uns['nnSVG'] = result print(f'Added key nnSVG in idata.uns') except Exception as e: print(e) def scGCO_sv(locs, data_norm, cellGraph, gmmDict, smooth_factor=5, unary_scale_factor=100, label_cost=10, algorithm='expansion'): from itertools import repeat from functools import reduce import operator import statsmodels.stats.multitest as multi import scGCO results = [scGCO.compute_single_fixed_sf(locs, data_norm, cellGraph, gmmDict, w=None, n=None, smooth_factor=smooth_factor, unary_scale_factor=unary_scale_factor, label_cost=label_cost, algorithm=algorithm)] nnn = [results[i][0] for i in np.arange(len(results))] nodes = reduce(operator.add, nnn) ppp = [results[i][1] for i in np.arange(len(results))] p_values=reduce(operator.add, ppp) ggg = [results[i][2] for i in np.arange(len(results))] genes = reduce(operator.add, ggg) fff = [results[i][3] for i in np.arange(len(results))] s_factors = reduce(operator.add, fff) lll = [results[i][4] for i in np.arange(len(results))] pred_labels = reduce(operator.add, lll) mmm = [results[i][5] for i in np.arange(len(results))] model_labels = reduce(operator.add, mmm) best_p_values=[min(i) for i in p_values] fdr = multi.multipletests(np.array(best_p_values), method='fdr_bh')[1] labels_array = np.array(pred_labels).reshape(len(genes), pred_labels[0].shape[0]) data_array = np.array((genes, p_values, fdr,s_factors, nodes, model_labels), dtype=object).T t_array = np.hstack((data_array, labels_array)) c_labels = ['p_value', 'fdr', 'smooth_factor', 'nodes','model_labels'] for i in np.arange(labels_array.shape[1]) + 1: temp_label = 'label_cell_' + str(i) c_labels.append(temp_label) result_df = pd.DataFrame(t_array[:,1:], index=t_array[:,0], columns=c_labels) return result_df def scGCO_log1p(data): ''' log transform normalized count data :param file: data (m, n); :rtype: data (m, n); ''' from scipy.sparse import issparse if not issparse(data): return np.log1p(data) else: return data.log1p() def scGCO_normalize_count_cellranger(data,Log=True, norm=True): ''' normalize count as in cellranger :param file: data: A dataframe of shape (m, n); :rtype: data shape (m, n); ''' normalizing_factor = np.sum(data, axis = 1)/np.median(np.sum(data, axis = 1)) # change to to .to_numpy() since some np/pd versions report "ValueError: Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer supported. Convert to a numpy array before indexing instead" data = pd.DataFrame(data.values/normalizing_factor.to_numpy()[:,np.newaxis], columns=data.columns, index=data.index) # data = pd.DataFrame(data.values/normalizing_factor[:,np.newaxis], columns=data.columns, index=data.index) if Log==True: data=scGCO_log1p(data) else: data=data return data def svi_scGCO(idata, work_dir, overwrite=False, n_jobs=10): try: import scGCO if (overwrite) | (not exists( f'{work_dir}scGCO.csv')): t0=time.time() result_dfs = [] for smooth_factor in [1, 5, 10, 20]: for unary_scale_factor in [50, 100]: if (overwrite) | (not exists(f'{work_dir}scGCO_{smooth_factor}_{unary_scale_factor}.csv')): data_norm = idata.to_df() data_norm = scGCO_log1p(data_norm) exp = data_norm.mean(axis=1) pos_key=['row', 'col'] locs = idata.obs[pos_key].to_numpy() cellGraph= scGCO.create_graph_with_weight(locs, exp) gmmDict= scGCO.gmm_model(data_norm) result_df= scGCO_sv(locs, data_norm, cellGraph, gmmDict, smooth_factor = smooth_factor, unary_scale_factor=unary_scale_factor) result_df.to_csv(f'{work_dir}scGCO_{smooth_factor}_{unary_scale_factor}.csv') result_dfs.append(result_df) else: result_df = pd.read_csv(f'{work_dir}scGCO_{smooth_factor}_{unary_scale_factor}.csv', index_col=0) result_dfs.append(result_df) result_dfs = pd.concat(result_dfs) scGCO.write_result_to_csv(result_dfs, f'{work_dir}scGCO.csv') idata.uns['scGCO_time'] = time.time()-t0 result = pd.read_csv(f'{work_dir}scGCO.csv').groupby('Unnamed: 0')['fdr'].min().reset_index() idata.uns['scGCO'] = result print(f'Added key scGCO in idata.uns') except: pass def svi_SPARKX(idata, work_dir, R_path, overwrite=False, n_jobs=10): try: count_f = f'{work_dir}idata_count.csv' meta_f = f'{work_dir}idata_meta.csv' if (overwrite) | ((not exists(count_f)) & (not exists(meta_f))): idata.to_df().to_csv(count_f) idata.obs[['row', 'col']].to_csv(meta_f) if (overwrite) | (not exists( f'{work_dir}SPARKX.csv')): t0=time.time() with resources.path("spider.R_script", "run_SPARKX.R") as pw_fn: pw_fn = '/home/lishiying/data6/SPIDER-paper/spider_st/R_script/run_SPARKX.R' os.system(str(f'/bin/bash -c "{R_path} -f {pw_fn} {count_f} {meta_f} {work_dir} {n_jobs}"')) idata.uns['SPARKX_time'] = time.time()-t0 result = pd.read_csv(f'{work_dir}SPARKX.csv', index_col=0) idata.uns['SPARKX'] = result print(f'Added key SPARKX in idata.uns') except: pass def svi_SpatialDE2(idata, work_dir, overwrite=False, n_jobs=10): try: if (overwrite) | (not exists(f'{work_dir}SpatialDE.csv')): from spider import SpatialDE2 t0=time.time() np.random.seed(20230617) svg_full, individual = SpatialDE2.test(idata, omnibus=False) svg_full = pd.concat([svg_full.set_index('gene'), individual.loc[individual.groupby('gene').lengthscale.idxmin()].set_index('gene')], axis=1) svg_full.to_csv(f'{work_dir}SpatialDE.csv') individual.to_csv(f'{work_dir}SpatialDE_individual.csv') idata.uns['SpatialDE_time'] = time.time()-t0 result = pd.read_csv(f'{work_dir}SpatialDE.csv', index_col=0) idata.uns['SpatialDE'] = result print(f'Added key SpatialDE in idata.uns') except: pass def svi_SpatialDE2_omnibus(idata, work_dir, overwrite=False, n_jobs=10): try: if (overwrite) | (not exists(f'{work_dir}SpatialDE2_omnibus.csv')): from spider import SpatialDE2 t0=time.time() np.random.seed(20230617) svg_full, _ = SpatialDE2.test(idata, omnibus=True) svg_full = svg_full.set_index('gene') svg_full.to_csv(f'{work_dir}SpatialDE2_omnibus.csv') idata.uns['SpatialDE2_time'] = time.time()-t0 result = pd.read_csv(f'{work_dir}SpatialDE2_omnibus.csv', index_col=0) idata.uns['SpatialDE2'] = result print(f'Added key SpatialDE2 in idata.uns') except Exception as e: print(e) pass def svi_SpatialDE(idata, work_dir, overwrite=False, normalized=True): try: if (overwrite) | (not exists(f'{work_dir}SpatialDE.csv')): import NaiveDE import SpatialDE t0=time.time() counts = idata.to_df() sample_info = idata.obs[['row', 'col']] sample_info.columns = ['x', 'y'] sample_info['total_counts'] = counts.sum(axis=1) if normalized: resid_expr = counts else: counts = counts[sample_info['total_counts'] > 1] sample_info = sample_info[sample_info['total_counts'] >1] norm_expr = NaiveDE.stabilize(counts.T).T resid_expr = NaiveDE.regress_out(sample_info, norm_expr.T, 'np.log(total_counts)').T X = sample_info[['x', 'y']] results = SpatialDE.run(X, resid_expr) idata.uns['SpatialDE_time'] = time.time()-t0 results.to_csv(f'{work_dir}SpatialDE.csv') result = pd.read_csv(f'{work_dir}SpatialDE.csv', index_col=0) idata.uns['SpatialDE'] = result print(f'Added key SpatialDE in idata.uns') except Exception as e: print(e) pass def svi_SOMDE(idata, work_dir, overwrite=False, som=None, n_jobs=10): try: if (overwrite) | (not exists(f'{work_dir}SOMDE.csv')): t0=time.time() if som is None: from somde import SomNode df = idata.to_df().T corinfo = idata.obs corinfo["total_count"]=df.sum(0) X=corinfo[['row','col']].values.astype(np.float32) if len(df) < 1000: som = SomNode(X,4) else: som = SomNode(X,10) ndf,ninfo = som.mtx(df) # nres = som.norm() from scipy import optimize from somde import regress_out phi_hat, _ = optimize.curve_fit(lambda mu, phi: mu + phi * mu ** 2, som.ndf.mean(1), som.ndf.var(1)) dfm = np.log(som.ndf + 1. / (2 * np.abs(phi_hat[0]))) try: som.nres = regress_out(som.ninfo, dfm, 'np.log(total_count)').T except: som.nres = dfm.T result, SVnum =som.run() result.to_csv(f'{work_dir}SOMDE.csv') idata.uns['SOMDE_time'] = time.time()-t0 result = pd.read_csv(f'{work_dir}SOMDE.csv', index_col=0) idata.uns['SOMDE'] = result print(f'Added key SOMDE in idata.uns') except: pass
[docs]def combine_SVI(idata, threshold, svi_number=10): """ Combine SVIs based on both SV candidates and TF support. This function attempts to identify SVIs from the given input data. It first uses strict filtering, then falls back to relaxed filtering or SOMDE results if the detected number of SVIs is below a specified threshold. Parameters ---------- idata : AnnData Input AnnData object containing interaction data. threshold : float Threshold value for filtering SVI candidates based on significance. svi_number : int, optional The mimimum number of SVIs to identify (default is 10). The function will attempt to ensure that at least this many SVIs are returned. Returns ------- svi_df : DataFrame DataFrame containing results of all LRIs. svi_df_strict : DataFrame DataFrame containing strictly filtered SVIs based on the provided threshold and TF support. Notes ----- - The function checks the number of detected SVIs and applies different filtering strategies if the count is less than `svi_number`. - If TF support counts are available, genes without TF support are excluded from the strict SVI results. - Updates the `is_svi` column in `idata.var` to indicate which genes are identified as SVIs. Examples -------- >>> svi_df, svi_df_strict = combine_SVI(idata, threshold=0.05, svi_number=10) """ svi_df, svi_df_strict = combine_SVI_strict(idata,threshold=threshold) if len(svi_df_strict) <= svi_number: print(f'Detected SVI number is less than {svi_number}, falling back to relaxed filtering.') svi_df, svi_df_strict = combine_SVI_Fisher(idata,threshold=threshold) if len(svi_df_strict) <= svi_number: print(f'Detected SVI number is less than {svi_number}, falling back to use SOMDE result only.') svi_df, svi_df_strict = combine_SVI_somde(idata,threshold=threshold) org_len = len(svi_df_strict) if 'tf_support_count' in idata.var.columns: svi_df_strict = svi_df_strict[idata.var['tf_support_count'].loc[svi_df_strict.index] != 0] print(f'Excluding {org_len-len(svi_df_strict)} genes without TF support') print(f'{len(svi_df_strict)}/{len(svi_df)} tf-supported SVIs identified.') else: print('TF corr not previously calculated, falling back to sv pattern only') print(f'{len(svi_df_strict)}/{len(svi_df)} SVIs identified.') idata.var['is_svi'] = 0 idata.var.loc[svi_df_strict.index, 'is_svi'] = 1 return svi_df, svi_df_strict
def combine_SVI_Fisher(idata, threshold=0.05): from scipy.stats import combine_pvalues methods = np.array(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'])[np.isin(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'],list(idata.uns.keys()))] print(f'Using the results from SVI identification methods: {methods}') df = [] for i in methods: if i == 'SOMDE': df.append(idata.uns[i].set_index('g')[['qval']].rename(columns = {'qval': i})) elif i == 'SpatialDE2': df.append(idata.uns[i][['padj']].rename(columns = {'padj': i})) elif i == 'SpatialDE': df.append(idata.uns[i].set_index('g')[['qval']].rename(columns = {'qval': i})) elif i == 'SPARKX': df.append(idata.uns[i][['adjustedPval']].rename(columns = {'adjustedPval': i})) elif i == 'nnSVG': df.append(idata.uns[i][['padj']].rename(columns = {'padj': i})) elif i == 'scGCO': df.append(idata.uns[i].set_index(idata.uns['scGCO'].columns[0])[['fdr']].rename(columns = {'fdr': i})) df = pd.concat(df, axis=1).fillna(1) arr = [combine_pvalues(x, method='fisher')[1] for x in df.to_numpy()] df['adj_pvalue'] = arr df_sub = df[df['adj_pvalue']<threshold] print(f'{len(df_sub)}/{len(df)} SVIs identified (threshold={threshold}).') idata.uns['svi'] = df return df, df_sub def combine_SVI_Stouffer(idata, threshold=0.05): from scipy.stats import combine_pvalues methods = np.array(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'])[np.isin(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'],list(idata.uns.keys()))] print(f'Using the results from SVI identification methods: {methods}') df = [] for i in methods: if i == 'SOMDE': df.append(idata.uns[i].set_index('g')[['qval']].rename(columns = {'qval': i})) elif i == 'SpatialDE2': df.append(idata.uns[i][['padj']].rename(columns = {'padj': i})) elif i == 'SpatialDE': df.append(idata.uns[i].set_index('g')[['qval']].rename(columns = {'qval': i})) elif i == 'SPARKX': df.append(idata.uns[i][['adjustedPval']].rename(columns = {'adjustedPval': i})) elif i == 'nnSVG': df.append(idata.uns[i][['padj']].rename(columns = {'padj': i})) elif i == 'scGCO': df.append(idata.uns[i].set_index(idata.uns['scGCO'].columns[0])[['fdr']].rename(columns = {'fdr': i})) df = pd.concat(df, axis=1).fillna(1) arr = [combine_pvalues(x, method='stouffer')[1] for x in df.to_numpy()] df['adj_pvalue'] = arr df_sub = df[df['adj_pvalue']<threshold] print(f'{len(df_sub)}/{len(df)} SVIs identified (threshold={threshold}).') idata.uns['svi'] = df return df, df_sub def combine_SVI_strict(idata, threshold=0.01): methods = np.array(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'])[np.isin(['SOMDE', 'SpatialDE', 'SpatialDE2', 'SPARKX', 'nnSVG', 'scGCO', 'gearyC', 'moranI'],list(idata.uns.keys()))] print(f'Using the results from SVI identification methods: {methods}') df = [] for i in methods: if i == 'SOMDE': df.append(idata.uns[i].set_index('g')[['qval']].rename(columns = {'qval': i})) elif i == 'SpatialDE2': df.append(idata.uns[i][['padj']].rename(columns = {'padj': i})) elif i == 'SpatialDE': df.append(idata.uns[i].set_index('g')[['qval']].rename(columns = {'qval': i})) elif i == 'SPARKX': df.append(idata.uns[i][['adjustedPval']].rename(columns = {'adjustedPval': i})) # elif i == 'SPARKX': # df.append(idata.uns[i].set_index('rn')[['fdr']].rename(columns = {'fdr': i})) elif i == 'nnSVG': df.append(idata.uns[i][['padj']].rename(columns = {'padj': i})) elif i == 'scGCO': df.append(idata.uns[i].set_index(idata.uns['scGCO'].columns[0])[['fdr']].rename(columns = {'fdr': i})) df = pd.concat(df, axis=1).fillna(1) df_sub = df[(df<threshold).all(axis=1)] print(f'{len(df_sub)}/{len(df)} SVIs identified (threshold={threshold}).') idata.uns['svi'] = df return df, df_sub def combine_SVI_somde(idata, threshold=0.01): print(f'Using the SOMDE results') df = idata.uns['SOMDE'].set_index('g')[['qval']].rename(columns = {'qval': 'SOMDE'}).fillna(1) df_sub = df[(df<threshold).all(axis=1)] print(f'{len(df_sub)}/{len(df)} SVIs identified (threshold={threshold}).') idata.uns['svi'] = df return df, df_sub def eva_SVI_notf(idata, svi_df_strict): import seaborn as sns from statannotations.Annotator import Annotator methods = np.array(['moranI', 'gearyC', 'SOMDE', 'nnSVG'])[np.isin(['SOMDE', 'nnSVG', 'gearyC', 'moranI'],list(idata.uns.keys()))] print(f'evaluating with {methods}') dfs = [] metrics = [] for i in methods: if i == 'gearyC': dfs.append(-idata.uns['gearyC'][['C']]) metrics.append("Geary\nC (rev.)") elif i == 'moranI': dfs.append(idata.uns['moranI'][['I']]), metrics.append("Moran\nI") elif i == 'SOMDE': dfs.append(idata.uns['SOMDE'].set_index('g')['FSV']), metrics.append("FSV\n(SOMDE)") dfs.append(idata.uns['SOMDE'].set_index('g')['LLR']), metrics.append("LR\n(SOMDE)") elif i == 'nnSVG': dfs.append(idata.uns['nnSVG']['LR_stat']), metrics.append("LR\n(nnSVG)") df = pd.concat(dfs, axis=1) df.columns=metrics normalized_df=(df-df.min())/(df.max()-df.min()) normalized_df = normalized_df.fillna(0) normalized_df['Category'] = 'Excluded' normalized_df.loc[svi_df_strict.index, 'Category'] = 'SVI' normalized_df = normalized_df.melt(id_vars='Category', value_vars=metrics, var_name='Metric') normalized_df = normalized_df.sort_values('Category') if normalized_df['Category'].nunique()!=1: ax = sns.boxplot(data=normalized_df,x='Metric',y='value', hue='Category', palette={'SVI':'#80b1d3','Excluded': '#fb8072'}, width=0.8, hue_order=['SVI', 'Excluded']) pairs = [] for i in metrics: pairs.append( ((i, 'SVI'), (i, 'Excluded'))) annot = Annotator(ax, pairs, data=normalized_df, x='Metric',y='value', hue='Category', hue_order=['SVI', 'Excluded']) annot.configure(test='Mann-Whitney-gt',comparisons_correction="BH", correction_format="replace") annot.apply_test() annot.annotate() else: ax =sns.boxplot(data=normalized_df,x='Metric',y='value', hue='Category', palette={'SVI':'#80b1d3'}, width=0.8, hue_order=['SVI']) ax.legend(loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.1), frameon=False) ax.set_ylabel('') ax.set_xlabel('') def eva_SVI(idata, svi_df_strict): import seaborn as sns import matplotlib.pyplot as plt from statannotations.Annotator import Annotator methods = np.array(['moranI', 'gearyC', 'SOMDE', 'nnSVG'])[np.isin(['moranI', 'gearyC', 'SOMDE', 'nnSVG'],list(idata.uns.keys()))] print(f'evaluating with {methods}') dfs = [] metrics = [] for i in methods: if i == 'gearyC': dfs.append(-idata.uns['gearyC'][['C']]) metrics.append("Geary\nC (rev.)") elif i == 'moranI': dfs.append(idata.uns['moranI'][['I']]), metrics.append("Moran\nI") elif i == 'SOMDE': dfs.append(idata.uns['SOMDE'].set_index('g')['FSV']), metrics.append("FSV\n(SOMDE)") dfs.append(idata.uns['SOMDE'].set_index('g')['LLR']), metrics.append("LR\n(SOMDE)") elif i == 'nnSVG': dfs.append(idata.uns['nnSVG']['LR_stat']), metrics.append("LR\n(nnSVG)") dfs.append(idata.uns['tf_corr'].max(axis=1)) metrics.append('TF corr') df = pd.concat(dfs, axis=1) df.columns=metrics normalized_df=(df-df.min())/(df.max()-df.min()) normalized_df['Category'] = 'Excluded' normalized_df.loc[svi_df_strict.index, 'Category'] = 'SVI' normalized_df = normalized_df.melt(id_vars='Category', value_vars=metrics, var_name='Metric').dropna() plt.figure(figsize=(6, 4)) plt.rc('font', size=12) if normalized_df['Category'].nunique()!=1: ax = sns.boxplot(data=normalized_df,x='Metric',y='value', hue='Category', palette={'SVI':'#F7994A','Excluded': '#7F46C0'}, width=0.8, hue_order=['SVI', 'Excluded']) # ax = sns.boxplot(data=normalized_df,x='Metric',y='value', hue='Category', palette={'SVI':'#80b1d3','Excluded': '#fb8072'}, width=0.8, hue_order=['SVI', 'Excluded']) pairs = [] for i in metrics: pairs.append( ((i, 'SVI'), (i, 'Excluded'))) annot = Annotator(ax, pairs, data=normalized_df, x='Metric',y='value', hue='Category', hue_order=['SVI', 'Excluded']) annot.configure(test='Mann-Whitney-gt',comparisons_correction="BH", correction_format="replace") annot.apply_test() annot.annotate() else: ax =sns.boxplot(data=normalized_df,x='Metric',y='value', hue='Category', palette={'SVI':'#80b1d3'}, width=0.8, hue_order=['SVI']) ax.legend(loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.15), frameon=False) ax.set_ylabel('') ax.set_xlabel('') return normalized_df def eva_pattern(idata): import matplotlib.pyplot as plt import seaborn as sns from statannotations.Annotator import Annotator dfs = [] pairs = [] dummy_df = pd.get_dummies(idata.var['label']) dummy_df = pd.concat([idata.var[[f'pattern_correlation_{i}' for i in range(idata.obsm['pattern_score'].shape[1])]+['is_svi']], dummy_df], axis=1) dummy_df = dummy_df[dummy_df['is_svi'] == 1] for i in range(idata.obsm['pattern_score'].shape[1]): subdf = dummy_df[[i, f'pattern_correlation_{i}']] subdf.columns = ['membership', 'correlation'] subdf['pattern'] = i dfs.append(subdf) pairs.append(((i, 'member'), (i, 'non-member'))) maindf = pd.concat(dfs) maindf = maindf.sort_values('membership') maindf['membership'] = maindf['membership'].replace(0, 'non-member') maindf.loc[maindf['membership']!='non-member', 'membership'] = 'member' ax=sns.boxplot(data=maindf, x='pattern', y='correlation', hue='membership', hue_order=['member', 'non-member'], palette={'member':'#F7994A','non-member': '#7F46C0'}, width=0.8) ax.legend(loc='upper center',ncol=2, bbox_to_anchor=(0.5, 1.15), frameon=False) annot = Annotator(ax, pairs, data=maindf, x='pattern',y='correlation', hue='membership') annot.configure(test='Mann-Whitney-ls',comparisons_correction="BH", correction_format="replace", fontsize=13) annot.apply_test() annot.annotate() plt.xlabel('SPIDER SVI patterns', fontsize=12) plt.title('SPIDER SVI pattern evaluation', y=1.1, fontsize=12) plt.ylabel('LRI-pattern correlations', fontsize=12) class dotdict(dict): """dot.notation access to dictionary attributes""" __getattr__ = dict.get __setattr__ = dict.__setitem__ __delattr__ = dict.__delitem__ # SVI pattern generation with Gaussian process mixture model def SVI_patterns(idata, svi_df_strict, iter=1000, pattern_prune_threshold=1e-6, predefined_pattern_number=-1): from spider import SpatialDE2 allsignifgenes = svi_df_strict.index.to_numpy() if predefined_pattern_number == -1: param_obj = SpatialDE2.SpatialPatternParameters(maxiter=iter, pattern_prune_threshold=pattern_prune_threshold) else: param_obj = SpatialDE2.SpatialPatternParameters(maxiter=iter, pattern_prune_threshold=pattern_prune_threshold, nclasses=predefined_pattern_number) upper_patterns, _ = SpatialDE2.spatial_patterns(idata, normalized=True, genes=allsignifgenes, rng=np.random.default_rng(seed=45), params=param_obj, copy=False) if (predefined_pattern_number == -1) & ((upper_patterns.patterns.shape[1] > 20) | (upper_patterns.patterns.shape[1] < 5) | (len(np.unique(upper_patterns.patterns)) == 1)): param_obj = SpatialDE2.SpatialPatternParameters(maxiter=iter, pattern_prune_threshold=pattern_prune_threshold, nclasses=20) upper_patterns, _ = SpatialDE2.spatial_patterns(idata, normalized=True, genes=allsignifgenes, rng=np.random.default_rng(seed=45), params=param_obj, copy=False) if ((upper_patterns.patterns.shape[1] < 3) | (len(np.unique(upper_patterns.patterns)) == 1)) : param_obj = SpatialDE2.SpatialPatternParameters(maxiter=iter, pattern_prune_threshold=pattern_prune_threshold, nclasses=5) upper_patterns, _ = SpatialDE2.spatial_patterns(idata, normalized=True, genes=allsignifgenes, rng=np.random.default_rng(seed=45), params=param_obj, copy=False) # drop empty patterns labels, patterns, prob = upper_patterns.labels, upper_patterns.patterns, upper_patterns.pattern_probabilities uni_labels = np.sort(np.unique(labels)) value_counts = np.array([np.sum(labels == i) for i in uni_labels]) drop_labels = [] for i in range(patterns.shape[1]): if len(np.unique(patterns[:, i])) == 1: uni_labels = np.delete(uni_labels, np.where(uni_labels == i)) drop_labels.append(i) for i in uni_labels: if i >= patterns.shape[1]: drop_labels.append(i) uni_labels = np.delete(uni_labels, np.where(uni_labels == i)) new_label_map = dict(zip(uni_labels, range(len(uni_labels)))) for i in drop_labels: new_label_map[i] = -1 print(new_label_map) patterns = patterns[:, uni_labels] prob = prob[:, uni_labels] new_labels = np.array([new_label_map[l] for l in labels]) print(new_labels) upper_patterns = dotdict({ 'labels': new_labels, 'patterns': patterns, 'pattern_probabilities': prob }) value_counts = np.array([np.sum(new_labels == i) for i in np.unique(new_labels)]) print(value_counts) if ((upper_patterns.patterns.shape[1] < 3) | (len(np.unique(upper_patterns.patterns)) == 1)) : print(f'using controlled pattern') if predefined_pattern_number == -1: predefined_pattern_number = 10 histology_results, patterns, prob = SVI_patterns_v1(idata, svi_df_strict, components=predefined_pattern_number) patterns = patterns.to_numpy() labels = histology_results['pattern'].to_numpy() uni_labels = np.sort(np.unique(labels)) patterns = patterns[:, uni_labels] prob = prob[:, uni_labels] new_label_map = dict(zip(uni_labels, range(len(uni_labels)))) new_labels = np.array([new_label_map[l] for l in labels]) histology_results['pattern'] = new_labels upper_patterns = dotdict({ 'labels': histology_results['pattern'].to_numpy(), 'patterns': patterns, 'pattern_probabilities': prob }) print(f'eventually found {upper_patterns.patterns.shape[1]} patterns') idata.var['label'] = -1 if len(upper_patterns.labels) != len(allsignifgenes): allsignifgenes = svi_df_strict.index.to_numpy() print(upper_patterns.labels) idata.var.loc[allsignifgenes, 'label'] = upper_patterns.labels idata.var[[f'pattern_membership_{x}' for x in range(upper_patterns.pattern_probabilities.shape[1])]] = 0 idata.var.loc[allsignifgenes, [f'pattern_membership_{x}' for x in range(upper_patterns.pattern_probabilities.shape[1])]] = upper_patterns.pattern_probabilities idata.obsm['pattern_score'] = upper_patterns.patterns idata.var[[f'pattern_correlation_{x}' for x in range(idata.obsm['pattern_score'].shape[1])]] = 0 corr_df=pd.concat([idata.to_df(),pd.DataFrame(idata.obsm['pattern_score'],index=idata.obs_names)],axis=1).corr().loc[idata.var_names, range(idata.obsm['pattern_score'].shape[1])] idata.var[[f'pattern_correlation_{x}' for x in range(idata.obsm['pattern_score'].shape[1])]] = corr_df.to_numpy() def SVI_patterns_v2(idata, svi_df_strict, iter=1000, pattern_prune_threshold=1e-8, predefined_pattern_number=-1): from spider import SpatialDE2 allsignifgenes = svi_df_strict.index.to_numpy() if 'lengthscale' in idata.uns['SpatialDE'].columns: l=idata.uns['SpatialDE'].loc[allsignifgenes]['lengthscale'].to_list() if len(np.unique(l)) < 2: allsignifgenes = np.intersect1d(allsignifgenes, idata.uns['SOMDE']['g'].to_numpy()) l=idata.uns['SOMDE'].set_index('g').loc[allsignifgenes]['l'].to_list() else: l=idata.uns['SOMDE'].set_index('g').loc[allsignifgenes]['l'].to_list() pattern_number = 1000 print(predefined_pattern_number) if (len(np.unique(l)) <= 1) | (predefined_pattern_number != -1): pattern_number = -1 for count in range(5): print(pattern_number, count, pattern_prune_threshold) if (pattern_number > 100) and (pattern_prune_threshold<1) and (count < 4): param_obj = SpatialDE2.SpatialPatternParameters(lengthscales=l,maxiter=iter, pattern_prune_threshold=pattern_prune_threshold) upper_patterns, _ = SpatialDE2.spatial_patterns(idata, genes=allsignifgenes, rng=np.random.default_rng(seed=45), params=param_obj, copy=False) pattern_number = upper_patterns.patterns.shape[1] pattern_prune_threshold = pattern_prune_threshold*100 if pattern_number < 2: pattern_number = -1 elif (pattern_number < 100) and (pattern_number > 2): break else: print(f'using controlled pattern') if predefined_pattern_number == -1: predefined_pattern_number = 5 histology_results, patterns, prob = SVI_patterns_v1(idata, svi_df_strict, components=predefined_pattern_number) upper_patterns = dotdict({ 'labels': histology_results['pattern'].to_numpy(), 'patterns': patterns.to_numpy(), 'pattern_probabilities': prob }) pattern_number = patterns.shape[1] break print(f'eventually found {pattern_number} patterns') idata.var['label'] = -1 if len(upper_patterns.labels) != len(allsignifgenes): allsignifgenes = svi_df_strict.index.to_numpy() idata.var.loc[allsignifgenes, 'label'] = upper_patterns.labels idata.var[[f'pattern_membership_{x}' for x in range(upper_patterns.pattern_probabilities.shape[1])]] = 0 idata.var.loc[allsignifgenes, [f'pattern_membership_{x}' for x in range(upper_patterns.pattern_probabilities.shape[1])]] = upper_patterns.pattern_probabilities idata.obsm['pattern_score'] = upper_patterns.patterns idata.var[[f'pattern_correlation_{x}' for x in range(idata.obsm['pattern_score'].shape[1])]] = 0 corr_df=pd.concat([idata.to_df(),pd.DataFrame(idata.obsm['pattern_score'],index=idata.obs_names)],axis=1).corr().loc[idata.var_names, range(idata.obsm['pattern_score'].shape[1])] idata.var[[f'pattern_correlation_{x}' for x in range(idata.obsm['pattern_score'].shape[1])]] = corr_df.to_numpy() def SVI_patterns_v1(idata, svi_df_strict, components=5): import NaiveDE import SpatialDE np.random.seed(20230617) df = idata.to_df().T.loc[svi_df_strict.index] corinfo = idata.obs corinfo["total_counts"]=df.sum(0) X=corinfo[['row','col']].values.astype(np.float32) # norm_expr = NaiveDE.stabilize(df).T from scipy import optimize phi_hat, _ = optimize.curve_fit(lambda mu, phi: mu + phi * mu ** 2, df.mean(1), df.var(1)) dfm = np.log(df + 1. / (2 * np.abs(phi_hat[0]))) nres = NaiveDE.regress_out(corinfo, dfm, 'np.log(total_count)').T # resid_expr = NaiveDE.regress_out(corinfo, norm_expr.T, 'np.log(total_counts)').T print('finished regression') results = SpatialDE.run(X,nres) # for generating lengthscale print('finished lengthscale') histology_results, patterns, prob = spatial_patterns(X, nres, results, C=components,l=results['l'].median()+0.5, verbosity=1) print('finished fitting') return histology_results, patterns, prob def idata_pattern_to_spot(idata): belonging = {} cells = idata.uns['cell_meta'].index for i in cells: belonging[i] = [] for pair in idata.obs.reset_index()[['index', 'A', 'B']].to_numpy(): belonging[pair[1]].append(pair[0]) belonging[pair[2]].append(pair[0]) score = pd.DataFrame(idata.obsm['pattern_score'], index=idata.obs_names) df = pd.concat([score.loc[belonging[c]].mean() for c in cells], axis=1).T df.index = cells idata.uns['cell_pattern'] = df print(f'Added key cell_pattern in idata.uns') return df def pattern_label_corr(data, pattern_df, label_key): from sklearn.feature_selection import mutual_info_classif label_df = pd.get_dummies(data.obs[label_key]).T mi=pd.DataFrame([mutual_info_classif(pattern_df, x) for x in label_df.to_numpy()], index=label_df.index, columns=pattern_df.columns) corr = pd.concat([pattern_df, label_df.T], axis=1).corr().loc[label_df.index, pattern_df.columns] return mi, corr # code created by SpatialDE def spatial_patterns(X, exp_mat, DE_mll_results, C, l, **kwargs): ''' Group spatially variable genes into spatial patterns using automatic expression histology (AEH). X : Spatial coordinates exp_mat : Expression matrix, appropriately normalised. DE_mll_results : Results table from SpatialDE, after filtering for significance level. C : The number of spatial patterns **kwards are passed on to the function fit_patterns() Returns pattern_results : A DataFrame with pattern membership information for each gene patterns : The posterior mean underlying expression for genes in given spatial patterns. ''' Y = exp_mat[DE_mll_results['g']].values.T # This is important, we only care about co-expression, not absolute levels. Y = (Y.T - Y.mean(1)).T Y = (Y.T / Y.std(1)).T _, m, r, _ = fit_patterns(X, Y, C, l, **kwargs) cres = pd.DataFrame({'g': DE_mll_results['g'], 'pattern': r.argmax(1), 'membership': r.max(1)}) m = pd.DataFrame.from_records(m) m.index = exp_mat.index return cres, m, r def make_elbojective(Y, r, m, X, K_0, s2e_0, pi=None): def elbojective(log_s2e): return -ELBO(Y, r, m, np.exp(log_s2e), K_0, K_0, s2e_0, pi) return elbojective def SE_kernel(X, l): X = np.array(X) Xsq = np.sum(np.square(X), 1) R2 = -2. * np.dot(X, X.T) + (Xsq[:, None] + Xsq[None, :]) R2 = np.clip(R2, 1e-12, np.inf) return np.exp((-R2 / (2 * l ** 2)).astype(float)) def ELBO(Y, r, m, s2e, K, K_0, s2e_0, pi=None): L = ln_P_YZms(Y, r, m, s2e, pi) + ln_P_Z(r, pi) + ln_P_mu(m, K) \ - ln_Q_Z(r, r) - ln_Q_mu(K_0, r, s2e_0) return L def factor(K): S, U = np.linalg.eigh(K) # .clip removes negative eigenvalues return U, np.clip(S, 1e-8, None) def ln_Q_mu(K, Z, s2e): ''' Expecation of ln Q(mu) ''' N = K.shape[0] C = Z.shape[1] G_k = Z.sum(0) ll = 0 U, S = factor(K) for k in range(C): ll = ll - (1. / S + G_k[k] / s2e).sum() ll = ll + N * np.log(2 * np.pi) ll = -0.5 * ll return ll def ln_Q_Z(Z, r): ''' Expectation of ln Q(Z) ''' return np.sum(Z * np.log(r)) def ln_P_YZms(Y, Z, mu, s2e, pi=None): ''' Expecation of ln P(Y | Z, mu, s2e) ''' G = Y.shape[0] N = Y.shape[1] C = Z.shape[1] if pi is None: pi = np.ones(C) / C log_rho = np.log(pi[None, :]) \ - 0.5 * N * np.log(s2e) \ - 0.5 * np.sum((mu.T[None, :, :] - Y[:, None, :]) ** 2, 2) / s2e \ - 0.5 * N * np.log(2 * np.pi) return (Z * log_rho).sum() def ln_P_Z(Z, pi=None): ''' Expectation of ln P(Z) ''' C = Z.shape[1] if pi is None: pi = np.ones(C) / C return np.dot(Z, np.log(pi)).sum() def ln_P_mu(mu, K): ''' Expectation of ln P(mu) ''' N = K.shape[0] C = mu.shape[1] ll = 0 for k in range(C): ll = ll + np.linalg.det(K) ll = ll + mu[:, k].dot(np.linalg.solve(K, mu[:, k])) ll = ll + N * np.log(2 * np.pi) ll = -0.5 * ll return ll def fit_patterns(X, Y, C, l, s2e_0=1.0, verbosity=0, maxiter=100, printerval=1, opt_interval=1, delta_elbo_threshold=1e-2): ''' Fit spatial patterns using Automatic Expression Histology. X : Spatial coordinates Y : Gene expression values C : The number of patterns l : The charancteristic length scale of the clusters Returns final_elbo : The final ELBO value. m : The posterior mean underlying expression values for each cluster. r : The posterior pattern assignment probabilities for each gene and pattern. s2e : The estimated noise parameter of the model ''' # Set up constants G = Y.shape[0] N = Y.shape[1] eps = 1e-8 * np.eye(N) s2e = s2e_0 K = SE_kernel(X, l) + eps # Randomly initialize r = np.random.uniform(size=(G, C)) r = r / r.sum(0) pi = r.sum(0) / G m = np.random.normal(size=(N, C)) elbo_0 = ELBO(Y, r, m, s2e, K, K, s2e, pi) elbo_1 = elbo_0 if verbosity > 0: print('iter {}, ELBO: {:0.2e}'.format(0, elbo_1)) if verbosity > 1: print() for i in range(maxiter): if (i % opt_interval == (opt_interval - 1)): elbojective = make_elbojective(Y, r, m, X, K, s2e, pi) o = optimize.minimize_scalar(elbojective) s2e = np.exp(o.x) r = Q_Z_expectation(m, Y, s2e, N, C, G, pi) m = Q_mu_expectation(r, Y, K, s2e) pi = r.sum(0) / G elbo_0 = elbo_1 elbo_1 = ELBO(Y, r, m, s2e, K, K, s2e, pi) delta_elbo = np.abs(elbo_1 - elbo_0) if verbosity > 0 and (i % printerval == 0): print('iter {}, ELBO: {:0.2e}, delta_ELBO: {:0.2e}'.format(i + 1, elbo_1, delta_elbo)) if verbosity > 1: print('ln(l): {:0.2f}, ln(s2e): {:.2f}'.format(np.log(l), np.log(s2e))) if verbosity > 2: line1 = 'P(Y | Z, mu, s2e): {:0.2e}, P(Z): {:0.2e}, P(mu): {:0.2e}' \ .format(ln_P_YZms(Y, r, m, s2e, pi), ln_P_Z(r, pi), ln_P_mu(m, K)) line2 = 'Q(Z): {:0.2e}, Q(mu): {:0.2e}'.format(ln_Q_Z(r, r), ln_Q_mu(K, r, s2e)) print(line1 + '\n' + line2) if verbosity > 1: print() if delta_elbo < delta_elbo_threshold: if verbosity > 0: print('Converged on iter {}'.format(i + 1)) break else: print('Warning! ELBO dit not converge after {} iters!'.format(i + 1)) final_elbo = ELBO(Y, r, m, s2e, K, K, s2e, pi) return final_elbo, m, r, s2e def Q_Z_expectation(mu, Y, s2e, N, C, G, pi=None): if pi is None: pi = np.ones(C) / C log_rho = np.log(pi[None, :]) \ - 0.5 * N * np.log(s2e) \ - 0.5 * np.sum((mu.T[None, :, :] - Y[:, None, :]) ** 2, 2) / s2e \ - 0.5 * N * np.log(2 * np.pi) # Subtract max per row for numerical stability, and add offset from 0 for same reason. rho = np.exp(log_rho - log_rho.max(1)[:, None]) + 1e-12 # Then evaluate softmax r = (rho.T / (rho.sum(1))).T return r def Q_mu_k_expectation(Z_k, Y, K, s2e): y_k_tilde = np.dot(Z_k, Y) / s2e Sytk = np.dot(K, y_k_tilde) IpSDk = np.eye(K.shape[0]) + K * Z_k.sum() / s2e m_k = np.linalg.solve(IpSDk, Sytk) return m_k def Q_mu_expectation(Z, Y, K, s2e): m = np.zeros((Y.shape[1], Z.shape[1])) y_k_tilde = np.dot(Z.T, Y).T / s2e for k in range(Z.shape[1]): m[:, k] = Q_mu_k_expectation(Z[:, k], Y, K, s2e) return m