singleCellHaystack

This repository contains a Python implementation of singleCellHaystack (version >= 1.0.0).

This package is currently in beta. The most important functionality in the R package works, but some features are not yet available. Here is a (probably imcomplete) list of missing features. Some will be added in the future.

weights.advanced.Q (formerly known as use.advanced.sampling).
seeding method for calculating grid points.
Hierarchical clustering method for cluster_genes.

Installation

You can install singleCellHaystack from PyPI:

pip install singleCellHaystack

You can install singleCellHaystack from GitHub with:

pip install git+http://github.com/ddiez/singleCellHaystack-py

Support for conda installation will be added in the future.

Example

import scanpy as sc
import singleCellHaystack as hs

adata = sc.read_h5ad("data.h5ad")

[... process adata object ...]

res = hs.haystack(adata, basis="pca")
res.top_features(n=10)

References

Our manuscript describing the updated, more generally applicable version of singleCellHaystack including this Python implementation was published in Scientific Reports.
Our manuscript describing the original implementation of singleCellHaystack for R (version 0.3.4) was published in Nature Communications.

If you use singleCellHaystack in your research please cite our work using:

Vandenbon A, Diez D (2023). “A universal tool for predicting differentially active features in single-cell and spatial genomics data.” Scientific Reports, 13(1), 11830. doi:10.1038/s41598-023-38965-2.

UnboundLocalError for `scale_coords=False` in `haystack` call

minimal example using current github revision 587530e:

import scanpy as sc
import singleCellHaystack as hs

data = sc.datasets.pbmc3k()[:100, :100]
sc.pp.pca(data)
hs.haystack(data, 'pca', scale_coords=False, random_state=0, verbose=False)

computing PCA
    with n_comps=50
    finished (0:00:00)

/opt/conda/lib/python3.9/site-packages/singleCellHaystack/_grid.py:10: ConvergenceWarning: Number of distinct clusters (90) found smaller than n_clusters (100). Possibly due to duplicate points in X.
  res = KMeans(n_clusters=ngrid_points, random_state=random_state, n_init=10).fit(x)

---------------------------------------------------------------------------
UnboundLocalError                         Traceback (most recent call last)
Cell In[81], line 6
      4 data = sc.datasets.pbmc3k()[:100, :100]
      5 sc.pp.pca(data)
----> 6 hs.haystack(data, 'pca', scale_coords=False, random_state=0, verbose=False)

File /opt/conda/lib/python3.9/site-packages/singleCellHaystack/_haystack.py:31, in haystack(x, coord, features, scale_coords, ngrid_points, n_genes_to_randomize, select_genes_randomize_method, genes_to_randomize, spline_method, n_randomizations, grid_points, pseudo, random_state, verbose, kld_method)
     28 res = None
     30 if isinstance(x, AnnData) and isinstance(coord, str):
---> 31   res = haystack_adata(adata=x, basis=coord, dims=None, scale_coords=scale_coords,
     32       ngrid_points=ngrid_points, n_genes_to_randomize=n_genes_to_randomize,
     33       select_genes_randomize_method=select_genes_randomize_method, genes_to_randomize=genes_to_randomize, spline_method=spline_method,
     34       n_randomizations=n_randomizations, grid_points=grid_points, pseudo=pseudo, random_state=random_state, verbose=verbose, kld_method=kld_method)
     36 if (isinstance(x, ndarray) or isspmatrix(x)) and isinstance(coord, ndarray):
     37   res = haystack_array(weights=x, coord=coord, features=features, scale_coords=scale_coords,
     38       ngrid_points=ngrid_points, n_genes_to_randomize=n_genes_to_randomize,
     39       select_genes_randomize_method=select_genes_randomize_method, genes_to_randomize=genes_to_randomize, spline_method=spline_method,
     40       n_randomizations=n_randomizations, grid_points=grid_points, pseudo=pseudo, random_state=random_state, verbose=verbose, kld_method=kld_method)

File /opt/conda/lib/python3.9/site-packages/singleCellHaystack/_haystack.py:213, in haystack_adata(adata, basis, dims, scale_coords, ngrid_points, n_genes_to_randomize, select_genes_randomize_method, genes_to_randomize, spline_method, n_randomizations, grid_points, pseudo, random_state, verbose, kld_method)
    210 if dims is not None:
    211   coord = coord[:, dims]
--> 213 res = haystack_array(exprs, coord, features=genes, scale_coords=scale_coords, ngrid_points=ngrid_points,
    214     n_genes_to_randomize=n_genes_to_randomize, select_genes_randomize_method=select_genes_randomize_method, genes_to_randomize=genes_to_randomize,
    215     spline_method=spline_method, n_randomizations=n_randomizations, grid_points=grid_points, pseudo=pseudo, random_state=random_state, verbose=verbose, kld_method=kld_method)
    216 return(res)

File /opt/conda/lib/python3.9/site-packages/singleCellHaystack/_haystack.py:178, in haystack_array(weights, coord, features, scale_coords, ngrid_points, n_genes_to_randomize, select_genes_randomize_method, genes_to_randomize, spline_method, n_randomizations, grid_points, pseudo, random_state, verbose, kld_method)
    157 df = DataFrame({
    158   "gene": features,
    159   "KLD": KLD,
   (...)
    163   "logpval_adj": logpval_adj
    164 })
    166 df = df.sort_values("logpval")
    168 info = {
    169   "grid_points": grid_points,
    170   "grid_dist": grid_dist,
    171   "grid_density": grid_density,
    172   "Q": Q,
    173   "P": P,
    174   "KLD_rand": KLD_rand,
    175   "pval_info": pvalData,
    176   "genes_to_randomize": genes_to_randomize,
    177   "exprs_cv": exprs_cv,
--> 178   "coord_mean": coord_mean,
    179   "coord_std": coord_std
    180 }
    182 return {
    183   "results": df,
    184   "info": info
    185 }

UnboundLocalError: local variable 'coord_mean' referenced before assignment

ddiez / singlecellhaystack-py Goto Github PK