diff --git a/setup.cfg b/setup.cfg index 27ef08e..821acec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,17 +5,17 @@ [metadata] name = scruncher -description = Add a short description here! +description = Helper functions for scRNA-seq author = Jayaram Kancherla author_email = jayaram.kancherla@gmail.com license = MIT license_files = LICENSE.txt long_description = file: README.md long_description_content_type = text/markdown; charset=UTF-8; variant=GFM -url = https://github.com/pyscaffold/pyscaffold/ +url = https://github.com/biocpy/scruncher # Add here related links, for example: project_urls = - Documentation = https://pyscaffold.org/ + Documentation = https://github.com/biocpy/scruncher # Source = https://github.com/pyscaffold/pyscaffold/ # Changelog = https://pyscaffold.org/en/latest/changelog.html # Tracker = https://github.com/pyscaffold/pyscaffold/issues @@ -41,7 +41,7 @@ package_dir = =src # Require a min/specific Python version (comma-separated conditions) -# python_requires = >=3.8 +python_requires = >=3.8 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in @@ -49,7 +49,10 @@ package_dir = # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - + numpy + pandas + singlecellexperiment + mopsy [options.packages.find] where = src diff --git a/src/scruncher/__init__.py b/src/scruncher/__init__.py index e451f10..78c8c80 100644 --- a/src/scruncher/__init__.py +++ b/src/scruncher/__init__.py @@ -14,3 +14,6 @@ __version__ = "unknown" finally: del version, PackageNotFoundError + + +from .zscores import compute_z_score, summarize_zscore \ No newline at end of file diff --git a/src/scruncher/skeleton.py b/src/scruncher/skeleton.py deleted file mode 100644 index 6112d13..0000000 --- a/src/scruncher/skeleton.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -This is a skeleton file that can serve as a starting point for a Python -console script. To run this script uncomment the following lines in the -``[options.entry_points]`` section in ``setup.cfg``:: - - console_scripts = - fibonacci = scruncher.skeleton:run - -Then run ``pip install .`` (or ``pip install -e .`` for editable mode) -which will install the command ``fibonacci`` inside your current environment. - -Besides console scripts, the header (i.e. until ``_logger``...) of this file can -also be used as template for Python modules. - -Note: - This file can be renamed depending on your needs or safely removed if not needed. - -References: - - https://setuptools.pypa.io/en/latest/userguide/entry_point.html - - https://pip.pypa.io/en/stable/reference/pip_install -""" - -import argparse -import logging -import sys - -from scruncher import __version__ - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - -_logger = logging.getLogger(__name__) - - -# ---- Python API ---- -# The functions defined in this section can be imported by users in their -# Python scripts/interactive interpreter, e.g. via -# `from scruncher.skeleton import fib`, -# when using this Python module as a library. - - -def fib(n): - """Fibonacci example function - - Args: - n (int): integer - - Returns: - int: n-th Fibonacci number - """ - assert n > 0 - a, b = 1, 1 - for _i in range(n - 1): - a, b = b, a + b - return a - - -# ---- CLI ---- -# The functions defined in this section are wrappers around the main Python -# API allowing them to be called directly from the terminal as a CLI -# executable/script. - - -def parse_args(args): - """Parse command line parameters - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--help"]``). - - Returns: - :obj:`argparse.Namespace`: command line parameters namespace - """ - parser = argparse.ArgumentParser(description="Just a Fibonacci demonstration") - parser.add_argument( - "--version", - action="version", - version=f"scruncher {__version__}", - ) - parser.add_argument(dest="n", help="n-th Fibonacci number", type=int, metavar="INT") - parser.add_argument( - "-v", - "--verbose", - dest="loglevel", - help="set loglevel to INFO", - action="store_const", - const=logging.INFO, - ) - parser.add_argument( - "-vv", - "--very-verbose", - dest="loglevel", - help="set loglevel to DEBUG", - action="store_const", - const=logging.DEBUG, - ) - return parser.parse_args(args) - - -def setup_logging(loglevel): - """Setup basic logging - - Args: - loglevel (int): minimum loglevel for emitting messages - """ - logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s" - logging.basicConfig( - level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S" - ) - - -def main(args): - """Wrapper allowing :func:`fib` to be called with string arguments in a CLI fashion - - Instead of returning the value from :func:`fib`, it prints the result to the - ``stdout`` in a nicely formatted message. - - Args: - args (List[str]): command line parameters as list of strings - (for example ``["--verbose", "42"]``). - """ - args = parse_args(args) - setup_logging(args.loglevel) - _logger.debug("Starting crazy calculations...") - print(f"The {args.n}-th Fibonacci number is {fib(args.n)}") - _logger.info("Script ends here") - - -def run(): - """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv` - - This function can be used as entry point to create console scripts with setuptools. - """ - main(sys.argv[1:]) - - -if __name__ == "__main__": - # ^ This is a guard statement that will prevent the following code from - # being executed in the case someone imports this file instead of - # executing it as a script. - # https://docs.python.org/3/library/__main__.html - - # After installing your project with pip, users can also run your Python - # modules as scripts via the ``-m`` flag, as defined in PEP 338:: - # - # python -m scruncher.skeleton 42 - # - run() diff --git a/src/scruncher/zscores.py b/src/scruncher/zscores.py new file mode 100644 index 0000000..50841a6 --- /dev/null +++ b/src/scruncher/zscores.py @@ -0,0 +1,211 @@ +from math import sqrt +from typing import Sequence, Union +from warnings import warn + +import numpy as np +import pandas as pd +from mopsy import rowmean +from mopsy.helpers import apply +from scipy import sparse as sp +from singlecellexperiment import SingleCellExperiment + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +# Modified to work directly with a matrix where rows are genes and columns are cells +def calc_background_mean_std(x: Union[sp.spmatrix, np.ndarray], means: np.ndarray, codes: np.ndarray, n_bins: int): + """Calculate background mean and standard deviation. + + Args: + x: + A matrix, rows are genes, columns are cells. + + means: + Pre-computed means per bin. + + codes: + Categories for each bin. + + n_bins: + Number of bins. + + Returns: + A tuple containing the mean and standard deviations. + """ + bin_count = np.zeros(n_bins, dtype=np.int32) + background_mean = np.zeros((n_bins, x.shape[1]), dtype=np.float64) + background_std = np.ones((n_bins, x.shape[1]), dtype=np.float64) + + for i in range(x.shape[0]): + bin_count[codes[i]] += 1 + + if sp.issparse(x): + if not isinstance(x, sp.csr_matrix): + x = x.tocsr() + + for i in range(x.shape[0]): + bin_idx = codes[i] + for j in range(x.indptr[i], x.indptr[i + 1]): + col = x.indices[j] + cexpr = x.data[j] - means[i] + background_mean[bin_idx, col] += cexpr + background_std[bin_idx, col] += cexpr * cexpr + else: + for i in range(x.shape[0]): + bin_idx = codes[i] + for j in range(x.shape[1]): + cexpr = x[i, j] - means[i] + background_mean[bin_idx, j] += cexpr + background_std[bin_idx, j] += cexpr * cexpr + + for i in range(n_bins): + if bin_count[i] > 0: + for j in range(x.shape[1]): + background_mean[i, j] /= bin_count[i] + estd = 0.0 + if bin_count[i] > 1: + estd = sqrt( + (background_std[i, j] - bin_count[i] * background_mean[i, j] * background_mean[i, j]) + / (bin_count[i] - 1) + ) + background_std[i, j] = estd if estd > 1e-4 else 1.0 + + return background_mean, background_std + + +def compute_z_score(data: SingleCellExperiment, assay_name: str = "counts", n_bins=50): + """Compute zscore. + + Args: + data: + A single-cell data object. + + assay_name: + Name of the assay to access for the count matrix. + Defaults to "counts". + + n_bins: + Number of bins. + Defaults to 50. + + Returns: + A NumPy dense matrix containing z-scores. + """ + + mat = data.assay(assay=assay_name) + + if not isinstance(mat, sp.csr_matrix): + mat = mat.tocsr() + + # No transpose needed now - mat already has genes as rows and cells as columns + matrix = mat + + # Calculate mean across cells (axis=1) + mean_vec = np.mean(matrix, axis=1, dtype=np.float64) + + if mean_vec.size <= n_bins: + warn( + f"Number of bins ({n_bins}) is larger or equal to the total number of genes ({mean_vec.size}).", + RuntimeWarning, + ) + + try: + bins = pd.qcut(mean_vec, n_bins) + except ValueError: + warn("Detected and dropped duplicate bin edges!") + bins = pd.qcut(mean_vec, n_bins, duplicates="drop") + + if bins.value_counts().min() == 1: + warn("Detected bins with only 1 gene!", RuntimeWarning) + + bins = bins.rename_categories(dict(zip(bins.categories, bins.categories.astype(str)))) + + n_bins = bins.categories.size + codes = bins.codes.astype(np.int32) + + mean_vec = mean_vec.astype(np.float64) + if matrix.dtype == np.float64: + matrix = matrix.astype(np.float32) + + fmeans, fstds = calc_background_mean_std(matrix, mean_vec, codes, n_bins) + + mat = matrix + if sp.issparse(mat): + mat = matrix.toarray() + + # Reshape means and stds for broadcasting + gene_bin_indices = np.arange(len(codes)) + result_reimp = np.zeros(mat.shape, dtype=np.float32) + + for i in range(mat.shape[0]): + bin_idx = codes[i] + result_reimp[i, :] = ( + mat[i, :].astype(np.float32) - mean_vec[i].astype(np.float32) - fmeans[bin_idx, :].astype(np.float32) + ) / fstds[bin_idx, :].astype(np.float32) + + return result_reimp + + +def summarize_zscore( + data: SingleCellExperiment, cell_labels: Sequence[str], assay_name: str = "counts", n_bins: int = 50 +) -> SingleCellExperiment: + """Generate pseudo-bulk like zscores averages based on cell groups. + + Args: + data: + A single-cell data object. + + cell_labels: + Cell labels to compute pseudo bulk over. + + assay_name: + Name of the assay to access for the count matrix. + Defaults to "counts". + + n_bins: + Number of bins. + Defaults to 50. + + Returns: + A `SingleCellExperiment` object woth mean zscores. + """ + matrix = data.assay(assay=assay_name) + # No transpose needed + mat = matrix + + z_mat = compute_z_score(data, assay_name=assay_name, n_bins=n_bins) + + feature_copy = data.get_row_data().copy() + + # Count non-zero cells for each gene + non_zero_cells, labels = count_non_zero(matrix=mat, cell_labels=cell_labels) + + # Calculate mean z-scores across cell groups + # Now we need to use column means since our matrix has genes as rows + z_mat_mean, labels = apply(lambda x: np.mean(x, axis=1), z_mat, 1, cell_labels) + + num_cells = pd.Series(cell_labels).value_counts()[labels].values + + return SingleCellExperiment( + column_data=pd.DataFrame({"cell_labels": labels, "num_cells": num_cells}), + row_data=feature_copy, + assays={"zscores_mean": z_mat_mean, "non_zero_cells": non_zero_cells}, + ) + + +def nz_func(arr: Union[np.ndarray, sp._csr.csr_matrix]) -> np.ndarray: + if isinstance(arr, np.ndarray): + return np.count_nonzero(arr, axis=1) # Changed axis from 0 to 1 + else: + return np.count_nonzero(arr.todense(), axis=1) # Changed axis from 0 to 1 + + +def count_non_zero(matrix, cell_labels: Sequence) -> np.ndarray: + # Changed axis from 0 to 1 since we're now counting non-zeros across cells for each gene + non_zero_cts, labels = apply(nz_func, matrix, 1, cell_labels) + if not isinstance(non_zero_cts, np.ndarray): + non_zero_cts = non_zero_cts.toarray() + + return non_zero_cts, labels \ No newline at end of file diff --git a/tests/test_skeleton.py b/tests/test_skeleton.py deleted file mode 100644 index 5f192c6..0000000 --- a/tests/test_skeleton.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from scruncher.skeleton import fib, main - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -def test_fib(): - """API Tests""" - assert fib(1) == 1 - assert fib(2) == 1 - assert fib(7) == 13 - with pytest.raises(AssertionError): - fib(-10) - - -def test_main(capsys): - """CLI Tests""" - # capsys is a pytest fixture that allows asserts against stdout/stderr - # https://docs.pytest.org/en/stable/capture.html - main(["7"]) - captured = capsys.readouterr() - assert "The 7-th Fibonacci number is 13" in captured.out