From d074b922d7814af65bbd9e0269019fcbd64a916a Mon Sep 17 00:00:00 2001 From: Pranav Rai Date: Wed, 15 May 2024 13:50:46 +0200 Subject: [PATCH 1/4] Updating regression requirements * Fitting the regression line for cerebellum group and SST cell type was using only 1 datapoint, and producing erroneous results. Now, the requirement for fitting the regression coefficient is have a minimum of 6 datapoints. * Added some debug statement for logging, which helps in checking how well the regression ran. * Changed how compute_region_volumes function in atlas_densities.densities.utils is calculating the "id_volume" column. Earlier it was generating nan values which leads to errors downstream in the pipeline. --- atlas_densities/densities/fitting.py | 15 ++++++++++++--- atlas_densities/densities/utils.py | 3 +-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 6f9d558..ade9250 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -33,6 +33,7 @@ from atlas_densities.densities import utils from atlas_densities.densities.measurement_to_density import remove_unknown_regions from atlas_densities.exceptions import AtlasDensitiesError, AtlasDensitiesWarning +import json L = logging.getLogger(__name__) @@ -367,7 +368,8 @@ def compute_average_intensities( def linear_fitting_xy( - xdata: list[float], ydata: list[float], sigma: Union[list[float], FloatArray] + xdata: list[float], ydata: list[float], sigma: Union[list[float], FloatArray], + min_data_points: int = 5 ) -> dict[str, float]: """ Compute the coefficient of the linear least-squares regression of the point cloud @@ -380,6 +382,10 @@ def linear_fitting_xy( These are the standard deviations of the values in `ydata`. Zero values are replaced by the least positive value if it exists, otherwise by 1.0. + min_data_points: minimum number of datapoints required for running + the linear regression. If the number of datapoints is less than + min_data_points then no fitting is done, and np.nan values are + returned. Returns: a dict of the form {"coefficient": , @@ -390,8 +396,8 @@ def linear_fitting_xy( AtlasDensitiesError if some of the `sigma`values are negative. """ - if len(xdata) == 0: - return {"coefficient": np.nan, "standard_deviation": np.nan} + if len(xdata) < min_data_points: + return {"coefficient": np.nan, "standard_deviation": np.nan, "r_square": np.nan} sigma = np.array(sigma) if np.any(sigma < 0.0): @@ -422,6 +428,7 @@ def _optimize_func(x, coefficient): ) ss_reg = np.sum((_optimize_func(xdata, parameters[0][0]) - np.mean(ydata)) ** 2) ss_tot = np.sum((np.array(ydata) - _optimize_func(xdata, parameters[0][0])) ** 2) + ss_reg + L.debug(f"Length of xdata = {xdata}.The ss_reg is {ss_reg} and ss_tot is {ss_tot}\n") # if total sum of square is null, no variance can be explained by the fitting return { "coefficient": parameters[0][0], @@ -545,6 +552,8 @@ def compute_fitting_coefficients( L.info("Computing regression coefficients for %d cell types ...", len(cell_types)) for cell_type in tqdm(cell_types): cloud = clouds[group_name][cell_type] + L.debug( + f"The length of training data for {group_name} and {cell_type} is {cloud['xdata']}") result[group_name][cell_type] = linear_fitting_xy( cloud["xdata"], cloud["ydata"], cloud["sigma"] ) diff --git a/atlas_densities/densities/utils.py b/atlas_densities/densities/utils.py index a210ade..d5c8f56 100644 --- a/atlas_densities/densities/utils.py +++ b/atlas_densities/densities/utils.py @@ -530,7 +530,7 @@ def compute_region_volumes( ) ids, counts = np.unique(annotation, return_counts=True) - result["id_volume"] = pd.Series(counts * voxel_volume, index=ids) + result.update(pd.Series(counts * voxel_volume, index=ids).rename("id_volume")) volumes = [] for id_ in hierarchy_info.index: @@ -538,5 +538,4 @@ def compute_region_volumes( volume = result.loc[list(id_set), "id_volume"].sum() volumes.append(volume) result["volume"] = volumes - return result From 74798d5851a0a1fe8230127dab7faaf1388d60f8 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Wed, 15 May 2024 16:46:31 +0200 Subject: [PATCH 2/4] make `min_data_points` a command like parameter for fit-average-densities --- atlas_densities/app/cell_densities.py | 9 +++++++++ atlas_densities/densities/fitting.py | 26 ++++++++++++++++++-------- tests/densities/test_fitting.py | 22 +++++++++++++++------- 3 files changed, 42 insertions(+), 15 deletions(-) diff --git a/atlas_densities/app/cell_densities.py b/atlas_densities/app/cell_densities.py index 961b327..5baeb38 100644 --- a/atlas_densities/app/cell_densities.py +++ b/atlas_densities/app/cell_densities.py @@ -808,6 +808,13 @@ def measurements_to_average_densities( help="Path to density groups ids config", show_default=True, ) +@click.option( + "--min-data-points", + type=int, + default=5, + help="minimum number of datapoints required for running the linear regression.", + show_default=True, +) @log_args(L) def fit_average_densities( hierarchy_path, @@ -820,6 +827,7 @@ def fit_average_densities( fitted_densities_output_path, fitting_maps_output_path, group_ids_config_path, + min_data_points, ): # pylint: disable=too-many-arguments, too-many-locals """ Estimate average cell densities of brain regions in `hierarchy_path` for the cell types @@ -935,6 +943,7 @@ def fit_average_densities( cell_density_stddev, region_name=region_name, group_ids_config=group_ids_config, + min_data_points=min_data_points, ) # Turn index into column to ease off the save and load operations on csv files diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index ade9250..6c58f42 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -33,7 +33,6 @@ from atlas_densities.densities import utils from atlas_densities.densities.measurement_to_density import remove_unknown_regions from atlas_densities.exceptions import AtlasDensitiesError, AtlasDensitiesWarning -import json L = logging.getLogger(__name__) @@ -368,8 +367,10 @@ def compute_average_intensities( def linear_fitting_xy( - xdata: list[float], ydata: list[float], sigma: Union[list[float], FloatArray], - min_data_points: int = 5 + xdata: list[float], + ydata: list[float], + sigma: Union[list[float], FloatArray], + min_data_points: int, ) -> dict[str, float]: """ Compute the coefficient of the linear least-squares regression of the point cloud @@ -441,7 +442,10 @@ def _optimize_func(x, coefficient): def compute_fitting_coefficients( - groups: dict[str, set[str]], average_intensities: pd.DataFrame, densities: pd.DataFrame + groups: dict[str, set[str]], + average_intensities: pd.DataFrame, + densities: pd.DataFrame, + min_data_points: int, ) -> FittingData: """ Compute the linear fitting coefficient of the cloud of 2D points (average marker intensity, @@ -553,9 +557,10 @@ def compute_fitting_coefficients( for cell_type in tqdm(cell_types): cloud = clouds[group_name][cell_type] L.debug( - f"The length of training data for {group_name} and {cell_type} is {cloud['xdata']}") + f"The length of training data for {group_name} and {cell_type} is {cloud['xdata']}" + ) result[group_name][cell_type] = linear_fitting_xy( - cloud["xdata"], cloud["ydata"], cloud["sigma"] + cloud["xdata"], cloud["ydata"], cloud["sigma"], min_data_points ) if np.isnan(result[group_name][cell_type]["coefficient"]): warnings.warn( @@ -739,6 +744,7 @@ def linear_fitting( # pylint: disable=too-many-arguments cell_density_stddevs: Optional[dict[str, float]] = None, region_name: str = "root", group_ids_config: dict | None = None, + min_data_points: int = 5, ) -> pd.DataFrame: """ Estimate the average densities of every region in `region_map` using a linear fitting @@ -781,6 +787,10 @@ def linear_fitting( # pylint: disable=too-many-arguments standard deviations of average cell densities of the corresponding regions. region_name: (str) name of the root region of interest group_ids_config: mapping of regions to their constituent ids + min_data_points: minimum number of datapoints required for running + the linear regression. If the number of datapoints is less than + min_data_points then no fitting is done, and np.nan values are + returned. Returns: tuple (densities, fitting_coefficients) @@ -842,8 +852,8 @@ def linear_fitting( # pylint: disable=too-many-arguments L.info("Computing fitting coefficients ...") fitting_coefficients = compute_fitting_coefficients( - groups, average_intensities, densities.drop(densities.index[indexes]) - ) + groups, average_intensities, densities.drop(densities.index[indexes]), + min_data_points=min_data_points) L.info("Fitting unknown average densities ...") fit_unknown_densities(groups, average_intensities, densities, fitting_coefficients) diff --git a/tests/densities/test_fitting.py b/tests/densities/test_fitting.py index b696b6f..d63eb7b 100644 --- a/tests/densities/test_fitting.py +++ b/tests/densities/test_fitting.py @@ -273,17 +273,18 @@ def test_compute_average_intensities(region_map, hierarchy_info): def test_linear_fitting_xy(): - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0]) + min_data_points = 1 + actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0], min_data_points=min_data_points) assert np.allclose(actual["coefficient"], 2.0) assert np.allclose(actual["r_square"], 1.0) assert not np.isinf(actual["standard_deviation"]) - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5]) + actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5], min_data_points=min_data_points) assert np.allclose(actual["coefficient"], 2.0) assert not np.isinf(actual["standard_deviation"]) assert np.allclose(actual["r_square"], 0.89286) - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0]) + actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0], min_data_points=min_data_points) assert np.allclose(actual["coefficient"], 2.0) assert not np.isinf(actual["standard_deviation"]) assert np.allclose(actual["r_square"], 1.0) @@ -319,7 +320,8 @@ def test_compute_fitting_coefficients(hierarchy_info): data = get_fitting_input_data_(hierarchy_info) actual = tested.compute_fitting_coefficients( - data["groups"], data["intensities"], data["densities"] + data["groups"], data["intensities"], data["densities"], + min_data_points=1, ) for group_name in ["Whole", "Central lobule"]: @@ -340,18 +342,24 @@ def test_compute_fitting_coefficients_exceptions(hierarchy_info): data["densities"].drop(index=["Central lobule"], inplace=True) with pytest.raises(AtlasDensitiesError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"]) + tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"], + min_data_points=1, + ) data = get_fitting_input_data_(hierarchy_info) data["densities"].drop(columns=["pv+"], inplace=True) with pytest.raises(AtlasDensitiesError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"]) + tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"], + min_data_points=1, + ) data = get_fitting_input_data_(hierarchy_info) data["densities"].at["Lobule II", "pv+_standard_deviation"] = np.nan with pytest.raises(AssertionError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"]) + tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"], + min_data_points=1, + ) @pytest.fixture From 16bdf61e8f5711861c597d1b73cfc7ced35080f8 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Wed, 15 May 2024 17:09:33 +0200 Subject: [PATCH 3/4] fix lints --- atlas_densities/app/cell_densities.py | 2 +- atlas_densities/densities/fitting.py | 17 ++++++++--- tests/densities/test_fitting.py | 44 +++++++++++++++++++-------- 3 files changed, 44 insertions(+), 19 deletions(-) diff --git a/atlas_densities/app/cell_densities.py b/atlas_densities/app/cell_densities.py index 5baeb38..e168de2 100644 --- a/atlas_densities/app/cell_densities.py +++ b/atlas_densities/app/cell_densities.py @@ -811,7 +811,7 @@ def measurements_to_average_densities( @click.option( "--min-data-points", type=int, - default=5, + default=1, help="minimum number of datapoints required for running the linear regression.", show_default=True, ) diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 6c58f42..6f41705 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -429,7 +429,7 @@ def _optimize_func(x, coefficient): ) ss_reg = np.sum((_optimize_func(xdata, parameters[0][0]) - np.mean(ydata)) ** 2) ss_tot = np.sum((np.array(ydata) - _optimize_func(xdata, parameters[0][0])) ** 2) + ss_reg - L.debug(f"Length of xdata = {xdata}.The ss_reg is {ss_reg} and ss_tot is {ss_tot}\n") + L.debug("Length of xdata = %s. The ss_reg is %s and ss_tot is %s", xdata, ss_reg, ss_tot) # if total sum of square is null, no variance can be explained by the fitting return { "coefficient": parameters[0][0], @@ -488,7 +488,7 @@ def compute_fitting_coefficients( The "standard_deviation" value is the standard deviation of the coefficient value. The "r_square" value is the coefficient of determination of the coefficient value. """ - + # pylint: disable=too-many-locals if len(densities.index) != len(average_intensities.index) or np.any( densities.index != average_intensities.index ): @@ -557,7 +557,10 @@ def compute_fitting_coefficients( for cell_type in tqdm(cell_types): cloud = clouds[group_name][cell_type] L.debug( - f"The length of training data for {group_name} and {cell_type} is {cloud['xdata']}" + "The length of training data for %s and %s is %s", + group_name, + cell_type, + cloud["xdata"], ) result[group_name][cell_type] = linear_fitting_xy( cloud["xdata"], cloud["ydata"], cloud["sigma"], min_data_points @@ -801,6 +804,7 @@ def linear_fitting( # pylint: disable=too-many-arguments fitting_coefficients: dict returned by :fun:`atlas_densities.densities.fitting.compute_fitting_coefficients`. """ + # pylint: disable=too-many-locals assert group_ids_config is not None L.info("Checking input data frames sanity ...") _check_average_densities_sanity(average_densities) @@ -852,8 +856,11 @@ def linear_fitting( # pylint: disable=too-many-arguments L.info("Computing fitting coefficients ...") fitting_coefficients = compute_fitting_coefficients( - groups, average_intensities, densities.drop(densities.index[indexes]), - min_data_points=min_data_points) + groups, + average_intensities, + densities.drop(densities.index[indexes]), + min_data_points=min_data_points, + ) L.info("Fitting unknown average densities ...") fit_unknown_densities(groups, average_intensities, densities, fitting_coefficients) diff --git a/tests/densities/test_fitting.py b/tests/densities/test_fitting.py index d63eb7b..935c016 100644 --- a/tests/densities/test_fitting.py +++ b/tests/densities/test_fitting.py @@ -274,17 +274,23 @@ def test_compute_average_intensities(region_map, hierarchy_info): def test_linear_fitting_xy(): min_data_points = 1 - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0], min_data_points=min_data_points) + actual = tested.linear_fitting_xy( + [0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 1.0, 1.0], min_data_points=min_data_points + ) assert np.allclose(actual["coefficient"], 2.0) assert np.allclose(actual["r_square"], 1.0) assert not np.isinf(actual["standard_deviation"]) - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5], min_data_points=min_data_points) + actual = tested.linear_fitting_xy( + [0.0, 1.0, 2.0], [0.0, 1.0, 4.0], [1.0, 1.0, 1e-5], min_data_points=min_data_points + ) assert np.allclose(actual["coefficient"], 2.0) assert not np.isinf(actual["standard_deviation"]) assert np.allclose(actual["r_square"], 0.89286) - actual = tested.linear_fitting_xy([0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0], min_data_points=min_data_points) + actual = tested.linear_fitting_xy( + [0.0, 1.0, 2.0], [0.0, 2.0, 4.0], [1.0, 0.0, 1.0], min_data_points=min_data_points + ) assert np.allclose(actual["coefficient"], 2.0) assert not np.isinf(actual["standard_deviation"]) assert np.allclose(actual["r_square"], 1.0) @@ -320,7 +326,9 @@ def test_compute_fitting_coefficients(hierarchy_info): data = get_fitting_input_data_(hierarchy_info) actual = tested.compute_fitting_coefficients( - data["groups"], data["intensities"], data["densities"], + data["groups"], + data["intensities"], + data["densities"], min_data_points=1, ) @@ -342,24 +350,33 @@ def test_compute_fitting_coefficients_exceptions(hierarchy_info): data["densities"].drop(index=["Central lobule"], inplace=True) with pytest.raises(AtlasDensitiesError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"], - min_data_points=1, - ) + tested.compute_fitting_coefficients( + data["groups"], + data["intensities"], + data["densities"], + min_data_points=1, + ) data = get_fitting_input_data_(hierarchy_info) data["densities"].drop(columns=["pv+"], inplace=True) with pytest.raises(AtlasDensitiesError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"], - min_data_points=1, - ) + tested.compute_fitting_coefficients( + data["groups"], + data["intensities"], + data["densities"], + min_data_points=1, + ) data = get_fitting_input_data_(hierarchy_info) data["densities"].at["Lobule II", "pv+_standard_deviation"] = np.nan with pytest.raises(AssertionError): - tested.compute_fitting_coefficients(data["groups"], data["intensities"], data["densities"], - min_data_points=1, - ) + tested.compute_fitting_coefficients( + data["groups"], + data["intensities"], + data["densities"], + min_data_points=1, + ) @pytest.fixture @@ -533,6 +550,7 @@ def test_linear_fitting(group_ids_config): data["average_densities"], data["homogenous_regions"], group_ids_config=group_ids_config, + min_data_points=1, ) warnings_ = [w for w in warnings_ if isinstance(w.message, AtlasDensitiesWarning)] # Three warnings for recording NaN coefficients, three warnings for using them From 82730dc2785477d84f758652acb8049a66ee5cb9 Mon Sep 17 00:00:00 2001 From: Mike Gevaert Date: Fri, 17 May 2024 10:32:59 +0200 Subject: [PATCH 4/4] update changelog --- CHANGELOG.rst | 30 ++++++++++++++++++++++++++++ README.rst | 1 + atlas_densities/densities/fitting.py | 1 - 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5b0ff09..29bc6a7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,11 +1,41 @@ Changelog ========= +Version 0.2.6 +------------- +* Updating regression requirements, teach + `atlas-densities cell-densities fit-average-densities` to have `--min-data-points` (#81) + +Version 0.2.5 +------------- + +* Fix nans in compute_region_volumes (#80) + +Version 0.2.4 +------------- + +* Add r_square values to the results of the fitting. (#68) +* Add check to the region filter in case the input dataset is all equal to 0 for the region of interest. (#74) +* Fix fitting and optimization steps (#75) +* Faster _compute_region_cell_counts (#72) +* Cleanup app tests (#78) +* Faster compute average intensities (#73) +* Speedup mtype density creation from probability maps using voxcell's ValueToIndexVoxels (#79) + +Version 0.2.3 +------------- + +* remove pkg_resources, use importlib (#62) +* Drop literature values from regions that are not in hierarchy or not in the annotation volume (#61) + Version 0.2.2 +------------- + * add ``atlas-densities combination manipulate`` Version 0.2.1 ------------- + * The following *cell-density* sub-commands can now optionally take a ``--group-ids-config-path``: *cell-density*, *glia-cell-densities*, *inhibitory-and-excitatory-neuron-densities*, *fit-average-densities* diff --git a/README.rst b/README.rst index 7b3d708..0224a25 100644 --- a/README.rst +++ b/README.rst @@ -286,6 +286,7 @@ respective standard deviation and `coefficient of determination`_ (`r_square`). --fitted-densities-output-path=data/ccfv2/first_estimates/first_estimates.csv \ --fitting-maps-output-path=data/ccfv2/first_estimates/fitting.json +Note: One can use the ``--min-data-points`` to require a minimum of points for the linear regression; the default is 1. Compute inhibitory/excitatory neuron densities ---------------------------------------------- diff --git a/atlas_densities/densities/fitting.py b/atlas_densities/densities/fitting.py index 6f41705..4f6fe0b 100644 --- a/atlas_densities/densities/fitting.py +++ b/atlas_densities/densities/fitting.py @@ -396,7 +396,6 @@ def linear_fitting_xy( Raises: AtlasDensitiesError if some of the `sigma`values are negative. """ - if len(xdata) < min_data_points: return {"coefficient": np.nan, "standard_deviation": np.nan, "r_square": np.nan}