diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f5874846..ab5dae33 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -59,7 +59,7 @@ jobs: uses: actions/cache@v2 with: path: testdata - key: hatchetcache03 + key: hatchetcache04 - name: Download Testing Data if: steps.cache-test-data.outputs.cache-hit != 'true' diff --git a/docs/source/doc_cluster_bins.md b/docs/source/doc_cluster_bins.md index e452356c..24387f64 100644 --- a/docs/source/doc_cluster_bins.md +++ b/docs/source/doc_cluster_bins.md @@ -1,11 +1,12 @@ # cluster-bins -This step globally clusters genomic bins along the entire genome and jointly across tumor samples, and estimate the corresponding values of RDR and BAF for every cluster in every sample. -cluster-bins uses a non-parametric Gaussian mixture model (GMM) (scikit-learn implementation) for clustering; the main parameters can be tuned for dealing with special datasets, especially those with high variance or low tumor purity (see [Main Parameters](#main-parameters) below). +This step globally clusters genomic bins along the entire genome and jointly across tumor samples. +`cluster-bins` clusters bins while also taking into account their locations on the genome to preferentially form clusters that correspond to contiguous genomic segments on chromosome arms. +The input/output files for `cluster-bins` are exactly the same as those for `cluster-bins-gmm`. ## Input -cluster-bins takes in input a tab-separated file with the following fields. +`cluster-bins` takes in input a tab-separated file with the following fields. | Field | Description | |-------|-------------| @@ -24,7 +25,7 @@ The fields `#SNPS`, `COV`, `ALPHA`, and `BETA` are currently deprecated and thei ## Output -cluster-bins produces two tab-separated files: +`cluster-bins` produces two tab-separated files: 1. A file of clustered genomic bins, specified by the flag `-O`, `--outbins`. The tab separated file has the same fields as the input plus a last field `CLUSTER` which specifies the name of the corresponding cluster. @@ -45,41 +46,25 @@ cluster-bins produces two tab-separated files: ## Main parameters -cluster-bins has 4 main features with some main parameters that allow to improve the clustering. +1. `cluster-bins` has a parameter `-d`, `--diploidbaf` that specifies the maximum expected shift from 0.5 the BAF of a balanced cluster (i.e., diploid with copy-number state (1, 1) or tetraploid with copy-number state (2, 2)). This threshold is used to correct bias in the BAF of these balanced clusters. +The default value of this parameter (0.1) is often sufficient, but the most appropriate value will vary depending on noise and coverage. In general, this value should be set to include only those clusters that are closest to 0.5 -- for example, if some clusters have centroids near 0.47 and others have centroids near 0.42, this parameter should be set to 0.035 or 0.04. +To determine the best setting for this value, please check the plots produced by `plot-bins` and the centroid values described `bbc/bulk.seg` (output from this command). -1. cluster-bins has a parameter `-d`, `--diploidbaf` that specifies the maximum expected shift from 0.5 for BAF for a diploid or tetraploid cluster (i.e. with copy-number states (1, 1) or (2, 2)). This threshold is used for two goals: (1) To identify the diploid or tetraploid cluster which is used to correct the estimated BAF of potentially biased clusters. (2) To identify potentially biased clusters. -The default value of this parameter (0.1) is typically sufficient for most of the datasets, but its value can be changed or tuned to accommodate the features of special datasets. -In particular, the value of this threshold depends on the variance in the data (related to noise and coverage); generally, higher variance requires a higher shift. -Information provided by plot-bins can be crucial to decide whether one needs to change this value in special datasets. +2. By default, `cluster-bins` takes as input a minimum number of clusters (`--minK`, default `2`) and maximum number of clusters (`--maxK`, default `30`), and chooses the number `K` of clusters in this closed interval that maximizes the silhoutette score. Users can also specify an exact number of clusters (`--exactK`) to infer, which skips the model selection step. -2. cluster-bins has some main parameters to control the clustering; the default values for most of these parameters allow to deal with most of datasets, but their values can be changed or tuned to accommodate the features of special datasets. -plot-bins provides informative plots that can be used to assess the quality of the clustering and evaluate the need of changing some parameters for special datasets. -If your clusters do not appear to be cohesive, try lowering the maximum number of clusters (`-K`) which will force cluster-bins to infer fewer clusters. +3. Other options are available to change aspects of the Gaussian Hidden Markov model (GHMM) that is used by `cluster-bins`: | Name | Description | Usage | Default | |------|-------------|-------|---------| -| `-K`, `--initclusters` | Maximum number of clusters | The parameter specifies the maximum number of clusters to infer, i.e., the maximum number of GMM components | 50 | -| `-c`, `--concentration` | Concentration parameter for clustering | This parameter determines how much confidence the GMM has in different types of clusterings. Higher values (e.g., 10 or 100) favor fewer clusters, and smaller values (e.g., 0.01 or 0.001) favor more clusters. For experts, this is the alpha parameter for the Dirichlet process prior. | 1/K | +| `--tau` | Off-diagonal value for initializing transition matrix | must be `<= 1/(K-1)` | `1e-6` | +| `-t`, `--transmat` | Type of transition matrix to infer | `fixed` (to off-diagonal = tau), `diag` (all diagonal elements are equal, all off-diagonal elements are equal) or `full` (freely varying) | `diag` | +| `-c`, `--covar` | Type of covariance matrix to infer | options described in [hmmlearn documentation](https://hmmlearn.readthedocs.io/en/latest/api.html#hmmlearn.hmm.GaussianHMM) | `diag` | +| `-x`, `--decoding` | Decoding algorithm to use to infer final estimates of states | `map` for MAP inference, `viterbi` for Viterbi algorithm | `map` | -3. cluster-bins offers a bootstraping approach that allows a succesfull clustering even when there is a limited number genomic bins that are considred. The bootstraping approach generates sinthetic (i.e. used only for clustering) bins based on the data of the given bins. The bootstraping is controlled by the following parameters. - -| Name | Description | Usage | Default | -|------|-------------|-------|---------| -| `-u`, `--bootclustering` | Number of sinthetic bins to generate | Sinthetic bins can be generated based on the RDR and BAF of given bins and are added only to the clustering to improve it when the total number of bins is low (e.g. when considering data from WES) | 0, not used | -| `-dR`,`--ratiodeviation` | Standard deviation for generate RDR of sinthetic bins | The parameter affects the variance of the generated data, this value can be estimated from given bins and plot-bins generates informative plots to do this | 0.02 | -| `-dB`,`--bafdeviation` | Standard deviation for generate BAF of sinthetic bins | The parameter affects the variance of the generated data, this value can be estimated from given bins and plot-bins generates informative plots to do this | 0.02 | -| `-s`, `--seed` | Random seed | The value is used to seed the random generation of RDR and BAF of synthetic bins | 0 | - -4. cluster-bins offers a basic iterative process to merge clusters according to given tolerances. This feature can be used to refine the results of the GMM clustering and merge distinct clusters that are not sufficiently distinguished. This process can be controlled by the following parameters. - -| Name | Description | Usage | Default | -|------|-------------|-------|---------| -| `-tR`, `--tolerancerdr` | Tolerance for RDR | The value is used to determine when two clusters should be merged in terms of RDR | 0.0, merging is not performed | -| `-tB`, `--tolerancebaf` | Tolerance for BAF | The value is used to determine when two clusters should be merged in terms of BAF | 0.0, merging is not performed | +Particularly, `tau` controls the balance between global information (RDR and BAf across samples) and local information (assigning adjacent bins to the same cluster): smaller values of `tau` put more weight on *local* information, and larger values of `tau` put more weight on *global* information. It may be appropriate to reduce `tau` by several orders of magnitude for noisier or lower-coverage datasets. ## Optional parameters | Name | Description | Usage | Default | |------|-------------|-------|---------| -| `-v`, `--verbose` | Verbose logging flag | When enabled, combine-counts outputs a verbose log of the executiong | Not used | -| `-r`, `--disablebar` | Disabling progress-bar flag | When enabled, the output progress bar is disabled | Not used | +| `-e`, `--seed` | Random number generator seed used in model fitting | 0 | diff --git a/docs/source/doc_cluster_bins_gmm.md b/docs/source/doc_cluster_bins_gmm.md new file mode 100644 index 00000000..e763804b --- /dev/null +++ b/docs/source/doc_cluster_bins_gmm.md @@ -0,0 +1,85 @@ +# cluster-bins-gmm + +This step globally clusters genomic bins along the entire genome and jointly across tumor samples, and estimate the corresponding values of RDR and BAF for every cluster in every sample. +cluster-bins-gmm uses a non-parametric Gaussian mixture model (GMM) (scikit-learn implementation) for clustering; the main parameters can be tuned for dealing with special datasets, especially those with high variance or low tumor purity (see [Main Parameters](#main-parameters) below). + +## Input + +cluster-bins-gmm takes in input a tab-separated file with the following fields. + +| Field | Description | +|-------|-------------| +| `CHR` | Name of a chromosome | +| `START` | Starting genomic position of a genomic bin in `CHR` | +| `END` | Ending genomic position of a genomic bin in `CHR` | +| `SAMPLE` | Name of a tumor sample | +| `RD` | RDR of the bin in `SAMPLE` | +| `#SNPS` | Number of SNPs present in the bin in `SAMPLE` | +| `COV` | Average coverage in the bin in `SAMPLE` | +| `ALPHA` | Alpha parameter related to the binomial model of BAF for the bin in `SAMPLE`, typically total number of reads from A allele | +| `BETA` | Beta parameter related to the binomial model of BAF for the bin in `SAMPLE`, typically total number of reads from B allele | +| `BAF` | BAF of the bin in `SAMPLE` | + +The fields `#SNPS`, `COV`, `ALPHA`, and `BETA` are currently deprecated and their values are ignored. + +## Output + +cluster-bins-gmm produces two tab-separated files: + +1. A file of clustered genomic bins, specified by the flag `-O`, `--outbins`. The tab separated file has the same fields as the input plus a last field `CLUSTER` which specifies the name of the corresponding cluster. + +2. A file of clustered genomic bins, specified by the flag `-o`, `--outsegments`. The tab separated file has the following fields. + +| Field | Description | +|-------|-------------| +| `ID` | The name of a cluster | +| `SAMPLE` | The name of a sample | +| `#BINS` | The number of bins included in `ID` | +| `RD` | The RDR of the cluster `ID` in `SAMPLE` | +| `#SNPS` | The total number of SNPs in the cluster `ID` | +| `COV` | The average coverage in the cluster `ID` | +| `ALPHA` | The alpha parameter of the binomial model for the BAF of the cluster `ID` | +| `BETA` | The beta parameter of the binomial model for the BAF of the cluster `ID` | +| `BAF` | The BAF of the cluster `ID` in `SAMPLE` | + + +## Main parameters + +cluster-bins-gmm has 4 main features with some main parameters that allow to improve the clustering. + +1. cluster-bins-gmm has a parameter `-d`, `--diploidbaf` that specifies the maximum expected shift from 0.5 for BAF for a diploid or tetraploid cluster (i.e. with copy-number states (1, 1) or (2, 2)). This threshold is used for two goals: (1) To identify the diploid or tetraploid cluster which is used to correct the estimated BAF of potentially biased clusters. (2) To identify potentially biased clusters. +The default value of this parameter (0.1) is typically sufficient for most of the datasets, but its value can be changed or tuned to accommodate the features of special datasets. +In particular, the value of this threshold depends on the variance in the data (related to noise and coverage); generally, higher variance requires a higher shift. +Information provided by plot-bins can be crucial to decide whether one needs to change this value in special datasets. + +2. cluster-bins-gmm has some main parameters to control the clustering; the default values for most of these parameters allow to deal with most of datasets, but their values can be changed or tuned to accommodate the features of special datasets. +plot-bins provides informative plots that can be used to assess the quality of the clustering and evaluate the need of changing some parameters for special datasets. +If your clusters do not appear to be cohesive, try lowering the maximum number of clusters (`-K`) which will force cluster-bins-gmm to infer fewer clusters. + +| Name | Description | Usage | Default | +|------|-------------|-------|---------| +| `-K`, `--initclusters` | Maximum number of clusters | The parameter specifies the maximum number of clusters to infer, i.e., the maximum number of GMM components | 50 | +| `-c`, `--concentration` | Concentration parameter for clustering | This parameter determines how much confidence the GMM has in different types of clusterings. Higher values (e.g., 10 or 100) favor fewer clusters, and smaller values (e.g., 0.01 or 0.001) favor more clusters. For experts, this is the alpha parameter for the Dirichlet process prior. | 1/K | + +3. cluster-bins-gmm offers a bootstraping approach that allows a succesfull clustering even when there is a limited number genomic bins that are considred. The bootstraping approach generates sinthetic (i.e. used only for clustering) bins based on the data of the given bins. The bootstraping is controlled by the following parameters. + +| Name | Description | Usage | Default | +|------|-------------|-------|---------| +| `-u`, `--bootclustering` | Number of sinthetic bins to generate | Sinthetic bins can be generated based on the RDR and BAF of given bins and are added only to the clustering to improve it when the total number of bins is low (e.g. when considering data from WES) | 0, not used | +| `-dR`,`--ratiodeviation` | Standard deviation for generate RDR of sinthetic bins | The parameter affects the variance of the generated data, this value can be estimated from given bins and plot-bins generates informative plots to do this | 0.02 | +| `-dB`,`--bafdeviation` | Standard deviation for generate BAF of sinthetic bins | The parameter affects the variance of the generated data, this value can be estimated from given bins and plot-bins generates informative plots to do this | 0.02 | +| `-s`, `--seed` | Random seed | The value is used to seed the random generation of RDR and BAF of synthetic bins | 0 | + +4. cluster-bins-gmm offers a basic iterative process to merge clusters according to given tolerances. This feature can be used to refine the results of the GMM clustering and merge distinct clusters that are not sufficiently distinguished. This process can be controlled by the following parameters. + +| Name | Description | Usage | Default | +|------|-------------|-------|---------| +| `-tR`, `--tolerancerdr` | Tolerance for RDR | The value is used to determine when two clusters should be merged in terms of RDR | 0.0, merging is not performed | +| `-tB`, `--tolerancebaf` | Tolerance for BAF | The value is used to determine when two clusters should be merged in terms of BAF | 0.0, merging is not performed | + +## Optional parameters + +| Name | Description | Usage | Default | +|------|-------------|-------|---------| +| `-v`, `--verbose` | Verbose logging flag | When enabled, combine-counts outputs a verbose log of the executiong | Not used | +| `-r`, `--disablebar` | Disabling progress-bar flag | When enabled, the output progress bar is disabled | Not used | diff --git a/docs/source/doc_cluster_bins_loc.md b/docs/source/doc_cluster_bins_loc.md deleted file mode 100644 index 66e078d9..00000000 --- a/docs/source/doc_cluster_bins_loc.md +++ /dev/null @@ -1,70 +0,0 @@ -# cluster-bins-loc - -This step globally clusters genomic bins along the entire genome and jointly across tumor samples. -`cluster-bins-loc` clusters bins while also taking into account their locations on the genome to preferentially form clusters that correspond to contiguous genomic segments on chromosome arms. -The input/output files for `cluster-bins-loc` are exactly the same as those for `cluster-bins`. - -## Input - -`cluster-bins-loc` takes in input a tab-separated file with the following fields. - -| Field | Description | -|-------|-------------| -| `CHR` | Name of a chromosome | -| `START` | Starting genomic position of a genomic bin in `CHR` | -| `END` | Ending genomic position of a genomic bin in `CHR` | -| `SAMPLE` | Name of a tumor sample | -| `RD` | RDR of the bin in `SAMPLE` | -| `#SNPS` | Number of SNPs present in the bin in `SAMPLE` | -| `COV` | Average coverage in the bin in `SAMPLE` | -| `ALPHA` | Alpha parameter related to the binomial model of BAF for the bin in `SAMPLE`, typically total number of reads from A allele | -| `BETA` | Beta parameter related to the binomial model of BAF for the bin in `SAMPLE`, typically total number of reads from B allele | -| `BAF` | BAF of the bin in `SAMPLE` | - -The fields `#SNPS`, `COV`, `ALPHA`, and `BETA` are currently deprecated and their values are ignored. - -## Output - -`cluster-bins-loc` produces two tab-separated files: - -1. A file of clustered genomic bins, specified by the flag `-O`, `--outbins`. The tab separated file has the same fields as the input plus a last field `CLUSTER` which specifies the name of the corresponding cluster. - -2. A file of clustered genomic bins, specified by the flag `-o`, `--outsegments`. The tab separated file has the following fields. - -| Field | Description | -|-------|-------------| -| `ID` | The name of a cluster | -| `SAMPLE` | The name of a sample | -| `#BINS` | The number of bins included in `ID` | -| `RD` | The RDR of the cluster `ID` in `SAMPLE` | -| `#SNPS` | The total number of SNPs in the cluster `ID` | -| `COV` | The average coverage in the cluster `ID` | -| `ALPHA` | The alpha parameter of the binomial model for the BAF of the cluster `ID` | -| `BETA` | The beta parameter of the binomial model for the BAF of the cluster `ID` | -| `BAF` | The BAF of the cluster `ID` in `SAMPLE` | - - -## Main parameters - -1. `cluster-bins-loc` has a parameter `-d`, `--diploidbaf` that specifies the maximum expected shift from 0.5 the BAF of a balanced cluster (i.e., diploid with copy-number state (1, 1) or tetraploid with copy-number state (2, 2)). This threshold is used to correct bias in the BAF of these balanced clusters. -The default value of this parameter (0.1) is often sufficient, but the most appropriate value will vary depending on noise and coverage. In general, this value should be set to include only those clusters that are closest to 0.5 -- for example, if some clusters have centroids near 0.47 and others have centroids near 0.42, this parameter should be set to 0.035 or 0.04. -To determine the best setting for this value, please check the plots produced by `plot-bins` and the centroid values described `bbc/bulk.seg` (output from this command). - -2. By default, `cluster-bins-loc` takes as input a minimum number of clusters (`--minK`, default `2`) and maximum number of clusters (`--maxK`, default `30`), and chooses the number `K` of clusters in this closed interval that maximizes the silhoutette score. Users can also specify an exact number of clusters (`--exactK`) to infer, which skips the model selection step. - -3. Other options are available to change aspects of the Gaussian Hidden Markov model (GHMM) that is used by `cluster-bins-loc`: - -| Name | Description | Usage | Default | -|------|-------------|-------|---------| -| `--tau` | Off-diagonal value for initializing transition matrix | must be `<= 1/(K-1)` | `1e-6` | -| `-t`, `--transmat` | Type of transition matrix to infer | `fixed` (to off-diagonal = tau), `diag` (all diagonal elements are equal, all off-diagonal elements are equal) or `full` (freely varying) | `diag` | -| `-c`, `--covar` | Type of covariance matrix to infer | options described in [hmmlearn documentation](https://hmmlearn.readthedocs.io/en/latest/api.html#hmmlearn.hmm.GaussianHMM) | `diag` | -| `-x`, `--decoding` | Decoding algorithm to use to infer final estimates of states | `map` for MAP inference, `viterbi` for Viterbi algorithm | `map` | - -Particularly, `tau` controls the balance between global information (RDR and BAf across samples) and local information (assigning adjacent bins to the same cluster): smaller values of `tau` put more weight on *local* information, and larger values of `tau` put more weight on *global* information. It may be appropriate to reduce `tau` by several orders of magnitude for noisier or lower-coverage datasets. - -## Optional parameters - -| Name | Description | Usage | Default | -|------|-------------|-------|---------| -| `-e`, `--seed` | Random number generator seed used in model fitting | 0 | diff --git a/docs/source/doc_fullpipeline.md b/docs/source/doc_fullpipeline.md index 775feb32..cfbd7835 100644 --- a/docs/source/doc_fullpipeline.md +++ b/docs/source/doc_fullpipeline.md @@ -53,7 +53,10 @@ Each step `` of HATCHet can be run with the following command within a HAT hatchet ``` -**Note**: This version of HATCHet uses variable-width bins to ensure that each bin has comparable B-allele frequency (BAF) signal from heterogeneous germline SNPs and to account for sequencing coverage. To run the older versions of HATCHet with fixed-width bins, use [*count-reads-fw*](doc_count_reads_fw.html) (formerly binBAM) instead of *count-reads* and [*combine-counts-fw*](doc_combine_counts_fw.html) (formerly comBBo) instead of *combine-counts*. If you are using the `run` command, set `fixed_width = True` in your `.ini` file. +**Note**: This version of HATCHet uses variable-width bins to ensure that each bin has comparable B-allele frequency (BAF) signal from heterogeneous germline SNPs and to account for sequencing coverage. To run the older versions of HATCHet with fixed-width bins, use [*count-reads-fw*](doc_count_reads_fw.html) (formerly binBAM) instead of *count-reads* and [*combine-counts-fw*](doc_combine_counts_fw.html) (formerly comBBo) instead of *combine-counts*. If you are using the `run` command, set `fixed_width = True` under the `[run]` header in your `.ini` file. + +Additionally, this version of HATCHet uses a new locality-aware clustering module that incorporates local information along the genome. This replaces the previous GMM-based clustering (command named cluBB). Users can still use GMM-based clustering if they wish by setting `loc-clust` to `False` under the `[run]` header in `hatchet.ini`, or by running [*cluster-bins-gmm*](doc_cluster_bins_gmm.md) directly. + *Older versions of HATCHet used different names for these steps. The `Old Name` column lists those names.* @@ -64,14 +67,12 @@ hatchet | (3) | [*count-reads*](doc_count_reads.html) | N/A | This step splits the human reference genome into very small regions (representing candidate bins) according to germline SNP positions, and counts the number of total reads aligned to each region in each of the tumor samples and in the matched normal sample. | | (4) | [*phase-snps*](doc_phase_snps.html) | N/A | This **optional** step uses reference-based phasing to group together germline SNPs that are likely on the same haplotype. This improves the quality of BAF estimates for balanced regions of the genome. | | (5) | [*combine-counts*](doc_combine_counts.html) | N/A | This step constructs genomic bins with variable sizes, and combines the read counts and the allele counts for the identified germline SNPs to compute the read-depth ratio (RDR) and B-allele frequency (BAF) of every genomic bin. | -| (6) | [*cluster-bins-loc*](doc_cluster_bins_loc.html)* | N/A | This step globally clusters genomic bins along the entire genome and jointly across tumor samples while incorporating local information. | +| (6) | [*cluster-bins*](doc_cluster_bins.html) | N/A | This step globally clusters genomic bins along the entire genome and jointly across tumor samples while incorporating local information. | | (7) | [*plot-bins*](doc_plot_bins.html) | BBot | This **optional** step produces informative plots concerning the computed RDRs, BAFs, and clusters. The information produced by this step are important to validate the compute clusters of genomic regions. When this step is enabled in the `.ini` file, [plot-bins-1d2d](doc_plot_bins_1d2d.html) is run as well to generate additional plots. | | (8) | [*compute-cn*](doc_compute_cn.html) | hatchet | This step computes allele-specific fractional copy numbers, solves a constrained distance-based simultaneous factorization to compute allele and clone-specific copy numbers and clone proportions, and deploys a model-selection criterion select the number of clone by explicitly considering the trade-off between subclonal copy-number aberrations and whole-genome duplication. | | (9) | [*plot-cn*](doc_plot_cn.html) | BBeval | This **optional** step analyzes the inferred copy-number states and clone proportions and produces informative plots jointly considering all samples from the same patient. In addition, this step can also combine results obtained for different patients and perform integrative analysis. When this step is enabled in the `.ini` file, [plot-cn-1d2d](doc_plot_cn_1d2d.html) is run as well to generate additional plots. | | (10) | [*check*](doc_check.html) | check-solver | This **optional** step is a quick way to verify if your solver and other 3rd party dependencies are working correctly. It checks for the presence of several dependencies and tests `compute_cn` on a set of small data files pre-packaged. | -*This is a new clusting module that replaces the previous GMM-based clustering (command named cluBB). Users can still use GMM-based clustering if they wish by setting `loc-clust` to `False` under the `[run]` header in `hatchet.ini`, or by running [*cluster-bins*](doc_cluster_bins.html) directly. - ## Recommendations and quality control diff --git a/docs/source/doc_plot_bins.md b/docs/source/doc_plot_bins.md index 6316e2a3..a9f101f6 100644 --- a/docs/source/doc_plot_bins.md +++ b/docs/source/doc_plot_bins.md @@ -2,9 +2,9 @@ This step produces informative plots concerning the computed RDRs, BAFs, and clusters. The information produced by this step are important to validate the compute clusters of genomic regions and help to tune the parameters for better deal with special datasets. -cluster-bins produces different plots which need to be specified by different commands and require different input. +plot-bins produces different plots which need to be specified by different commands and require different input. -When `plot_bins = True` is indicated in `hatchet.ini`, the command [`plot-bins-1d2d`](plot_bins_1d2d.md) will also be run. This command produces alternate plots in which bins are colored by cluster and colors match across samples between the 2D cluster view and 1D genomic view. +When `plot_bins = True` is indicated in `hatchet.ini`, the command [`plot-bins-1d2d`](doc_plot_bins_1d2d.md) will also be run. This command produces alternate plots in which bins are colored by cluster and colors match across samples between the 2D cluster view and 1D genomic view. ## Input diff --git a/docs/source/doc_runhatchet.md b/docs/source/doc_runhatchet.md index d7507b85..6522af5f 100644 --- a/docs/source/doc_runhatchet.md +++ b/docs/source/doc_runhatchet.md @@ -8,7 +8,7 @@ The tutorial is subdivided into some subsections and each of these describes seq 3. [count-alleles](#count-alleles) 4. [count-reads](#count-reads) 5. [combine-counts](#combine-counts) -6. [cluster-bins-loc](#cluster-bins-loc) +6. [cluster-bins](#cluster-bins) 7. [plot-bins](#plot-bins) 8. [compute-cn](#compute-cn) 9. [plot-cn](#plot-cn) @@ -123,15 +123,15 @@ combine-counts constructs genomic bins such that in all samples, each bin has at See the [script](script/README.md) directory for a guide on how to run HATCHet with phasing. If a phased VCF file is supplied via `-p, --phase ${PHASE}` (e.g., `-p phase/phased.vcf.gz`), SNPs are merged into blocks before BAF inference. Each block contains at most `${max_spb}` such that no two SNPs in the same block are further apart than `${max_blocksize}`, and such that no two adjacent SNPs have significantly different marginal BAF estimates (at significance level `${alpha}` -- higher `${alpha}` corresponds to less trust in the phasing results). Then, blocks are passed to the EM which determines the relative phase of each block. -## [cluster-bins-loc](doc_cluster_bins_loc.html) - +## [cluster-bins](doc_cluster_bins.html) + ```shell -python3 -m hatchet cluster-bins-loc ${BB}bulk.bb -o ${BBC}bulk.seg -O ${BBC}bulk.bbc \ +python3 -m hatchet cluster-bins ${BB}bulk.bb -o ${BBC}bulk.seg -O ${BBC}bulk.bbc \ -minK 5 -maxK 10 -tau 0.0001 -d 0.08 ``` -cluster-bins-loc globally clusters genomic bins based on RDR and BAF (specified in a BB file `${BB}bulk.bb`) jointly along the genome and across all tumor samples. This *locality-aware* clustering takes into account the contiguity of bins along the genome to inform the clustering. +cluster-bins globally clusters genomic bins based on RDR and BAF (specified in a BB file `${BB}bulk.bb`) jointly along the genome and across all tumor samples. This *locality-aware* clustering takes into account the contiguity of bins along the genome to inform the clustering. The main parameters are as follows: - The maximum expected BAF shift `-d` for a balanced cluster, here set to `0.08`. This parameter should be as small as possible while still including the clusters with the closest BAF to 0.5. diff --git a/docs/source/recommendation_clustering.md b/docs/source/recommendation_clustering.md index dff85e90..aa7619ac 100644 --- a/docs/source/recommendation_clustering.md +++ b/docs/source/recommendation_clustering.md @@ -1,8 +1,8 @@ # Analyze global clustering -The global clustering performed along the genome and jointly across samples is a crucial feature of HATCHet and the quality of the final results is strongly affected by the quality of the clustering. This global clustering is performed by HATCHet's component `cluster-bins-loc`, whose default values are suitable for many datasets. However, for ideal results on specific datasets these parameters may need to be modified. +The global clustering performed along the genome and jointly across samples is a crucial feature of HATCHet and the quality of the final results is strongly affected by the quality of the clustering. This global clustering is performed by HATCHet's component `cluster-bins`, whose default values are suitable for many datasets. However, for ideal results on specific datasets these parameters may need to be modified. -The module `cluster-bins-loc` incorporates genomic position to improve clustering using a Gaussian hidden Markov model (GHMM), as opposed to the position-agnostic Gaussian mixture model (GMM) used in `cluster-bins` and described in the original HATCHet publication. This page describes how to tune the parameters of `cluster-bins-loc` -- for recommendations on `cluster-bins`, see [this page](recommendation_old_clustering.html) instead. +The module `cluster-bins` incorporates genomic position to improve clustering using a Gaussian hidden Markov model (GHMM), as opposed to the position-agnostic Gaussian mixture model (GMM) used in `cluster-bins-gmm` and described in the original HATCHet publication. This page describes how to tune the parameters of `cluster-bins` -- for recommendations on `cluster-bins-gmm`, see [this page](recommendation_old_clustering.md) instead. The user should validate the results of the clustering, especially in noisy or suspicious cases, through the cluster figures produced by [plot-bins](doc_plot_bins.html) and [plot-bins-1d2d](doc_plot_bins_1d2d.html). More specifically, we suggest the following criteria to evaluate the clustering: @@ -10,9 +10,9 @@ The user should validate the results of the clustering, especially in noisy or s 2. Each cluster should contain regions with similar values of RDR and BAF in all samples -`cluster-bins-loc` offers several parameters that can be used to tune the clustering. +`cluster-bins` offers several parameters that can be used to tune the clustering. ## Number of clusters -By default, `cluster-bins-loc` tries several possible values for the number `K` of clusters and selects the one that maximizes the silhouette score. In practice, this tends to *underestimate* the number of clusters that are visually apparent. This can be modified by +By default, `cluster-bins` tries several possible values for the number `K` of clusters and selects the one that maximizes the silhouette score. In practice, this tends to *underestimate* the number of clusters that are visually apparent. This can be modified by 1. Setting the parameters `--minK` and `--maxK` which specify the minimum and maximum number of clusters to consider, or 2. Setting the parameter `--exactK` to fix the number of clusters to a given value. diff --git a/docs/source/recommendation_datatype.md b/docs/source/recommendation_datatype.md index da10d6a1..69deed03 100644 --- a/docs/source/recommendation_datatype.md +++ b/docs/source/recommendation_datatype.md @@ -8,8 +8,8 @@ The default values in the complete pipeline of HATCHet are typically used for an - *Read-count thresholds*. As suggested in the GATK best practices, `count-alleles` requires two parameters -c (the minimum coverage for SNPs) and -C (the maximum coverage for SNPs) to reliably call SNPs and exclude those in regions with artifacts. GATK suggests to consider a value of -C that is at least twice larger than the average coverage and -c should be large enough to exclude non-sequenced regions. For example, `-c 6` and `-C 300` are values previously used for WGS data whose coverage is typically between 30x and 90x. However, WES data are generally characterized by a much larger average coverage and thus require larger values, e.g. `-c 20` and `-C 600`. These values are also very usefule to discard off-target regions. In any case, the user should ideally pick values according to the considered data. - - *Bootstrapping for clustering*. (legacy `cluster-bins` only) Occasionally, WES may have very few points and much less data points than WGS. Only in these special cases with very few data points, the global clustering of cluster-bins may generally benefit from the integrated bootstrapping approach. This approach allow to generate a certain number of synthetic bins from the real ones to increase the power of the clustering. For example, the following cluster-bins parameters `-u 20 -dR 0.002 -dB 0.002` allow to activate the bootstraping which introduces 20 synthetic bins for each real bin with low variances. + - *Bootstrapping for clustering*. (legacy `cluster-bins-gmm` only) Occasionally, WES may have very few points and much less data points than WGS. Only in these special cases with very few data points, the global clustering of cluster-bins-gmm may generally benefit from the integrated bootstrapping approach. This approach allow to generate a certain number of synthetic bins from the real ones to increase the power of the clustering. For example, the following cluster-bins-gmm parameters `-u 20 -dR 0.002 -dB 0.002` allow to activate the bootstraping which introduces 20 synthetic bins for each real bin with low variances. - - *Bias in balanced BAF.* Depending on the data type (WGS vs. WES) and the sequencing coverage, the argument `-d, --diploidbaf` in `cluster-bins`/`cluster-bins-loc` may need to be adjusted to match the observed BAF bias. This threshold should be set so that the centroids of cluster(s) with BAF closest to 0.5 are included in the range [0.5 - `d`, 0.5] and other clusters with apparently different BAF are excluded. + - *Bias in balanced BAF.* Depending on the data type (WGS vs. WES) and the sequencing coverage, the argument `-d, --diploidbaf` in `cluster-bins`/`cluster-bins-gmm` may need to be adjusted to match the observed BAF bias. This threshold should be set so that the centroids of cluster(s) with BAF closest to 0.5 are included in the range [0.5 - `d`, 0.5] and other clusters with apparently different BAF are excluded. - *Genomic regions*. Provide a BED file defining the sequenced genomic regions when this is available. This helps to speed up the process. diff --git a/docs/source/recommendation_inference.md b/docs/source/recommendation_inference.md index eb4fe639..07957327 100644 --- a/docs/source/recommendation_inference.md +++ b/docs/source/recommendation_inference.md @@ -46,7 +46,7 @@ HATCHet chooses the largest combination of consistent clonal clusters to obtain ## Found pattern of size 886592751.0: {'8': (2, 0), '28': (4, 2), '50': (2, 2)} ## Chosen pattern of size 1080892751.0: {'8': (2, 1), '28': (3, 2), '50': (2, 2), '29': (4, 2), '23': (2, 0)} -There are two parameters which control the identification of these patterns, which are `-tR`, i.e. RDR threshold, and `-tB`, i.e. BAF threshold. These thresholds are considered as the maximum allowed errors when estimating the clonal clusters. As such, the user can either decrease the thresholds (e.g. `-ts 0.03` and `-ts 0.02`) to consider more stringent constraints especially in low-noise datasets where subclonality can be accurately identified, or increase the thresholds (e.g. `-ts 0.15` and `-ts 0.05`) to accomodate the higher noise in certain datasets. The user can use the `bb_clustered.pdf` figure produced by the command `CBB` of `cluster-bins` to assess the identified clonal cluster. For example, in the following case +There are two parameters which control the identification of these patterns, which are `-tR`, i.e. RDR threshold, and `-tB`, i.e. BAF threshold. These thresholds are considered as the maximum allowed errors when estimating the clonal clusters. As such, the user can either decrease the thresholds (e.g. `-ts 0.03` and `-ts 0.02`) to consider more stringent constraints especially in low-noise datasets where subclonality can be accurately identified, or increase the thresholds (e.g. `-ts 0.15` and `-ts 0.05`) to accomodate the higher noise in certain datasets. The user can use the `bb_clustered.pdf` figure produced by the command `CBB` of `plot-bins` to assess the identified clonal cluster. For example, in the following case ## 0: SIZE= 498 #CHRS= 22 (RDR, BAF)= (1.0, 0.5) ## 1: SIZE= 497 #CHRS= 22 (RDR, BAF)= (0.7, 0.25) @@ -105,7 +105,7 @@ The final best solution, according to prediction of the presence or absence of a It is very important that the user verifies the results in different steps to guarantee the best-quality results, more specifically: -- User can assess the validity of the chosen clonal cluster using the `bb_clustered.pdf` figure produced by the command `CBB` of `cluster-bins` and comparing the list of selected potential clonal clusters. +- User can assess the validity of the chosen clonal cluster using the `bb_clustered.pdf` figure produced by the command `CBB` of `plot-bins` and comparing the list of selected potential clonal clusters. - User can assess the inferred copy numbers by analyzing the inferred maximum values and the inferred clone proportions which define the tumor clonal composition. - User can assess the joint inference of number of clones and WGD by analyzing the values of the objective function and the related scores. diff --git a/docs/source/recommendation_old_clustering.md b/docs/source/recommendation_old_clustering.md index 7673f6ab..92cc3e59 100644 --- a/docs/source/recommendation_old_clustering.md +++ b/docs/source/recommendation_old_clustering.md @@ -1,14 +1,16 @@ # Analyze global clustering -The global clustering performed along the genome and jointly across samples is a crucial feature of HATCHet and the quality of the final results is affected by the quality of the clustering. In particular, the global clustering is performed by HATCHet's component `cluster-bins`, whose default values for the main parameters allow to deal with most of the datasets. However, noisy or special datasets need tuning of these parameters, especially because the current version of `cluster-bins` uses [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html), an external implementation of a GMM with a Dirichlet process prior which is not specifically designed for sequencing data. Therefore, the user needs to validate the results and improve it to obtain best-quality results, especially when considering noisy and special datasets. +**Note**: this page describes recommendations for tuning the older clustering module `cluster-bins-gmm`. We recommend using the locality-aware clustering module [`cluster-bins`](doc_cluster_bins.md) instead. + +The global clustering performed along the genome and jointly across samples is a crucial feature of HATCHet and the quality of the final results is affected by the quality of the clustering. In particular, the global clustering is performed by HATCHet's component `cluster-bins-gmm`, whose default values for the main parameters allow to deal with most of the datasets. However, noisy or special datasets need tuning of these parameters, especially because the current version of `cluster-bins-gmm` uses [scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html), an external implementation of a GMM with a Dirichlet process prior which is not specifically designed for sequencing data. Therefore, the user needs to validate the results and improve it to obtain best-quality results, especially when considering noisy and special datasets. The user should validate the results of the clustering, especially in noisy or suspicious cases, through the cluster figure produced by the [command CBB of plot-bins](doc_plot_bins.md) and as explained in the available [demos](https://github.com/raphael-group/hatchet#demos). More specifically, we suggest to consider the following criterion to validate the clustering: **Every pair of clusters needs to be clearly distinct in terms of RDR or BAF in at least one sample and each cluster only contains regions with similar values of RDR and BAF in all samples**. -`cluster-bins` offers two sets of parameters that the user can tune to improve the result of the clustering: (1) `cluster-bins`-specific and (2) BNPY-specific parameters. +`cluster-bins-gmm` offers two sets of parameters that the user can tune to improve the result of the clustering: (1) `cluster-bins-gmm`-specific and (2) sklearn-specific parameters. -### `cluster-bins`-specific parameters +### `cluster-bins-gmm`-specific parameters -`cluster-bins` provides two specific parameters that can be used to improve the results of the global clustering: +`cluster-bins-gmm` provides two specific parameters that can be used to improve the results of the global clustering: - `-tR` threshold defines a RDR tolerance (default value is `-tR 0.15`) - `-tB` threshold defines a BAF tolerance (default value is `-tB 0.04`) @@ -16,8 +18,8 @@ These tolerances are used to merge any pair of clusters which have differences i ### `sklearn`-specific parameters -`cluster-bins` additionally provide 3 parameters which allow to directly control the clustering computed by the scikit-learn Gaussian Mixture Model: -- `-R`: number of restarts (default is `-R 10`) which controls the number of restarts allows the method to escape local optima. While a lower number of restarts improves the running time of `cluster-bins` process, a higher number allows it to consider more solutions and choose a better clustering. +`cluster-bins-gmm` additionally provide 3 parameters which allow to directly control the clustering computed by the scikit-learn Gaussian Mixture Model: +- `-R`: number of restarts (default is `-R 10`) which controls the number of restarts allows the method to escape local optima. While a lower number of restarts improves the running time of `cluster-bins-gmm` process, a higher number allows it to consider more solutions and choose a better clustering. - `-K`: this value provides the maximum possible number of clusters (default is `-K 50`). Scikit-learn tends to infer close to this maximum number `K` of clusters, so if you have relatively simple data with few apparent clusters you may want to lower `K`. - `-c`: this value (default `1`) determines the model's relative confidence in many even clusters (large `c`) vs. an uneven distribution of points into clusters, potentially with fewer than `K` components (small `c`). For experts, this is the concentration parameter for the Dirichlet process prior. Reducing this value by many orders of magnitude (e.g., `-c 0.001` or `-c 0.00001`) may enable scikit-learn to infer fewer than `K` clusters. diff --git a/pyproject.toml b/pyproject.toml index 9d7ec46e..e0430d6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "hatchet" -version = "1.0.0" +version = "1.0.1" authors = [ { name="Simone Zaccaria", email="s.zaccaria@ucl.ac.uk" }, { name="Ben Raphael", email="braphael@cs.princeton.edu" }, diff --git a/src/hatchet/__init__.py b/src/hatchet/__init__.py index df578d8e..28a119ef 100644 --- a/src/hatchet/__init__.py +++ b/src/hatchet/__init__.py @@ -1,4 +1,4 @@ -__version__ = '1.0.0' +__version__ = '1.0.1' import os.path from importlib.resources import path diff --git a/src/hatchet/__main__.py b/src/hatchet/__main__.py index 02e2a16a..5884531b 100644 --- a/src/hatchet/__main__.py +++ b/src/hatchet/__main__.py @@ -15,8 +15,8 @@ from hatchet.utils.count_alleles import main as count_alleles # noqa: F401 from hatchet.utils.combine_counts import main as combine_counts # noqa: F401 from hatchet.utils.combine_counts_fw import main as combine_counts_fw # noqa: F401 +from hatchet.utils.cluster_bins_gmm import main as cluster_bins_gmm # noqa: F401 from hatchet.utils.cluster_bins import main as cluster_bins # noqa: F401 -from hatchet.utils.cluster_bins_loc import main as cluster_bins_loc # noqa: F401 from hatchet.utils.plot_bins import main as plot_bins # noqa: F401 from hatchet.utils.plot_bins_1d2d import main as plot_bins_1d2d # noqa: F401 diff --git a/src/hatchet/hatchet.ini b/src/hatchet/hatchet.ini index 626108c0..f9ef66bc 100644 --- a/src/hatchet/hatchet.ini +++ b/src/hatchet/hatchet.ini @@ -56,7 +56,7 @@ alpha = 0.1 ss_em = not_compressed = -[cluster_bins] +[cluster_bins_gmm] outsegments = outbins = diploidbaf = 0.1 @@ -72,7 +72,7 @@ restarts = 10 verbose = False disablebar = False -[cluster_bins_loc] +[cluster_bins] outsegments = outbins = diploidbaf = 0.1 diff --git a/src/hatchet/utils/ArgParsing.py b/src/hatchet/utils/ArgParsing.py index 8ae76946..17e08192 100644 --- a/src/hatchet/utils/ArgParsing.py +++ b/src/hatchet/utils/ArgParsing.py @@ -236,7 +236,7 @@ def parse_plot_cn_1d2d_args(args=None): } -def parse_cluster_bins_loc_args(args=None): +def parse_cluster_bins_args(args=None): """ Parse command line arguments Returns: @@ -262,7 +262,7 @@ def parse_cluster_bins_loc_args(args=None): '-o', '--outsegments', required=False, - default=config.cluster_bins_loc.outsegments, + default=config.cluster_bins.outsegments, type=str, help='Output filename for the segments computed by clustering bins (default: stdout)', ) @@ -270,7 +270,7 @@ def parse_cluster_bins_loc_args(args=None): '-O', '--outbins', required=False, - default=config.cluster_bins_loc.outbins, + default=config.cluster_bins.outbins, type=str, help='Output filename for a BB file adding the clusters (default: stdout)', ) @@ -279,7 +279,7 @@ def parse_cluster_bins_loc_args(args=None): '--diploidbaf', type=float, required=False, - default=config.cluster_bins_loc.diploidbaf, + default=config.cluster_bins.diploidbaf, help=( 'Maximum diploid-BAF shift used to determine the largest copy-neutral cluster and to rescale all the ' 'cluster inside this threshold accordingly (default: None, scaling is not performed)' @@ -290,28 +290,28 @@ def parse_cluster_bins_loc_args(args=None): '--seed', type=int, required=False, - default=config.cluster_bins_loc.seed, + default=config.cluster_bins.seed, help='Random seed used for clustering (default: 0)', ) parser.add_argument( '--minK', type=int, required=False, - default=config.cluster_bins_loc.mink, - help=f'Minimum number of clusters to infer (default = {config.cluster_bins_loc.mink})', + default=config.cluster_bins.mink, + help=f'Minimum number of clusters to infer (default = {config.cluster_bins.mink})', ) parser.add_argument( '--maxK', type=int, required=False, - default=config.cluster_bins_loc.maxk, - help=f'Maximum number of clusters to infer (default = {config.cluster_bins_loc.maxk})', + default=config.cluster_bins.maxk, + help=f'Maximum number of clusters to infer (default = {config.cluster_bins.maxk})', ) parser.add_argument( '--exactK', type=int, required=False, - default=config.cluster_bins_loc.exactk, + default=config.cluster_bins.exactk, help='Skip model selection and infer exactly this many clusters (default: None)', ) parser.add_argument( @@ -319,22 +319,22 @@ def parse_cluster_bins_loc_args(args=None): '--transmat', type=str, required=False, - default=config.cluster_bins_loc.transmat, + default=config.cluster_bins.transmat, help='Form of transition matrix to infer: fixed, diag (1-parameter), or full (default: diag)', ) parser.add_argument( '--tau', type=float, required=False, - default=config.cluster_bins_loc.tau, - help=f'Off-diagonal value for initializing transition matrix (default: {config.cluster_bins_loc.tau})', + default=config.cluster_bins.tau, + help=f'Off-diagonal value for initializing transition matrix (default: {config.cluster_bins.tau})', ) parser.add_argument( '-c', '--covar', type=str, required=False, - default=config.cluster_bins_loc.covar, + default=config.cluster_bins.covar, help='Form of covariance matrix: spherical, diag, full, or tied (default: diag)', ) parser.add_argument( @@ -342,7 +342,7 @@ def parse_cluster_bins_loc_args(args=None): '--decoding', type=str, required=False, - default=config.cluster_bins_loc.decoding, + default=config.cluster_bins.decoding, help='Decoding algorithm to use: map or viterbi (default: map)', ) parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {__version__}') @@ -1884,7 +1884,7 @@ def parse_combine_counts_fw_args(args=None): } -def parse_cluster_bins_args(args=None): +def parse_cluster_bins_gmm_args(args=None): parser = argparse.ArgumentParser( prog='hatchet cluster-bins', description=( @@ -1906,7 +1906,7 @@ def parse_cluster_bins_args(args=None): '-o', '--outsegments', required=False, - default=config.cluster_bins.outsegments, + default=config.cluster_bins_gmm.outsegments, type=str, help='Output filename for the segments computed by clustering bins (default: stdout)', ) @@ -1914,7 +1914,7 @@ def parse_cluster_bins_args(args=None): '-O', '--outbins', required=False, - default=config.cluster_bins.outbins, + default=config.cluster_bins_gmm.outbins, type=str, help='Output filename for a BB file adding the clusters (default: stdout)', ) @@ -1923,7 +1923,7 @@ def parse_cluster_bins_args(args=None): '--diploidbaf', type=float, required=False, - default=config.cluster_bins.diploidbaf, + default=config.cluster_bins_gmm.diploidbaf, help=( 'Maximum diploid-BAF shift used to determine the largest copy-neutral cluster and to rescale all the ' 'cluster inside this threshold accordingly (default: None, scaling is not performed)' @@ -1934,7 +1934,7 @@ def parse_cluster_bins_args(args=None): '--tolerancerdr', type=float, required=False, - default=config.cluster_bins.tolerancerdr, + default=config.cluster_bins_gmm.tolerancerdr, help=( 'Refine the clustering merging the clusters with this maximum difference in RDR values (default: None, ' 'gurobipy required)' @@ -1945,7 +1945,7 @@ def parse_cluster_bins_args(args=None): '--tolerancebaf', type=float, required=False, - default=config.cluster_bins.tolerancebaf, + default=config.cluster_bins_gmm.tolerancebaf, help=( 'Refine the clustering merging the clusters with this maximum difference in BAF values (default: None, ', 'gurobipy required)', @@ -1956,7 +1956,7 @@ def parse_cluster_bins_args(args=None): '--bootclustering', type=int, required=False, - default=config.cluster_bins.bootclustering, + default=config.cluster_bins_gmm.bootclustering, help=( 'Number of points to add for bootstraping each bin to improve the clustering. Each point is generated ' 'by drawing its values from a normal distribution centered on the values of the bin. This can help the ' @@ -1968,7 +1968,7 @@ def parse_cluster_bins_args(args=None): '--ratiodeviation', type=float, required=False, - default=config.cluster_bins.ratiodeviation, + default=config.cluster_bins_gmm.ratiodeviation, help='Standard deviation of the read ratios used to generate the points in the clouds (default: 0.02)', ) parser.add_argument( @@ -1976,7 +1976,7 @@ def parse_cluster_bins_args(args=None): '--bafdeviation', type=float, required=False, - default=config.cluster_bins.bafdeviation, + default=config.cluster_bins_gmm.bafdeviation, help='Standard deviation of the BAFs used to generate the points in the clouds (default: 0.02)', ) parser.add_argument( @@ -1984,7 +1984,7 @@ def parse_cluster_bins_args(args=None): '--seed', type=int, required=False, - default=config.cluster_bins.seed, + default=config.cluster_bins_gmm.seed, help='Random seed used for clustering AND the normal distributions used in the clouds (default: 0)', ) parser.add_argument( @@ -1992,7 +1992,7 @@ def parse_cluster_bins_args(args=None): '--initclusters', type=int, required=False, - default=config.cluster_bins.initclusters, + default=config.cluster_bins_gmm.initclusters, help='The maximum number of clusters to infer (default: 50)', ) parser.add_argument( @@ -2000,7 +2000,7 @@ def parse_cluster_bins_args(args=None): '--concentration', type=float, required=False, - default=config.cluster_bins.concentration, + default=config.cluster_bins_gmm.concentration, help=( 'Tuning parameter for clustering (concentration parameter for Dirichlet process prior). Higher favors ' 'more clusters, lower favors fewer clusters (default 0.02 = 1/K).' @@ -2011,21 +2011,21 @@ def parse_cluster_bins_args(args=None): '--restarts', type=int, required=False, - default=config.cluster_bins.restarts, + default=config.cluster_bins_gmm.restarts, help='Number of restarts performed by the clustering to choose the best (default: 10)', ) parser.add_argument( '-v', '--verbose', action='store_true', - default=config.cluster_bins.verbose, + default=config.cluster_bins_gmm.verbose, required=False, help='Use verbose log messages', ) parser.add_argument( '--disablebar', action='store_true', - default=config.cluster_bins.disablebar, + default=config.cluster_bins_gmm.disablebar, required=False, help='Disable progress bar', ) diff --git a/src/hatchet/utils/cluster_bins.py b/src/hatchet/utils/cluster_bins.py index 472028cf..d518c262 100644 --- a/src/hatchet/utils/cluster_bins.py +++ b/src/hatchet/utils/cluster_bins.py @@ -1,7 +1,12 @@ -import sys -import math -import numpy as np from collections import Counter +import numpy as np +import pandas as pd + +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import silhouette_score +from scipy.special import logsumexp +from scipy.spatial.distance import pdist, squareform +from hmmlearn import hmm from hatchet.utils.ArgParsing import parse_cluster_bins_args import hatchet.utils.Supporting as sp @@ -13,444 +18,307 @@ def main(args=None): sp.logArgs(args, 80) sp.log(msg='# Reading the combined BB file\n', level='STEP') - combo, samples = readBB(args['bbfile']) - - sp.log(msg='# Format data to cluster\n', level='STEP') - points, bintoidx = getPoints(data=combo, samples=samples) - - clouds = None - if args['cloud'] > 0: - sp.log(msg='# Bootstrap each bin for clustering\n', level='STEP') - clouds = generateClouds( - points=points, - density=args['cloud'], - seed=args['seed'], - sdeven=args['ratiodeviation'], - sdodd=args['bafdeviation'], - ) - - sp.log( - msg='# Clustering bins by RD and BAF across tumor samples\n', - level='STEP', - ) - mus, sigmas, clusterAssignments, numPoints, numClusters = cluster( - points=points, - clouds=clouds, - K=args['initclusters'], - concentration_prior=args['concentration'], - restarts=args['restarts'], - seed=args['seed'], - ) - - if args['rdtol'] > 0.0 or args['baftol'] > 0.0: - sp.log(msg='# Refining clustering using given tolerances\n', level='STEP') - before = len(set(clusterAssignments)) - clusterAssignments, numClusters = refineClustering( - combo=combo, - assign=clusterAssignments, - assignidx=bintoidx, - samples=samples, - rdtol=args['rdtol'], - baftol=args['baftol'], - ) - sp.log( - msg='The number of clusters have been reduced from {} to {} with given tolerances\n'.format( - before, numClusters - ), - level='INFO', - ) + tracks, bb, sample_labels, chr_labels = read_bb(args['bbfile']) - clusterAssignments = reindex(clusterAssignments) - - sp.log(msg='# Writing BBC output with resulting clusters\n', level='STEP') - if args['outbins'] is None: - outbins = sys.stdout + if args['exactK'] > 0: + minK = args['exactK'] + maxK = args['exactK'] else: - outbins = open(args['outbins'], 'w') - outbins.write('#CHR\tSTART\tEND\tSAMPLE\tRD\t#SNPS\tCOV\tALPHA\tBETA\tBAF\tCLUSTER\n') - for key in sorted(combo, key=(lambda x: (sp.numericOrder(x[0]), int(x[1]), int(x[2])))): - for sample in sorted(combo[key]): - outbins.write( - '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( - key[0], - key[1], - key[2], - sample[0], - sample[1], - sample[2], - sample[3], - sample[4], - sample[5], - sample[6], - clusterAssignments[bintoidx[key]], - ) - ) - if outbins is not sys.stdout: - outbins.close() + minK = args['minK'] + maxK = args['maxK'] - sp.log(msg='# Segmenting bins\n', level='STEP') - clusters = { - cluster: set(key for key in combo if clusterAssignments[bintoidx[key]] == cluster) - for cluster in set(clusterAssignments) - } - segments = segmentBins(bb=combo, clusters=clusters, samples=samples) + if minK <= 1: + sp.log( + msg='# WARNING: model selection does not support comparing K=1 to K>1. K=1 will be ignored.\n', + level='WARNING', + ) - if args['diploidbaf'] is not None: + if args['exactK'] > 0 and args['exactK'] == 1: sp.log( - msg=( - '# Determining the largest cluster as diploid or tetraploid and rescaling all the clusters inside the ' - 'threshold accordingly\n' - ), + msg='# Found exactK=1, returning trivial clustering.\n', level='STEP', ) - segments = scaleBAF(segments=segments, samples=samples, diploidbaf=args['diploidbaf']) - - sp.log(msg='# Writing REF output with resulting segments\n', level='STEP') - if args['outsegments'] is None: - outsegments = sys.stdout + best_labels = [1] * int(len(bb) / len(sample_labels)) else: - outsegments = open(args['outsegments'], 'w') - outsegments.write('#ID\tSAMPLE\t#BINS\tRD\t#SNPS\tCOV\tALPHA\tBETA\tBAF\n') - for key in sorted(segments): - for sample in sorted(segments[key]): - record = segments[key][sample] - outsegments.write( - '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( - key, - sample, - record[0], - record[1], - record[2], - record[3], - record[4], - record[5], - record[6], - ) - ) - if outsegments is not sys.stdout: - outsegments.close() - - -def readBB(bbfile): - read = {} - samples = set() - with open(bbfile, 'r') as f: - for line in f: - if line[0] != '#': - parsed = line.strip().split() - chromosome = parsed[0] - start = int(parsed[1]) - end = int(parsed[2]) - sample = parsed[3] - rd = float(parsed[4]) - snps = int(parsed[5]) - cov = float(parsed[6]) - alpha = int(parsed[7]) - beta = int(parsed[8]) - baf = float(parsed[9]) - - samples.add(sample) - try: - read[chromosome, start, end].append((sample, rd, snps, cov, alpha, beta, baf)) - except KeyError: - read[chromosome, start, end] = [(sample, rd, snps, cov, alpha, beta, baf)] - nonzeroab = lambda rb: all(rec[4] + rec[5] > 0 for rec in rb) - newread = {b: read[b] for b in read if nonzeroab(read[b])} - diff = len(read.keys()) - len(newread.keys()) - if diff > 0: sp.log( - msg='{} bins have been discarded because no covered by any SNP\n'.format(diff), - level='WARN', + msg='# Clustering bins by RD and BAF across tumor samples using locality\n', + level='STEP', + ) + (best_score, best_model, best_labels, best_K, results,) = hmm_model_select( + tracks, + minK=minK, + maxK=maxK, + seed=args['seed'], + covar=args['covar'], + decode_alg=args['decoding'], + tmat=args['transmat'], + tau=args['tau'], ) - return newread, samples - - -def getPoints(data, samples): - idx = 0 - points = [] - bintoidx = {} - for bi in sorted(data, key=(lambda x: (sp.numericOrder(x[0]), int(x[1]), int(x[2])))): - partition = {} - for x in data[bi]: - if x[0] in partition: - raise ValueError( - sp.error( - 'Found a bin ({}, {}) in chromosome {} defined multiple times for the same sample!'.format( - bi[1], bi[2], bi[0] - ) - ) - ) - else: - partition[x[0]] = [x[1], x[-1]] - if len(partition) != len(samples): - raise ValueError( - sp.error( - 'Found a bin ({}, {}) in chromosome {} that is not covered in all the samples!'.format( - bi[1], bi[2], bi[0] - ) - ) - ) - points.append([e for sample in samples for e in partition[sample]]) - bintoidx[bi] = idx - idx += 1 - return points, bintoidx + best_labels = reindex(best_labels) + bb['CLUSTER'] = np.repeat(best_labels, len(sample_labels)) + + sp.log(msg='# Checking consistency of results\n', level='STEP') + pivot_check = bb.pivot(index=['#CHR', 'START', 'END'], columns='SAMPLE', values='CLUSTER') + # Verify that the array lengths and order match the bins in the BB file + chr_idx = 0 + bin_indices = pivot_check.index.to_numpy() + i = 0 + while chr_idx < len(tracks): + my_chr = chr_labels[chr_idx][:-2] + + start_row = bin_indices[i] + assert str(start_row[0]) == my_chr, (start_row[0], my_chr) + + prev_end = start_row[-1] + + start_idx = i + i += 1 + while i < len(bin_indices) and bin_indices[i][0] == start_row[0] and bin_indices[i][1] == prev_end: + prev_end = bin_indices[i][2] + i += 1 + + # check the array lengths + assert tracks[chr_idx].shape[1] == i - start_idx, ( + tracks[chr_idx].shape[1], + i - start_idx, + ) -def cluster(points, clouds=None, concentration_prior=None, K=100, restarts=10, seed=0): + chr_idx += 1 + + # Verify that cluster labels were applied correctly + cl_check = pivot_check.to_numpy().T + assert np.all(cl_check == cl_check[0]) + + sp.log(msg='# Writing output\n', level='STEP') + bb = bb[ + [ + '#CHR', + 'START', + 'END', + 'SAMPLE', + 'RD', + '#SNPS', + 'COV', + 'ALPHA', + 'BETA', + 'BAF', + 'CLUSTER', + ] + ] + bb.to_csv(args['outbins'], index=False, sep='\t') + + seg = form_seg(bb, args['diploidbaf']) + seg.to_csv(args['outsegments'], index=False, sep='\t') + + sp.log(msg='# Done\n', level='STEP') + + +def read_bb(bbfile, use_chr=True, compressed=False): """ - Clusters a set of data points lying in an arbitrary number of clusters. - Arguments: - data (list of lists of floats): list of data points to be clustered. - clouds (list or lists of floats, same second dimension as data): bootstrapped bins for clustering - sampleName (string): The name of the input sample. - concentration_prior (float): Tuning parameter for clustering, must be between 0 and 1. Used to determine - concentration of points in clusters -- higher favors more clusters, lower favors fewer clusters. - K (int): maximum number of clusters to infer - restarts (int): number of initializations to try for GMM - seed (int): random number generator seed for GMM + Constructs arrays to represent the bin in each chromosome or arm. + If bbfile was binned around chromosome arm, then uses chromosome arms. + Otherwise, uses chromosomes. + Returns: - mus (list of lists of floats): List of cluster means. - sigmas (list of 2D lists of floats): List of cluster covariances. - clusterAssignments (list of ints): The assignment of each interval to a cluster, where an entry - j at index i means the ith interval has been assigned to the - jth meta-interval. - numPoints (list of ints): Number of points assigned to each cluster - numClusters (int): The number of clusters. + botht: list of np.ndarrays of size (n_bins, n_tracks) + where n_tracks = n_samples * 2 + bb: table read from input bbfile + sample_labels: order in which samples are represented in each array in botht + chr_lables: order in which chromosomes or arms are represented in botht + + each array contains + 1 track per sample for a single chromosome arm. """ - from sklearn.mixture import BayesianGaussianMixture - from collections import Counter - sp.log( - msg='## Clustering with K={} and c={}...\n'.format(K, concentration_prior), - level='INFO', - ) - total = list(points) - if clouds is not None: - total.extend(list(clouds)) - npArray = np.array(total) - - gmm = BayesianGaussianMixture( - n_components=K, - n_init=restarts, - weight_concentration_prior=concentration_prior, - max_iter=int(1e6), - random_state=seed, + bb = pd.read_table(bbfile) + + tracks = [] + + sample_labels = [] + populated_labels = False + + chr_labels = [] + for ch, df0 in bb.groupby(['#CHR']): + df0 = df0.sort_values('START') + + p_arrs = [] + q_arrs = [] + + for sample, df in df0.groupby('SAMPLE'): + if not populated_labels: + sample_labels.append(sample) + + gaps = np.where(df.START.to_numpy()[1:] - df.END.to_numpy()[:-1] > 0)[0] + # print(ch, gaps) + + if len(gaps) > 0: + assert len(gaps) == 1, 'Found a chromosome with >1 gaps between bins' + gap = gaps[0] + 1 + + df_p = df.iloc[:gap] + df_q = df.iloc[gap:] + + p_arrs.append(df_p.BAF.to_numpy()) + p_arrs.append(df_p.RD.to_numpy()) + + q_arrs.append(df_q.BAF.to_numpy()) + q_arrs.append(df_q.RD.to_numpy()) + else: + df_p = df + p_arrs.append(df_p.BAF.to_numpy()) + p_arrs.append(df_p.RD.to_numpy()) + + if len(q_arrs) > 0: + tracks.append(np.array(p_arrs)) + chr_labels.append(str(ch) + '_p') + + tracks.append(np.array(q_arrs)) + chr_labels.append(str(ch) + '_q') + else: + tracks.append(np.array(p_arrs)) + chr_labels.append(str(ch) + '_p') + + populated_labels = True + + return ( + tracks, + bb.sort_values(by=['#CHR', 'START', 'SAMPLE']), + sample_labels, + chr_labels, ) - targetAssignments = gmm.fit_predict(npArray) - targetAssignments = targetAssignments[: len(points)] - mus = gmm.means_ - sigmas = gmm.covariances_ - cntr = Counter(targetAssignments) - numPoints = [cntr[i] if i in cntr else 0 for i in range(K)] - numClusters = len(cntr) - - return mus, sigmas, targetAssignments, numPoints, numClusters - - -def refineClustering(combo, assign, assignidx, samples, rdtol, baftol): - assignment = {b: assign[assignidx[b]] for b in combo} - clusters = set(assignment[b] for b in assignment) - size = {c: float(sum(c == assignment[b] for b in combo)) for c in clusters} - getbaf = lambda c, p: float(sum(e[6] for b in combo for e in combo[b] if assignment[b] == c and e[0] == p)) - baf = {c: {p: getbaf(c, p) / size[c] for p in samples} for c in clusters} - getrdr = lambda c, p: float(sum(e[1] for b in combo for e in combo[b] if assignment[b] == c and e[0] == p)) - rdr = {c: {p: getrdr(c, p) / size[c] for p in samples} for c in clusters} - - mbaf = lambda c: {p: baf[c][p] for p in samples} - mrdr = lambda c: {p: rdr[c][p] for p in samples} - merge = {c: {'BAF': mbaf(c), 'RDR': mrdr(c), 'SIZE': size[c], 'CLUS': {c}} for c in clusters} - - def mergable(m): - checkrdr = lambda f, s: False not in set(abs(m[f]['RDR'][p] - m[s]['RDR'][p]) <= rdtol for p in samples) - checkbaf = lambda f, s: False not in set(abs(m[f]['BAF'][p] - m[s]['BAF'][p]) <= baftol for p in samples) - check = lambda f, s: checkrdr(f, s) and checkbaf(f, s) - varrdr = lambda f, s: sum(abs(m[f]['RDR'][p] - m[s]['RDR'][p]) for p in samples) - varbaf = lambda f, s: sum(abs(m[f]['BAF'][p] - m[s]['BAF'][p]) for p in samples) - var = lambda f, s: varrdr(f, s) + varbaf(f, s) - - seq = sorted(m, key=(lambda x: m[x]['SIZE'])) - for idx, f in enumerate(seq): - opts = {s: var(f, s) for s in seq[idx + 1 :] for p in samples if s != f and check(f, s)} - if len(opts) > 0: - first = f - break - if len(opts) > 0: - f = first - return first, sp.argmin(opts) + + +def hmm_model_select( + tracks, + minK=20, + maxK=50, + tau=10e-6, + tmat='diag', + decode_alg='viterbi', + covar='diag', + seed=0, +): + assert tmat in ['fixed', 'diag', 'free'] + assert decode_alg in ['map', 'viterbi'] + + # format input + tracks = [a for a in tracks if a.shape[0] > 0 and a.shape[1] > 0] + if len(tracks) > 1: + X = np.concatenate(tracks, axis=1).T + lengths = [a.shape[1] for a in tracks] + else: + X = tracks[0].T + lengths = [tracks[0].shape[1]] + + best_K = 0 + best_score = 0 + best_model = None + best_labels = None + + scaler = StandardScaler() + X_scaled = scaler.fit_transform(X) + C = squareform(pdist(X_scaled)) + + rs = {} + for K in range(minK, maxK + 1): + # print(K, datetime.now()) + # construct initial transition matrix + A = make_transmat(1 - tau, K) + + if tmat == 'fixed': + model = hmm.GaussianHMM( + n_components=K, + init_params='mc', + params='smc', + covariance_type=covar, + random_state=0, + ) + elif tmat == 'free': + model = hmm.GaussianHMM( + n_components=K, + init_params='mc', + params='smct', + covariance_type=covar, + random_state=0, + ) else: - None - - m = mergable(merge) - while m is not None: - m1 = m[0] - m2 = m[1] - tot = float(merge[m1]['SIZE'] + merge[m2]['SIZE']) - newbaf = { - p: float(merge[m1]['BAF'][p] * merge[m1]['SIZE'] + merge[m2]['BAF'][p] * merge[m2]['SIZE']) / tot - for p in samples - } - newrdr = { - p: float(merge[m1]['RDR'][p] * merge[m1]['SIZE'] + merge[m2]['RDR'][p] * merge[m2]['SIZE']) / tot - for p in samples - } - newclu = merge[m1]['CLUS'] | merge[m2]['CLUS'] - merge = {c: merge[c] for c in merge if c != m1 and c != m2} - if len(merge) == 0: - merge = {} - merge[m1] = {'BAF': newbaf, 'RDR': newrdr, 'SIZE': tot, 'CLUS': newclu} - m = mergable(merge) - - newassign = [-1 for i in range(len(assign))] - for b in combo: - get = [c for c in merge if assign[assignidx[b]] in merge[c]['CLUS']] - assert len(get) == 1 - newassign[assignidx[b]] = get[0] - assert -1 not in set(newassign) - - return newassign, len(merge) - - -def generateClouds(points, density, seed, sdeven=0.02, sdodd=0.02): - res = [] - resappend = res.append - for point in points: - np.random.seed(seed=seed) - for d in range(density): - normal = np.random.normal - newpoint = [normal(point[i], sdeven) if i % 2 == 0 else normal(point[i], sdodd) for i in range(len(point))] - resappend(newpoint) - return res - - -def segmentBins(bb, clusters, samples): - sbb = {bi: {record[0]: record[1:] for record in bb[bi]} for bi in bb} - nbins = {cluster: {sample: len(clusters[cluster]) for sample in samples} for cluster in clusters} - rd = { - cluster: { - sample: float(sum(sbb[bi][sample][0] for bi in clusters[cluster])) / float(len(clusters[cluster])) - for sample in samples - } - for cluster in clusters - } - nsnps = { - cluster: {sample: sum(sbb[bi][sample][1] for bi in clusters[cluster]) for sample in samples} - for cluster in clusters - } - cov = { - cluster: { - sample: float(sum(sbb[bi][sample][2] for bi in clusters[cluster])) / float(len(clusters[cluster])) - for sample in samples - } - for cluster in clusters - } - return minSegmentBins(sbb, nbins, rd, nsnps, cov, clusters, samples) - - -def minSegmentBins(sbb, nbins, rd, nsnps, cov, clusters, samples): - alpha = { - cluster: { - sample: sum(min(sbb[bi][sample][3], sbb[bi][sample][4]) for bi in clusters[cluster]) for sample in samples - } - for cluster in clusters - } - beta = { - cluster: { - sample: sum(max(sbb[bi][sample][3], sbb[bi][sample][4]) for bi in clusters[cluster]) for sample in samples - } - for cluster in clusters - } - mean = { - cluster: { - sample: float(alpha[cluster][sample]) / float(alpha[cluster][sample] + beta[cluster][sample]) - for sample in samples - } - for cluster in clusters - } - return { - cluster: { - sample: ( - nbins[cluster][sample], - rd[cluster][sample], - nsnps[cluster][sample], - cov[cluster][sample], - alpha[cluster][sample], - beta[cluster][sample], - mean[cluster][sample], + model = DiagGHMM( + n_components=K, + init_params='mc', + params='smct', + covariance_type=covar, + random_state=0, ) - for sample in samples - } - for cluster in clusters - } - - -def scaleBAF(segments, samples, diploidbaf): - diploid = -1 - main = -1 - for key in segments: - if sum((0.5 - segments[key][sample][-1]) <= diploidbaf for sample in samples) == len(samples): - sample = list(samples)[0] - if main < segments[key][sample][0]: - main = segments[key][sample][0] - diploid = key - if diploid == -1: - raise ValueError( - sp.error('No potential neutral cluster has been found within the given threshold {}!'.format(diploidbaf)) - ) - scalings = {sample: segments[diploid][sample][-1] for sample in samples} - scale = ( - lambda value, scale, diploidbaf: min((float(value) / float(scale)) * 0.5, 0.5) - if (0.5 - value) <= diploidbaf and scale > 0 - else value - ) - scaledBAF = { - segment: {sample: scale(segments[segment][sample][-1], scalings[sample], diploidbaf) for sample in samples} - for segment in segments - } - regulate = ( - lambda record, baf: record[:-3] + splitBAF(baf, record[-3] + record[-2]) + (baf,) - if record[-1] != baf - else record - ) - return { - segment: {sample: regulate(segments[segment][sample], scaledBAF[segment][sample]) for sample in samples} - for segment in segments - } + model.startprob_ = np.ones(K) / K + model.transmat_ = A + model.fit(X, lengths) + + prob, labels = model.decode(X, lengths, algorithm=decode_alg) + score = silhouette_score(C, labels, metric='precomputed') + + rs[K] = prob, score, labels + if score > best_score: + best_score = score + best_model = model + best_labels = labels + best_K = K + + return best_score, best_model, best_labels, best_K, rs + + +class DiagGHMM(hmm.GaussianHMM): + def _accumulate_sufficient_statistics(self, stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice): + super()._accumulate_sufficient_statistics(stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice) -def splitBAF(baf, scale): - BAF = float(baf) - BAF = min(BAF, 1.0 - BAF) - SUM = float(scale) + if 't' in self.params: + # for each ij, recover sum_t xi_ij from the inferred transition matrix + bothlattice = fwdlattice + bwdlattice + loggamma = (bothlattice.T - logsumexp(bothlattice, axis=1)).T - roundings = [] - roundings.append((int(math.floor(BAF * SUM)), int(math.floor((1.0 - BAF) * SUM)))) - roundings.append((int(math.floor(BAF * SUM)), int(math.ceil((1.0 - BAF) * SUM)))) - roundings.append((int(math.ceil(BAF * SUM)), int(math.floor((1.0 - BAF) * SUM)))) - roundings.append((int(math.ceil(BAF * SUM)), int(math.ceil((1.0 - BAF) * SUM)))) - roundings = [(int(min(a, b)), int(max(a, b))) for (a, b) in roundings] + # denominator for each ij is the sum of gammas over i + denoms = np.sum(np.exp(loggamma), axis=0) + # transpose to perform row-wise multiplication + stats['denoms'] = denoms - estimations = [float(a) / float(a + b) if a + b > 0 else 1.0 for (a, b) in roundings] - diff = [abs(est - BAF) for est in estimations] - best = np.argmin(diff) - return roundings[best][0], roundings[best][1] + def _do_mstep(self, stats): + super()._do_mstep(stats) + if 't' in self.params: + denoms = stats['denoms'] + x = (self.transmat_.T * denoms).T -def roundAlphasBetas(baf, alpha, beta): - BAF = float(baf) - BAF = min(BAF, 1.0 - BAF) - ALPHA = min(alpha, beta) - BETA = max(alpha, beta) + # numerator is the sum of ii elements + num = np.sum(np.diag(x)) + # denominator is the sum of all elements + denom = np.sum(x) - roundings = [] - roundings.append((int(math.floor(ALPHA)), int(math.floor(BETA)))) - roundings.append((int(math.floor(ALPHA)), int(math.ceil(BETA)))) - roundings.append((int(math.ceil(ALPHA)), int(math.floor(BETA)))) - roundings.append((int(math.ceil(ALPHA)), int(math.ceil(BETA)))) - roundings = [(int(min(a, b)), int(max(a, b))) for (a, b) in roundings] + # (this is the same as sum_i gamma_i) + # assert np.isclose(denom, np.sum(denoms)) - estimations = [float(a) / float(a + b) if a + b > 0 else 1.0 for (a, b) in roundings] - diff = [abs(est - BAF) for est in estimations] - return roundings[np.argmin(diff)] + stats['diag'] = num / denom + # print(num.shape) + # print(denom.shape) + + self.transmat_ = self.form_transition_matrix(stats['diag']) + + def form_transition_matrix(self, diag): + tol = 1e-10 + diag = np.clip(diag, tol, 1 - tol) + + offdiag = (1 - diag) / (self.n_components - 1) + transmat_ = np.diag([diag - offdiag] * self.n_components) + transmat_ += offdiag + # assert np.all(transmat_ > 0), (diag, offdiag, transmat_) + return transmat_ + + +def make_transmat(diag, K): + offdiag = (1 - diag) / (K - 1) + transmat_ = np.diag([diag - offdiag] * K) + transmat_ += offdiag + return transmat_ def reindex(labels): @@ -468,5 +336,34 @@ def reindex(labels): return [old2newf(a) for a in labels] +def form_seg(bbc, balanced_threshold): + segments = [] + for (key, sample), df in bbc.groupby(['CLUSTER', 'SAMPLE']): + nbins = len(df) + rd = df.RD.mean() + nsnps = df['#SNPS'].sum() + cov = df.COV.mean() + a = np.sum(np.minimum(df.ALPHA, df.BETA)) + b = np.sum(np.maximum(df.ALPHA, df.BETA)) + baf = a / (a + b) + baf = baf if (0.5 - baf) > balanced_threshold else 0.5 + segments.append([key, sample, nbins, rd, nsnps, cov, a, b, baf]) + seg = pd.DataFrame( + segments, + columns=[ + '#ID', + 'SAMPLE', + '#BINS', + 'RD', + '#SNPS', + 'COV', + 'ALPHA', + 'BETA', + 'BAF', + ], + ) + return seg + + if __name__ == '__main__': main() diff --git a/src/hatchet/utils/cluster_bins_gmm.py b/src/hatchet/utils/cluster_bins_gmm.py new file mode 100644 index 00000000..dbb8069c --- /dev/null +++ b/src/hatchet/utils/cluster_bins_gmm.py @@ -0,0 +1,472 @@ +import sys +import math +import numpy as np +from collections import Counter + +from hatchet.utils.ArgParsing import parse_cluster_bins_gmm_args +import hatchet.utils.Supporting as sp + + +def main(args=None): + sp.log(msg='# Parsing and checking input arguments\n', level='STEP') + args = parse_cluster_bins_gmm_args(args) + sp.logArgs(args, 80) + + sp.log(msg='# Reading the combined BB file\n', level='STEP') + combo, samples = readBB(args['bbfile']) + + sp.log(msg='# Format data to cluster\n', level='STEP') + points, bintoidx = getPoints(data=combo, samples=samples) + + clouds = None + if args['cloud'] > 0: + sp.log(msg='# Bootstrap each bin for clustering\n', level='STEP') + clouds = generateClouds( + points=points, + density=args['cloud'], + seed=args['seed'], + sdeven=args['ratiodeviation'], + sdodd=args['bafdeviation'], + ) + + sp.log( + msg='# Clustering bins by RD and BAF across tumor samples\n', + level='STEP', + ) + mus, sigmas, clusterAssignments, numPoints, numClusters = cluster( + points=points, + clouds=clouds, + K=args['initclusters'], + concentration_prior=args['concentration'], + restarts=args['restarts'], + seed=args['seed'], + ) + + if args['rdtol'] > 0.0 or args['baftol'] > 0.0: + sp.log(msg='# Refining clustering using given tolerances\n', level='STEP') + before = len(set(clusterAssignments)) + clusterAssignments, numClusters = refineClustering( + combo=combo, + assign=clusterAssignments, + assignidx=bintoidx, + samples=samples, + rdtol=args['rdtol'], + baftol=args['baftol'], + ) + sp.log( + msg='The number of clusters have been reduced from {} to {} with given tolerances\n'.format( + before, numClusters + ), + level='INFO', + ) + + clusterAssignments = reindex(clusterAssignments) + + sp.log(msg='# Writing BBC output with resulting clusters\n', level='STEP') + if args['outbins'] is None: + outbins = sys.stdout + else: + outbins = open(args['outbins'], 'w') + outbins.write('#CHR\tSTART\tEND\tSAMPLE\tRD\t#SNPS\tCOV\tALPHA\tBETA\tBAF\tCLUSTER\n') + for key in sorted(combo, key=(lambda x: (sp.numericOrder(x[0]), int(x[1]), int(x[2])))): + for sample in sorted(combo[key]): + outbins.write( + '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( + key[0], + key[1], + key[2], + sample[0], + sample[1], + sample[2], + sample[3], + sample[4], + sample[5], + sample[6], + clusterAssignments[bintoidx[key]], + ) + ) + if outbins is not sys.stdout: + outbins.close() + + sp.log(msg='# Segmenting bins\n', level='STEP') + clusters = { + cluster: set(key for key in combo if clusterAssignments[bintoidx[key]] == cluster) + for cluster in set(clusterAssignments) + } + segments = segmentBins(bb=combo, clusters=clusters, samples=samples) + + if args['diploidbaf'] is not None: + sp.log( + msg=( + '# Determining the largest cluster as diploid or tetraploid and rescaling all the clusters inside the ' + 'threshold accordingly\n' + ), + level='STEP', + ) + segments = scaleBAF(segments=segments, samples=samples, diploidbaf=args['diploidbaf']) + + sp.log(msg='# Writing REF output with resulting segments\n', level='STEP') + if args['outsegments'] is None: + outsegments = sys.stdout + else: + outsegments = open(args['outsegments'], 'w') + outsegments.write('#ID\tSAMPLE\t#BINS\tRD\t#SNPS\tCOV\tALPHA\tBETA\tBAF\n') + for key in sorted(segments): + for sample in sorted(segments[key]): + record = segments[key][sample] + outsegments.write( + '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( + key, + sample, + record[0], + record[1], + record[2], + record[3], + record[4], + record[5], + record[6], + ) + ) + if outsegments is not sys.stdout: + outsegments.close() + + +def readBB(bbfile): + read = {} + samples = set() + with open(bbfile, 'r') as f: + for line in f: + if line[0] != '#': + parsed = line.strip().split() + chromosome = parsed[0] + start = int(parsed[1]) + end = int(parsed[2]) + sample = parsed[3] + rd = float(parsed[4]) + snps = int(parsed[5]) + cov = float(parsed[6]) + alpha = int(parsed[7]) + beta = int(parsed[8]) + baf = float(parsed[9]) + + samples.add(sample) + try: + read[chromosome, start, end].append((sample, rd, snps, cov, alpha, beta, baf)) + except KeyError: + read[chromosome, start, end] = [(sample, rd, snps, cov, alpha, beta, baf)] + nonzeroab = lambda rb: all(rec[4] + rec[5] > 0 for rec in rb) + newread = {b: read[b] for b in read if nonzeroab(read[b])} + diff = len(read.keys()) - len(newread.keys()) + if diff > 0: + sp.log( + msg='{} bins have been discarded because no covered by any SNP\n'.format(diff), + level='WARN', + ) + return newread, samples + + +def getPoints(data, samples): + idx = 0 + points = [] + bintoidx = {} + for bi in sorted(data, key=(lambda x: (sp.numericOrder(x[0]), int(x[1]), int(x[2])))): + partition = {} + for x in data[bi]: + if x[0] in partition: + raise ValueError( + sp.error( + 'Found a bin ({}, {}) in chromosome {} defined multiple times for the same sample!'.format( + bi[1], bi[2], bi[0] + ) + ) + ) + else: + partition[x[0]] = [x[1], x[-1]] + if len(partition) != len(samples): + raise ValueError( + sp.error( + 'Found a bin ({}, {}) in chromosome {} that is not covered in all the samples!'.format( + bi[1], bi[2], bi[0] + ) + ) + ) + points.append([e for sample in samples for e in partition[sample]]) + bintoidx[bi] = idx + idx += 1 + return points, bintoidx + + +def cluster(points, clouds=None, concentration_prior=None, K=100, restarts=10, seed=0): + """ + Clusters a set of data points lying in an arbitrary number of clusters. + Arguments: + data (list of lists of floats): list of data points to be clustered. + clouds (list or lists of floats, same second dimension as data): bootstrapped bins for clustering + sampleName (string): The name of the input sample. + concentration_prior (float): Tuning parameter for clustering, must be between 0 and 1. Used to determine + concentration of points in clusters -- higher favors more clusters, lower favors fewer clusters. + K (int): maximum number of clusters to infer + restarts (int): number of initializations to try for GMM + seed (int): random number generator seed for GMM + Returns: + mus (list of lists of floats): List of cluster means. + sigmas (list of 2D lists of floats): List of cluster covariances. + clusterAssignments (list of ints): The assignment of each interval to a cluster, where an entry + j at index i means the ith interval has been assigned to the + jth meta-interval. + numPoints (list of ints): Number of points assigned to each cluster + numClusters (int): The number of clusters. + """ + from sklearn.mixture import BayesianGaussianMixture + from collections import Counter + + sp.log( + msg='## Clustering with K={} and c={}...\n'.format(K, concentration_prior), + level='INFO', + ) + total = list(points) + if clouds is not None: + total.extend(list(clouds)) + npArray = np.array(total) + + gmm = BayesianGaussianMixture( + n_components=K, + n_init=restarts, + weight_concentration_prior=concentration_prior, + max_iter=int(1e6), + random_state=seed, + ) + targetAssignments = gmm.fit_predict(npArray) + targetAssignments = targetAssignments[: len(points)] + mus = gmm.means_ + sigmas = gmm.covariances_ + cntr = Counter(targetAssignments) + numPoints = [cntr[i] if i in cntr else 0 for i in range(K)] + numClusters = len(cntr) + + return mus, sigmas, targetAssignments, numPoints, numClusters + + +def refineClustering(combo, assign, assignidx, samples, rdtol, baftol): + assignment = {b: assign[assignidx[b]] for b in combo} + clusters = set(assignment[b] for b in assignment) + size = {c: float(sum(c == assignment[b] for b in combo)) for c in clusters} + getbaf = lambda c, p: float(sum(e[6] for b in combo for e in combo[b] if assignment[b] == c and e[0] == p)) + baf = {c: {p: getbaf(c, p) / size[c] for p in samples} for c in clusters} + getrdr = lambda c, p: float(sum(e[1] for b in combo for e in combo[b] if assignment[b] == c and e[0] == p)) + rdr = {c: {p: getrdr(c, p) / size[c] for p in samples} for c in clusters} + + mbaf = lambda c: {p: baf[c][p] for p in samples} + mrdr = lambda c: {p: rdr[c][p] for p in samples} + merge = {c: {'BAF': mbaf(c), 'RDR': mrdr(c), 'SIZE': size[c], 'CLUS': {c}} for c in clusters} + + def mergable(m): + checkrdr = lambda f, s: False not in set(abs(m[f]['RDR'][p] - m[s]['RDR'][p]) <= rdtol for p in samples) + checkbaf = lambda f, s: False not in set(abs(m[f]['BAF'][p] - m[s]['BAF'][p]) <= baftol for p in samples) + check = lambda f, s: checkrdr(f, s) and checkbaf(f, s) + varrdr = lambda f, s: sum(abs(m[f]['RDR'][p] - m[s]['RDR'][p]) for p in samples) + varbaf = lambda f, s: sum(abs(m[f]['BAF'][p] - m[s]['BAF'][p]) for p in samples) + var = lambda f, s: varrdr(f, s) + varbaf(f, s) + + seq = sorted(m, key=(lambda x: m[x]['SIZE'])) + for idx, f in enumerate(seq): + opts = {s: var(f, s) for s in seq[idx + 1 :] for p in samples if s != f and check(f, s)} + if len(opts) > 0: + first = f + break + if len(opts) > 0: + f = first + return first, sp.argmin(opts) + else: + None + + m = mergable(merge) + while m is not None: + m1 = m[0] + m2 = m[1] + tot = float(merge[m1]['SIZE'] + merge[m2]['SIZE']) + newbaf = { + p: float(merge[m1]['BAF'][p] * merge[m1]['SIZE'] + merge[m2]['BAF'][p] * merge[m2]['SIZE']) / tot + for p in samples + } + newrdr = { + p: float(merge[m1]['RDR'][p] * merge[m1]['SIZE'] + merge[m2]['RDR'][p] * merge[m2]['SIZE']) / tot + for p in samples + } + newclu = merge[m1]['CLUS'] | merge[m2]['CLUS'] + merge = {c: merge[c] for c in merge if c != m1 and c != m2} + if len(merge) == 0: + merge = {} + merge[m1] = {'BAF': newbaf, 'RDR': newrdr, 'SIZE': tot, 'CLUS': newclu} + m = mergable(merge) + + newassign = [-1 for i in range(len(assign))] + for b in combo: + get = [c for c in merge if assign[assignidx[b]] in merge[c]['CLUS']] + assert len(get) == 1 + newassign[assignidx[b]] = get[0] + assert -1 not in set(newassign) + + return newassign, len(merge) + + +def generateClouds(points, density, seed, sdeven=0.02, sdodd=0.02): + res = [] + resappend = res.append + for point in points: + np.random.seed(seed=seed) + for d in range(density): + normal = np.random.normal + newpoint = [normal(point[i], sdeven) if i % 2 == 0 else normal(point[i], sdodd) for i in range(len(point))] + resappend(newpoint) + return res + + +def segmentBins(bb, clusters, samples): + sbb = {bi: {record[0]: record[1:] for record in bb[bi]} for bi in bb} + nbins = {cluster: {sample: len(clusters[cluster]) for sample in samples} for cluster in clusters} + rd = { + cluster: { + sample: float(sum(sbb[bi][sample][0] for bi in clusters[cluster])) / float(len(clusters[cluster])) + for sample in samples + } + for cluster in clusters + } + nsnps = { + cluster: {sample: sum(sbb[bi][sample][1] for bi in clusters[cluster]) for sample in samples} + for cluster in clusters + } + cov = { + cluster: { + sample: float(sum(sbb[bi][sample][2] for bi in clusters[cluster])) / float(len(clusters[cluster])) + for sample in samples + } + for cluster in clusters + } + return minSegmentBins(sbb, nbins, rd, nsnps, cov, clusters, samples) + + +def minSegmentBins(sbb, nbins, rd, nsnps, cov, clusters, samples): + alpha = { + cluster: { + sample: sum(min(sbb[bi][sample][3], sbb[bi][sample][4]) for bi in clusters[cluster]) for sample in samples + } + for cluster in clusters + } + beta = { + cluster: { + sample: sum(max(sbb[bi][sample][3], sbb[bi][sample][4]) for bi in clusters[cluster]) for sample in samples + } + for cluster in clusters + } + mean = { + cluster: { + sample: float(alpha[cluster][sample]) / float(alpha[cluster][sample] + beta[cluster][sample]) + for sample in samples + } + for cluster in clusters + } + return { + cluster: { + sample: ( + nbins[cluster][sample], + rd[cluster][sample], + nsnps[cluster][sample], + cov[cluster][sample], + alpha[cluster][sample], + beta[cluster][sample], + mean[cluster][sample], + ) + for sample in samples + } + for cluster in clusters + } + + +def scaleBAF(segments, samples, diploidbaf): + diploid = -1 + main = -1 + for key in segments: + if sum((0.5 - segments[key][sample][-1]) <= diploidbaf for sample in samples) == len(samples): + sample = list(samples)[0] + if main < segments[key][sample][0]: + main = segments[key][sample][0] + diploid = key + if diploid == -1: + raise ValueError( + sp.error('No potential neutral cluster has been found within the given threshold {}!'.format(diploidbaf)) + ) + scalings = {sample: segments[diploid][sample][-1] for sample in samples} + scale = ( + lambda value, scale, diploidbaf: min((float(value) / float(scale)) * 0.5, 0.5) + if (0.5 - value) <= diploidbaf and scale > 0 + else value + ) + scaledBAF = { + segment: {sample: scale(segments[segment][sample][-1], scalings[sample], diploidbaf) for sample in samples} + for segment in segments + } + regulate = ( + lambda record, baf: record[:-3] + splitBAF(baf, record[-3] + record[-2]) + (baf,) + if record[-1] != baf + else record + ) + return { + segment: {sample: regulate(segments[segment][sample], scaledBAF[segment][sample]) for sample in samples} + for segment in segments + } + + +def splitBAF(baf, scale): + BAF = float(baf) + BAF = min(BAF, 1.0 - BAF) + SUM = float(scale) + + roundings = [] + roundings.append((int(math.floor(BAF * SUM)), int(math.floor((1.0 - BAF) * SUM)))) + roundings.append((int(math.floor(BAF * SUM)), int(math.ceil((1.0 - BAF) * SUM)))) + roundings.append((int(math.ceil(BAF * SUM)), int(math.floor((1.0 - BAF) * SUM)))) + roundings.append((int(math.ceil(BAF * SUM)), int(math.ceil((1.0 - BAF) * SUM)))) + roundings = [(int(min(a, b)), int(max(a, b))) for (a, b) in roundings] + + estimations = [float(a) / float(a + b) if a + b > 0 else 1.0 for (a, b) in roundings] + diff = [abs(est - BAF) for est in estimations] + best = np.argmin(diff) + return roundings[best][0], roundings[best][1] + + +def roundAlphasBetas(baf, alpha, beta): + BAF = float(baf) + BAF = min(BAF, 1.0 - BAF) + ALPHA = min(alpha, beta) + BETA = max(alpha, beta) + + roundings = [] + roundings.append((int(math.floor(ALPHA)), int(math.floor(BETA)))) + roundings.append((int(math.floor(ALPHA)), int(math.ceil(BETA)))) + roundings.append((int(math.ceil(ALPHA)), int(math.floor(BETA)))) + roundings.append((int(math.ceil(ALPHA)), int(math.ceil(BETA)))) + roundings = [(int(min(a, b)), int(max(a, b))) for (a, b) in roundings] + + estimations = [float(a) / float(a + b) if a + b > 0 else 1.0 for (a, b) in roundings] + diff = [abs(est - BAF) for est in estimations] + return roundings[np.argmin(diff)] + + +def reindex(labels): + """ + Given a list of labels, reindex them as integers from 1 to n_labels + Also orders them in nonincreasing order of prevalence + """ + old2new = {} + j = 1 + for i, _ in Counter(labels).most_common(): + old2new[i] = j + j += 1 + old2newf = lambda x: old2new[x] + + return [old2newf(a) for a in labels] + + +if __name__ == '__main__': + main() diff --git a/src/hatchet/utils/cluster_bins_loc.py b/src/hatchet/utils/cluster_bins_loc.py deleted file mode 100644 index 8d6f8592..00000000 --- a/src/hatchet/utils/cluster_bins_loc.py +++ /dev/null @@ -1,369 +0,0 @@ -from collections import Counter -import numpy as np -import pandas as pd - -from sklearn.preprocessing import StandardScaler -from sklearn.metrics import silhouette_score -from scipy.special import logsumexp -from scipy.spatial.distance import pdist, squareform -from hmmlearn import hmm - -from hatchet.utils.ArgParsing import parse_cluster_bins_loc_args -import hatchet.utils.Supporting as sp - - -def main(args=None): - sp.log(msg='# Parsing and checking input arguments\n', level='STEP') - args = parse_cluster_bins_loc_args(args) - sp.logArgs(args, 80) - - sp.log(msg='# Reading the combined BB file\n', level='STEP') - tracks, bb, sample_labels, chr_labels = read_bb(args['bbfile']) - - if args['exactK'] > 0: - minK = args['exactK'] - maxK = args['exactK'] - else: - minK = args['minK'] - maxK = args['maxK'] - - if minK <= 1: - sp.log( - msg='# WARNING: model selection does not support comparing K=1 to K>1. K=1 will be ignored.\n', - level='WARNING', - ) - - if args['exactK'] > 0 and args['exactK'] == 1: - sp.log( - msg='# Found exactK=1, returning trivial clustering.\n', - level='STEP', - ) - best_labels = [1] * int(len(bb) / len(sample_labels)) - else: - sp.log( - msg='# Clustering bins by RD and BAF across tumor samples using locality\n', - level='STEP', - ) - (best_score, best_model, best_labels, best_K, results,) = hmm_model_select( - tracks, - minK=minK, - maxK=maxK, - seed=args['seed'], - covar=args['covar'], - decode_alg=args['decoding'], - tmat=args['transmat'], - tau=args['tau'], - ) - - best_labels = reindex(best_labels) - bb['CLUSTER'] = np.repeat(best_labels, len(sample_labels)) - - sp.log(msg='# Checking consistency of results\n', level='STEP') - pivot_check = bb.pivot(index=['#CHR', 'START', 'END'], columns='SAMPLE', values='CLUSTER') - # Verify that the array lengths and order match the bins in the BB file - chr_idx = 0 - bin_indices = pivot_check.index.to_numpy() - i = 0 - while chr_idx < len(tracks): - my_chr = chr_labels[chr_idx][:-2] - - start_row = bin_indices[i] - assert str(start_row[0]) == my_chr, (start_row[0], my_chr) - - prev_end = start_row[-1] - - start_idx = i - i += 1 - while i < len(bin_indices) and bin_indices[i][0] == start_row[0] and bin_indices[i][1] == prev_end: - prev_end = bin_indices[i][2] - i += 1 - - # check the array lengths - assert tracks[chr_idx].shape[1] == i - start_idx, ( - tracks[chr_idx].shape[1], - i - start_idx, - ) - - chr_idx += 1 - - # Verify that cluster labels were applied correctly - cl_check = pivot_check.to_numpy().T - assert np.all(cl_check == cl_check[0]) - - sp.log(msg='# Writing output\n', level='STEP') - bb = bb[ - [ - '#CHR', - 'START', - 'END', - 'SAMPLE', - 'RD', - '#SNPS', - 'COV', - 'ALPHA', - 'BETA', - 'BAF', - 'CLUSTER', - ] - ] - bb.to_csv(args['outbins'], index=False, sep='\t') - - seg = form_seg(bb, args['diploidbaf']) - seg.to_csv(args['outsegments'], index=False, sep='\t') - - sp.log(msg='# Done\n', level='STEP') - - -def read_bb(bbfile, use_chr=True, compressed=False): - """ - Constructs arrays to represent the bin in each chromosome or arm. - If bbfile was binned around chromosome arm, then uses chromosome arms. - Otherwise, uses chromosomes. - - Returns: - botht: list of np.ndarrays of size (n_bins, n_tracks) - where n_tracks = n_samples * 2 - bb: table read from input bbfile - sample_labels: order in which samples are represented in each array in botht - chr_lables: order in which chromosomes or arms are represented in botht - - each array contains - 1 track per sample for a single chromosome arm. - """ - - bb = pd.read_table(bbfile) - - tracks = [] - - sample_labels = [] - populated_labels = False - - chr_labels = [] - for ch, df0 in bb.groupby(['#CHR']): - df0 = df0.sort_values('START') - - p_arrs = [] - q_arrs = [] - - for sample, df in df0.groupby('SAMPLE'): - if not populated_labels: - sample_labels.append(sample) - - gaps = np.where(df.START.to_numpy()[1:] - df.END.to_numpy()[:-1] > 0)[0] - # print(ch, gaps) - - if len(gaps) > 0: - assert len(gaps) == 1, 'Found a chromosome with >1 gaps between bins' - gap = gaps[0] + 1 - - df_p = df.iloc[:gap] - df_q = df.iloc[gap:] - - p_arrs.append(df_p.BAF.to_numpy()) - p_arrs.append(df_p.RD.to_numpy()) - - q_arrs.append(df_q.BAF.to_numpy()) - q_arrs.append(df_q.RD.to_numpy()) - else: - df_p = df - p_arrs.append(df_p.BAF.to_numpy()) - p_arrs.append(df_p.RD.to_numpy()) - - if len(q_arrs) > 0: - tracks.append(np.array(p_arrs)) - chr_labels.append(str(ch) + '_p') - - tracks.append(np.array(q_arrs)) - chr_labels.append(str(ch) + '_q') - else: - tracks.append(np.array(p_arrs)) - chr_labels.append(str(ch) + '_p') - - populated_labels = True - - return ( - tracks, - bb.sort_values(by=['#CHR', 'START', 'SAMPLE']), - sample_labels, - chr_labels, - ) - - -def hmm_model_select( - tracks, - minK=20, - maxK=50, - tau=10e-6, - tmat='diag', - decode_alg='viterbi', - covar='diag', - seed=0, -): - assert tmat in ['fixed', 'diag', 'free'] - assert decode_alg in ['map', 'viterbi'] - - # format input - tracks = [a for a in tracks if a.shape[0] > 0 and a.shape[1] > 0] - if len(tracks) > 1: - X = np.concatenate(tracks, axis=1).T - lengths = [a.shape[1] for a in tracks] - else: - X = tracks[0].T - lengths = [tracks[0].shape[1]] - - best_K = 0 - best_score = 0 - best_model = None - best_labels = None - - scaler = StandardScaler() - X_scaled = scaler.fit_transform(X) - C = squareform(pdist(X_scaled)) - - rs = {} - for K in range(minK, maxK + 1): - # print(K, datetime.now()) - # construct initial transition matrix - A = make_transmat(1 - tau, K) - - if tmat == 'fixed': - model = hmm.GaussianHMM( - n_components=K, - init_params='mc', - params='smc', - covariance_type=covar, - random_state=0, - ) - elif tmat == 'free': - model = hmm.GaussianHMM( - n_components=K, - init_params='mc', - params='smct', - covariance_type=covar, - random_state=0, - ) - else: - model = DiagGHMM( - n_components=K, - init_params='mc', - params='smct', - covariance_type=covar, - random_state=0, - ) - - model.startprob_ = np.ones(K) / K - model.transmat_ = A - model.fit(X, lengths) - - prob, labels = model.decode(X, lengths, algorithm=decode_alg) - score = silhouette_score(C, labels, metric='precomputed') - - rs[K] = prob, score, labels - if score > best_score: - best_score = score - best_model = model - best_labels = labels - best_K = K - - return best_score, best_model, best_labels, best_K, rs - - -class DiagGHMM(hmm.GaussianHMM): - def _accumulate_sufficient_statistics(self, stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice): - super()._accumulate_sufficient_statistics(stats, obs, framelogprob, posteriors, fwdlattice, bwdlattice) - - if 't' in self.params: - # for each ij, recover sum_t xi_ij from the inferred transition matrix - bothlattice = fwdlattice + bwdlattice - loggamma = (bothlattice.T - logsumexp(bothlattice, axis=1)).T - - # denominator for each ij is the sum of gammas over i - denoms = np.sum(np.exp(loggamma), axis=0) - # transpose to perform row-wise multiplication - stats['denoms'] = denoms - - def _do_mstep(self, stats): - super()._do_mstep(stats) - if 't' in self.params: - - denoms = stats['denoms'] - x = (self.transmat_.T * denoms).T - - # numerator is the sum of ii elements - num = np.sum(np.diag(x)) - # denominator is the sum of all elements - denom = np.sum(x) - - # (this is the same as sum_i gamma_i) - # assert np.isclose(denom, np.sum(denoms)) - - stats['diag'] = num / denom - # print(num.shape) - # print(denom.shape) - - self.transmat_ = self.form_transition_matrix(stats['diag']) - - def form_transition_matrix(self, diag): - tol = 1e-10 - diag = np.clip(diag, tol, 1 - tol) - - offdiag = (1 - diag) / (self.n_components - 1) - transmat_ = np.diag([diag - offdiag] * self.n_components) - transmat_ += offdiag - # assert np.all(transmat_ > 0), (diag, offdiag, transmat_) - return transmat_ - - -def make_transmat(diag, K): - offdiag = (1 - diag) / (K - 1) - transmat_ = np.diag([diag - offdiag] * K) - transmat_ += offdiag - return transmat_ - - -def reindex(labels): - """ - Given a list of labels, reindex them as integers from 1 to n_labels - Also orders them in nonincreasing order of prevalence - """ - old2new = {} - j = 1 - for i, _ in Counter(labels).most_common(): - old2new[i] = j - j += 1 - old2newf = lambda x: old2new[x] - - return [old2newf(a) for a in labels] - - -def form_seg(bbc, balanced_threshold): - segments = [] - for (key, sample), df in bbc.groupby(['CLUSTER', 'SAMPLE']): - nbins = len(df) - rd = df.RD.mean() - nsnps = df['#SNPS'].sum() - cov = df.COV.mean() - a = np.sum(np.minimum(df.ALPHA, df.BETA)) - b = np.sum(np.maximum(df.ALPHA, df.BETA)) - baf = a / (a + b) - baf = baf if (0.5 - baf) > balanced_threshold else 0.5 - segments.append([key, sample, nbins, rd, nsnps, cov, a, b, baf]) - seg = pd.DataFrame( - segments, - columns=[ - '#ID', - 'SAMPLE', - '#BINS', - 'RD', - '#SNPS', - 'COV', - 'ALPHA', - 'BETA', - 'BAF', - ], - ) - return seg - - -if __name__ == '__main__': - main() diff --git a/src/hatchet/utils/commands.py b/src/hatchet/utils/commands.py index 727058aa..3da79dc5 100644 --- a/src/hatchet/utils/commands.py +++ b/src/hatchet/utils/commands.py @@ -14,7 +14,7 @@ 'check', 'download-panel', 'phase-snps', - 'cluster-bins-loc', + 'cluster-bins-gmm', 'plot-cn-1d2d', 'plot-bins-1d2d', ) @@ -26,7 +26,7 @@ 'SNPCaller': 'genotype-snps', 'deBAF': 'count-alleles', 'comBBo': 'combine-counts-fw', - 'cluBB': 'cluster-bins', + 'cluBB': 'cluster-bins-gmm', 'BBot': 'plot-bins', 'solve': 'compute-cn', 'BBeval': 'plot-cn', diff --git a/src/hatchet/utils/run.py b/src/hatchet/utils/run.py index 5789bc67..5344e795 100644 --- a/src/hatchet/utils/run.py +++ b/src/hatchet/utils/run.py @@ -11,8 +11,8 @@ from hatchet.utils.count_alleles import main as count_alleles from hatchet.utils.combine_counts import main as combine_counts from hatchet.utils.combine_counts_fw import main as combine_counts_fw +from hatchet.utils.cluster_bins_gmm import main as cluster_bins_gmm from hatchet.utils.cluster_bins import main as cluster_bins -from hatchet.utils.cluster_bins_loc import main as loc_clust from hatchet.utils.plot_bins import main as plot_bins from hatchet.utils.plot_bins_1d2d import main as plot_bins_1d2d from hatchet.bin.HATCHet import main as hatchet_main @@ -301,7 +301,7 @@ def main(args=None): os.makedirs(f'{output}/bbc', exist_ok=True) if config.run.loc_clust: - loc_clust( + cluster_bins( args=[ f'{output}/bb/bulk.bb', '-o', @@ -311,7 +311,7 @@ def main(args=None): ] ) else: - cluster_bins( + cluster_bins_gmm( args=[ f'{output}/bb/bulk.bb', '-o', diff --git a/tests/test_steps.py b/tests/test_steps.py index 7af2c568..afd103c4 100644 --- a/tests/test_steps.py +++ b/tests/test_steps.py @@ -17,7 +17,7 @@ from hatchet.utils.genotype_snps import main as genotype_snps from hatchet.utils.count_alleles import counting from hatchet.utils.combine_counts_fw import main as combine_counts -from hatchet.utils.cluster_bins import main as cluster_bins +from hatchet.utils.cluster_bins_gmm import main as cluster_bins_gmm from hatchet.utils.plot_bins import main as plot_bins from hatchet.bin.HATCHet import main as main from hatchet.utils.plot_cn import main as plot_cn @@ -229,8 +229,8 @@ def test_combine_counts(output_folder): assert out == f.read() -def test_cluster_bins(output_folder): - cluster_bins( +def test_cluster_bins_gmm(output_folder): + cluster_bins_gmm( args=[ f'{this_dir}/data/fw/bb/bulk.bb', '-o', diff --git a/tests/test_steps_vw.py b/tests/test_steps_vw.py index 0b0a5135..56470300 100644 --- a/tests/test_steps_vw.py +++ b/tests/test_steps_vw.py @@ -10,7 +10,7 @@ from hatchet import config from hatchet.utils.count_reads import main as count_reads -from hatchet.utils.cluster_bins_loc import main as cluster_bins_loc +from hatchet.utils.cluster_bins import main as cluster_bins from hatchet.utils.combine_counts import main as combine_counts this_dir = os.path.dirname(__file__) @@ -132,8 +132,8 @@ def test_combine_counts(_, output_folder): assert_frame_equal(df3, df4) -def test_cluster_bins_loc(output_folder): - cluster_bins_loc( +def test_cluster_bins(output_folder): + cluster_bins( args=[ f'{this_dir}/data/fw/bb/bulk.bb', '-o', @@ -162,8 +162,8 @@ def test_cluster_bins_loc(output_folder): assert_frame_equal(bbc1, bbc2) -def test_cluster_bins_loc_singleton(output_folder): - cluster_bins_loc( +def test_cluster_bins_singleton(output_folder): + cluster_bins( args=[ f'{this_dir}/data/fw/bb/bulk.bb', '-o',