diff --git a/docs/source/conf.py b/docs/source/conf.py index 6c8991f..bc83afd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,4 +1,3 @@ - # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py index b345d14..614a3d8 100644 --- a/edgePy/DGEList.py +++ b/edgePy/DGEList.py @@ -360,7 +360,7 @@ def rpkm( def get_gene_mask_and_lengths(self, gene_data): """ - use gene_data to get the gene lenths and a gene mask for the tranformation. + use gene_data to get the gene lengths and a gene mask for the tranformation. Args: gene_data: the object that holds gene data from ensembl @@ -400,15 +400,16 @@ def get_gene_mask_and_lengths(self, gene_data): def tpm( self, - gene_lengths: np.ndarray, + gene_data: CanonicalDataStore, transform_to_log: bool = False, prior_count: float = PRIOR_COUNT, - mean_fragment_lengths: np.ndarray = None, ) -> "DGEList": """Normalize the DGEList to transcripts per million. - Adapted from Wagner, et al. 'Measurement of mRNA abundance using RNA-seq data: - RPKM measure is inconsistent among samples.' doi:10.1007/s12064-012-0162-3 + Adapted from Li et al. "RNA-Seq gene expression estimation with read mapping uncertainty." + Bioinformatics, Volume 26, Issue 4, 15 February 2010, Pages 493–500, + https://doi.org/10.1093/bioinformatics/btp692 + Read counts :math:`X_i` (for each gene :math:`i` with gene length :math:`\widetilde{l_j}` ) are normalized as follows: @@ -419,32 +420,51 @@ def tpm( \\left(\\frac{1}{\sum_j \\frac{X_j}{\widetilde{l_j}}}\\right) \cdot 10^6 Args: - gene_lengths: 1D array of gene lengths for each gene in the rows of `DGEList.counts`. - transform_to_log: store log outputs - prior_count: - mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts` - (optional) + gene_data: An object that works to import Ensembl based data, for use in calculations + transform_to_log: true, if you wish to convert to log after converting to TPM + prior_count: a minimum value for genes, if you do log transforms. """ + current_log = self.current_log_status - # compute effective length not allowing negative lengths - if mean_fragment_lengths: - effective_lengths = ( - gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :] - ).clip(min=1) - else: - effective_lengths = gene_lengths[:, np.newaxis] + # checks current log status and converts if currently in log format + if self.current_log_status: + self.counts = np.exp(self.counts) + current_log = False - # how many counts per base - base_counts = self.counts / effective_lengths + gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) + + genes = self.genes[gene_mask].copy() + + rates = self.get_rates(gene_data) + rate_sum = np.sum(rates) + + # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths + counts = (rates / rate_sum) * 1e6 - counts = 1e6 * base_counts / np.sum(base_counts, axis=0)[np.newaxis, :] - current_log = self.current_log_status if transform_to_log: counts = self.log_transform(counts, prior_count) current_log = True - return self.copy(counts=counts, current_log=current_log) + return self.copy(counts=counts, current_log=current_log, genes=genes) + + def get_rates(self, gene_data: CanonicalDataStore) -> "DGEList": + """Gets the number of counts per base, otherwise known as the rate, for all genes. + Currently used for TPM calculations. + + Args: + gene_data: An object that works to import Ensembl based data, for use in calculations + """ + + gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data) + counts = self.counts[gene_mask].copy() + + # calculates counts per base (ie, the rate) + rates = (counts.T / gene_len_ordered).T + + rates = rates / 1e3 + + return rates def __repr__(self) -> str: """Give a pretty non-executeable representation of this object.""" diff --git a/tests/mongodb/test_gene_functions.py b/tests/mongodb/test_gene_functions.py index 9a41aa2..a667a22 100644 --- a/tests/mongodb/test_gene_functions.py +++ b/tests/mongodb/test_gene_functions.py @@ -16,28 +16,34 @@ "size": 720, "canonical": "0", "exons": { - "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866}, - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, + "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, }, - "rpkm": 0.39507510837872767, + "rpkm": 0.395_075_108_378_727_67, }, "ENST00000576696": { "size": 1306, "canonical": "0", "exons": { - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, - "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, + "ENSE00002672617": { + "raw": 5.564_356_435_643_564, + "rpkm": 0.160_022_655_238_508_75, + }, }, - "rpkm": 0.195204453741728, + "rpkm": 0.195_204_453_741_728, }, "ENST00000443778": { "size": 2084, "canonical": "1", "exons": { - "ENSE00001729822": {"raw": 2, "rpkm": 0.0997865579744511}, - "ENSE00001608298": {"raw": 1.1777177717771776, "rpkm": 0.22669418591124607}, + "ENSE00001729822": {"raw": 2, "rpkm": 0.099_786_557_974_451_1}, + "ENSE00001608298": { + "raw": 1.177_717_771_777_177_6, + "rpkm": 0.226_694_185_911_246_07, + }, }, - "rpkm": 0.05165702955135873, + "rpkm": 0.051_657_029_551_358_73, }, }, "star_rpkm": None, @@ -53,19 +59,22 @@ "size": 720, "canonical": "0", "exons": { - "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866}, - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, + "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, }, - "rpkm": 0.39507510837872767, + "rpkm": 0.395_075_108_378_727_67, }, "ENST00000576696": { "size": 1306, "canonical": "0", "exons": { - "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676}, - "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875}, + "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6}, + "ENSE00002672617": { + "raw": 5.564_356_435_643_564, + "rpkm": 0.160_022_655_238_508_75, + }, }, - "rpkm": 0.195204453741728, + "rpkm": 0.195_204_453_741_728, }, }, "star_rpkm": None, @@ -137,7 +146,7 @@ def test_get_sample_details(mongodb): def test_get_canonical_rpkm(): rpkm = get_canonical_rpkm(RNASeq_RECORD) - assert rpkm == 0.05165702955135873 + assert rpkm == 0.051_657_029_551_358_73 def test_get_canonical_rpkm_no_canonical(): diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py index bf208bc..bb3e00a 100644 --- a/tests/test_DGEList.py +++ b/tests/test_DGEList.py @@ -40,7 +40,6 @@ def test_sample_group_list(): def test_minimal_init(): - dge_list = DGEList( to_remove_zeroes=False, counts=np.ones(shape=(5, 5)), @@ -92,7 +91,6 @@ def test_library_size(): def test_setting_DGElist_counts(): - dge_list = DGEList( counts=np.zeros(shape=(5, 10)), groups_in_list=['A', 'A', 'B', 'B', 'B'], @@ -116,7 +114,6 @@ def test_setting_DGElist_counts(): def test_cycle_dge_npz(): - import tempfile import os @@ -214,34 +211,42 @@ def test_rpkm(): assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6))) -def test_tpm(): - # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w - counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]]) - gene_lengths = np.array([2000, 4000, 1000, 10000]) - - expected = np.array( - [ - [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26], - [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26], - [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26], - [0.0, 0.0, 2217.294_900_22], - ] +def test_get_rates(): + dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) + icd = CanonicalDataStore( + get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) ) + first_pos = dge_list.counts[0][0] + first_gene = dge_list.genes[0] - dge_list = DGEList( - counts=counts, - samples=np.array(['a', 'b', 'c']), - genes=np.array(['a', 'b', 'c', 'd']), - groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']}, + assert isinstance(first_pos, np.integer) + ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) + gene_len = icd.get_length_of_canonical_transcript(ensg_gene) + + rates = dge_list.get_rates(icd) + + assert rates[0][0] == first_pos / gene_len + + +def test_tpm(): + dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) + icd = CanonicalDataStore( + get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) ) - assert isinstance(dge_list.counts[0][0], np.integer) - new_dge_list = dge_list.tpm(gene_lengths) + first_pos = dge_list.counts[0][0] + first_gene = dge_list.genes[0] + + rates = dge_list.get_rates(icd) + + rate_sum = np.sum(rates) - assert np.allclose(new_dge_list.counts, expected, atol=1e-1) + assert isinstance(first_pos, np.integer) + tpm_dge = dge_list.tpm(icd) + ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) + gene_len = icd.get_length_of_canonical_transcript(ensg_gene) - # make sure that the sums of all genes across are the same the each sample (an important property of TPM) - gene_sums = new_dge_list.counts.sum(axis=0) - assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums)) + # TPM = countsPerBase / sum[countsPerBase]) * 10^6 + assert tpm_dge.counts[0][0] == ((first_pos / gene_len) / rate_sum) * 1e6 # Unit tests for ``edgePy.data_import.Importer``.\