r-bioinformatics · zbohannan · Nov 20, 2018 · Nov 24, 2018 · Dec 1, 2018 · Dec 11, 2018
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -1,4 +1,3 @@
-
 # -*- coding: utf-8 -*-
 #
 # Configuration file for the Sphinx documentation builder.

diff --git a/edgePy/DGEList.py b/edgePy/DGEList.py
@@ -360,7 +360,7 @@ def rpkm(
     def get_gene_mask_and_lengths(self, gene_data):
 
         """
-        use gene_data to get the gene lenths and a gene mask for the tranformation.
+        use gene_data to get the gene lengths and a gene mask for the tranformation.
         Args:
             gene_data: the object that holds gene data from ensembl
 
@@ -400,15 +400,16 @@ def get_gene_mask_and_lengths(self, gene_data):
 
     def tpm(
         self,
-        gene_lengths: np.ndarray,
+        gene_data: CanonicalDataStore,
         transform_to_log: bool = False,
         prior_count: float = PRIOR_COUNT,
-        mean_fragment_lengths: np.ndarray = None,
     ) -> "DGEList":
         """Normalize the DGEList to transcripts per million.
 
-        Adapted from Wagner, et al. 'Measurement of mRNA abundance using RNA-seq data:
-        RPKM measure is inconsistent among samples.' doi:10.1007/s12064-012-0162-3
+        Adapted from Li et al. "RNA-Seq gene expression estimation with read mapping uncertainty."
+        Bioinformatics, Volume 26, Issue 4, 15 February 2010, Pages 493–500,
+        https://doi.org/10.1093/bioinformatics/btp692
+
 
         Read counts :math:`X_i` (for each gene :math:`i` with gene length :math:`\widetilde{l_j}` )
         are normalized as follows:
@@ -419,32 +420,51 @@ def tpm(
            \\left(\\frac{1}{\sum_j \\frac{X_j}{\widetilde{l_j}}}\\right) \cdot 10^6
 
         Args:
-            gene_lengths: 1D array of gene lengths for each gene in the rows of `DGEList.counts`.
-            transform_to_log: store log outputs
-            prior_count:
-            mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts`
-                (optional)
+            gene_data: An object that works to import Ensembl based data, for use in calculations
+            transform_to_log: true, if you wish to convert to log after converting to TPM
+            prior_count: a minimum value for genes, if you do log transforms.
 
         """
+        current_log = self.current_log_status
 
-        # compute effective length not allowing negative lengths
-        if mean_fragment_lengths:
-            effective_lengths = (
-                gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
-            ).clip(min=1)
-        else:
-            effective_lengths = gene_lengths[:, np.newaxis]
+        # checks current log status and converts if currently in log format
+        if self.current_log_status:
+            self.counts = np.exp(self.counts)
+            current_log = False
 
-        # how many counts per base
-        base_counts = self.counts / effective_lengths
+        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
+
+        genes = self.genes[gene_mask].copy()
+
+        rates = self.get_rates(gene_data)
+        rate_sum = np.sum(rates)
+
+        # calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths
+        counts = (rates / rate_sum) * 1e6
 
-        counts = 1e6 * base_counts / np.sum(base_counts, axis=0)[np.newaxis, :]
-        current_log = self.current_log_status
         if transform_to_log:
             counts = self.log_transform(counts, prior_count)
             current_log = True
 
-        return self.copy(counts=counts, current_log=current_log)
+        return self.copy(counts=counts, current_log=current_log, genes=genes)
+
+    def get_rates(self, gene_data: CanonicalDataStore) -> "DGEList":
+        """Gets the number of counts per base, otherwise known as the rate, for all genes.
+        Currently used for TPM calculations.
+
+        Args:
+            gene_data: An object that works to import Ensembl based data, for use in calculations
+        """
+
+        gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
+        counts = self.counts[gene_mask].copy()
+
+        # calculates counts per base (ie, the rate)
+        rates = (counts.T / gene_len_ordered).T
+
+        rates = rates / 1e3
+
+        return rates
 
     def __repr__(self) -> str:
         """Give a pretty non-executeable representation of this object."""

diff --git a/tests/mongodb/test_gene_functions.py b/tests/mongodb/test_gene_functions.py
@@ -16,28 +16,34 @@
             "size": 720,
             "canonical": "0",
             "exons": {
-                "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
+                "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
             },
-            "rpkm": 0.39507510837872767,
+            "rpkm": 0.395_075_108_378_727_67,
         },
         "ENST00000576696": {
             "size": 1306,
             "canonical": "0",
             "exons": {
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
-                "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
+                "ENSE00002672617": {
+                    "raw": 5.564_356_435_643_564,
+                    "rpkm": 0.160_022_655_238_508_75,
+                },
             },
-            "rpkm": 0.195204453741728,
+            "rpkm": 0.195_204_453_741_728,
         },
         "ENST00000443778": {
             "size": 2084,
             "canonical": "1",
             "exons": {
-                "ENSE00001729822": {"raw": 2, "rpkm": 0.0997865579744511},
-                "ENSE00001608298": {"raw": 1.1777177717771776, "rpkm": 0.22669418591124607},
+                "ENSE00001729822": {"raw": 2, "rpkm": 0.099_786_557_974_451_1},
+                "ENSE00001608298": {
+                    "raw": 1.177_717_771_777_177_6,
+                    "rpkm": 0.226_694_185_911_246_07,
+                },
             },
-            "rpkm": 0.05165702955135873,
+            "rpkm": 0.051_657_029_551_358_73,
         },
     },
     "star_rpkm": None,
@@ -53,19 +59,22 @@
             "size": 720,
             "canonical": "0",
             "exons": {
-                "ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
+                "ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
             },
-            "rpkm": 0.39507510837872767,
+            "rpkm": 0.395_075_108_378_727_67,
         },
         "ENST00000576696": {
             "size": 1306,
             "canonical": "0",
             "exons": {
-                "ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
-                "ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
+                "ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
+                "ENSE00002672617": {
+                    "raw": 5.564_356_435_643_564,
+                    "rpkm": 0.160_022_655_238_508_75,
+                },
             },
-            "rpkm": 0.195204453741728,
+            "rpkm": 0.195_204_453_741_728,
         },
     },
     "star_rpkm": None,
@@ -137,7 +146,7 @@ def test_get_sample_details(mongodb):
 
 def test_get_canonical_rpkm():
     rpkm = get_canonical_rpkm(RNASeq_RECORD)
-    assert rpkm == 0.05165702955135873
+    assert rpkm == 0.051_657_029_551_358_73
 
 
 def test_get_canonical_rpkm_no_canonical():

diff --git a/tests/test_DGEList.py b/tests/test_DGEList.py
@@ -40,7 +40,6 @@ def test_sample_group_list():
 
 
 def test_minimal_init():
-
     dge_list = DGEList(
         to_remove_zeroes=False,
         counts=np.ones(shape=(5, 5)),
@@ -92,7 +91,6 @@ def test_library_size():
 
 
 def test_setting_DGElist_counts():
-
     dge_list = DGEList(
         counts=np.zeros(shape=(5, 10)),
         groups_in_list=['A', 'A', 'B', 'B', 'B'],
@@ -116,7 +114,6 @@ def test_setting_DGElist_counts():
 
 
 def test_cycle_dge_npz():
-
     import tempfile
     import os
 
@@ -214,34 +211,42 @@ def test_rpkm():
     assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6)))
 
 
-def test_tpm():
-    # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w
-    counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]])
-    gene_lengths = np.array([2000, 4000, 1000, 10000])
-
-    expected = np.array(
-        [
-            [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26],
-            [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26],
-            [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26],
-            [0.0, 0.0, 2217.294_900_22],
-        ]
+def test_get_rates():
+    dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
+    icd = CanonicalDataStore(
+        get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
     )
+    first_pos = dge_list.counts[0][0]
+    first_gene = dge_list.genes[0]
 
-    dge_list = DGEList(
-        counts=counts,
-        samples=np.array(['a', 'b', 'c']),
-        genes=np.array(['a', 'b', 'c', 'd']),
-        groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']},
+    assert isinstance(first_pos, np.integer)
+    ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
+    gene_len = icd.get_length_of_canonical_transcript(ensg_gene)
+
+    rates = dge_list.get_rates(icd)
+
+    assert rates[0][0] == first_pos / gene_len
+
+
+def test_tpm():
+    dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
+    icd = CanonicalDataStore(
+        get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
     )
-    assert isinstance(dge_list.counts[0][0], np.integer)
-    new_dge_list = dge_list.tpm(gene_lengths)
+    first_pos = dge_list.counts[0][0]
+    first_gene = dge_list.genes[0]
+
+    rates = dge_list.get_rates(icd)
+
+    rate_sum = np.sum(rates)
 
-    assert np.allclose(new_dge_list.counts, expected, atol=1e-1)
+    assert isinstance(first_pos, np.integer)
+    tpm_dge = dge_list.tpm(icd)
+    ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
+    gene_len = icd.get_length_of_canonical_transcript(ensg_gene)
 
-    # make sure that the sums of all genes across are the same the each sample (an important property of TPM)
-    gene_sums = new_dge_list.counts.sum(axis=0)
-    assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums))
+    # TPM = countsPerBase / sum[countsPerBase]) * 10^6
+    assert tpm_dge.counts[0][0] == ((first_pos / gene_len) / rate_sum) * 1e6
 
 
 # Unit tests for ``edgePy.data_import.Importer``.\