Skip to content
This repository was archived by the owner on Jan 23, 2025. It is now read-only.

Fix norm fn sigs #80

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
Expand Down
64 changes: 42 additions & 22 deletions edgePy/DGEList.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ def rpkm(
def get_gene_mask_and_lengths(self, gene_data):

"""
use gene_data to get the gene lenths and a gene mask for the tranformation.
use gene_data to get the gene lengths and a gene mask for the tranformation.
Args:
gene_data: the object that holds gene data from ensembl

Expand Down Expand Up @@ -400,15 +400,16 @@ def get_gene_mask_and_lengths(self, gene_data):

def tpm(
self,
gene_lengths: np.ndarray,
gene_data: CanonicalDataStore,
transform_to_log: bool = False,
prior_count: float = PRIOR_COUNT,
mean_fragment_lengths: np.ndarray = None,
) -> "DGEList":
"""Normalize the DGEList to transcripts per million.

Adapted from Wagner, et al. 'Measurement of mRNA abundance using RNA-seq data:
RPKM measure is inconsistent among samples.' doi:10.1007/s12064-012-0162-3
Adapted from Li et al. "RNA-Seq gene expression estimation with read mapping uncertainty."
Bioinformatics, Volume 26, Issue 4, 15 February 2010, Pages 493–500,
https://doi.org/10.1093/bioinformatics/btp692


Read counts :math:`X_i` (for each gene :math:`i` with gene length :math:`\widetilde{l_j}` )
are normalized as follows:
Expand All @@ -419,32 +420,51 @@ def tpm(
\\left(\\frac{1}{\sum_j \\frac{X_j}{\widetilde{l_j}}}\\right) \cdot 10^6

Args:
gene_lengths: 1D array of gene lengths for each gene in the rows of `DGEList.counts`.
transform_to_log: store log outputs
prior_count:
mean_fragment_lengths: 1D array of mean fragment lengths for each sample in the columns of `DGEList.counts`
(optional)
gene_data: An object that works to import Ensembl based data, for use in calculations
transform_to_log: true, if you wish to convert to log after converting to TPM
prior_count: a minimum value for genes, if you do log transforms.

"""
current_log = self.current_log_status

# compute effective length not allowing negative lengths
if mean_fragment_lengths:
effective_lengths = (
gene_lengths[:, np.newaxis] - mean_fragment_lengths[np.newaxis, :]
).clip(min=1)
else:
effective_lengths = gene_lengths[:, np.newaxis]
# checks current log status and converts if currently in log format
if self.current_log_status:
self.counts = np.exp(self.counts)
current_log = False

# how many counts per base
base_counts = self.counts / effective_lengths
gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)

genes = self.genes[gene_mask].copy()

rates = self.get_rates(gene_data)
rate_sum = np.sum(rates)

# calculates tpm and adds kilobase conversion from get_gene_mask_and_lengths
counts = (rates / rate_sum) * 1e6

counts = 1e6 * base_counts / np.sum(base_counts, axis=0)[np.newaxis, :]
current_log = self.current_log_status
if transform_to_log:
counts = self.log_transform(counts, prior_count)
current_log = True

return self.copy(counts=counts, current_log=current_log)
return self.copy(counts=counts, current_log=current_log, genes=genes)

def get_rates(self, gene_data: CanonicalDataStore) -> "DGEList":
"""Gets the number of counts per base, otherwise known as the rate, for all genes.
Currently used for TPM calculations.

Args:
gene_data: An object that works to import Ensembl based data, for use in calculations
"""

gene_len_ordered, gene_mask = self.get_gene_mask_and_lengths(gene_data)
counts = self.counts[gene_mask].copy()

# calculates counts per base (ie, the rate)
rates = (counts.T / gene_len_ordered).T

rates = rates / 1e3

return rates

def __repr__(self) -> str:
"""Give a pretty non-executeable representation of this object."""
Expand Down
41 changes: 25 additions & 16 deletions tests/mongodb/test_gene_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,34 @@
"size": 720,
"canonical": "0",
"exons": {
"ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
"ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
"ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6},
"ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
},
"rpkm": 0.39507510837872767,
"rpkm": 0.395_075_108_378_727_67,
},
"ENST00000576696": {
"size": 1306,
"canonical": "0",
"exons": {
"ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
"ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
"ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
"ENSE00002672617": {
"raw": 5.564_356_435_643_564,
"rpkm": 0.160_022_655_238_508_75,
},
},
"rpkm": 0.195204453741728,
"rpkm": 0.195_204_453_741_728,
},
"ENST00000443778": {
"size": 2084,
"canonical": "1",
"exons": {
"ENSE00001729822": {"raw": 2, "rpkm": 0.0997865579744511},
"ENSE00001608298": {"raw": 1.1777177717771776, "rpkm": 0.22669418591124607},
"ENSE00001729822": {"raw": 2, "rpkm": 0.099_786_557_974_451_1},
"ENSE00001608298": {
"raw": 1.177_717_771_777_177_6,
"rpkm": 0.226_694_185_911_246_07,
},
},
"rpkm": 0.05165702955135873,
"rpkm": 0.051_657_029_551_358_73,
},
},
"star_rpkm": None,
Expand All @@ -53,19 +59,22 @@
"size": 720,
"canonical": "0",
"exons": {
"ENSE00002642039": {"raw": 6.435643564356435, "rpkm": 0.3682833603326866},
"ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
"ENSE00002642039": {"raw": 6.435_643_564_356_435, "rpkm": 0.368_283_360_332_686_6},
"ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
},
"rpkm": 0.39507510837872767,
"rpkm": 0.395_075_108_378_727_67,
},
"ENST00000576696": {
"size": 1306,
"canonical": "0",
"exons": {
"ENSE00002663544": {"raw": 1.960896089608961, "rpkm": 0.5189869430916676},
"ENSE00002672617": {"raw": 5.564356435643564, "rpkm": 0.16002265523850875},
"ENSE00002663544": {"raw": 1.960_896_089_608_961, "rpkm": 0.518_986_943_091_667_6},
"ENSE00002672617": {
"raw": 5.564_356_435_643_564,
"rpkm": 0.160_022_655_238_508_75,
},
},
"rpkm": 0.195204453741728,
"rpkm": 0.195_204_453_741_728,
},
},
"star_rpkm": None,
Expand Down Expand Up @@ -137,7 +146,7 @@ def test_get_sample_details(mongodb):

def test_get_canonical_rpkm():
rpkm = get_canonical_rpkm(RNASeq_RECORD)
assert rpkm == 0.05165702955135873
assert rpkm == 0.051_657_029_551_358_73


def test_get_canonical_rpkm_no_canonical():
Expand Down
57 changes: 31 additions & 26 deletions tests/test_DGEList.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_sample_group_list():


def test_minimal_init():

dge_list = DGEList(
to_remove_zeroes=False,
counts=np.ones(shape=(5, 5)),
Expand Down Expand Up @@ -92,7 +91,6 @@ def test_library_size():


def test_setting_DGElist_counts():

dge_list = DGEList(
counts=np.zeros(shape=(5, 10)),
groups_in_list=['A', 'A', 'B', 'B', 'B'],
Expand All @@ -116,7 +114,6 @@ def test_setting_DGElist_counts():


def test_cycle_dge_npz():

import tempfile
import os

Expand Down Expand Up @@ -214,34 +211,42 @@ def test_rpkm():
assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6)))


def test_tpm():
# example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w
counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]])
gene_lengths = np.array([2000, 4000, 1000, 10000])

expected = np.array(
[
[333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26],
[333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26],
[333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26],
[0.0, 0.0, 2217.294_900_22],
]
def test_get_rates():
dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
icd = CanonicalDataStore(
get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
)
first_pos = dge_list.counts[0][0]
first_gene = dge_list.genes[0]

dge_list = DGEList(
counts=counts,
samples=np.array(['a', 'b', 'c']),
genes=np.array(['a', 'b', 'c', 'd']),
groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']},
assert isinstance(first_pos, np.integer)
ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is similar to how I designed the the test_tpm in the first run. Here is the comment by clintval to that:
2276d19#r224543989

That's the reason why I chose a very simple table that is manually calculatable

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I'm a bit confused. Is this in reference to the comments about using random values? This test shouldn't be using anything random; it should be pulling the first entry from the test data, similar to what the RPKM test is doing.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The link I posted is not working as I expected! I'll copy the comment here:
clintval: "Not bad, you are dynamically creating an expected result by copying some code from the main application.

This makes it hard to track down bugs with the logic since the logic exists in two places.

Norm. methods should probably be tested with a very small, hand-designed test case so you can explicitly write out all inputs, and all outputs."

This is why I didn't implement the test_tpm function like the test_rpkm function in the end. As a consequence also test_rpkm shouldn't be implemented as is at the moment..

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that makes much more sense. In that case, it may make sense to us apfejes' RPKM function sig and your test function as standards. It seems reasonable but probably worth discussing (unless it's already been discussed before I arrived).

Would it be worth it to possibly create a minimal test data set and run it through edgeR to match the edgeR outputs to ours? That's probably a larger discussion, but known and verified 3rd-party outputs may make sense for test cases.

Copy link
Member

@clintval clintval Dec 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider too, an additional layer of testing as property-based for these mathematical calculations:

https://hypothesis.readthedocs.io/en/latest/quickstart.html#writing-tests

gene_len = icd.get_length_of_canonical_transcript(ensg_gene)

rates = dge_list.get_rates(icd)

assert rates[0][0] == first_pos / gene_len


def test_tpm():
dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ)))
icd = CanonicalDataStore(
get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS)
)
assert isinstance(dge_list.counts[0][0], np.integer)
new_dge_list = dge_list.tpm(gene_lengths)
first_pos = dge_list.counts[0][0]
first_gene = dge_list.genes[0]

rates = dge_list.get_rates(icd)

rate_sum = np.sum(rates)

assert np.allclose(new_dge_list.counts, expected, atol=1e-1)
assert isinstance(first_pos, np.integer)
tpm_dge = dge_list.tpm(icd)
ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene))
gene_len = icd.get_length_of_canonical_transcript(ensg_gene)

# make sure that the sums of all genes across are the same the each sample (an important property of TPM)
gene_sums = new_dge_list.counts.sum(axis=0)
assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums))
# TPM = countsPerBase / sum[countsPerBase]) * 10^6
assert tpm_dge.counts[0][0] == ((first_pos / gene_len) / rate_sum) * 1e6


# Unit tests for ``edgePy.data_import.Importer``.\
Expand Down