Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 37 additions & 20 deletions .github/workflows/cloud_tests_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,50 @@ on:
- aws
- azure
- gcp
pull_request_review:
types: [submitted]
jobs:
run-full-tests-on-aws:
if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' || !github.event.inputs }}
if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' || !github.event.inputs || (github.repository == 'nf-core/rnaseq' && github.event.review.state == 'approved' && (github.event.pull_request.base.ref == 'master' || github.event.pull_request.base.ref == 'main')) }}
runs-on: ubuntu-latest
strategy:
matrix:
aligner: ["star_salmon", "star_rsem"]
arch: ["arm64", "x86_64"]
steps:
- uses: seqeralabs/action-tower-launch@v2
- name: Set revision variable
id: revision
run: |
echo "revision=${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'release') && github.sha || 'dev' }}" >> "$GITHUB_OUTPUT"

- name: Validate parameters
run: |
echo "Workspace ID: ${{ vars.TOWER_WORKSPACE_ID }}"
echo "Compute Env: ${{ matrix.arch == 'arm64' && vars.TOWER_COMPUTE_ENV_ARM || vars.TOWER_COMPUTE_ENV_CPU }}"
echo "Access token present: ${{ secrets.TOWER_ACCESS_TOKEN != '' }}"
- name: Launch workflow via Seqera Platform
uses: seqeralabs/action-tower-launch@refactor-javascript
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
workspace_id: ${{ vars.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_CE_AWS_CPU }}
workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/rnaseq/work-${{ github.sha }}"
run_name: "aws_rnaseq_full_${{ matrix.aligner }}"
revision: ${{ github.sha }}
profiles: test_full_aws
compute_env: ${{ matrix.arch == 'arm64' && vars.TOWER_COMPUTE_ENV_ARM || vars.TOWER_COMPUTE_ENV_CPU }}
workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/rnaseq/work-${{ steps.revision.outputs.revision }}-${{ matrix.aligner }}-${{ matrix.arch }}"
run_name: "aws_rnaseq_full_${{ matrix.aligner }}_${{ matrix.arch }}"
revision: ${{ steps.revision.outputs.revision }}
labels: ${{ matrix.arch == 'arm64' && 'full_test,arm,wave' || 'full_test' }}
profiles: test_full_aws${{ matrix.arch == 'arm64' && ',docker_arm' || '' }}
parameters: |
{
"hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}",
"aligner": "${{ matrix.aligner }}",
"outdir": "${{ secrets.TOWER_BUCKET_AWS }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}/"
"outdir": "${{ secrets.TOWER_BUCKET_AWS }}/rnaseq/results-${{ steps.revision.outputs.revision }}-${{ matrix.aligner }}-${{ matrix.arch }}/"
}
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: Tower debug log file
path: tower_action_*.log
name: Seqera Platform debug log file (${{ matrix.aligner }}-${{ matrix.arch }})
path: |
seqera_platform_action_*.log
seqera_platform_action_*.json

run-full-tests-on-azure:
if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' || !github.event.inputs }}
Expand All @@ -48,11 +65,11 @@ jobs:
matrix:
aligner: ["star_salmon", "star_rsem"]
steps:
- uses: seqeralabs/action-tower-launch@v2
- uses: seqeralabs/action-tower-launch@refactor-javascript
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
workspace_id: ${{ vars.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }}
compute_env: ${{ vars.TOWER_COMPUTE_ENV_CPU }}
workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/rnaseq/work-${{ github.sha }}"
run_name: "azure_rnaseq_full_${{ matrix.aligner }}"
revision: ${{ github.sha }}
Expand All @@ -64,7 +81,7 @@ jobs:
"outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}/",
"igenomes_base": "${{ secrets.TOWER_IGENOMES_BASE_AZURE }}"
}
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: Tower debug log file
path: tower_action_*.log
Expand All @@ -76,11 +93,11 @@ jobs:
matrix:
aligner: ["star_salmon", "star_rsem"]
steps:
- uses: seqeralabs/action-tower-launch@v2
- uses: seqeralabs/action-tower-launch@refactor-javascript
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
workspace_id: ${{ vars.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_CE_GCP_CPU }}
compute_env: ${{ vars.TOWER_COMPUTE_ENV_CPU }}
workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/rnaseq/work-${{ github.sha }}"
run_name: "gcp_rnaseq_full_${{ matrix.aligner }}"
revision: ${{ github.sha }}
Expand All @@ -91,7 +108,7 @@ jobs:
"aligner": "${{ matrix.aligner }}",
"outdir": "${{ secrets.TOWER_BUCKET_GCP }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}/"
}
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: Tower debug log file
path: tower_action_*.log
39 changes: 39 additions & 0 deletions bin/duplication_rates.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/awk -f
# AWK script to calculate duplication rates from featureCounts output
# Usage: awk -f duplication_rates.awk -v output_prefix="sample" with_dups.txt no_dups.txt

BEGIN {
OFS="\t"
print "ID", "Length", "Counts", "CountsNodup", "DupRate", "RPK", "RPKM"
}

# First pass: read with_dups file
FNR==NR && NR>2 {
with_dups[$1] = $7
lengths[$1] = $6
next
}

# Second pass: read no_dups file and calculate metrics
FNR!=NR && NR>2 {
gene = $1
length = lengths[gene]
counts_with_dups = with_dups[gene]
counts_no_dups = $7

if (counts_with_dups > 0) {
dup_rate = (counts_with_dups - counts_no_dups) / counts_with_dups
} else {
dup_rate = 0
}

if (length > 0) {
rpk = counts_no_dups / (length / 1000)
} else {
rpk = 0
}

rpkm = rpk # Simplified - could add total reads normalization if needed

print gene, length, counts_with_dups, counts_no_dups, dup_rate, rpk, rpkm
}
179 changes: 179 additions & 0 deletions bin/dupradar_fast_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#!/usr/bin/env python3
"""
Fast dupRadar analysis script
Replaces R/dupRadar with Python for high-performance processing
Maintains full MultiQC compatibility
"""

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import argparse

def main():
parser = argparse.ArgumentParser(description='Fast dupRadar analysis')
parser.add_argument('--with-dups', required=True, help='featureCounts output with duplicates')
parser.add_argument('--no-dups', required=True, help='featureCounts output without duplicates')
parser.add_argument('--prefix', required=True, help='Output prefix')
args = parser.parse_args()

# Read featureCounts outputs
with_dups = pd.read_csv(args.with_dups, sep='\t', comment='#', skiprows=1)
no_dups = pd.read_csv(args.no_dups, sep='\t', comment='#', skiprows=1)

# Extract count columns (last column is the BAM file)
with_dups_counts = with_dups.iloc[:, -1]
no_dups_counts = no_dups.iloc[:, -1]

# Calculate RPK (Reads per Kilobase)
gene_lengths = with_dups['Length'] / 1000 # Convert to kb
with_dups_rpk = with_dups_counts / gene_lengths
no_dups_rpk = no_dups_counts / gene_lengths

# Calculate duplication rate
duplication_rate = np.where(with_dups_rpk > 0, 1 - (no_dups_rpk / with_dups_rpk), 0)

# Create dupMatrix
dup_matrix = pd.DataFrame({
'gene_id': with_dups['Geneid'],
'rpkm': with_dups_rpk, # Using RPK instead of RPKM for simplicity
'duplication_rate': duplication_rate
})

# Filter out genes with zero expression
dup_matrix_filtered = dup_matrix[dup_matrix['rpkm'] > 0]

# Save dupMatrix
dup_matrix_filtered.to_csv(f"{args.prefix}_dupMatrix.txt", sep='\t', index=False)

# Calculate intercept and slope
if len(dup_matrix_filtered) > 1:
log_rpkm = np.log10(dup_matrix_filtered['rpkm'])
slope, intercept, r_value, p_value, std_err = linregress(log_rpkm, dup_matrix_filtered['duplication_rate'])

# Save intercept and slope
with open(f"{args.prefix}_intercept_slope.txt", 'w') as f:
f.write(f"intercept\t{intercept}\n")
f.write(f"slope\t{slope}\n")
f.write(f"r_squared\t{r_value**2}\n")

# Generate MultiQC files
with open(f"{args.prefix}_dup_intercept_mqc.txt", 'w') as f:
f.write("# plot_type: 'generalstats'\n")
f.write("# pconfig:\n")
f.write("# dupradar_intercept:\n")
f.write("# title: 'dupRadar Intercept'\n")
f.write("# description: 'Intercept of the dupRadar fitted curve'\n")
f.write("# format: '{:.3f}'\n")
f.write("Sample\tdupradar_intercept\n")
f.write(f"{args.prefix}\t{intercept:.3f}\n")

with open(f"{args.prefix}_duprateExpDensCurve_mqc.txt", 'w') as f:
f.write("# plot_type: 'linegraph'\n")
f.write("# section_name: 'dupRadar'\n")
f.write("# description: 'Expression vs Duplication Rate'\n")
f.write("# pconfig:\n")
f.write("# title: 'dupRadar: Expression vs Duplication Rate'\n")
f.write("# xlab: 'Expression (log10 RPK)'\n")
f.write("# ylab: 'Duplication Rate'\n")

# Sample data points for the curve
x_points = np.linspace(log_rpkm.min(), log_rpkm.max(), 100)
y_points = slope * x_points + intercept

f.write("log10_rpk\tduplication_rate\n")
for x, y in zip(x_points, y_points):
f.write(f"{x:.3f}\t{y:.3f}\n")

# Generate basic plots using matplotlib
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))

# Scatter plot
ax1.scatter(log_rpkm, dup_matrix_filtered['duplication_rate'], alpha=0.6, s=1)
ax1.plot(x_points, y_points, 'r-', linewidth=2)
ax1.set_xlabel('Expression (log10 RPK)')
ax1.set_ylabel('Duplication Rate')
ax1.set_title('dupRadar: Expression vs Duplication Rate')

# Box plot (simplified)
ax2.boxplot([dup_matrix_filtered['duplication_rate']])
ax2.set_ylabel('Duplication Rate')
ax2.set_title('Duplication Rate Distribution')

# Expression histogram
ax3.hist(log_rpkm, bins=50, alpha=0.7)
ax3.set_xlabel('Expression (log10 RPK)')
ax3.set_ylabel('Frequency')
ax3.set_title('Expression Distribution')

# Duplication rate histogram
ax4.hist(dup_matrix_filtered['duplication_rate'], bins=50, alpha=0.7)
ax4.set_xlabel('Duplication Rate')
ax4.set_ylabel('Frequency')
ax4.set_title('Duplication Rate Distribution')

plt.tight_layout()
plt.savefig(f"{args.prefix}_duprateExpDens.pdf")
plt.close()

# Individual plots for compatibility
plt.figure(figsize=(8, 6))
plt.boxplot([dup_matrix_filtered['duplication_rate']])
plt.ylabel('Duplication Rate')
plt.title('Duplication Rate Distribution')
plt.savefig(f"{args.prefix}_duprateExpBoxplot.pdf")
plt.close()

plt.figure(figsize=(8, 6))
plt.hist(log_rpkm, bins=50, alpha=0.7)
plt.xlabel('Expression (log10 RPK)')
plt.ylabel('Frequency')
plt.title('Expression Distribution')
plt.savefig(f"{args.prefix}_expressionHist.pdf")
plt.close()

else:
# Handle case with no/insufficient data
with open(f"{args.prefix}_intercept_slope.txt", 'w') as f:
f.write("intercept\tNA\n")
f.write("slope\tNA\n")
f.write("r_squared\tNA\n")

# Generate empty MultiQC files
with open(f"{args.prefix}_dup_intercept_mqc.txt", 'w') as f:
f.write("# plot_type: 'generalstats'\n")
f.write("# pconfig:\n")
f.write("# dupradar_intercept:\n")
f.write("# title: 'dupRadar Intercept'\n")
f.write("# description: 'Intercept of the dupRadar fitted curve'\n")
f.write("# format: '{:.3f}'\n")
f.write("Sample\tdupradar_intercept\n")
f.write(f"{args.prefix}\tNA\n")

with open(f"{args.prefix}_duprateExpDensCurve_mqc.txt", 'w') as f:
f.write("# plot_type: 'linegraph'\n")
f.write("# section_name: 'dupRadar'\n")
f.write("# description: 'Expression vs Duplication Rate'\n")
f.write("# pconfig:\n")
f.write("# title: 'dupRadar: Expression vs Duplication Rate'\n")
f.write("# xlab: 'Expression (log10 RPK)'\n")
f.write("# ylab: 'Duplication Rate'\n")
f.write("log10_rpk\tduplication_rate\n")
f.write("0\t0\n")
f.write("1\t0\n")

# Create empty plots
fig, ax = plt.subplots(figsize=(8, 6))
ax.text(0.5, 0.5, 'Insufficient data for analysis',
horizontalalignment='center', verticalalignment='center', transform=ax.transAxes)
plt.savefig(f"{args.prefix}_duprateExpDens.pdf")
plt.savefig(f"{args.prefix}_duprateExpBoxplot.pdf")
plt.savefig(f"{args.prefix}_expressionHist.pdf")
plt.close()

print("Analysis complete")

if __name__ == "__main__":
main()
Loading
Loading