1
- use std:: collections:: HashSet ;
2
1
use log:: info;
3
- use pyo3:: prelude:: * ;
4
2
use pyo3:: exceptions:: PyIOError ;
3
+ use pyo3:: prelude:: * ;
4
+ use std:: collections:: HashSet ;
5
5
6
6
const AS_TAG_PREFIX : & str = "AS:i:" ;
7
7
8
8
/// Extract AS:i alignment score from SAM optional fields
9
- ///
9
+ ///
10
10
/// # Arguments
11
11
/// * `fields` - SAM fields starting from the optional fields (field 11+)
12
- ///
12
+ ///
13
13
/// # Returns
14
14
/// Option containing the AS:i score as f64, None if not found or invalid
15
15
fn extract_as_score ( fields : & [ & str ] ) -> Option < f64 > {
@@ -24,14 +24,14 @@ fn extract_as_score(fields: &[&str]) -> Option<f64> {
24
24
}
25
25
26
26
/// Parse a single SAM line and extract candidate OTU information
27
- ///
27
+ ///
28
28
/// This function processes one SAM line and determines if the read meets the score cutoff.
29
29
/// Used for testing and by the streaming functions.
30
30
///
31
31
/// # Arguments
32
32
/// * `line` - A SAM format line as string
33
33
/// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
34
- ///
34
+ ///
35
35
/// # Returns
36
36
/// Option containing the reference name if the read meets the cutoff, None otherwise
37
37
pub fn parse_sam_line ( line : & str , p_score_cutoff : f64 ) -> Option < String > {
@@ -42,14 +42,14 @@ pub fn parse_sam_line(line: &str, p_score_cutoff: f64) -> Option<String> {
42
42
43
43
// Parse SAM line - tab-separated format
44
44
let fields: Vec < & str > = line. split ( '\t' ) . collect ( ) ;
45
-
45
+
46
46
// SAM format requires at least 11 fields
47
47
if fields. len ( ) < 11 {
48
48
return None ;
49
49
}
50
50
51
51
// Extract key fields:
52
- // 1: FLAG
52
+ // 1: FLAG
53
53
// 2: RNAME (reference name)
54
54
// 9: SEQ (read sequence)
55
55
let flag: u16 = fields[ 1 ] . parse ( ) . unwrap_or ( 4 ) ; // Default to unmapped if parse fails
@@ -75,19 +75,18 @@ pub fn parse_sam_line(line: &str, p_score_cutoff: f64) -> Option<String> {
75
75
None
76
76
}
77
77
78
-
79
78
/// Extract candidate OTU reference IDs by running bowtie2 directly with streaming
80
- ///
79
+ ///
81
80
/// This function spawns a bowtie2 process directly from Rust and streams its output
82
81
/// to avoid memory issues with large SAM files. It processes SAM lines as they arrive
83
82
/// and returns only the unique reference IDs that meet the score cutoff.
84
- ///
83
+ ///
85
84
/// # Arguments
86
85
/// * `bowtie_index_path` - Path to the bowtie2 index
87
86
/// * `read_paths` - List of paths to the input read files
88
87
/// * `proc` - Number of processor threads for bowtie2
89
88
/// * `p_score_cutoff` - Minimum score threshold (AS:i score + read length)
90
- ///
89
+ ///
91
90
/// # Returns
92
91
/// Set of reference IDs that have reads meeting the score cutoff
93
92
pub fn find_candidate_otus_with_bowtie2 (
@@ -97,52 +96,57 @@ pub fn find_candidate_otus_with_bowtie2(
97
96
proc : i32 ,
98
97
p_score_cutoff : f64 ,
99
98
) -> PyResult < HashSet < String > > {
100
- use std:: process:: { Command , Stdio } ;
101
99
use std:: io:: { BufRead , BufReader } ;
102
-
103
- info ! ( "running bowtie2: index={}, reads={:?}, cutoff={}" ,
104
- bowtie_index_path, read_paths, p_score_cutoff) ;
100
+ use std:: process:: { Command , Stdio } ;
101
+
102
+ info ! (
103
+ "running bowtie2: index={}, reads={:?}, cutoff={}" ,
104
+ bowtie_index_path, read_paths, p_score_cutoff
105
+ ) ;
105
106
py. allow_threads ( || {
106
107
let mut cmd = Command :: new ( "bowtie2" ) ;
107
- cmd. arg ( "-p" ) . arg ( proc. to_string ( ) )
108
- . arg ( "--local" )
109
- . arg ( "--no-unal" )
110
- . arg ( "--score-min" ) . arg ( "L,20,1.0" )
111
- . arg ( "-N" ) . arg ( "0" )
112
- . arg ( "-L" ) . arg ( "15" )
113
- . arg ( "-x" ) . arg ( bowtie_index_path)
114
- . arg ( "-U" ) . arg ( read_paths. join ( "," ) )
115
- . stdout ( Stdio :: piped ( ) )
116
- . stderr ( Stdio :: piped ( ) ) ;
117
-
108
+ cmd. arg ( "-p" )
109
+ . arg ( proc. to_string ( ) )
110
+ . arg ( "--local" )
111
+ . arg ( "--no-unal" )
112
+ . arg ( "--score-min" )
113
+ . arg ( "L,20,1.0" )
114
+ . arg ( "-N" )
115
+ . arg ( "0" )
116
+ . arg ( "-L" )
117
+ . arg ( "15" )
118
+ . arg ( "-x" )
119
+ . arg ( bowtie_index_path)
120
+ . arg ( "-U" )
121
+ . arg ( read_paths. join ( "," ) )
122
+ . stdout ( Stdio :: piped ( ) )
123
+ . stderr ( Stdio :: piped ( ) ) ;
124
+
118
125
info ! ( "spawning bowtie2 process" ) ;
119
- let mut child = cmd. spawn ( )
120
- . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "Failed to spawn bowtie2: {}" , e) ) ) ?;
121
-
126
+ let mut child = cmd. spawn ( ) ?;
127
+
122
128
let stdout = child. stdout . take ( ) . unwrap ( ) ;
123
129
let reader = BufReader :: new ( stdout) ;
124
-
130
+
125
131
let mut candidate_otus = HashSet :: new ( ) ;
126
132
let mut line_count = 0u64 ;
127
133
let mut passing_count = 0u64 ;
128
-
134
+
129
135
for line_result in reader. lines ( ) {
130
- let line = line_result
131
- . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "Error reading bowtie2 output: {}" , e) ) ) ?;
132
-
136
+ let line = line_result?;
137
+
133
138
line_count += 1 ;
134
-
139
+
135
140
// Use the extracted SAM parsing function
136
141
if let Some ( ref_name) = parse_sam_line ( & line, p_score_cutoff) {
137
142
candidate_otus. insert ( ref_name) ;
138
143
passing_count += 1 ;
139
144
}
140
145
}
141
-
146
+
142
147
// Wait for bowtie2 to finish and check exit status
143
- let status = child. wait ( )
144
- . map_err ( |e| PyErr :: new :: < PyIOError , _ > ( format ! ( "Error waiting for bowtie2: {}" , e) ) ) ?;
145
-
148
+ let status = child. wait ( ) ?;
149
+
146
150
if !status. success ( ) {
147
151
// Read stderr for error details
148
152
let stderr_output = if let Some ( mut stderr) = child. stderr . take ( ) {
@@ -152,22 +156,25 @@ pub fn find_candidate_otus_with_bowtie2(
152
156
} else {
153
157
"Unknown error" . to_string ( )
154
158
} ;
155
-
159
+
156
160
return Err ( PyErr :: new :: < PyIOError , _ > ( format ! (
157
- "bowtie2 failed with exit code {:?}: {}" ,
158
- status. code( ) ,
161
+ "bowtie2 failed with exit code {:?}: {}" ,
162
+ status. code( ) ,
159
163
stderr_output
160
164
) ) ) ;
161
165
}
162
-
163
- info ! ( "processed {} sam lines, {} passed cutoff, found {} unique otus" ,
164
- line_count, passing_count, candidate_otus. len( ) ) ;
165
-
166
+
167
+ info ! (
168
+ "processed {} sam lines, {} passed cutoff, found {} unique otus" ,
169
+ line_count,
170
+ passing_count,
171
+ candidate_otus. len( )
172
+ ) ;
173
+
166
174
Ok ( candidate_otus)
167
175
} )
168
176
}
169
177
170
-
171
178
#[ cfg( test) ]
172
179
mod tests {
173
180
use super :: * ;
@@ -176,7 +183,7 @@ mod tests {
176
183
fn test_parse_sam_line_basic ( ) {
177
184
let line = "read1\t 0\t ref1\t 100\t 255\t 50M\t *\t 0\t 0\t AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t *\t AS:i:45" ;
178
185
let result = parse_sam_line ( line, 0.01 ) ;
179
-
186
+
180
187
// AS:i:45 + seq_len(50) = 95.0, should pass cutoff of 0.01
181
188
assert_eq ! ( result, Some ( "ref1" . to_string( ) ) ) ;
182
189
}
@@ -185,7 +192,7 @@ mod tests {
185
192
fn test_parse_sam_line_below_cutoff ( ) {
186
193
let line = "read1\t 0\t ref1\t 100\t 255\t 50M\t *\t 0\t 0\t AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t *\t AS:i:45" ;
187
194
let result = parse_sam_line ( line, 100.0 ) ;
188
-
195
+
189
196
// AS:i:45 + seq_len(50) = 95.0, should not pass cutoff of 100.0
190
197
assert_eq ! ( result, None ) ;
191
198
}
@@ -194,7 +201,7 @@ mod tests {
194
201
fn test_parse_sam_line_unmapped ( ) {
195
202
let line = "read1\t 4\t *\t 0\t 0\t *\t *\t 0\t 0\t AAAAA\t *" ;
196
203
let result = parse_sam_line ( line, 0.01 ) ;
197
-
204
+
198
205
// Unmapped read (flag & 4 != 0), should return None
199
206
assert_eq ! ( result, None ) ;
200
207
}
@@ -203,7 +210,7 @@ mod tests {
203
210
fn test_parse_sam_line_no_as_score ( ) {
204
211
let line = "read1\t 0\t ref1\t 100\t 255\t 50M\t *\t 0\t 0\t AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\t *" ;
205
212
let result = parse_sam_line ( line, 0.01 ) ;
206
-
213
+
207
214
// No AS:i score, should return None
208
215
assert_eq ! ( result, None ) ;
209
216
}
@@ -212,10 +219,8 @@ mod tests {
212
219
fn test_parse_sam_line_header ( ) {
213
220
let line = "@HD\t VN:1.0\t SO:unsorted" ;
214
221
let result = parse_sam_line ( line, 0.01 ) ;
215
-
222
+
216
223
// Header line, should return None
217
224
assert_eq ! ( result, None ) ;
218
225
}
219
-
220
-
221
- }
226
+ }
0 commit comments