diff --git a/NEWS.md b/NEWS.md index 5412986c5..9043f3a51 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,7 @@ * `expect_lint()` has a new argument `ignore_order` (default `FALSE`), which, if `TRUE`, allows the `checks=` to be provided in arbitary order vs. how `lint()` produces them (@MichaelChirico). * `undesirable_function_linter()` accepts unnamed entries, treating them as undesirable functions, e.g. `undesirable_function_linter("sum")` (#2536, @MichaelChirico). * `any_duplicated_linter()` is extended to recognize some usages from {dplyr} and {data.table} that could be replaced by `anyDuplicated()`, e.g. `n_distinct(col) == n()` or `uniqueN(col) == .N` (#2482, @MichaelChirico). +* `fixed_regex_linter()` recognizes usage of the new (R 4.5.0) `grepv()` wrapper of `grep()`; `regex_subset_linter()` also recommends `grepv()` alternatives (#2855, @MichaelChirico). ### New linters diff --git a/R/fixed_regex_linter.R b/R/fixed_regex_linter.R index 02ce1e576..5ab8680d5 100644 --- a/R/fixed_regex_linter.R +++ b/R/fixed_regex_linter.R @@ -77,7 +77,7 @@ fixed_regex_linter <- function(allow_unescaped = FALSE) { # regular expression pattern is the first argument pos_1_regex_funs <- c( - "grep", "gsub", "sub", "regexec", "grepl", "regexpr", "gregexpr" + "grep", "grepv", "gsub", "sub", "regexec", "grepl", "regexpr", "gregexpr" ) # regular expression pattern is the second argument diff --git a/R/regex_subset_linter.R b/R/regex_subset_linter.R index b6471e72f..bf55b4827 100644 --- a/R/regex_subset_linter.R +++ b/R/regex_subset_linter.R @@ -1,8 +1,8 @@ #' Require usage of direct methods for subsetting strings via regex #' -#' Using `value = TRUE` in [grep()] returns the subset of the input that matches -#' the pattern, e.g. `grep("[a-m]", letters, value = TRUE)` will return the -#' first 13 elements (`a` through `m`). +# TODO(R>=4.5.0): Just use [grepv()] directly. Need this while ?grepv doesn't exist. +#' Using [`grepv()`][grep] returns the subset of the input that matches the pattern, +#' e.g. `grepv("[a-m]", letters)` will return the first 13 elements (`a` through `m`). #' #' `letters[grep("[a-m]", letters)]` and `letters[grepl("[a-m]", letters)]` #' both return the same thing, but more circuitously and more verbosely. @@ -12,12 +12,12 @@ #' `str_detect()` and `str_which()`. #' #' @section Exceptions: -#' Note that `x[grep(pattern, x)]` and `grep(pattern, x, value = TRUE)` -#' are not _completely_ interchangeable when `x` is not character -#' (most commonly, when `x` is a factor), because the output of the -#' latter will be a character vector while the former remains a factor. -#' It still may be preferable to refactor such code, as it may be faster -#' to match the pattern on `levels(x)` and use that to subset instead. +#' Note that `x[grep(pattern, x)]` and `grepv(pattern, x)` are not +#' _completely_ interchangeable when `x` is not character (most commonly, +#' when `x` is a factor), because the output of the latter will be a +#' character vector while the former remains a factor. It still may be +#' preferable to refactor such code, as it may be faster to match the +#' pattern on `levels(x)` and use that to subset instead. #' #' @evalRd rd_tags("regex_subset_linter") #' @@ -35,7 +35,7 @@ #' #' # okay #' lint( -#' text = "grep(pattern, x, value = TRUE)", +#' text = "grepv(pattern, x)", #' linters = regex_subset_linter() #' ) #' @@ -71,8 +71,10 @@ regex_subset_linter <- function() { grep_lints <- xml_nodes_to_lints( grep_expr, source_expression = source_expression, - lint_message = - "Prefer grep(pattern, x, ..., value = TRUE) over x[grep(pattern, x, ...)] and x[grepl(pattern, x, ...)].", + lint_message = paste( + "Prefer grepv(pattern, x, ...) over x[grep(pattern, x, ...)] and x[grepl(pattern, x, ...)].", + "Code required to run on R versions before 4.5.0 can use grep(pattern, x, ..., value = TRUE)." + ), type = "warning" ) diff --git a/man/regex_subset_linter.Rd b/man/regex_subset_linter.Rd index b030cf748..267bc3e23 100644 --- a/man/regex_subset_linter.Rd +++ b/man/regex_subset_linter.Rd @@ -7,9 +7,8 @@ regex_subset_linter() } \description{ -Using \code{value = TRUE} in \code{\link[=grep]{grep()}} returns the subset of the input that matches -the pattern, e.g. \code{grep("[a-m]", letters, value = TRUE)} will return the -first 13 elements (\code{a} through \code{m}). +Using \code{\link[=grep]{grepv()}} returns the subset of the input that matches the pattern, +e.g. \code{grepv("[a-m]", letters)} will return the first 13 elements (\code{a} through \code{m}). } \details{ \code{letters[grep("[a-m]", letters)]} and \code{letters[grepl("[a-m]", letters)]} @@ -21,12 +20,12 @@ namely \code{str_subset()}, which should be preferred to versions using } \section{Exceptions}{ -Note that \code{x[grep(pattern, x)]} and \code{grep(pattern, x, value = TRUE)} -are not \emph{completely} interchangeable when \code{x} is not character -(most commonly, when \code{x} is a factor), because the output of the -latter will be a character vector while the former remains a factor. -It still may be preferable to refactor such code, as it may be faster -to match the pattern on \code{levels(x)} and use that to subset instead. +Note that \code{x[grep(pattern, x)]} and \code{grepv(pattern, x)} are not +\emph{completely} interchangeable when \code{x} is not character (most commonly, +when \code{x} is a factor), because the output of the latter will be a +character vector while the former remains a factor. It still may be +preferable to refactor such code, as it may be faster to match the +pattern on \code{levels(x)} and use that to subset instead. } \examples{ @@ -43,7 +42,7 @@ lint( # okay lint( - text = "grep(pattern, x, value = TRUE)", + text = "grepv(pattern, x)", linters = regex_subset_linter() ) diff --git a/tests/testthat/test-fixed_regex_linter.R b/tests/testthat/test-fixed_regex_linter.R index 83a00c141..e0dcae72e 100644 --- a/tests/testthat/test-fixed_regex_linter.R +++ b/tests/testthat/test-fixed_regex_linter.R @@ -1,30 +1,31 @@ test_that("fixed_regex_linter skips allowed usages", { linter <- fixed_regex_linter() - expect_lint("gsub('^x', '', y)", NULL, linter) - expect_lint("grep('x$', '', y)", NULL, linter) - expect_lint("sub('[a-zA-Z]', '', y)", NULL, linter) - expect_lint("grepl(fmt, y)", NULL, linter) - expect_lint(R"{regexec('\\s', '', y)}", NULL, linter) - expect_lint("grep('a(?=b)', x, perl = TRUE)", NULL, linter) - expect_lint("grep('0+1', x, perl = TRUE)", NULL, linter) - expect_lint("grep('1*2', x)", NULL, linter) - expect_lint("grep('a|b', x)", NULL, linter) - expect_lint(R"{grep('\\[|\\]', x)}", NULL, linter) + expect_no_lint("gsub('^x', '', y)", linter) + expect_no_lint("grep('x$', y)", linter) + expect_no_lint("grepv('x$', y)", linter) + expect_no_lint("sub('[a-zA-Z]', '', y)", linter) + expect_no_lint("grepl(fmt, y)", linter) + expect_no_lint(R"{regexec('\\s', '', y)}", linter) + expect_no_lint("grep('a(?=b)', x, perl = TRUE)", linter) + expect_no_lint("grep('0+1', x, perl = TRUE)", linter) + expect_no_lint("grep('1*2', x)", linter) + expect_no_lint("grep('a|b', x)", linter) + expect_no_lint(R"{grep('\\[|\\]', x)}", linter) # if fixed=TRUE is already set, regex patterns don't matter - expect_lint(R"{gsub('\\.', '', y, fixed = TRUE)}", NULL, linter) + expect_no_lint(R"{gsub('\\.', '', y, fixed = TRUE)}", linter) # ignore.case=TRUE implies regex interpretation - expect_lint("gsub('abcdefg', '', y, ignore.case = TRUE)", NULL, linter) + expect_no_lint("gsub('abcdefg', '', y, ignore.case = TRUE)", linter) # char classes starting with [] might contain other characters -> not fixed - expect_lint("sub('[][]', '', y)", NULL, linter) - expect_lint("sub('[][ ]', '', y)", NULL, linter) - expect_lint("sub('[],[]', '', y)", NULL, linter) + expect_no_lint("sub('[][]', '', y)", linter) + expect_no_lint("sub('[][ ]', '', y)", linter) + expect_no_lint("sub('[],[]', '', y)", linter) # wrapper functions don't throw - expect_lint("gregexpr(pattern = pattern, data, perl = TRUE, ...)", NULL, linter) + expect_no_lint("gregexpr(pattern = pattern, data, perl = TRUE, ...)", linter) }) test_that("fixed_regex_linter blocks simple disallowed usages", { @@ -36,6 +37,7 @@ test_that("fixed_regex_linter blocks simple disallowed usages", { expect_lint("gregexpr('a-z', y)", lint_msg, linter) expect_lint(R"{regexec('\\$', x)}", lint_msg, linter) expect_lint("grep('\n', x)", lint_msg, linter) + expect_lint("grepv('\n', x)", lint_msg, linter) # naming the argument doesn't matter (if it's still used positionally) expect_lint("gregexpr(pattern = 'a-z', y)", lint_msg, linter) @@ -77,19 +79,19 @@ test_that("fixed_regex_linter catches regex like [.] or [$]", { test_that("fixed_regex_linter catches null calls to strsplit as well", { linter <- fixed_regex_linter() - expect_lint("strsplit(x, '^x')", NULL, linter) - expect_lint(R"{strsplit(x, '\\s')}", NULL, linter) - expect_lint("strsplit(x, 'a(?=b)', perl = TRUE)", NULL, linter) - expect_lint("strsplit(x, '0+1', perl = TRUE)", NULL, linter) - expect_lint("strsplit(x, 'a|b')", NULL, linter) + expect_no_lint("strsplit(x, '^x')", linter) + expect_no_lint(R"{strsplit(x, '\\s')}", linter) + expect_no_lint("strsplit(x, 'a(?=b)', perl = TRUE)", linter) + expect_no_lint("strsplit(x, '0+1', perl = TRUE)", linter) + expect_no_lint("strsplit(x, 'a|b')", linter) - expect_lint("tstrsplit(x, '1*2')", NULL, linter) - expect_lint("tstrsplit(x, '[a-zA-Z]')", NULL, linter) - expect_lint("tstrsplit(x, fmt)", NULL, linter) + expect_no_lint("tstrsplit(x, '1*2')", linter) + expect_no_lint("tstrsplit(x, '[a-zA-Z]')", linter) + expect_no_lint("tstrsplit(x, fmt)", linter) # if fixed=TRUE is already set, regex patterns don't matter - expect_lint(R"{strsplit(x, '\\.', fixed = TRUE)}", NULL, linter) - expect_lint(R"{strsplit(x, '\\.', fixed = T)}", NULL, linter) + expect_no_lint(R"{strsplit(x, '\\.', fixed = TRUE)}", linter) + expect_no_lint(R"{strsplit(x, '\\.', fixed = T)}", linter) }) test_that("fixed_regex_linter catches calls to strsplit as well", { @@ -106,7 +108,7 @@ test_that("fixed_regex_linter is more exact about distinguishing \\s from \\:", linter <- fixed_regex_linter() lint_msg <- rex::rex("This regular expression is static") - expect_lint(R"{grep('\\s', '', x)}", NULL, linter) + expect_no_lint(R"{grep('\\s', '', x)}", linter) expect_lint(R"{grep('\\:', '', x)}", lint_msg, linter) }) @@ -114,18 +116,18 @@ test_that("fixed_regex_linter is more exact about distinguishing \\s from \\:", test_that("fixed_regex_linter skips allowed stringr usages", { linter <- fixed_regex_linter() - expect_lint("str_replace(y, '[a-zA-Z]', '')", NULL, linter) - expect_lint("str_replace_all(y, '^x', '')", NULL, linter) - expect_lint("str_detect(y, fmt)", NULL, linter) - expect_lint(R"{str_extract(y, '\\s')}", NULL, linter) - expect_lint(R"{str_extract_all(y, '\\s')}", NULL, linter) - expect_lint("str_which(x, '1*2')", NULL, linter) + expect_no_lint("str_replace(y, '[a-zA-Z]', '')", linter) + expect_no_lint("str_replace_all(y, '^x', '')", linter) + expect_no_lint("str_detect(y, fmt)", linter) + expect_no_lint(R"{str_extract(y, '\\s')}", linter) + expect_no_lint(R"{str_extract_all(y, '\\s')}", linter) + expect_no_lint("str_which(x, '1*2')", linter) # if fixed() is already set, regex patterns don't matter - expect_lint(R"{str_replace(y, fixed('\\.'), '')}", NULL, linter) + expect_no_lint(R"{str_replace(y, fixed('\\.'), '')}", linter) # namespace qualification doesn't matter - expect_lint("stringr::str_replace(y, stringr::fixed('abcdefg'), '')", NULL, linter) + expect_no_lint("stringr::str_replace(y, stringr::fixed('abcdefg'), '')", linter) }) test_that("fixed_regex_linter blocks simple disallowed usages of stringr functions", { @@ -148,11 +150,11 @@ test_that("fixed_regex_linter catches calls to str_split as well", { linter <- fixed_regex_linter() lint_msg <- rex::rex("This regular expression is static") - expect_lint("str_split(x, '^x')", NULL, linter) - expect_lint("str_split(x, fmt)", NULL, linter) + expect_no_lint("str_split(x, '^x')", linter) + expect_no_lint("str_split(x, fmt)", linter) # if fixed() is already set, regex patterns don't matter - expect_lint(R"{str_split(x, fixed('\\.'))}", NULL, linter) + expect_no_lint(R"{str_split(x, fixed('\\.'))}", linter) expect_lint(R"{str_split(x, '\\.')}", lint_msg, linter) expect_lint("str_split(x, '[.]')", lint_msg, linter) }) @@ -163,8 +165,8 @@ test_that("str_replace_all's multi-replacement version is handled", { # While each of the replacements is fixed, and this _could_ in principle be replaced by # a pipeline where each step does one of the replacements and fixed() is used, this is overkill. # Instead, ensure that no lint is returned for this case - expect_lint('str_replace_all(x, c("one" = "1", "two" = "2", "three" = "3"))', NULL, linter) - expect_lint('grepl(c("a" = "b"), x)', NULL, linter) + expect_no_lint('str_replace_all(x, c("one" = "1", "two" = "2", "three" = "3"))', linter) + expect_no_lint('grepl(c("a" = "b"), x)', linter) }) test_that("1- or 2-width octal escape sequences are handled", { @@ -209,20 +211,20 @@ test_that("bracketed unicode escapes are caught", { test_that("escaped characters are handled correctly", { linter <- fixed_regex_linter() - expect_lint(R"{gsub('\n+', '', sql)}", NULL, linter) - expect_lint('gsub("\\n{2,}", "\n", D)', NULL, linter) - expect_lint(R'{gsub("[\r\n]", "", x)}', NULL, linter) - expect_lint(R'{gsub("\n $", "", y)}', NULL, linter) - expect_lint(R'{gsub("```\n*```r*\n*", "", x)}', NULL, linter) - expect_lint('strsplit(x, "(;|\n)")', NULL, linter) - expect_lint(R'{strsplit(x, "(;|\n)")}', NULL, linter) - expect_lint(R'{grepl("[\\W]", x, perl = TRUE)}', NULL, linter) - expect_lint(R'{grepl("[\\W]", x)}', NULL, linter) + expect_no_lint(R"{gsub('\n+', '', sql)}", linter) + expect_no_lint('gsub("\\n{2,}", "\n", D)', linter) + expect_no_lint(R'{gsub("[\r\n]", "", x)}', linter) + expect_no_lint(R'{gsub("\n $", "", y)}', linter) + expect_no_lint(R'{gsub("```\n*```r*\n*", "", x)}', linter) + expect_no_lint('strsplit(x, "(;|\n)")', linter) + expect_no_lint(R'{strsplit(x, "(;|\n)")}', linter) + expect_no_lint(R'{grepl("[\\W]", x, perl = TRUE)}', linter) + expect_no_lint(R'{grepl("[\\W]", x)}', linter) }) # make sure the logic is properly vectorized test_that("single expression with multiple regexes is OK", { - expect_lint('c(grep("^a", x), grep("b$", x))', NULL, fixed_regex_linter()) + expect_no_lint('c(grep("^a", x), grep("b$", x))', fixed_regex_linter()) }) test_that("fixed replacements vectorize and recognize str_detect", { @@ -344,8 +346,8 @@ local({ test_that("'unescaped' regex can optionally be skipped", { linter <- fixed_regex_linter(allow_unescaped = TRUE) - expect_lint("grepl('a', x)", NULL, linter) - expect_lint("str_detect(x, 'a')", NULL, linter) + expect_no_lint("grepl('a', x)", linter) + expect_no_lint("str_detect(x, 'a')", linter) expect_lint("grepl('[$]', x)", rex::rex('Use "$" with fixed = TRUE'), linter) }) @@ -358,18 +360,18 @@ local({ lint_msg <- "This regular expression is static" expect_lint(paste("x", pipe, "grepl(pattern = 'a')"), lint_msg, linter) - expect_lint(paste("x", pipe, "grepl(pattern = '^a')"), NULL, linter) - expect_lint(paste("x", pipe, "grepl(pattern = 'a', fixed = TRUE)"), NULL, linter) + expect_no_lint(paste("x", pipe, "grepl(pattern = '^a')"), linter) + expect_no_lint(paste("x", pipe, "grepl(pattern = 'a', fixed = TRUE)"), linter) expect_lint(paste("x", pipe, "str_detect('a')"), lint_msg, linter) - expect_lint(paste("x", pipe, "str_detect('^a')"), NULL, linter) - expect_lint(paste("x", pipe, "str_detect(fixed('a'))"), NULL, linter) + expect_no_lint(paste("x", pipe, "str_detect('^a')"), linter) + expect_no_lint(paste("x", pipe, "str_detect(fixed('a'))"), linter) expect_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '')"), lint_msg, linter) - expect_lint(paste("x", pipe, "gsub(pattern = '^a', replacement = '')"), NULL, linter) - expect_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '', fixed = TRUE)"), NULL, linter) + expect_no_lint(paste("x", pipe, "gsub(pattern = '^a', replacement = '')"), linter) + expect_no_lint(paste("x", pipe, "gsub(pattern = 'a', replacement = '', fixed = TRUE)"), linter) expect_lint(paste("x", pipe, "str_replace('a', '')"), lint_msg, linter) - expect_lint(paste("x", pipe, "str_replace('^a', '')"), NULL, linter) - expect_lint(paste("x", pipe, "str_replace(fixed('a'), '')"), NULL, linter) + expect_no_lint(paste("x", pipe, "str_replace('^a', '')"), linter) + expect_no_lint(paste("x", pipe, "str_replace(fixed('a'), '')"), linter) }, pipe = pipes, .test_name = names(pipes) diff --git a/tests/testthat/test-regex_subset_linter.R b/tests/testthat/test-regex_subset_linter.R index 27303ee40..0c3e0c6e9 100644 --- a/tests/testthat/test-regex_subset_linter.R +++ b/tests/testthat/test-regex_subset_linter.R @@ -5,7 +5,7 @@ test_that("regex_subset_linter skips allowed usages", { test_that("regex_subset_linter blocks simple disallowed usages", { linter <- regex_subset_linter() - lint_msg <- rex::rex("Prefer grep(pattern, x, ..., value = TRUE)") + lint_msg <- rex::rex("Prefer grepv(pattern, x, ...)") expect_lint("x[grep(ptn, x)]", lint_msg, linter) expect_lint("names(y)[grepl(ptn, names(y), perl = TRUE)]", lint_msg, linter)