Skip to content

Commit 94d23ff

Browse files
Soluminfacebook-github-bot
authored andcommitted
Turn mypy exclude regexes into lists of globs
Summary: Next CLs: - Add an option to verify the globs against the list of files that the exclude regex matches. - Refine the algorithm, e.g. don't ignore `^`, treat repetitions and classes more accurately. - Fall back to the list of files if conversion fails or is inaccurate. Reviewed By: connernilsen Differential Revision: D73400130 fbshipit-source-id: 059529e32916d1ad925fe81ae95ee3b510953c0d
1 parent e3d828c commit 94d23ff

File tree

3 files changed

+275
-11
lines changed

3 files changed

+275
-11
lines changed

pyrefly/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ path-absolutize = { version = "3.0", features = ["use_unix_paths_on_wasm"] }
4545
pyrefly_derive = { path = "../pyrefly_derive" }
4646
rayon = "1.9.0"
4747
regex = "1.11.1"
48+
regex-syntax = "0.7.5"
4849
ruff_python_ast = { git = "https://github.com/astral-sh/ruff/", rev = "3acf4e716d5d1fb9bd1316dc268deb6824a300c8" }
4950
ruff_python_parser = { git = "https://github.com/astral-sh/ruff/", rev = "3acf4e716d5d1fb9bd1316dc268deb6824a300c8" }
5051
ruff_source_file = { git = "https://github.com/astral-sh/ruff/", rev = "3acf4e716d5d1fb9bd1316dc268deb6824a300c8" }

pyrefly/lib/config/mypy.rs renamed to pyrefly/lib/config/mypy/mod.rs

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
* LICENSE file in the root directory of this source tree.
66
*/
77

8+
mod regex_converter;
9+
810
use std::path::Path;
911
use std::path::PathBuf;
1012
use std::process::Command;
@@ -34,8 +36,6 @@ pub struct MypyConfig {
3436
python_version: Option<PythonVersion>,
3537
#[serde(rename = "python_executable")]
3638
python_interpreter: Option<PathBuf>,
37-
#[serde(rename = "follow_untyped_imports")]
38-
use_untyped_imports: bool,
3939
}
4040

4141
#[derive(Deserialize)]
@@ -53,7 +53,6 @@ with open(sys.argv[1]) as f:
5353
cp.read_file(f)
5454
cfg = {}
5555
replace_imports = []
56-
follow_untyped_imports = False
5756
for section in cp.sections():
5857
cfg[section] = {}
5958
for key, value in cp.items(section):
@@ -64,15 +63,12 @@ for section in cp.sections():
6463
value = [x.strip() for x in value.split(',') if x.strip()]
6564
elif key == 'mypy_path':
6665
value = [x.strip() for x in re.split('[,:]', value) if x.strip()]
67-
elif key == 'follow_untyped_imports':
68-
follow_untyped_imports |= value == 'True'
6966
elif value in ('True', 'False'):
7067
value = value == 'True'
7168
cfg[section][key] = value
7269
if not cfg[section]:
7370
del cfg[section]
7471
mypy = cfg.pop('mypy', {})
75-
mypy['follow_untyped_imports'] = follow_untyped_imports
7672
print(json.dumps({'mypy': mypy, 'per_module': cfg, 'replace_imports': replace_imports}))
7773
";
7874
let mut cmd = Command::new(
@@ -106,6 +102,11 @@ print(json.dumps({'mypy': mypy, 'per_module': cfg, 'replace_imports': replace_im
106102
);
107103
cfg.project_includes = project_includes;
108104

105+
if let Some(exclude_regex) = mypy.exclude_regex {
106+
let patterns = regex_converter::convert(&exclude_regex)?;
107+
cfg.project_excludes = Globs::new(patterns);
108+
}
109+
109110
if let Some(search_path) = mypy.search_path {
110111
cfg.search_path = search_path;
111112
}
@@ -118,7 +119,6 @@ print(json.dumps({'mypy': mypy, 'per_module': cfg, 'replace_imports': replace_im
118119
if mypy.python_interpreter.is_some() {
119120
cfg.python_interpreter = mypy.python_interpreter;
120121
}
121-
cfg.use_untyped_imports = mypy.use_untyped_imports;
122122

123123
cfg.root.replace_imports_with_any = Some(replace_imports);
124124

@@ -156,9 +156,6 @@ ignore_missing_imports = True
156156
157157
[mypy-stricter.on.this.*]
158158
check_untyped_defs = True
159-
160-
[mypy-do.follow.*]
161-
follow_untyped_imports = True
162159
"#;
163160
fs_anyhow::write(&input_path, mypy)?;
164161

@@ -180,8 +177,14 @@ follow_untyped_imports = True
180177
]
181178
);
182179

180+
let expected_excludes = Globs::new(vec![
181+
"**/src/include/".to_owned(),
182+
"**/other_src/include/".to_owned(),
183+
"**/src/specific/bad/file.py".to_owned(),
184+
]);
185+
assert_eq!(cfg.project_excludes, expected_excludes);
186+
183187
assert_eq!(cfg.replace_imports_with_any().len(), 2);
184-
assert!(cfg.use_untyped_imports);
185188
Ok(())
186189
}
187190
}
Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
*
4+
* This source code is licensed under the MIT license found in the
5+
* LICENSE file in the root directory of this source tree.
6+
*/
7+
8+
use regex_syntax::hir;
9+
use regex_syntax::hir::Hir;
10+
use regex_syntax::hir::HirKind;
11+
use regex_syntax::hir::Visitor;
12+
13+
/// Ir is the intermediate representation that the visitor uses to track the regex components it has processed.
14+
#[derive(Debug)]
15+
enum Ir {
16+
/// Represents a literal string.
17+
Part(String),
18+
/// Represents a sequence of strings that will be concatenated into one (or more) final strings.
19+
Concat(Vec<Ir>),
20+
/// Represents a choice of strings.
21+
Alter(Vec<Ir>),
22+
}
23+
24+
impl Ir {
25+
/// Consumes the IR to create the list of strings.
26+
fn to_strings(self) -> Vec<String> {
27+
match self {
28+
// Part is easy: its string is itself.
29+
Self::Part(s) => vec![s],
30+
// For concat, the components are glued together.
31+
// An Alter in this Concat produces multiple strings:
32+
// `a(b|c)` -> Concat(a, Alter(b, c)) -> ["ab", "ac"]
33+
// To handle this, each Ir in the Concat is glued to each of the strings that is being built.
34+
Self::Concat(parts) => parts
35+
.into_iter()
36+
.map(Ir::to_strings)
37+
.reduce(|acc, ps| {
38+
acc.iter()
39+
.flat_map(|a| ps.iter().map(|p| format!("{a}{p}")).collect::<Vec<_>>())
40+
.collect::<Vec<_>>()
41+
})
42+
.unwrap_or_default(),
43+
// Alter is also easy: each Ir is handled independently.
44+
Self::Alter(parts) => parts
45+
.into_iter()
46+
.flat_map(Ir::to_strings)
47+
.collect::<Vec<_>>(),
48+
}
49+
}
50+
}
51+
52+
/// RegexConverter is a regex_syntax Visitor for turning (simple) mypy exclude regexes into pyrefly globs.
53+
// Conversion follows simple rules:
54+
// - directories end in `/` => prepended with `**/`.
55+
// - files end in `.py` => prepended with `**/`.
56+
// - ambiguous names are neither files nor directories => prepend with **/ and append *.
57+
// - all reptitions (e.g. `.*`) are turned into wildcards (`*`).
58+
// - all other `.`s are treated as literal `.` instead of wildcards.
59+
// - alternations (`|`) are split into individual items.
60+
// Conversion proceeds by turning each component of the regex into a literal, a sequence of components, or an
61+
// alternation of components. This results in a tree of Strings and Vec<String>s.
62+
#[derive(Debug)]
63+
struct RegexConverter {
64+
stack: Vec<Ir>,
65+
dot: hir::Hir,
66+
repetition: Option<Hir>,
67+
}
68+
69+
impl RegexConverter {
70+
fn new() -> Self {
71+
let dot = hir::Hir::dot(hir::Dot::AnyChar);
72+
Self {
73+
stack: vec![],
74+
dot,
75+
repetition: None,
76+
}
77+
}
78+
79+
fn push_ir(&mut self, ir: Ir) {
80+
match &mut self.stack.last_mut() {
81+
Some(Ir::Concat(v)) => v.push(ir),
82+
Some(Ir::Alter(v)) => v.push(ir),
83+
_ => self.stack.push(ir),
84+
}
85+
}
86+
}
87+
88+
impl Visitor for RegexConverter {
89+
type Output = Vec<String>;
90+
type Err = anyhow::Error;
91+
92+
fn finish(mut self) -> Result<Self::Output, Self::Err> {
93+
if self.stack.len() != 1 {
94+
return Err(anyhow::anyhow!(
95+
"Expected to find exactly one element on the stack, but found {}",
96+
self.stack.len()
97+
));
98+
}
99+
let curr = self.stack.pop().unwrap();
100+
let curr = curr.to_strings();
101+
let globs = curr
102+
.iter()
103+
.map(|g| {
104+
let g = g.strip_prefix('/').unwrap_or(g);
105+
let mut g = format!("**/{g}");
106+
if !(g.ends_with('/') || g.ends_with(".py")) {
107+
g.push('*');
108+
}
109+
g
110+
})
111+
.collect();
112+
Ok(globs)
113+
}
114+
115+
fn visit_pre(&mut self, hir: &Hir) -> Result<(), Self::Err> {
116+
if self.repetition.is_some() {
117+
return Ok(());
118+
}
119+
120+
match hir.kind() {
121+
HirKind::Empty => {}
122+
HirKind::Literal(lit) => {
123+
let slice = lit.0.as_ref();
124+
self.push_ir(Ir::Part(String::from_utf8_lossy(slice).to_string()));
125+
}
126+
HirKind::Look(_) => {}
127+
// It's common to see `.` as in `.py`. Treat it as just a `.`.
128+
// If it was inside a repetition, where it would actually matter, then it would have been skipped.
129+
HirKind::Class(_) if *hir == self.dot => self.push_ir(Ir::Part(".".to_owned())),
130+
// Otherwise, we're ignoring Classes for now, since they're uncommon.
131+
HirKind::Class(_) => {}
132+
// Treat all repetitions as .* => *
133+
HirKind::Repetition(_) => {
134+
self.repetition = Some(hir.clone());
135+
self.push_ir(Ir::Part("*".to_owned()));
136+
}
137+
// Ignore captures. We'll just process their subexpression normally.
138+
HirKind::Capture(_) => {}
139+
HirKind::Concat(_) => self.stack.push(Ir::Concat(vec![])),
140+
HirKind::Alternation(_) => self.stack.push(Ir::Alter(vec![])),
141+
}
142+
Ok(())
143+
}
144+
145+
fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
146+
if self.repetition.as_ref() == Some(_hir) {
147+
self.repetition = None;
148+
}
149+
if matches!(_hir.kind(), HirKind::Concat(_) | HirKind::Alternation(_)) {
150+
let top = self.stack.pop().unwrap();
151+
self.push_ir(top);
152+
}
153+
Ok(())
154+
}
155+
}
156+
157+
pub fn convert(regex: &str) -> anyhow::Result<Vec<String>> {
158+
let mut parser = regex_syntax::ParserBuilder::new()
159+
// This enables `.` to match all characters, which is a small simplification that makes Hir easier to reason about.
160+
.dot_matches_new_line(true)
161+
.build();
162+
let h = parser.parse(regex)?;
163+
hir::visit(&h, RegexConverter::new())
164+
}
165+
166+
#[cfg(test)]
167+
mod tests {
168+
use super::*;
169+
170+
struct Case {
171+
input: &'static str,
172+
want: Vec<&'static str>,
173+
}
174+
175+
#[test]
176+
fn test_simple_regex() -> anyhow::Result<()> {
177+
let cases = vec![
178+
Case {
179+
input: "ambiguous",
180+
want: vec!["**/ambiguous*"],
181+
},
182+
Case {
183+
input: "dir/",
184+
want: vec!["**/dir/"],
185+
},
186+
Case {
187+
input: "unambiguous.py",
188+
want: vec!["**/unambiguous.py"],
189+
},
190+
];
191+
for Case { input, want } in cases {
192+
let got = convert(input)?;
193+
assert_eq!(got, want, "input: {input:?}");
194+
}
195+
Ok(())
196+
}
197+
198+
#[test]
199+
fn test_special_chars() -> anyhow::Result<()> {
200+
let cases = vec![
201+
Case {
202+
input: r"src/foo/bar\.py",
203+
want: vec!["**/src/foo/bar.py"],
204+
},
205+
Case {
206+
input: r"dev/.*\.py",
207+
want: vec!["**/dev/*.py"],
208+
},
209+
Case {
210+
input: r"^bar/",
211+
want: vec!["**/bar/"],
212+
},
213+
Case {
214+
input: r"some/file.py$",
215+
want: vec!["**/some/file.py"],
216+
},
217+
Case {
218+
input: r"^try/both.py$",
219+
want: vec!["**/try/both.py"],
220+
},
221+
];
222+
for Case { input, want } in cases {
223+
let got = convert(input)?;
224+
assert_eq!(got, want, "input: {input:?}");
225+
}
226+
Ok(())
227+
}
228+
229+
#[test]
230+
fn test_alternation() -> anyhow::Result<()> {
231+
let cases = vec![
232+
Case {
233+
input: "foo|bar",
234+
want: vec!["**/foo*", "**/bar*"],
235+
},
236+
Case {
237+
input: r"(src/foo/bar\.py|dev/.*\.py)",
238+
want: vec!["**/src/foo/bar.py", "**/dev/*.py"],
239+
},
240+
];
241+
for Case { input, want } in cases {
242+
let got = convert(input)?;
243+
assert_eq!(got, want, "input: {input:?}");
244+
}
245+
Ok(())
246+
}
247+
248+
#[test]
249+
fn test_x_mode() -> anyhow::Result<()> {
250+
let input = r"(?x)(
251+
alpha/
252+
| beta/
253+
| gamma.py
254+
)";
255+
let want = vec!["**/alpha/", "**/beta/", "**/gamma.py"];
256+
let got = convert(input)?;
257+
assert_eq!(got, want);
258+
Ok(())
259+
}
260+
}

0 commit comments

Comments
 (0)