Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of commented-out-code (~50-80%) #7706

Merged
merged 2 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/ruff_linter/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ ruff_python_parser = { path = "../ruff_python_parser" }
ruff_source_file = { path = "../ruff_source_file", features = ["serde"] }
ruff_text_size = { path = "../ruff_text_size" }

aho-corasick = { version = "1.1.1" }
annotate-snippets = { version = "0.9.1", features = ["color"] }
anyhow = { workspace = true }
bitflags = { workspace = true }
Expand Down
106 changes: 40 additions & 66 deletions crates/ruff_linter/src/rules/eradicate/detection.rs
Original file line number Diff line number Diff line change
@@ -1,43 +1,48 @@
/// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py)
use aho_corasick::AhoCorasick;
use once_cell::sync::Lazy;
use regex::Regex;
use regex::{Regex, RegexSet};

use ruff_python_parser::parse_suite;

static CODE_INDICATORS: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::new([
"(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue",
"import",
])
.unwrap()
});
charliermarsh marked this conversation as resolved.
Show resolved Hide resolved

static ALLOWLIST_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"^(?i)(?:pylint|pyright|noqa|nosec|region|endregion|type:\s*ignore|fmt:\s*(on|off)|isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)|mypy:|SPDX-License-Identifier:)",
r"^(?i)(?:pylint|pyright|noqa|nosec|region|endregion|type:\s*ignore|fmt:\s*(on|off)|isort:\s*(on|off|skip|skip_file|split|dont-add-imports(:\s*\[.*?])?)|mypy:|SPDX-License-Identifier:|(?:en)?coding[:=][ \t]*([-_.a-zA-Z0-9]+))",
).unwrap()
});
static BRACKET_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[()\[\]{}\s]+$").unwrap());
static CODE_INDICATORS: &[&str] = &[
"(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue", "import",
];
static CODE_KEYWORDS: Lazy<Vec<Regex>> = Lazy::new(|| {
vec![
Regex::new(r"^\s*elif\s+.*\s*:\s*$").unwrap(),
Regex::new(r"^\s*else\s*:\s*$").unwrap(),
Regex::new(r"^\s*try\s*:\s*$").unwrap(),
Regex::new(r"^\s*finally\s*:\s*$").unwrap(),
Regex::new(r"^\s*except\s+.*\s*:\s*$").unwrap(),
]
charliermarsh marked this conversation as resolved.
Show resolved Hide resolved
});
static CODING_COMMENT_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)").unwrap());

static HASH_NUMBER: Lazy<Regex> = Lazy::new(|| Regex::new(r"#\d").unwrap());
static MULTILINE_ASSIGNMENT_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^\s*([(\[]\s*)?(\w+\s*,\s*)*\w+\s*([)\]]\s*)?=.*[(\[{]$").unwrap());
static PARTIAL_DICTIONARY_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"^\s*['"]\w+['"]\s*:.+[,{]\s*(#.*)?$"#).unwrap());
static PRINT_RETURN_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(print|return)\b\s*").unwrap());

static POSITIVE_CASES: Lazy<RegexSet> = Lazy::new(|| {
RegexSet::new([
// Keywords
r"^(?:elif\s+.*\s*:|else\s*:|try\s*:|finally\s*:|except\s+.*\s*:)$",
// Partial dictionary
r#"^['"]\w+['"]\s*:.+[,{]\s*(#.*)?$"#,
// Multiline assignment
r"^(?:[(\[]\s*)?(?:\w+\s*,\s*)*\w+\s*([)\]]\s*)?=.*[(\[{]$",
// Brackets,
r"^[()\[\]{}\s]+$",
])
.unwrap()
});

/// Returns `true` if a comment contains Python code.
pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool {
let line = if let Some(line) = line.trim().strip_prefix('#') {
line.trim_start_matches([' ', '#'])
} else {
let line = line.trim_start_matches([' ', '#']).trim_end();

// Fast path: if none of the indicators are present, the line is not code.
if !CODE_INDICATORS.is_match(line) {
return false;
};
}
charliermarsh marked this conversation as resolved.
Show resolved Hide resolved

// Ignore task tag comments (e.g., "# TODO(tom): Refactor").
if line
Expand All @@ -48,65 +53,36 @@ pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool {
return false;
}

// Ignore non-comment related hashes (e.g., "# Issue #999").
if HASH_NUMBER.is_match(line) {
return false;
}

// Ignore whitelisted comments.
if ALLOWLIST_REGEX.is_match(line) {
return false;
}

if CODING_COMMENT_REGEX.is_match(line) {
return false;
}

// Check that this is possibly code.
if CODE_INDICATORS.iter().all(|symbol| !line.contains(symbol)) {
Copy link
Member

@MichaReiser MichaReiser Sep 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if it helps performance but some of the above regex could be combined with https://docs.rs/regex/latest/regex/struct.RegexSet.html . But we may also be able to join the Regex by simply ORing them together (A | B | C) instead of having multiple regex expressions

// Ignore non-comment related hashes (e.g., "# Issue #999").
if HASH_NUMBER.is_match(line) {
return false;
}

if multiline_case(line) {
return true;
}

if CODE_KEYWORDS.iter().any(|symbol| symbol.is_match(line)) {
return true;
}

let line = PRINT_RETURN_REGEX.replace_all(line, "");

if PARTIAL_DICTIONARY_REGEX.is_match(&line) {
return true;
}

// Finally, compile the source code.
parse_suite(&line, "<filename>").is_ok()
}

/// Returns `true` if a line is probably part of some multiline code.
fn multiline_case(line: &str) -> bool {
// If the comment made it this far, and ends in a continuation, assume it's code.
if line.ends_with('\\') {
return true;
}

if MULTILINE_ASSIGNMENT_REGEX.is_match(line) {
return true;
}

if BRACKET_REGEX.is_match(line) {
// If the comment matches any of the specified positive cases, assume it's code.
if POSITIVE_CASES.is_match(line) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interestingly, combining HASH_NUMBER and ALLOWLIST_REGEX into a single regex ended up hurting performance, so I left those as two separate passes.

return true;
}

false
// Finally, compile the source code.
parse_suite(line, "<filename>").is_ok()
}

#[cfg(test)]
mod tests {
use super::comment_contains_code;
use crate::settings::TASK_TAGS;

use super::comment_contains_code;

#[test]
fn comment_contains_code_basic() {
assert!(comment_contains_code("# x = 1", &[]));
Expand All @@ -127,7 +103,6 @@ mod tests {
assert!(!comment_contains_code("# 123", &[]));
assert!(!comment_contains_code("# 123.1", &[]));
assert!(!comment_contains_code("# 1, 2, 3", &[]));
assert!(!comment_contains_code("x = 1 # x = 1", &[]));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The method assumes you pass a comment, so this test doesn't really make sense.

assert!(!comment_contains_code(
"# pylint: disable=redefined-outer-name",
&[]
Expand All @@ -150,7 +125,6 @@ mod tests {
fn comment_contains_code_with_print() {
assert!(comment_contains_code("#print", &[]));
assert!(comment_contains_code("#print(1)", &[]));
assert!(comment_contains_code("#print 1", &[]));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is Python 2 code.


assert!(!comment_contains_code("#to print", &[]));
}
Expand Down
Loading