-
Notifications
You must be signed in to change notification settings - Fork 580
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Delete orphan files for topics. #8185
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,7 @@ | |
#include <seastar/core/with_scheduling_group.hh> | ||
#include <seastar/coroutine/maybe_yield.hh> | ||
#include <seastar/coroutine/parallel_for_each.hh> | ||
#include <seastar/util/file.hh> | ||
|
||
#include <fmt/format.h> | ||
|
||
|
@@ -401,6 +402,75 @@ ss::future<> log_manager::remove(model::ntp ntp) { | |
}); | ||
} | ||
|
||
ss::future<> log_manager::remove_orphan( | ||
ss::sstring data_directory_path, model::ntp ntp, model::revision_id rev) { | ||
vlog(stlog.info, "Asked to remove orphan for: {} revision: {}", ntp, rev); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we be more specific here i.e. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think no, because we remove all directories with rev less than provided. |
||
if (_logs.contains(ntp)) { | ||
co_return; | ||
} | ||
Comment on lines
+408
to
+410
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check can be done before creating a directory path |
||
|
||
const auto topic_directory_path | ||
= (std::filesystem::path(data_directory_path) / ntp.topic_path()) | ||
.string(); | ||
|
||
auto topic_directory_exist = co_await ss::file_exists(topic_directory_path); | ||
if (!topic_directory_exist) { | ||
co_return; | ||
} | ||
|
||
std::exception_ptr eptr; | ||
try { | ||
co_await directory_walker::walk( | ||
topic_directory_path, | ||
[&ntp, &topic_directory_path, &rev](ss::directory_entry entry) { | ||
auto ntp_directory_data | ||
= ntp_directory_path::parse_partition_directory(entry.name); | ||
if (!ntp_directory_data) { | ||
return ss::now(); | ||
} | ||
if ( | ||
ntp_directory_data->partition_id == ntp.tp.partition | ||
&& ntp_directory_data->revision_id <= rev) { | ||
auto ntp_directory = std::filesystem::path( | ||
topic_directory_path) | ||
/ std::filesystem::path(entry.name); | ||
vlog( | ||
stlog.info, | ||
"Cleaning up ntp [{}] rev {} directory {} ", | ||
ntp, | ||
ntp_directory_data->revision_id, | ||
ntp_directory); | ||
return ss::recursive_remove_directory(ntp_directory); | ||
} | ||
return ss::now(); | ||
}); | ||
} catch (std::filesystem::filesystem_error const&) { | ||
eptr = std::current_exception(); | ||
} catch (ss::broken_promise const&) { | ||
// List directory can throw ss::broken_promise exception when directory | ||
// was deleted while list directory is processing | ||
eptr = std::current_exception(); | ||
} | ||
if (eptr) { | ||
topic_directory_exist = co_await ss::file_exists(topic_directory_path); | ||
if (topic_directory_exist) { | ||
std::rethrow_exception(eptr); | ||
} else { | ||
vlog( | ||
stlog.debug, | ||
"Cleaning orphan. Topic directory was deleted: {}", | ||
topic_directory_path); | ||
co_return; | ||
} | ||
} | ||
vlog( | ||
stlog.info, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: think we can lower this to debug (logged for every orphan cleanup).
We don't know if the topic directory is "orphan" at this point? there may be other partitions we are just attempting to schedule a deletion if it is empty. |
||
"Trying to clean up orphan topic directory: {}", | ||
topic_directory_path); | ||
co_await dispatch_topic_dir_deletion(std::move(topic_directory_path)); | ||
co_return; | ||
} | ||
|
||
ss::future<> log_manager::dispatch_topic_dir_deletion(ss::sstring dir) { | ||
return ss::smp::submit_to( | ||
0, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,16 +127,23 @@ def __init__(self, test_context): | |
|
||
self.kafka_tools = KafkaCliTools(self.redpanda) | ||
|
||
def produce_until_partitions(self): | ||
self.kafka_tools.produce(self.topic, 1024, 1024) | ||
storage = self.redpanda.storage() | ||
return len(list(storage.partitions("kafka", self.topic))) == 9 | ||
|
||
def dump_storage_listing(self): | ||
for node in self.redpanda.nodes: | ||
self.logger.error(f"Storage listing on {node.name}:") | ||
for line in node.account.ssh_capture( | ||
f"find {self.redpanda.DATA_DIR}"): | ||
self.logger.error(line.strip()) | ||
|
||
@cluster(num_nodes=3) | ||
@parametrize(with_restart=False) | ||
@parametrize(with_restart=True) | ||
def topic_delete_test(self, with_restart): | ||
def produce_until_partitions(): | ||
self.kafka_tools.produce(self.topic, 1024, 1024) | ||
storage = self.redpanda.storage() | ||
return len(list(storage.partitions("kafka", self.topic))) == 9 | ||
|
||
wait_until(lambda: produce_until_partitions(), | ||
wait_until(lambda: self.produce_until_partitions(), | ||
timeout_sec=30, | ||
backoff_sec=2, | ||
err_msg="Expected partition did not materialize") | ||
|
@@ -160,13 +167,69 @@ def produce_until_partitions(): | |
err_msg="Topic storage was not removed") | ||
|
||
except: | ||
# On errors, dump listing of the storage location | ||
for node in self.redpanda.nodes: | ||
self.logger.error(f"Storage listing on {node.name}:") | ||
for line in node.account.ssh_capture( | ||
f"find {self.redpanda.DATA_DIR}"): | ||
self.logger.error(line.strip()) | ||
self.dump_storage_listing() | ||
raise | ||
|
||
@cluster(num_nodes=3, log_allow_list=[r'filesystem error: remove failed']) | ||
def topic_delete_orphan_files_test(self): | ||
wait_until(lambda: self.produce_until_partitions(), | ||
timeout_sec=30, | ||
backoff_sec=2, | ||
err_msg="Expected partition did not materialize") | ||
|
||
# Sanity check the kvstore checks: there should be at least one kvstore entry | ||
# per partition while the topic exists. | ||
assert sum(get_kvstore_topic_key_counts( | ||
self.redpanda).values()) >= self.topics[0].partition_count | ||
|
||
down_node = self.redpanda.nodes[-1] | ||
try: | ||
# Make topic directory immutable to prevent deleting | ||
down_node.account.ssh( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice trick 👍 |
||
f"chattr +i {self.redpanda.DATA_DIR}/kafka/{self.topic}") | ||
|
||
self.kafka_tools.delete_topic(self.topic) | ||
|
||
def topic_deleted_on_all_nodes_except_one(redpanda, down_node, | ||
topic_name): | ||
storage = redpanda.storage() | ||
log_not_removed_on_down = topic_name in next( | ||
filter(lambda x: x.name == down_node.name, | ||
storage.nodes)).ns["kafka"].topics | ||
logs_removed_on_others = all( | ||
map( | ||
lambda n: topic_name not in n.ns["kafka"].topics, | ||
filter(lambda x: x.name != down_node.name, | ||
storage.nodes))) | ||
return log_not_removed_on_down and logs_removed_on_others | ||
|
||
try: | ||
wait_until( | ||
lambda: topic_deleted_on_all_nodes_except_one( | ||
self.redpanda, down_node, self.topic), | ||
timeout_sec=30, | ||
backoff_sec=2, | ||
err_msg= | ||
"Topic storage was not removed from running nodes or removed from down node" | ||
) | ||
except: | ||
self.dump_storage_listing() | ||
raise | ||
|
||
self.redpanda.stop_node(down_node) | ||
finally: | ||
down_node.account.ssh( | ||
f"chattr -i {self.redpanda.DATA_DIR}/kafka/{self.topic}") | ||
|
||
self.redpanda.start_node(down_node) | ||
|
||
try: | ||
wait_until(lambda: topic_storage_purged(self.redpanda, self.topic), | ||
timeout_sec=30, | ||
backoff_sec=2, | ||
err_msg="Topic storage was not removed") | ||
except: | ||
self.dump_storage_listing() | ||
raise | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
coroutine?