Optimize `VecDeque::drain` for (half-)open ranges #118264

lukas-code · 2023-11-24T19:54:27Z

The most common use cases of VecDeque::drain consume either the entire queue or elements from the front or back.¹ This PR makes these operations faster by optimizing the generated code of the destructor of the drain:

.drain(..) is now the same as .clear().
.drain(n..) is now (almost²) the same as .truncate(n).
.drain(..n) is now an efficient "advance" function. This operation is not provided by a dedicated function and optimizing it is my main motivation for this PR.

Previously, all of these cases generated a function call to the destructor of the DropGuard, emitting a lot of unused machine code as well as unnecessary branches and loads/stores of stack variables.

There are no algorithmic changes in this PR, but it simplifies the code enough to allow LLVM to recognize the special cases and optimize accordingly. Most notably, it allows elimination of the rather large wrap_copy function.

Some rudimentary microbenchmarks show a performance improvement of ~3x-4x on my machine for the special cases and roughly equal performance for the general case.

Best reviewed commit by commit.

generated assembly

before:

clear:
	sub rsp, 40
	mov rax, qword ptr [rdi + 24]
	mov qword ptr [rdi + 24], 0
	mov qword ptr [rsp], rdi
	mov qword ptr [rsp + 8], rax
	xorps xmm0, xmm0
	movups xmmword ptr [rsp + 16], xmm0
	mov qword ptr [rsp + 32], rax
	test rax, rax
	je .LBB1_2
	mov rcx, qword ptr [rdi]
	mov rdx, qword ptr [rdi + 16]
	xor esi, esi
	cmp rdx, rcx
	cmovae rsi, rcx
	sub rdx, rsi
	mov rsi, rcx
	sub rsi, rdx
	lea rdi, [rdx + rax]
	cmp rsi, rax
	cmovb rdi, rcx
	sub rdi, rdx
	mov qword ptr [rsp + 16], rdi
	mov qword ptr [rsp + 32], 0
.LBB1_2:
	mov rdi, rsp
	call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
	add rsp, 40
	ret

truncate:
	mov rax, qword ptr [rdi + 24]
	sub rax, rsi
	jbe .LBB2_2
	sub rsp, 40
	mov qword ptr [rdi + 24], rsi
	mov qword ptr [rsp], rdi
	mov qword ptr [rsp + 8], rax
	mov rcx, qword ptr [rdi]
	mov rdx, qword ptr [rdi + 16]
	add rdx, rsi
	xor edi, edi
	cmp rdx, rcx
	cmovae rdi, rcx
	mov qword ptr [rsp + 24], 0
	sub rdx, rdi
	mov rdi, rcx
	sub rdi, rdx
	lea r8, [rdx + rax]
	cmp rdi, rax
	cmovb r8, rcx
	sub rsi, rdx
	add rsi, r8
	mov qword ptr [rsp + 16], rsi
	mov qword ptr [rsp + 32], 0
	mov rdi, rsp
	call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
	add rsp, 40

advance:
	mov rcx, qword ptr [rdi + 24]
	mov rax, rcx
	sub rax, rsi
	jbe .LBB3_1
	sub rsp, 40
	mov qword ptr [rdi + 24], 0
	mov qword ptr [rsp], rdi
	mov qword ptr [rsp + 8], rsi
	mov qword ptr [rsp + 16], 0
	mov qword ptr [rsp + 24], rax
	mov qword ptr [rsp + 32], rsi
	test rsi, rsi
	je .LBB3_6
	mov rax, qword ptr [rdi]
	mov rcx, qword ptr [rdi + 16]
	xor edx, edx
	cmp rcx, rax
	cmovae rdx, rax
	sub rcx, rdx
	mov rdx, rax
	sub rdx, rcx
	lea rdi, [rcx + rsi]
	cmp rdx, rsi
	cmovb rdi, rax
	sub rdi, rcx
	mov qword ptr [rsp + 16], rdi
	mov qword ptr [rsp + 32], 0
.LBB3_6:
	mov rdi, rsp
	call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
	add rsp, 40
	ret
.LBB3_1:
	test rcx, rcx
	je .LBB3_3
	mov qword ptr [rdi + 24], 0
.LBB3_3:
	mov qword ptr [rdi + 16], 0
	ret

remove:
	sub rsp, 40
	cmp rdx, rsi
	jb .LBB4_5
	mov rax, qword ptr [rdi + 24]
	mov rcx, rax
	sub rcx, rdx
	jb .LBB4_6
	mov qword ptr [rdi + 24], rsi
	mov qword ptr [rsp], rdi
	sub rdx, rsi
	mov qword ptr [rsp + 8], rdx
	mov qword ptr [rsp + 16], rsi
	mov qword ptr [rsp + 24], rcx
	mov qword ptr [rsp + 32], rdx
	je .LBB4_4
	mov rax, qword ptr [rdi]
	mov rcx, qword ptr [rdi + 16]
	add rcx, rsi
	xor edi, edi
	cmp rcx, rax
	cmovae rdi, rax
	sub rcx, rdi
	mov rdi, rax
	sub rdi, rcx
	lea r8, [rcx + rdx]
	cmp rdi, rdx
	cmovb r8, rax
	sub rsi, rcx
	add rsi, r8
	mov qword ptr [rsp + 16], rsi
	mov qword ptr [rsp + 32], 0
.LBB4_4:
	mov rdi, rsp
	call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>
	add rsp, 40
	ret
.LBB4_5:
	lea rax, [rip + .L__unnamed_2]
	mov rdi, rsi
	mov rsi, rdx
	mov rdx, rax
	call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL]
.LBB4_6:
	lea rcx, [rip + .L__unnamed_2]
	mov rdi, rdx
	mov rsi, rax
	mov rdx, rcx
	call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL]

core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>:
	push rbp
	push r15
	push r14
	push r13
	push r12
	push rbx
	sub rsp, 24
	mov rsi, qword ptr [rdi + 32]
	test rsi, rsi
	je .LBB0_2
	mov rax, qword ptr [rdi + 16]
	add rsi, rax
	jb .LBB0_45
.LBB0_2:
	mov r13, qword ptr [rdi]
	mov rbp, qword ptr [rdi + 8]
	mov rbx, qword ptr [r13 + 24]
	lea r12, [rbx + rbp]
	mov r15, qword ptr [rdi + 24]
	lea rsi, [r15 + r12]
	test rbx, rbx
	je .LBB0_10
	test r15, r15
	je .LBB0_42
	cmp rbx, r15
	jbe .LBB0_12
	mov r14, qword ptr [r13]
	mov rax, qword ptr [r13 + 16]
	add r12, rax
	xor ecx, ecx
	cmp r12, r14
	mov rdx, r14
	cmovb rdx, rcx
	sub r12, rdx
	add rbx, rax
	cmp rbx, r14
	cmovae rcx, r14
	sub rbx, rcx
	mov rcx, rbx
	sub rcx, r12
	je .LBB0_42
	mov rdi, qword ptr [r13 + 8]
	mov rax, rcx
	add rax, r14
	cmovae rax, rcx
	mov r8, r14
	sub r8, r12
	mov rcx, r14
	sub rcx, rbx
	mov rdx, r15
	sub rdx, r8
	mov qword ptr [rsp + 16], rsi
	jbe .LBB0_18
	cmp rax, r15
	jae .LBB0_24
	mov rdx, r15
	sub rdx, r8
	shl rdx, 2
	cmp r15, rcx
	jbe .LBB0_30
	sub r8, rcx
	mov qword ptr [rsp], rdi
	mov rax, qword ptr [rsp]
	lea rdi, [rax + 4*r8]
	mov rsi, qword ptr [rsp]
	mov qword ptr [rsp + 8], rcx
	mov r15, r8
	call qword ptr [rip + memmove@GOTPCREL]
	sub r14, r15
	mov rax, qword ptr [rsp]
	lea rsi, [rax + 4*r14]
	shl r15, 2
	mov rdi, qword ptr [rsp]
	mov rdx, r15
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, qword ptr [rsp]
	lea rsi, [rdi + 4*r12]
	lea rdi, [rdi + 4*rbx]
	mov r15, qword ptr [rsp + 8]
	jmp .LBB0_36
.LBB0_10:
	test r15, r15
	je .LBB0_17
	mov rax, qword ptr [r13]
	sub rsi, rbp
	add rbp, qword ptr [r13 + 16]
	xor ecx, ecx
	cmp rbp, rax
	cmovae rcx, rax
	sub rbp, rcx
	mov qword ptr [r13 + 16], rbp
	jmp .LBB0_43
.LBB0_12:
	mov rdx, qword ptr [r13 + 16]
	mov r15, qword ptr [r13]
	lea rax, [rdx + rbp]
	xor ecx, ecx
	cmp rax, r15
	cmovae rcx, r15
	mov r12, rax
	sub r12, rcx
	mov rcx, r12
	sub rcx, rdx
	je .LBB0_41
	mov rdi, qword ptr [r13 + 8]
	mov rax, rcx
	add rax, r15
	cmovae rax, rcx
	mov r8, r15
	sub r8, rdx
	mov rcx, r15
	sub rcx, r12
	mov r14, rbx
	sub r14, r8
	mov qword ptr [rsp + 16], rsi
	jbe .LBB0_21
	cmp rax, rbx
	jae .LBB0_26
	mov qword ptr [rsp], rdx
	mov rdx, rbx
	sub rdx, r8
	shl rdx, 2
	cmp rbx, rcx
	jbe .LBB0_32
	sub r8, rcx
	mov rbx, rdi
	lea rdi, [rdi + 4*r8]
	mov rsi, rbx
	mov qword ptr [rsp + 8], rcx
	mov r14, r8
	call qword ptr [rip + memmove@GOTPCREL]
	sub r15, r14
	lea rsi, [rbx + 4*r15]
	shl r14, 2
	mov rdi, rbx
	mov rdx, r14
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, rbx
	mov rax, qword ptr [rsp]
	lea rsi, [rbx + 4*rax]
	lea rdi, [rbx + 4*r12]
	mov rbx, qword ptr [rsp + 8]
	jmp .LBB0_40
.LBB0_17:
	xorps xmm0, xmm0
	movups xmmword ptr [r13 + 16], xmm0
	jmp .LBB0_44
.LBB0_18:
	mov r14, r15
	sub r14, rcx
	jbe .LBB0_28
	cmp rax, r15
	jae .LBB0_33
	lea rax, [rcx + r12]
	sub r15, rcx
	lea rsi, [rdi + 4*rax]
	shl r15, 2
	mov r14, rdi
	mov rdx, r15
	mov r15, rcx
	jmp .LBB0_31
.LBB0_21:
	mov r14, rbx
	sub r14, rcx
	jbe .LBB0_29
	cmp rax, rbx
	jae .LBB0_34
	lea rax, [rcx + rdx]
	sub rbx, rcx
	lea rsi, [rdi + 4*rax]
	shl rbx, 2
	mov r14, rdi
	mov r15, rdx
	mov rdx, rbx
	mov rbx, rcx
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, r14
	lea rsi, [r14 + 4*r15]
	lea rdi, [r14 + 4*r12]
	jmp .LBB0_40
.LBB0_24:
	sub r15, rcx
	jbe .LBB0_35
	sub rcx, r8
	mov qword ptr [rsp + 8], rcx
	lea rsi, [rdi + 4*r12]
	mov r12, rdi
	lea rdi, [rdi + 4*rbx]
	lea rdx, [4*r8]
	mov r14, r8
	call qword ptr [rip + memmove@GOTPCREL]
	add r14, rbx
	lea rdi, [r12 + 4*r14]
	mov rbx, qword ptr [rsp + 8]
	lea rdx, [4*rbx]
	mov rsi, r12
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, r12
	lea rsi, [r12 + 4*rbx]
	jmp .LBB0_36
.LBB0_26:
	sub rbx, rcx
	jbe .LBB0_37
	sub rcx, r8
	lea rsi, [rdi + 4*rdx]
	mov r15, rdi
	lea rdi, [rdi + 4*r12]
	lea rdx, [4*r8]
	mov r14, rcx
	mov qword ptr [rsp], r8
	call qword ptr [rip + memmove@GOTPCREL]
	add r12, qword ptr [rsp]
	lea rdi, [r15 + 4*r12]
	lea rdx, [4*r14]
	mov rsi, r15
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, r15
	lea rsi, [r15 + 4*r14]
	jmp .LBB0_40
.LBB0_28:
	lea rsi, [rdi + 4*r12]
	lea rdi, [rdi + 4*rbx]
	jmp .LBB0_36
.LBB0_29:
	lea rsi, [rdi + 4*rdx]
	lea rdi, [rdi + 4*r12]
	jmp .LBB0_40
.LBB0_30:
	lea rax, [r8 + rbx]
	mov r14, rdi
	lea rdi, [rdi + 4*rax]
	mov rsi, r14
	mov r15, r8
.LBB0_31:
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, r14
	lea rsi, [r14 + 4*r12]
	lea rdi, [r14 + 4*rbx]
	jmp .LBB0_36
.LBB0_32:
	lea rax, [r12 + r8]
	mov rbx, rdi
	lea rdi, [rdi + 4*rax]
	mov rsi, rbx
	mov r14, r8
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, rbx
	mov rax, qword ptr [rsp]
	lea rsi, [rbx + 4*rax]
	jmp .LBB0_38
.LBB0_33:
	lea rsi, [rdi + 4*r12]
	mov r15, rdi
	lea rdi, [rdi + 4*rbx]
	lea rdx, [4*rcx]
	mov rbx, rcx
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, r15
	add rbx, r12
	lea rsi, [r15 + 4*rbx]
	mov r15, r14
	jmp .LBB0_36
.LBB0_34:
	lea rsi, [rdi + 4*rdx]
	mov rbx, rdi
	lea rdi, [rdi + 4*r12]
	mov r15, rdx
	lea rdx, [4*rcx]
	mov r12, rcx
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, rbx
	add r12, r15
	lea rsi, [rbx + 4*r12]
	jmp .LBB0_39
.LBB0_35:
	lea rsi, [rdi + 4*r12]
	mov r14, rdi
	lea rdi, [rdi + 4*rbx]
	mov r12, rdx
	lea rdx, [4*r8]
	mov r15, r8
	call qword ptr [rip + memmove@GOTPCREL]
	add r15, rbx
	mov rsi, r14
	lea rdi, [r14 + 4*r15]
	mov r15, r12
.LBB0_36:
	shl r15, 2
	mov rdx, r15
	call qword ptr [rip + memmove@GOTPCREL]
	mov rsi, qword ptr [rsp + 16]
	jmp .LBB0_42
.LBB0_37:
	lea rsi, [rdi + 4*rdx]
	mov rbx, rdi
	lea rdi, [rdi + 4*r12]
	lea rdx, [4*r8]
	mov r15, r8
	call qword ptr [rip + memmove@GOTPCREL]
	add r12, r15
	mov rsi, rbx
.LBB0_38:
	lea rdi, [rbx + 4*r12]
.LBB0_39:
	mov rbx, r14
.LBB0_40:
	shl rbx, 2
	mov rdx, rbx
	call qword ptr [rip + memmove@GOTPCREL]
	mov r15, qword ptr [r13]
	mov rax, qword ptr [r13 + 16]
	add rax, rbp
	mov rsi, qword ptr [rsp + 16]
.LBB0_41:
	xor ecx, ecx
	cmp rax, r15
	cmovae rcx, r15
	sub rax, rcx
	mov qword ptr [r13 + 16], rax
.LBB0_42:
	sub rsi, rbp
.LBB0_43:
	mov qword ptr [r13 + 24], rsi
.LBB0_44:
	add rsp, 24
	pop rbx
	pop r12
	pop r13
	pop r14
	pop r15
	pop rbp
	ret
.LBB0_45:
	lea rdx, [rip + .L__unnamed_1]
	mov rdi, rax
	call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL]

after:

clear:
	movups xmmword ptr [rdi + 16], xmm0
	ret

truncate:
	cmp qword ptr [rdi + 24], rsi
	jbe .LBB2_4
	test rsi, rsi
	jne .LBB2_3
	mov qword ptr [rdi + 16], 0
.LBB2_3:
	mov qword ptr [rdi + 24], rsi
.LBB2_4:
	ret

advance:
	mov rcx, qword ptr [rdi + 24]
	mov rax, rcx
	sub rax, rsi
	jbe .LBB3_1
	mov rcx, qword ptr [rdi]
	add rsi, qword ptr [rdi + 16]
	xor edx, edx
	cmp rsi, rcx
	cmovae rdx, rcx
	sub rsi, rdx
	mov qword ptr [rdi + 16], rsi
	mov qword ptr [rdi + 24], rax
	ret
.LBB3_1:
	test rcx, rcx
	je .LBB3_3
	mov qword ptr [rdi + 24], 0
.LBB3_3:
	mov qword ptr [rdi + 16], 0
	ret

remove:
	push rbp
	push r15
	push r14
	push r13
	push r12
	push rbx
	push rax
	mov r15, rsi
	mov r14, rdx
	sub r14, rsi
	jb .LBB4_9
	mov rbx, rdi
	mov r12, qword ptr [rdi + 24]
	mov r13, r12
	sub r13, rdx
	jb .LBB4_10
	mov qword ptr [rbx + 24], r15
	mov rbp, r12
	sub rbp, r14
	test r15, r15
	je .LBB4_4
	cmp rbp, r15
	jne .LBB4_11
.LBB4_4:
	cmp r12, r14
	jne .LBB4_6
.LBB4_5:
	mov qword ptr [rbx + 16], 0
	jmp .LBB4_8
.LBB4_11:
	mov rdi, rbx
	mov rsi, r14
	mov rdx, r15
	mov rcx, r13
	call <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data
	cmp r12, r14
	je .LBB4_5
.LBB4_6:
	cmp r13, r15
	jbe .LBB4_8
	mov rax, qword ptr [rbx]
	add r14, qword ptr [rbx + 16]
	xor ecx, ecx
	cmp r14, rax
	cmovae rcx, rax
	sub r14, rcx
	mov qword ptr [rbx + 16], r14
.LBB4_8:
	mov qword ptr [rbx + 24], rbp
	add rsp, 8
	pop rbx
	pop r12
	pop r13
	pop r14
	pop r15
	pop rbp
	ret
.LBB4_9:
	lea rax, [rip + .L__unnamed_1]
	mov rdi, r15
	mov rsi, rdx
	mov rdx, rax
	call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL]
.LBB4_10:
	lea rax, [rip + .L__unnamed_1]
	mov rdi, rdx
	mov rsi, r12
	mov rdx, rax
	call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL]

<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data:
	push rbp
	push r15
	push r14
	push r13
	push r12
	push rbx
	push rax
	mov r14, rsi
	cmp rdx, rcx
	jae .LBB0_1
	mov r12, qword ptr [rdi]
	mov rax, qword ptr [rdi + 16]
	add r14, rax
	xor ecx, ecx
	cmp r14, r12
	cmovae rcx, r12
	sub r14, rcx
	mov r15, rdx
	mov r13, r14
	mov r14, rax
	mov rcx, r13
	sub rcx, r14
	je .LBB0_18
.LBB0_4:
	mov rdi, qword ptr [rdi + 8]
	mov rax, rcx
	add rax, r12
	cmovae rax, rcx
	mov rbx, r12
	sub rbx, r14
	mov rcx, r12
	sub rcx, r13
	mov rbp, r15
	sub rbp, rbx
	jbe .LBB0_5
	cmp rax, r15
	jae .LBB0_12
	mov rdx, r15
	sub rdx, rbx
	shl rdx, 2
	cmp r15, rcx
	jbe .LBB0_16
	sub rbx, rcx
	mov rbp, rdi
	lea rdi, [rdi + 4*rbx]
	mov r15, qword ptr [rip + memmove@GOTPCREL]
	mov rsi, rbp
	mov qword ptr [rsp], rcx
	call r15
	sub r12, rbx
	lea rsi, [4*r12]
	add rsi, rbp
	shl rbx, 2
	mov rdi, rbp
	mov rdx, rbx
	call r15
	mov rdi, rbp
	lea rsi, [4*r14]
	add rsi, rbp
	lea rdi, [4*r13]
	add rdi, rbp
	mov r15, qword ptr [rsp]
	jmp .LBB0_7
.LBB0_1:
	mov r15, rcx
	add r14, rdx
	mov r12, qword ptr [rdi]
	mov r13, qword ptr [rdi + 16]
	add r14, r13
	xor eax, eax
	cmp r14, r12
	mov rcx, r12
	cmovb rcx, rax
	sub r14, rcx
	add r13, rdx
	cmp r13, r12
	cmovae rax, r12
	sub r13, rax
	mov rcx, r13
	sub rcx, r14
	jne .LBB0_4
.LBB0_18:
	add rsp, 8
	pop rbx
	pop r12
	pop r13
	pop r14
	pop r15
	pop rbp
	ret
.LBB0_5:
	mov rbx, r15
	sub rbx, rcx
	jbe .LBB0_6
	cmp rax, r15
	jae .LBB0_9
	lea rax, [rcx + r14]
	sub r15, rcx
	lea rsi, [rdi + 4*rax]
	shl r15, 2
	mov rbx, rdi
	mov rdx, r15
	mov r15, rcx
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, rbx
	lea rsi, [rbx + 4*r14]
	lea rdi, [rbx + 4*r13]
	jmp .LBB0_7
.LBB0_12:
	sub r15, rcx
	jbe .LBB0_13
	sub rcx, rbx
	lea rsi, [rdi + 4*r14]
	mov r12, rdi
	lea rdi, [rdi + 4*r13]
	lea rdx, [4*rbx]
	mov r14, qword ptr [rip + memmove@GOTPCREL]
	mov rbp, rcx
	call r14
	add rbx, r13
	lea rdi, [r12 + 4*rbx]
	lea rdx, [4*rbp]
	mov rsi, r12
	call r14
	mov rdi, r12
	lea rsi, [r12 + 4*rbp]
	jmp .LBB0_7
.LBB0_6:
	lea rsi, [rdi + 4*r14]
	lea rdi, [rdi + 4*r13]
	jmp .LBB0_7
.LBB0_16:
	lea rax, [rbx + r13]
	mov r15, rdi
	lea rdi, [rdi + 4*rax]
	mov rsi, r15
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, r15
	lea rsi, [r15 + 4*r14]
	lea rdi, [r15 + 4*r13]
	mov r15, rbx
	jmp .LBB0_7
.LBB0_9:
	lea rsi, [rdi + 4*r14]
	mov r15, rdi
	lea rdi, [rdi + 4*r13]
	lea rdx, [4*rcx]
	mov r12, rcx
	call qword ptr [rip + memmove@GOTPCREL]
	mov rdi, r15
	add r12, r14
	lea rsi, [r15 + 4*r12]
	mov r15, rbx
	jmp .LBB0_7
.LBB0_13:
	lea rsi, [rdi + 4*r14]
	mov r14, rdi
	lea rdi, [rdi + 4*r13]
	lea rdx, [4*rbx]
	call qword ptr [rip + memmove@GOTPCREL]
	add rbx, r13
	mov rsi, r14
	lea rdi, [r14 + 4*rbx]
	mov r15, rbp
.LBB0_7:
	shl r15, 2
	mov rdx, r15
	add rsp, 8
	pop rbx
	pop r12
	pop r13
	pop r14
	pop r15
	pop rbp
	jmp qword ptr [rip + memmove@GOTPCREL]

source: GitHub code search: full range drain(..) = 7.5k results, from front drain(..n) = 3.2k results, from back drain(n..) = 1.6k results, from middle drain(n..m) = <500 results ↩
.drain(0..) and .clear() reset the head to 0, but .truncate(0) does not. ↩

rustbot · 2023-11-24T19:54:36Z

r? @thomcc

(rustbot has picked a reviewer for you, use r? to override)

AngelicosPhosphoros · 2023-11-25T02:37:12Z

I don't understand this claim:

.drain(0..) and .clear() reset the head to 0, but .truncate(0) does not.

According to docs:

Truncating when len == 0 is equivalent to calling the clear method.

This statements contradict each other. May you elaborate, please?

lukas-code · 2023-11-25T10:36:20Z

@AngelicosPhosphoros This PR is about VecDeque, but you're looking at the docs of Vec. I don't see any guarantee like that on the VecDeque docs, neither on truncate, nor on clear.

AngelicosPhosphoros · 2023-11-25T10:45:33Z

Oh, sorry, missed that.

lukas-code · 2024-01-17T17:26:04Z

Ping @thomcc this PR is waiting for your review.

lukas-code · 2024-02-01T07:48:48Z

r? @the8472

library/alloc/src/collections/vec_deque/drain.rs

the8472

The implementation looks ok. But I think readability has suffered.
Perhaps things can be shuffled around a bit to recover some of that.

library/alloc/src/collections/vec_deque/drain.rs

the8472 · 2024-02-10T09:54:02Z

library/alloc/src/collections/vec_deque/drain.rs

+                if new_len == 0 {
+                    source_deque.head = 0;
+                } else if head_len < tail_len {
+                    source_deque.head = source_deque.to_physical_idx(drain_len);
+                }
+                source_deque.len = new_len;


I think the match was easier to read and grouped things more nicely. Now it's somewhat scattered across this block and the outlined function.

I'm not sure what to do about that. Maybe some comments how there can only be 4 cases and how the control flow handles some cases jointly.

the8472 · 2024-02-10T10:00:53Z

library/alloc/src/collections/vec_deque/drain.rs

+                source_deque.len = new_len;
+
+                #[cold]
+                fn copy_data<T, A: Allocator>(


Maybe it could be called shift_blocks or restore_wrapping or something that describes its purpose rather than what it does internally?

It may copy the bits (in the ptr::copy sense, including potential overlap), but logically we don't want a copy, we want to shift the parts to restore deque invariants.

the8472 · 2024-02-15T18:42:06Z

Thanks for the detailed documentation.

@bors r+

bors · 2024-02-15T18:42:09Z

📌 Commit da1c7f6 has been approved by the8472

It is now in the queue for this repository.

…he8472 Optimize `VecDeque::drain` for (half-)open ranges The most common use cases of `VecDeque::drain` consume either the entire queue or elements from the front or back.[^1] This PR makes these operations faster by optimizing the generated code of the destructor of the drain: * `.drain(..)` is now the same as `.clear()`. * `.drain(n..)` is now (almost[^2]) the same as `.truncate(n)`. * `.drain(..n)` is now an efficient "advance" function. This operation is not provided by a dedicated function and optimizing it is my main motivation for this PR. Previously, all of these cases generated a function call to the destructor of the `DropGuard`, emitting a lot of unused machine code as well as unnecessary branches and loads/stores of stack variables. There are no algorithmic changes in this PR, but it simplifies the code enough to allow LLVM to recognize the special cases and optimize accordingly. Most notably, it allows elimination of the rather large [`wrap_copy`] function. Some [rudimentary microbenchmarks][benches] show a performance improvement of **~3x-4x** on my machine for the special cases and roughly equal performance for the general case. Best reviewed commit by commit. [^1]: source: GitHub code search: [full range `drain(..)` = 7.5k results][full], [from front `drain(..n)` = 3.2k results][front], [from back `drain(n..)` = 1.6k results][back], [from middle `drain(n..m)` = <500 results][middle] [^2]: `.drain(0..)` and `.clear()` reset the head to 0, but `.truncate(0)` does not. [full]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5C%29%2F+lang%3ARust [front]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust [back]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5C%29%2F+lang%3ARust [middle]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust [`wrap_copy`]: https://github.com/rust-lang/rust/blob/4fd68eb47bad1c121417ac4450b2f0456150db86/library/alloc/src/collections/vec_deque/mod.rs#L262-L391 [benches]: https://gist.github.com/lukas-code/c97bd707d074c4cc31f241edbc7fd2a2 <details> <summary>generated assembly</summary> before: ```asm clear: sub rsp, 40 mov rax, qword ptr [rdi + 24] mov qword ptr [rdi + 24], 0 mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rax xorps xmm0, xmm0 movups xmmword ptr [rsp + 16], xmm0 mov qword ptr [rsp + 32], rax test rax, rax je .LBB1_2 mov rcx, qword ptr [rdi] mov rdx, qword ptr [rdi + 16] xor esi, esi cmp rdx, rcx cmovae rsi, rcx sub rdx, rsi mov rsi, rcx sub rsi, rdx lea rdi, [rdx + rax] cmp rsi, rax cmovb rdi, rcx sub rdi, rdx mov qword ptr [rsp + 16], rdi mov qword ptr [rsp + 32], 0 .LBB1_2: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret truncate: mov rax, qword ptr [rdi + 24] sub rax, rsi jbe .LBB2_2 sub rsp, 40 mov qword ptr [rdi + 24], rsi mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rax mov rcx, qword ptr [rdi] mov rdx, qword ptr [rdi + 16] add rdx, rsi xor edi, edi cmp rdx, rcx cmovae rdi, rcx mov qword ptr [rsp + 24], 0 sub rdx, rdi mov rdi, rcx sub rdi, rdx lea r8, [rdx + rax] cmp rdi, rax cmovb r8, rcx sub rsi, rdx add rsi, r8 mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 32], 0 mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 advance: mov rcx, qword ptr [rdi + 24] mov rax, rcx sub rax, rsi jbe .LBB3_1 sub rsp, 40 mov qword ptr [rdi + 24], 0 mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rsi mov qword ptr [rsp + 16], 0 mov qword ptr [rsp + 24], rax mov qword ptr [rsp + 32], rsi test rsi, rsi je .LBB3_6 mov rax, qword ptr [rdi] mov rcx, qword ptr [rdi + 16] xor edx, edx cmp rcx, rax cmovae rdx, rax sub rcx, rdx mov rdx, rax sub rdx, rcx lea rdi, [rcx + rsi] cmp rdx, rsi cmovb rdi, rax sub rdi, rcx mov qword ptr [rsp + 16], rdi mov qword ptr [rsp + 32], 0 .LBB3_6: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret .LBB3_1: test rcx, rcx je .LBB3_3 mov qword ptr [rdi + 24], 0 .LBB3_3: mov qword ptr [rdi + 16], 0 ret remove: sub rsp, 40 cmp rdx, rsi jb .LBB4_5 mov rax, qword ptr [rdi + 24] mov rcx, rax sub rcx, rdx jb .LBB4_6 mov qword ptr [rdi + 24], rsi mov qword ptr [rsp], rdi sub rdx, rsi mov qword ptr [rsp + 8], rdx mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 24], rcx mov qword ptr [rsp + 32], rdx je .LBB4_4 mov rax, qword ptr [rdi] mov rcx, qword ptr [rdi + 16] add rcx, rsi xor edi, edi cmp rcx, rax cmovae rdi, rax sub rcx, rdi mov rdi, rax sub rdi, rcx lea r8, [rcx + rdx] cmp rdi, rdx cmovb r8, rax sub rsi, rcx add rsi, r8 mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 32], 0 .LBB4_4: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret .LBB4_5: lea rax, [rip + .L__unnamed_2] mov rdi, rsi mov rsi, rdx mov rdx, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] .LBB4_6: lea rcx, [rip + .L__unnamed_2] mov rdi, rdx mov rsi, rax mov rdx, rcx call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL] core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>: push rbp push r15 push r14 push r13 push r12 push rbx sub rsp, 24 mov rsi, qword ptr [rdi + 32] test rsi, rsi je .LBB0_2 mov rax, qword ptr [rdi + 16] add rsi, rax jb .LBB0_45 .LBB0_2: mov r13, qword ptr [rdi] mov rbp, qword ptr [rdi + 8] mov rbx, qword ptr [r13 + 24] lea r12, [rbx + rbp] mov r15, qword ptr [rdi + 24] lea rsi, [r15 + r12] test rbx, rbx je .LBB0_10 test r15, r15 je .LBB0_42 cmp rbx, r15 jbe .LBB0_12 mov r14, qword ptr [r13] mov rax, qword ptr [r13 + 16] add r12, rax xor ecx, ecx cmp r12, r14 mov rdx, r14 cmovb rdx, rcx sub r12, rdx add rbx, rax cmp rbx, r14 cmovae rcx, r14 sub rbx, rcx mov rcx, rbx sub rcx, r12 je .LBB0_42 mov rdi, qword ptr [r13 + 8] mov rax, rcx add rax, r14 cmovae rax, rcx mov r8, r14 sub r8, r12 mov rcx, r14 sub rcx, rbx mov rdx, r15 sub rdx, r8 mov qword ptr [rsp + 16], rsi jbe .LBB0_18 cmp rax, r15 jae .LBB0_24 mov rdx, r15 sub rdx, r8 shl rdx, 2 cmp r15, rcx jbe .LBB0_30 sub r8, rcx mov qword ptr [rsp], rdi mov rax, qword ptr [rsp] lea rdi, [rax + 4*r8] mov rsi, qword ptr [rsp] mov qword ptr [rsp + 8], rcx mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] sub r14, r15 mov rax, qword ptr [rsp] lea rsi, [rax + 4*r14] shl r15, 2 mov rdi, qword ptr [rsp] mov rdx, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, qword ptr [rsp] lea rsi, [rdi + 4*r12] lea rdi, [rdi + 4*rbx] mov r15, qword ptr [rsp + 8] jmp .LBB0_36 .LBB0_10: test r15, r15 je .LBB0_17 mov rax, qword ptr [r13] sub rsi, rbp add rbp, qword ptr [r13 + 16] xor ecx, ecx cmp rbp, rax cmovae rcx, rax sub rbp, rcx mov qword ptr [r13 + 16], rbp jmp .LBB0_43 .LBB0_12: mov rdx, qword ptr [r13 + 16] mov r15, qword ptr [r13] lea rax, [rdx + rbp] xor ecx, ecx cmp rax, r15 cmovae rcx, r15 mov r12, rax sub r12, rcx mov rcx, r12 sub rcx, rdx je .LBB0_41 mov rdi, qword ptr [r13 + 8] mov rax, rcx add rax, r15 cmovae rax, rcx mov r8, r15 sub r8, rdx mov rcx, r15 sub rcx, r12 mov r14, rbx sub r14, r8 mov qword ptr [rsp + 16], rsi jbe .LBB0_21 cmp rax, rbx jae .LBB0_26 mov qword ptr [rsp], rdx mov rdx, rbx sub rdx, r8 shl rdx, 2 cmp rbx, rcx jbe .LBB0_32 sub r8, rcx mov rbx, rdi lea rdi, [rdi + 4*r8] mov rsi, rbx mov qword ptr [rsp + 8], rcx mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] sub r15, r14 lea rsi, [rbx + 4*r15] shl r14, 2 mov rdi, rbx mov rdx, r14 call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx mov rax, qword ptr [rsp] lea rsi, [rbx + 4*rax] lea rdi, [rbx + 4*r12] mov rbx, qword ptr [rsp + 8] jmp .LBB0_40 .LBB0_17: xorps xmm0, xmm0 movups xmmword ptr [r13 + 16], xmm0 jmp .LBB0_44 .LBB0_18: mov r14, r15 sub r14, rcx jbe .LBB0_28 cmp rax, r15 jae .LBB0_33 lea rax, [rcx + r12] sub r15, rcx lea rsi, [rdi + 4*rax] shl r15, 2 mov r14, rdi mov rdx, r15 mov r15, rcx jmp .LBB0_31 .LBB0_21: mov r14, rbx sub r14, rcx jbe .LBB0_29 cmp rax, rbx jae .LBB0_34 lea rax, [rcx + rdx] sub rbx, rcx lea rsi, [rdi + 4*rax] shl rbx, 2 mov r14, rdi mov r15, rdx mov rdx, rbx mov rbx, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r14 lea rsi, [r14 + 4*r15] lea rdi, [r14 + 4*r12] jmp .LBB0_40 .LBB0_24: sub r15, rcx jbe .LBB0_35 sub rcx, r8 mov qword ptr [rsp + 8], rcx lea rsi, [rdi + 4*r12] mov r12, rdi lea rdi, [rdi + 4*rbx] lea rdx, [4*r8] mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] add r14, rbx lea rdi, [r12 + 4*r14] mov rbx, qword ptr [rsp + 8] lea rdx, [4*rbx] mov rsi, r12 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r12 lea rsi, [r12 + 4*rbx] jmp .LBB0_36 .LBB0_26: sub rbx, rcx jbe .LBB0_37 sub rcx, r8 lea rsi, [rdi + 4*rdx] mov r15, rdi lea rdi, [rdi + 4*r12] lea rdx, [4*r8] mov r14, rcx mov qword ptr [rsp], r8 call qword ptr [rip + memmove@GOTPCREL] add r12, qword ptr [rsp] lea rdi, [r15 + 4*r12] lea rdx, [4*r14] mov rsi, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 lea rsi, [r15 + 4*r14] jmp .LBB0_40 .LBB0_28: lea rsi, [rdi + 4*r12] lea rdi, [rdi + 4*rbx] jmp .LBB0_36 .LBB0_29: lea rsi, [rdi + 4*rdx] lea rdi, [rdi + 4*r12] jmp .LBB0_40 .LBB0_30: lea rax, [r8 + rbx] mov r14, rdi lea rdi, [rdi + 4*rax] mov rsi, r14 mov r15, r8 .LBB0_31: call qword ptr [rip + memmove@GOTPCREL] mov rdi, r14 lea rsi, [r14 + 4*r12] lea rdi, [r14 + 4*rbx] jmp .LBB0_36 .LBB0_32: lea rax, [r12 + r8] mov rbx, rdi lea rdi, [rdi + 4*rax] mov rsi, rbx mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx mov rax, qword ptr [rsp] lea rsi, [rbx + 4*rax] jmp .LBB0_38 .LBB0_33: lea rsi, [rdi + 4*r12] mov r15, rdi lea rdi, [rdi + 4*rbx] lea rdx, [4*rcx] mov rbx, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 add rbx, r12 lea rsi, [r15 + 4*rbx] mov r15, r14 jmp .LBB0_36 .LBB0_34: lea rsi, [rdi + 4*rdx] mov rbx, rdi lea rdi, [rdi + 4*r12] mov r15, rdx lea rdx, [4*rcx] mov r12, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx add r12, r15 lea rsi, [rbx + 4*r12] jmp .LBB0_39 .LBB0_35: lea rsi, [rdi + 4*r12] mov r14, rdi lea rdi, [rdi + 4*rbx] mov r12, rdx lea rdx, [4*r8] mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] add r15, rbx mov rsi, r14 lea rdi, [r14 + 4*r15] mov r15, r12 .LBB0_36: shl r15, 2 mov rdx, r15 call qword ptr [rip + memmove@GOTPCREL] mov rsi, qword ptr [rsp + 16] jmp .LBB0_42 .LBB0_37: lea rsi, [rdi + 4*rdx] mov rbx, rdi lea rdi, [rdi + 4*r12] lea rdx, [4*r8] mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] add r12, r15 mov rsi, rbx .LBB0_38: lea rdi, [rbx + 4*r12] .LBB0_39: mov rbx, r14 .LBB0_40: shl rbx, 2 mov rdx, rbx call qword ptr [rip + memmove@GOTPCREL] mov r15, qword ptr [r13] mov rax, qword ptr [r13 + 16] add rax, rbp mov rsi, qword ptr [rsp + 16] .LBB0_41: xor ecx, ecx cmp rax, r15 cmovae rcx, r15 sub rax, rcx mov qword ptr [r13 + 16], rax .LBB0_42: sub rsi, rbp .LBB0_43: mov qword ptr [r13 + 24], rsi .LBB0_44: add rsp, 24 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB0_45: lea rdx, [rip + .L__unnamed_1] mov rdi, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] ``` after: ```asm clear: movups xmmword ptr [rdi + 16], xmm0 ret truncate: cmp qword ptr [rdi + 24], rsi jbe .LBB2_4 test rsi, rsi jne .LBB2_3 mov qword ptr [rdi + 16], 0 .LBB2_3: mov qword ptr [rdi + 24], rsi .LBB2_4: ret advance: mov rcx, qword ptr [rdi + 24] mov rax, rcx sub rax, rsi jbe .LBB3_1 mov rcx, qword ptr [rdi] add rsi, qword ptr [rdi + 16] xor edx, edx cmp rsi, rcx cmovae rdx, rcx sub rsi, rdx mov qword ptr [rdi + 16], rsi mov qword ptr [rdi + 24], rax ret .LBB3_1: test rcx, rcx je .LBB3_3 mov qword ptr [rdi + 24], 0 .LBB3_3: mov qword ptr [rdi + 16], 0 ret remove: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov r15, rsi mov r14, rdx sub r14, rsi jb .LBB4_9 mov rbx, rdi mov r12, qword ptr [rdi + 24] mov r13, r12 sub r13, rdx jb .LBB4_10 mov qword ptr [rbx + 24], r15 mov rbp, r12 sub rbp, r14 test r15, r15 je .LBB4_4 cmp rbp, r15 jne .LBB4_11 .LBB4_4: cmp r12, r14 jne .LBB4_6 .LBB4_5: mov qword ptr [rbx + 16], 0 jmp .LBB4_8 .LBB4_11: mov rdi, rbx mov rsi, r14 mov rdx, r15 mov rcx, r13 call <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data cmp r12, r14 je .LBB4_5 .LBB4_6: cmp r13, r15 jbe .LBB4_8 mov rax, qword ptr [rbx] add r14, qword ptr [rbx + 16] xor ecx, ecx cmp r14, rax cmovae rcx, rax sub r14, rcx mov qword ptr [rbx + 16], r14 .LBB4_8: mov qword ptr [rbx + 24], rbp add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB4_9: lea rax, [rip + .L__unnamed_1] mov rdi, r15 mov rsi, rdx mov rdx, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] .LBB4_10: lea rax, [rip + .L__unnamed_1] mov rdi, rdx mov rsi, r12 mov rdx, rax call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL] <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov r14, rsi cmp rdx, rcx jae .LBB0_1 mov r12, qword ptr [rdi] mov rax, qword ptr [rdi + 16] add r14, rax xor ecx, ecx cmp r14, r12 cmovae rcx, r12 sub r14, rcx mov r15, rdx mov r13, r14 mov r14, rax mov rcx, r13 sub rcx, r14 je .LBB0_18 .LBB0_4: mov rdi, qword ptr [rdi + 8] mov rax, rcx add rax, r12 cmovae rax, rcx mov rbx, r12 sub rbx, r14 mov rcx, r12 sub rcx, r13 mov rbp, r15 sub rbp, rbx jbe .LBB0_5 cmp rax, r15 jae .LBB0_12 mov rdx, r15 sub rdx, rbx shl rdx, 2 cmp r15, rcx jbe .LBB0_16 sub rbx, rcx mov rbp, rdi lea rdi, [rdi + 4*rbx] mov r15, qword ptr [rip + memmove@GOTPCREL] mov rsi, rbp mov qword ptr [rsp], rcx call r15 sub r12, rbx lea rsi, [4*r12] add rsi, rbp shl rbx, 2 mov rdi, rbp mov rdx, rbx call r15 mov rdi, rbp lea rsi, [4*r14] add rsi, rbp lea rdi, [4*r13] add rdi, rbp mov r15, qword ptr [rsp] jmp .LBB0_7 .LBB0_1: mov r15, rcx add r14, rdx mov r12, qword ptr [rdi] mov r13, qword ptr [rdi + 16] add r14, r13 xor eax, eax cmp r14, r12 mov rcx, r12 cmovb rcx, rax sub r14, rcx add r13, rdx cmp r13, r12 cmovae rax, r12 sub r13, rax mov rcx, r13 sub rcx, r14 jne .LBB0_4 .LBB0_18: add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB0_5: mov rbx, r15 sub rbx, rcx jbe .LBB0_6 cmp rax, r15 jae .LBB0_9 lea rax, [rcx + r14] sub r15, rcx lea rsi, [rdi + 4*rax] shl r15, 2 mov rbx, rdi mov rdx, r15 mov r15, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx lea rsi, [rbx + 4*r14] lea rdi, [rbx + 4*r13] jmp .LBB0_7 .LBB0_12: sub r15, rcx jbe .LBB0_13 sub rcx, rbx lea rsi, [rdi + 4*r14] mov r12, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rbx] mov r14, qword ptr [rip + memmove@GOTPCREL] mov rbp, rcx call r14 add rbx, r13 lea rdi, [r12 + 4*rbx] lea rdx, [4*rbp] mov rsi, r12 call r14 mov rdi, r12 lea rsi, [r12 + 4*rbp] jmp .LBB0_7 .LBB0_6: lea rsi, [rdi + 4*r14] lea rdi, [rdi + 4*r13] jmp .LBB0_7 .LBB0_16: lea rax, [rbx + r13] mov r15, rdi lea rdi, [rdi + 4*rax] mov rsi, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 lea rsi, [r15 + 4*r14] lea rdi, [r15 + 4*r13] mov r15, rbx jmp .LBB0_7 .LBB0_9: lea rsi, [rdi + 4*r14] mov r15, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rcx] mov r12, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 add r12, r14 lea rsi, [r15 + 4*r12] mov r15, rbx jmp .LBB0_7 .LBB0_13: lea rsi, [rdi + 4*r14] mov r14, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rbx] call qword ptr [rip + memmove@GOTPCREL] add rbx, r13 mov rsi, r14 lea rdi, [r14 + 4*rbx] mov r15, rbp .LBB0_7: shl r15, 2 mov rdx, r15 add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp jmp qword ptr [rip + memmove@GOTPCREL] ``` </details>

…llaumeGomez Rollup of 13 pull requests Successful merges: - rust-lang#118264 (Optimize `VecDeque::drain` for (half-)open ranges) - rust-lang#120741 (Make `io::BorrowedCursor::advance` safe) - rust-lang#120777 (Bump Unicode to version 15.1.0, regenerate tables) - rust-lang#120971 (Fix comment in core/src/str/validations.rs) - rust-lang#121034 (Improve wording of `static_mut_ref`) - rust-lang#121095 (Add extra indent spaces for rust-playground link) - rust-lang#121109 (Add an ErrorGuaranteed to ast::TyKind::Err (attempt 2)) - rust-lang#121119 (Make `async Fn` trait kind errors better) - rust-lang#121141 (Fix closure kind docs) - rust-lang#121145 (Update aarch64 target feature docs to match LLVM) - rust-lang#121146 (Only point out non-diverging arms for match suggestions) - rust-lang#121147 (Avoid debug logging entire MIR body) - rust-lang#121155 (doc: add note about panicking examples for strict_overflow_ops) r? `@ghost` `@rustbot` modify labels: rollup

…8472 Optimize `VecDeque::drain` for (half-)open ranges The most common use cases of `VecDeque::drain` consume either the entire queue or elements from the front or back.[^1] This PR makes these operations faster by optimizing the generated code of the destructor of the drain: * `.drain(..)` is now the same as `.clear()`. * `.drain(n..)` is now (almost[^2]) the same as `.truncate(n)`. * `.drain(..n)` is now an efficient "advance" function. This operation is not provided by a dedicated function and optimizing it is my main motivation for this PR. Previously, all of these cases generated a function call to the destructor of the `DropGuard`, emitting a lot of unused machine code as well as unnecessary branches and loads/stores of stack variables. There are no algorithmic changes in this PR, but it simplifies the code enough to allow LLVM to recognize the special cases and optimize accordingly. Most notably, it allows elimination of the rather large [`wrap_copy`] function. Some [rudimentary microbenchmarks][benches] show a performance improvement of **~3x-4x** on my machine for the special cases and roughly equal performance for the general case. Best reviewed commit by commit. [^1]: source: GitHub code search: [full range `drain(..)` = 7.5k results][full], [from front `drain(..n)` = 3.2k results][front], [from back `drain(n..)` = 1.6k results][back], [from middle `drain(n..m)` = <500 results][middle] [^2]: `.drain(0..)` and `.clear()` reset the head to 0, but `.truncate(0)` does not. [full]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5C%29%2F+lang%3ARust [front]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust [back]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5C%29%2F+lang%3ARust [middle]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust [`wrap_copy`]: https://github.com/rust-lang/rust/blob/4fd68eb47bad1c121417ac4450b2f0456150db86/library/alloc/src/collections/vec_deque/mod.rs#L262-L391 [benches]: https://gist.github.com/lukas-code/c97bd707d074c4cc31f241edbc7fd2a2 <details> <summary>generated assembly</summary> before: ```asm clear: sub rsp, 40 mov rax, qword ptr [rdi + 24] mov qword ptr [rdi + 24], 0 mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rax xorps xmm0, xmm0 movups xmmword ptr [rsp + 16], xmm0 mov qword ptr [rsp + 32], rax test rax, rax je .LBB1_2 mov rcx, qword ptr [rdi] mov rdx, qword ptr [rdi + 16] xor esi, esi cmp rdx, rcx cmovae rsi, rcx sub rdx, rsi mov rsi, rcx sub rsi, rdx lea rdi, [rdx + rax] cmp rsi, rax cmovb rdi, rcx sub rdi, rdx mov qword ptr [rsp + 16], rdi mov qword ptr [rsp + 32], 0 .LBB1_2: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret truncate: mov rax, qword ptr [rdi + 24] sub rax, rsi jbe .LBB2_2 sub rsp, 40 mov qword ptr [rdi + 24], rsi mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rax mov rcx, qword ptr [rdi] mov rdx, qword ptr [rdi + 16] add rdx, rsi xor edi, edi cmp rdx, rcx cmovae rdi, rcx mov qword ptr [rsp + 24], 0 sub rdx, rdi mov rdi, rcx sub rdi, rdx lea r8, [rdx + rax] cmp rdi, rax cmovb r8, rcx sub rsi, rdx add rsi, r8 mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 32], 0 mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 advance: mov rcx, qword ptr [rdi + 24] mov rax, rcx sub rax, rsi jbe .LBB3_1 sub rsp, 40 mov qword ptr [rdi + 24], 0 mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rsi mov qword ptr [rsp + 16], 0 mov qword ptr [rsp + 24], rax mov qword ptr [rsp + 32], rsi test rsi, rsi je .LBB3_6 mov rax, qword ptr [rdi] mov rcx, qword ptr [rdi + 16] xor edx, edx cmp rcx, rax cmovae rdx, rax sub rcx, rdx mov rdx, rax sub rdx, rcx lea rdi, [rcx + rsi] cmp rdx, rsi cmovb rdi, rax sub rdi, rcx mov qword ptr [rsp + 16], rdi mov qword ptr [rsp + 32], 0 .LBB3_6: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret .LBB3_1: test rcx, rcx je .LBB3_3 mov qword ptr [rdi + 24], 0 .LBB3_3: mov qword ptr [rdi + 16], 0 ret remove: sub rsp, 40 cmp rdx, rsi jb .LBB4_5 mov rax, qword ptr [rdi + 24] mov rcx, rax sub rcx, rdx jb .LBB4_6 mov qword ptr [rdi + 24], rsi mov qword ptr [rsp], rdi sub rdx, rsi mov qword ptr [rsp + 8], rdx mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 24], rcx mov qword ptr [rsp + 32], rdx je .LBB4_4 mov rax, qword ptr [rdi] mov rcx, qword ptr [rdi + 16] add rcx, rsi xor edi, edi cmp rcx, rax cmovae rdi, rax sub rcx, rdi mov rdi, rax sub rdi, rcx lea r8, [rcx + rdx] cmp rdi, rdx cmovb r8, rax sub rsi, rcx add rsi, r8 mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 32], 0 .LBB4_4: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret .LBB4_5: lea rax, [rip + .L__unnamed_2] mov rdi, rsi mov rsi, rdx mov rdx, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] .LBB4_6: lea rcx, [rip + .L__unnamed_2] mov rdi, rdx mov rsi, rax mov rdx, rcx call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL] core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>: push rbp push r15 push r14 push r13 push r12 push rbx sub rsp, 24 mov rsi, qword ptr [rdi + 32] test rsi, rsi je .LBB0_2 mov rax, qword ptr [rdi + 16] add rsi, rax jb .LBB0_45 .LBB0_2: mov r13, qword ptr [rdi] mov rbp, qword ptr [rdi + 8] mov rbx, qword ptr [r13 + 24] lea r12, [rbx + rbp] mov r15, qword ptr [rdi + 24] lea rsi, [r15 + r12] test rbx, rbx je .LBB0_10 test r15, r15 je .LBB0_42 cmp rbx, r15 jbe .LBB0_12 mov r14, qword ptr [r13] mov rax, qword ptr [r13 + 16] add r12, rax xor ecx, ecx cmp r12, r14 mov rdx, r14 cmovb rdx, rcx sub r12, rdx add rbx, rax cmp rbx, r14 cmovae rcx, r14 sub rbx, rcx mov rcx, rbx sub rcx, r12 je .LBB0_42 mov rdi, qword ptr [r13 + 8] mov rax, rcx add rax, r14 cmovae rax, rcx mov r8, r14 sub r8, r12 mov rcx, r14 sub rcx, rbx mov rdx, r15 sub rdx, r8 mov qword ptr [rsp + 16], rsi jbe .LBB0_18 cmp rax, r15 jae .LBB0_24 mov rdx, r15 sub rdx, r8 shl rdx, 2 cmp r15, rcx jbe .LBB0_30 sub r8, rcx mov qword ptr [rsp], rdi mov rax, qword ptr [rsp] lea rdi, [rax + 4*r8] mov rsi, qword ptr [rsp] mov qword ptr [rsp + 8], rcx mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] sub r14, r15 mov rax, qword ptr [rsp] lea rsi, [rax + 4*r14] shl r15, 2 mov rdi, qword ptr [rsp] mov rdx, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, qword ptr [rsp] lea rsi, [rdi + 4*r12] lea rdi, [rdi + 4*rbx] mov r15, qword ptr [rsp + 8] jmp .LBB0_36 .LBB0_10: test r15, r15 je .LBB0_17 mov rax, qword ptr [r13] sub rsi, rbp add rbp, qword ptr [r13 + 16] xor ecx, ecx cmp rbp, rax cmovae rcx, rax sub rbp, rcx mov qword ptr [r13 + 16], rbp jmp .LBB0_43 .LBB0_12: mov rdx, qword ptr [r13 + 16] mov r15, qword ptr [r13] lea rax, [rdx + rbp] xor ecx, ecx cmp rax, r15 cmovae rcx, r15 mov r12, rax sub r12, rcx mov rcx, r12 sub rcx, rdx je .LBB0_41 mov rdi, qword ptr [r13 + 8] mov rax, rcx add rax, r15 cmovae rax, rcx mov r8, r15 sub r8, rdx mov rcx, r15 sub rcx, r12 mov r14, rbx sub r14, r8 mov qword ptr [rsp + 16], rsi jbe .LBB0_21 cmp rax, rbx jae .LBB0_26 mov qword ptr [rsp], rdx mov rdx, rbx sub rdx, r8 shl rdx, 2 cmp rbx, rcx jbe .LBB0_32 sub r8, rcx mov rbx, rdi lea rdi, [rdi + 4*r8] mov rsi, rbx mov qword ptr [rsp + 8], rcx mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] sub r15, r14 lea rsi, [rbx + 4*r15] shl r14, 2 mov rdi, rbx mov rdx, r14 call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx mov rax, qword ptr [rsp] lea rsi, [rbx + 4*rax] lea rdi, [rbx + 4*r12] mov rbx, qword ptr [rsp + 8] jmp .LBB0_40 .LBB0_17: xorps xmm0, xmm0 movups xmmword ptr [r13 + 16], xmm0 jmp .LBB0_44 .LBB0_18: mov r14, r15 sub r14, rcx jbe .LBB0_28 cmp rax, r15 jae .LBB0_33 lea rax, [rcx + r12] sub r15, rcx lea rsi, [rdi + 4*rax] shl r15, 2 mov r14, rdi mov rdx, r15 mov r15, rcx jmp .LBB0_31 .LBB0_21: mov r14, rbx sub r14, rcx jbe .LBB0_29 cmp rax, rbx jae .LBB0_34 lea rax, [rcx + rdx] sub rbx, rcx lea rsi, [rdi + 4*rax] shl rbx, 2 mov r14, rdi mov r15, rdx mov rdx, rbx mov rbx, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r14 lea rsi, [r14 + 4*r15] lea rdi, [r14 + 4*r12] jmp .LBB0_40 .LBB0_24: sub r15, rcx jbe .LBB0_35 sub rcx, r8 mov qword ptr [rsp + 8], rcx lea rsi, [rdi + 4*r12] mov r12, rdi lea rdi, [rdi + 4*rbx] lea rdx, [4*r8] mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] add r14, rbx lea rdi, [r12 + 4*r14] mov rbx, qword ptr [rsp + 8] lea rdx, [4*rbx] mov rsi, r12 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r12 lea rsi, [r12 + 4*rbx] jmp .LBB0_36 .LBB0_26: sub rbx, rcx jbe .LBB0_37 sub rcx, r8 lea rsi, [rdi + 4*rdx] mov r15, rdi lea rdi, [rdi + 4*r12] lea rdx, [4*r8] mov r14, rcx mov qword ptr [rsp], r8 call qword ptr [rip + memmove@GOTPCREL] add r12, qword ptr [rsp] lea rdi, [r15 + 4*r12] lea rdx, [4*r14] mov rsi, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 lea rsi, [r15 + 4*r14] jmp .LBB0_40 .LBB0_28: lea rsi, [rdi + 4*r12] lea rdi, [rdi + 4*rbx] jmp .LBB0_36 .LBB0_29: lea rsi, [rdi + 4*rdx] lea rdi, [rdi + 4*r12] jmp .LBB0_40 .LBB0_30: lea rax, [r8 + rbx] mov r14, rdi lea rdi, [rdi + 4*rax] mov rsi, r14 mov r15, r8 .LBB0_31: call qword ptr [rip + memmove@GOTPCREL] mov rdi, r14 lea rsi, [r14 + 4*r12] lea rdi, [r14 + 4*rbx] jmp .LBB0_36 .LBB0_32: lea rax, [r12 + r8] mov rbx, rdi lea rdi, [rdi + 4*rax] mov rsi, rbx mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx mov rax, qword ptr [rsp] lea rsi, [rbx + 4*rax] jmp .LBB0_38 .LBB0_33: lea rsi, [rdi + 4*r12] mov r15, rdi lea rdi, [rdi + 4*rbx] lea rdx, [4*rcx] mov rbx, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 add rbx, r12 lea rsi, [r15 + 4*rbx] mov r15, r14 jmp .LBB0_36 .LBB0_34: lea rsi, [rdi + 4*rdx] mov rbx, rdi lea rdi, [rdi + 4*r12] mov r15, rdx lea rdx, [4*rcx] mov r12, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx add r12, r15 lea rsi, [rbx + 4*r12] jmp .LBB0_39 .LBB0_35: lea rsi, [rdi + 4*r12] mov r14, rdi lea rdi, [rdi + 4*rbx] mov r12, rdx lea rdx, [4*r8] mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] add r15, rbx mov rsi, r14 lea rdi, [r14 + 4*r15] mov r15, r12 .LBB0_36: shl r15, 2 mov rdx, r15 call qword ptr [rip + memmove@GOTPCREL] mov rsi, qword ptr [rsp + 16] jmp .LBB0_42 .LBB0_37: lea rsi, [rdi + 4*rdx] mov rbx, rdi lea rdi, [rdi + 4*r12] lea rdx, [4*r8] mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] add r12, r15 mov rsi, rbx .LBB0_38: lea rdi, [rbx + 4*r12] .LBB0_39: mov rbx, r14 .LBB0_40: shl rbx, 2 mov rdx, rbx call qword ptr [rip + memmove@GOTPCREL] mov r15, qword ptr [r13] mov rax, qword ptr [r13 + 16] add rax, rbp mov rsi, qword ptr [rsp + 16] .LBB0_41: xor ecx, ecx cmp rax, r15 cmovae rcx, r15 sub rax, rcx mov qword ptr [r13 + 16], rax .LBB0_42: sub rsi, rbp .LBB0_43: mov qword ptr [r13 + 24], rsi .LBB0_44: add rsp, 24 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB0_45: lea rdx, [rip + .L__unnamed_1] mov rdi, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] ``` after: ```asm clear: movups xmmword ptr [rdi + 16], xmm0 ret truncate: cmp qword ptr [rdi + 24], rsi jbe .LBB2_4 test rsi, rsi jne .LBB2_3 mov qword ptr [rdi + 16], 0 .LBB2_3: mov qword ptr [rdi + 24], rsi .LBB2_4: ret advance: mov rcx, qword ptr [rdi + 24] mov rax, rcx sub rax, rsi jbe .LBB3_1 mov rcx, qword ptr [rdi] add rsi, qword ptr [rdi + 16] xor edx, edx cmp rsi, rcx cmovae rdx, rcx sub rsi, rdx mov qword ptr [rdi + 16], rsi mov qword ptr [rdi + 24], rax ret .LBB3_1: test rcx, rcx je .LBB3_3 mov qword ptr [rdi + 24], 0 .LBB3_3: mov qword ptr [rdi + 16], 0 ret remove: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov r15, rsi mov r14, rdx sub r14, rsi jb .LBB4_9 mov rbx, rdi mov r12, qword ptr [rdi + 24] mov r13, r12 sub r13, rdx jb .LBB4_10 mov qword ptr [rbx + 24], r15 mov rbp, r12 sub rbp, r14 test r15, r15 je .LBB4_4 cmp rbp, r15 jne .LBB4_11 .LBB4_4: cmp r12, r14 jne .LBB4_6 .LBB4_5: mov qword ptr [rbx + 16], 0 jmp .LBB4_8 .LBB4_11: mov rdi, rbx mov rsi, r14 mov rdx, r15 mov rcx, r13 call <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data cmp r12, r14 je .LBB4_5 .LBB4_6: cmp r13, r15 jbe .LBB4_8 mov rax, qword ptr [rbx] add r14, qword ptr [rbx + 16] xor ecx, ecx cmp r14, rax cmovae rcx, rax sub r14, rcx mov qword ptr [rbx + 16], r14 .LBB4_8: mov qword ptr [rbx + 24], rbp add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB4_9: lea rax, [rip + .L__unnamed_1] mov rdi, r15 mov rsi, rdx mov rdx, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] .LBB4_10: lea rax, [rip + .L__unnamed_1] mov rdi, rdx mov rsi, r12 mov rdx, rax call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL] <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov r14, rsi cmp rdx, rcx jae .LBB0_1 mov r12, qword ptr [rdi] mov rax, qword ptr [rdi + 16] add r14, rax xor ecx, ecx cmp r14, r12 cmovae rcx, r12 sub r14, rcx mov r15, rdx mov r13, r14 mov r14, rax mov rcx, r13 sub rcx, r14 je .LBB0_18 .LBB0_4: mov rdi, qword ptr [rdi + 8] mov rax, rcx add rax, r12 cmovae rax, rcx mov rbx, r12 sub rbx, r14 mov rcx, r12 sub rcx, r13 mov rbp, r15 sub rbp, rbx jbe .LBB0_5 cmp rax, r15 jae .LBB0_12 mov rdx, r15 sub rdx, rbx shl rdx, 2 cmp r15, rcx jbe .LBB0_16 sub rbx, rcx mov rbp, rdi lea rdi, [rdi + 4*rbx] mov r15, qword ptr [rip + memmove@GOTPCREL] mov rsi, rbp mov qword ptr [rsp], rcx call r15 sub r12, rbx lea rsi, [4*r12] add rsi, rbp shl rbx, 2 mov rdi, rbp mov rdx, rbx call r15 mov rdi, rbp lea rsi, [4*r14] add rsi, rbp lea rdi, [4*r13] add rdi, rbp mov r15, qword ptr [rsp] jmp .LBB0_7 .LBB0_1: mov r15, rcx add r14, rdx mov r12, qword ptr [rdi] mov r13, qword ptr [rdi + 16] add r14, r13 xor eax, eax cmp r14, r12 mov rcx, r12 cmovb rcx, rax sub r14, rcx add r13, rdx cmp r13, r12 cmovae rax, r12 sub r13, rax mov rcx, r13 sub rcx, r14 jne .LBB0_4 .LBB0_18: add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB0_5: mov rbx, r15 sub rbx, rcx jbe .LBB0_6 cmp rax, r15 jae .LBB0_9 lea rax, [rcx + r14] sub r15, rcx lea rsi, [rdi + 4*rax] shl r15, 2 mov rbx, rdi mov rdx, r15 mov r15, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx lea rsi, [rbx + 4*r14] lea rdi, [rbx + 4*r13] jmp .LBB0_7 .LBB0_12: sub r15, rcx jbe .LBB0_13 sub rcx, rbx lea rsi, [rdi + 4*r14] mov r12, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rbx] mov r14, qword ptr [rip + memmove@GOTPCREL] mov rbp, rcx call r14 add rbx, r13 lea rdi, [r12 + 4*rbx] lea rdx, [4*rbp] mov rsi, r12 call r14 mov rdi, r12 lea rsi, [r12 + 4*rbp] jmp .LBB0_7 .LBB0_6: lea rsi, [rdi + 4*r14] lea rdi, [rdi + 4*r13] jmp .LBB0_7 .LBB0_16: lea rax, [rbx + r13] mov r15, rdi lea rdi, [rdi + 4*rax] mov rsi, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 lea rsi, [r15 + 4*r14] lea rdi, [r15 + 4*r13] mov r15, rbx jmp .LBB0_7 .LBB0_9: lea rsi, [rdi + 4*r14] mov r15, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rcx] mov r12, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 add r12, r14 lea rsi, [r15 + 4*r12] mov r15, rbx jmp .LBB0_7 .LBB0_13: lea rsi, [rdi + 4*r14] mov r14, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rbx] call qword ptr [rip + memmove@GOTPCREL] add rbx, r13 mov rsi, r14 lea rdi, [r14 + 4*rbx] mov r15, rbp .LBB0_7: shl r15, 2 mov rdx, r15 add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp jmp qword ptr [rip + memmove@GOTPCREL] ``` </details>

bors · 2024-02-16T07:20:45Z

⌛ Testing commit da1c7f6 with merge 45665a8...

bors · 2024-02-16T07:45:24Z

💔 Test failed - checks-actions

lukas-code · 2024-02-16T12:59:18Z

I fixed the codegen test for 32-bit.

@rustbot label -A-testsuite -T-infra

the8472 · 2024-02-17T11:04:36Z

@bors r+

bors · 2024-02-17T11:04:39Z

📌 Commit 5d98977 has been approved by the8472

It is now in the queue for this repository.

…he8472 Optimize `VecDeque::drain` for (half-)open ranges The most common use cases of `VecDeque::drain` consume either the entire queue or elements from the front or back.[^1] This PR makes these operations faster by optimizing the generated code of the destructor of the drain: * `.drain(..)` is now the same as `.clear()`. * `.drain(n..)` is now (almost[^2]) the same as `.truncate(n)`. * `.drain(..n)` is now an efficient "advance" function. This operation is not provided by a dedicated function and optimizing it is my main motivation for this PR. Previously, all of these cases generated a function call to the destructor of the `DropGuard`, emitting a lot of unused machine code as well as unnecessary branches and loads/stores of stack variables. There are no algorithmic changes in this PR, but it simplifies the code enough to allow LLVM to recognize the special cases and optimize accordingly. Most notably, it allows elimination of the rather large [`wrap_copy`] function. Some [rudimentary microbenchmarks][benches] show a performance improvement of **~3x-4x** on my machine for the special cases and roughly equal performance for the general case. Best reviewed commit by commit. [^1]: source: GitHub code search: [full range `drain(..)` = 7.5k results][full], [from front `drain(..n)` = 3.2k results][front], [from back `drain(n..)` = 1.6k results][back], [from middle `drain(n..m)` = <500 results][middle] [^2]: `.drain(0..)` and `.clear()` reset the head to 0, but `.truncate(0)` does not. [full]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5C%29%2F+lang%3ARust [front]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%280%3F%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust [back]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5C%29%2F+lang%3ARust [middle]: https://github.com/search?type=code&q=%2FVecDeque%28.%7C%5Cn%29%2B%5C.drain%5C%28%5B%5E0%5D.*%5C.%5C.%5B%5E%29%5D.*%5C%29%2F+lang%3ARust [`wrap_copy`]: https://github.com/rust-lang/rust/blob/4fd68eb47bad1c121417ac4450b2f0456150db86/library/alloc/src/collections/vec_deque/mod.rs#L262-L391 [benches]: https://gist.github.com/lukas-code/c97bd707d074c4cc31f241edbc7fd2a2 <details> <summary>generated assembly</summary> before: ```asm clear: sub rsp, 40 mov rax, qword ptr [rdi + 24] mov qword ptr [rdi + 24], 0 mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rax xorps xmm0, xmm0 movups xmmword ptr [rsp + 16], xmm0 mov qword ptr [rsp + 32], rax test rax, rax je .LBB1_2 mov rcx, qword ptr [rdi] mov rdx, qword ptr [rdi + 16] xor esi, esi cmp rdx, rcx cmovae rsi, rcx sub rdx, rsi mov rsi, rcx sub rsi, rdx lea rdi, [rdx + rax] cmp rsi, rax cmovb rdi, rcx sub rdi, rdx mov qword ptr [rsp + 16], rdi mov qword ptr [rsp + 32], 0 .LBB1_2: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret truncate: mov rax, qword ptr [rdi + 24] sub rax, rsi jbe .LBB2_2 sub rsp, 40 mov qword ptr [rdi + 24], rsi mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rax mov rcx, qword ptr [rdi] mov rdx, qword ptr [rdi + 16] add rdx, rsi xor edi, edi cmp rdx, rcx cmovae rdi, rcx mov qword ptr [rsp + 24], 0 sub rdx, rdi mov rdi, rcx sub rdi, rdx lea r8, [rdx + rax] cmp rdi, rax cmovb r8, rcx sub rsi, rdx add rsi, r8 mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 32], 0 mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 advance: mov rcx, qword ptr [rdi + 24] mov rax, rcx sub rax, rsi jbe .LBB3_1 sub rsp, 40 mov qword ptr [rdi + 24], 0 mov qword ptr [rsp], rdi mov qword ptr [rsp + 8], rsi mov qword ptr [rsp + 16], 0 mov qword ptr [rsp + 24], rax mov qword ptr [rsp + 32], rsi test rsi, rsi je .LBB3_6 mov rax, qword ptr [rdi] mov rcx, qword ptr [rdi + 16] xor edx, edx cmp rcx, rax cmovae rdx, rax sub rcx, rdx mov rdx, rax sub rdx, rcx lea rdi, [rcx + rsi] cmp rdx, rsi cmovb rdi, rax sub rdi, rcx mov qword ptr [rsp + 16], rdi mov qword ptr [rsp + 32], 0 .LBB3_6: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret .LBB3_1: test rcx, rcx je .LBB3_3 mov qword ptr [rdi + 24], 0 .LBB3_3: mov qword ptr [rdi + 16], 0 ret remove: sub rsp, 40 cmp rdx, rsi jb .LBB4_5 mov rax, qword ptr [rdi + 24] mov rcx, rax sub rcx, rdx jb .LBB4_6 mov qword ptr [rdi + 24], rsi mov qword ptr [rsp], rdi sub rdx, rsi mov qword ptr [rsp + 8], rdx mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 24], rcx mov qword ptr [rsp + 32], rdx je .LBB4_4 mov rax, qword ptr [rdi] mov rcx, qword ptr [rdi + 16] add rcx, rsi xor edi, edi cmp rcx, rax cmovae rdi, rax sub rcx, rdi mov rdi, rax sub rdi, rcx lea r8, [rcx + rdx] cmp rdi, rdx cmovb r8, rax sub rsi, rcx add rsi, r8 mov qword ptr [rsp + 16], rsi mov qword ptr [rsp + 32], 0 .LBB4_4: mov rdi, rsp call core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>> add rsp, 40 ret .LBB4_5: lea rax, [rip + .L__unnamed_2] mov rdi, rsi mov rsi, rdx mov rdx, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] .LBB4_6: lea rcx, [rip + .L__unnamed_2] mov rdi, rdx mov rsi, rax mov rdx, rcx call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL] core::ptr::drop_in_place<<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<i32,alloc::alloc::Global>>: push rbp push r15 push r14 push r13 push r12 push rbx sub rsp, 24 mov rsi, qword ptr [rdi + 32] test rsi, rsi je .LBB0_2 mov rax, qword ptr [rdi + 16] add rsi, rax jb .LBB0_45 .LBB0_2: mov r13, qword ptr [rdi] mov rbp, qword ptr [rdi + 8] mov rbx, qword ptr [r13 + 24] lea r12, [rbx + rbp] mov r15, qword ptr [rdi + 24] lea rsi, [r15 + r12] test rbx, rbx je .LBB0_10 test r15, r15 je .LBB0_42 cmp rbx, r15 jbe .LBB0_12 mov r14, qword ptr [r13] mov rax, qword ptr [r13 + 16] add r12, rax xor ecx, ecx cmp r12, r14 mov rdx, r14 cmovb rdx, rcx sub r12, rdx add rbx, rax cmp rbx, r14 cmovae rcx, r14 sub rbx, rcx mov rcx, rbx sub rcx, r12 je .LBB0_42 mov rdi, qword ptr [r13 + 8] mov rax, rcx add rax, r14 cmovae rax, rcx mov r8, r14 sub r8, r12 mov rcx, r14 sub rcx, rbx mov rdx, r15 sub rdx, r8 mov qword ptr [rsp + 16], rsi jbe .LBB0_18 cmp rax, r15 jae .LBB0_24 mov rdx, r15 sub rdx, r8 shl rdx, 2 cmp r15, rcx jbe .LBB0_30 sub r8, rcx mov qword ptr [rsp], rdi mov rax, qword ptr [rsp] lea rdi, [rax + 4*r8] mov rsi, qword ptr [rsp] mov qword ptr [rsp + 8], rcx mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] sub r14, r15 mov rax, qword ptr [rsp] lea rsi, [rax + 4*r14] shl r15, 2 mov rdi, qword ptr [rsp] mov rdx, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, qword ptr [rsp] lea rsi, [rdi + 4*r12] lea rdi, [rdi + 4*rbx] mov r15, qword ptr [rsp + 8] jmp .LBB0_36 .LBB0_10: test r15, r15 je .LBB0_17 mov rax, qword ptr [r13] sub rsi, rbp add rbp, qword ptr [r13 + 16] xor ecx, ecx cmp rbp, rax cmovae rcx, rax sub rbp, rcx mov qword ptr [r13 + 16], rbp jmp .LBB0_43 .LBB0_12: mov rdx, qword ptr [r13 + 16] mov r15, qword ptr [r13] lea rax, [rdx + rbp] xor ecx, ecx cmp rax, r15 cmovae rcx, r15 mov r12, rax sub r12, rcx mov rcx, r12 sub rcx, rdx je .LBB0_41 mov rdi, qword ptr [r13 + 8] mov rax, rcx add rax, r15 cmovae rax, rcx mov r8, r15 sub r8, rdx mov rcx, r15 sub rcx, r12 mov r14, rbx sub r14, r8 mov qword ptr [rsp + 16], rsi jbe .LBB0_21 cmp rax, rbx jae .LBB0_26 mov qword ptr [rsp], rdx mov rdx, rbx sub rdx, r8 shl rdx, 2 cmp rbx, rcx jbe .LBB0_32 sub r8, rcx mov rbx, rdi lea rdi, [rdi + 4*r8] mov rsi, rbx mov qword ptr [rsp + 8], rcx mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] sub r15, r14 lea rsi, [rbx + 4*r15] shl r14, 2 mov rdi, rbx mov rdx, r14 call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx mov rax, qword ptr [rsp] lea rsi, [rbx + 4*rax] lea rdi, [rbx + 4*r12] mov rbx, qword ptr [rsp + 8] jmp .LBB0_40 .LBB0_17: xorps xmm0, xmm0 movups xmmword ptr [r13 + 16], xmm0 jmp .LBB0_44 .LBB0_18: mov r14, r15 sub r14, rcx jbe .LBB0_28 cmp rax, r15 jae .LBB0_33 lea rax, [rcx + r12] sub r15, rcx lea rsi, [rdi + 4*rax] shl r15, 2 mov r14, rdi mov rdx, r15 mov r15, rcx jmp .LBB0_31 .LBB0_21: mov r14, rbx sub r14, rcx jbe .LBB0_29 cmp rax, rbx jae .LBB0_34 lea rax, [rcx + rdx] sub rbx, rcx lea rsi, [rdi + 4*rax] shl rbx, 2 mov r14, rdi mov r15, rdx mov rdx, rbx mov rbx, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r14 lea rsi, [r14 + 4*r15] lea rdi, [r14 + 4*r12] jmp .LBB0_40 .LBB0_24: sub r15, rcx jbe .LBB0_35 sub rcx, r8 mov qword ptr [rsp + 8], rcx lea rsi, [rdi + 4*r12] mov r12, rdi lea rdi, [rdi + 4*rbx] lea rdx, [4*r8] mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] add r14, rbx lea rdi, [r12 + 4*r14] mov rbx, qword ptr [rsp + 8] lea rdx, [4*rbx] mov rsi, r12 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r12 lea rsi, [r12 + 4*rbx] jmp .LBB0_36 .LBB0_26: sub rbx, rcx jbe .LBB0_37 sub rcx, r8 lea rsi, [rdi + 4*rdx] mov r15, rdi lea rdi, [rdi + 4*r12] lea rdx, [4*r8] mov r14, rcx mov qword ptr [rsp], r8 call qword ptr [rip + memmove@GOTPCREL] add r12, qword ptr [rsp] lea rdi, [r15 + 4*r12] lea rdx, [4*r14] mov rsi, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 lea rsi, [r15 + 4*r14] jmp .LBB0_40 .LBB0_28: lea rsi, [rdi + 4*r12] lea rdi, [rdi + 4*rbx] jmp .LBB0_36 .LBB0_29: lea rsi, [rdi + 4*rdx] lea rdi, [rdi + 4*r12] jmp .LBB0_40 .LBB0_30: lea rax, [r8 + rbx] mov r14, rdi lea rdi, [rdi + 4*rax] mov rsi, r14 mov r15, r8 .LBB0_31: call qword ptr [rip + memmove@GOTPCREL] mov rdi, r14 lea rsi, [r14 + 4*r12] lea rdi, [r14 + 4*rbx] jmp .LBB0_36 .LBB0_32: lea rax, [r12 + r8] mov rbx, rdi lea rdi, [rdi + 4*rax] mov rsi, rbx mov r14, r8 call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx mov rax, qword ptr [rsp] lea rsi, [rbx + 4*rax] jmp .LBB0_38 .LBB0_33: lea rsi, [rdi + 4*r12] mov r15, rdi lea rdi, [rdi + 4*rbx] lea rdx, [4*rcx] mov rbx, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 add rbx, r12 lea rsi, [r15 + 4*rbx] mov r15, r14 jmp .LBB0_36 .LBB0_34: lea rsi, [rdi + 4*rdx] mov rbx, rdi lea rdi, [rdi + 4*r12] mov r15, rdx lea rdx, [4*rcx] mov r12, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx add r12, r15 lea rsi, [rbx + 4*r12] jmp .LBB0_39 .LBB0_35: lea rsi, [rdi + 4*r12] mov r14, rdi lea rdi, [rdi + 4*rbx] mov r12, rdx lea rdx, [4*r8] mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] add r15, rbx mov rsi, r14 lea rdi, [r14 + 4*r15] mov r15, r12 .LBB0_36: shl r15, 2 mov rdx, r15 call qword ptr [rip + memmove@GOTPCREL] mov rsi, qword ptr [rsp + 16] jmp .LBB0_42 .LBB0_37: lea rsi, [rdi + 4*rdx] mov rbx, rdi lea rdi, [rdi + 4*r12] lea rdx, [4*r8] mov r15, r8 call qword ptr [rip + memmove@GOTPCREL] add r12, r15 mov rsi, rbx .LBB0_38: lea rdi, [rbx + 4*r12] .LBB0_39: mov rbx, r14 .LBB0_40: shl rbx, 2 mov rdx, rbx call qword ptr [rip + memmove@GOTPCREL] mov r15, qword ptr [r13] mov rax, qword ptr [r13 + 16] add rax, rbp mov rsi, qword ptr [rsp + 16] .LBB0_41: xor ecx, ecx cmp rax, r15 cmovae rcx, r15 sub rax, rcx mov qword ptr [r13 + 16], rax .LBB0_42: sub rsi, rbp .LBB0_43: mov qword ptr [r13 + 24], rsi .LBB0_44: add rsp, 24 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB0_45: lea rdx, [rip + .L__unnamed_1] mov rdi, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] ``` after: ```asm clear: movups xmmword ptr [rdi + 16], xmm0 ret truncate: cmp qword ptr [rdi + 24], rsi jbe .LBB2_4 test rsi, rsi jne .LBB2_3 mov qword ptr [rdi + 16], 0 .LBB2_3: mov qword ptr [rdi + 24], rsi .LBB2_4: ret advance: mov rcx, qword ptr [rdi + 24] mov rax, rcx sub rax, rsi jbe .LBB3_1 mov rcx, qword ptr [rdi] add rsi, qword ptr [rdi + 16] xor edx, edx cmp rsi, rcx cmovae rdx, rcx sub rsi, rdx mov qword ptr [rdi + 16], rsi mov qword ptr [rdi + 24], rax ret .LBB3_1: test rcx, rcx je .LBB3_3 mov qword ptr [rdi + 24], 0 .LBB3_3: mov qword ptr [rdi + 16], 0 ret remove: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov r15, rsi mov r14, rdx sub r14, rsi jb .LBB4_9 mov rbx, rdi mov r12, qword ptr [rdi + 24] mov r13, r12 sub r13, rdx jb .LBB4_10 mov qword ptr [rbx + 24], r15 mov rbp, r12 sub rbp, r14 test r15, r15 je .LBB4_4 cmp rbp, r15 jne .LBB4_11 .LBB4_4: cmp r12, r14 jne .LBB4_6 .LBB4_5: mov qword ptr [rbx + 16], 0 jmp .LBB4_8 .LBB4_11: mov rdi, rbx mov rsi, r14 mov rdx, r15 mov rcx, r13 call <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data cmp r12, r14 je .LBB4_5 .LBB4_6: cmp r13, r15 jbe .LBB4_8 mov rax, qword ptr [rbx] add r14, qword ptr [rbx + 16] xor ecx, ecx cmp r14, rax cmovae rcx, rax sub r14, rcx mov qword ptr [rbx + 16], r14 .LBB4_8: mov qword ptr [rbx + 24], rbp add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB4_9: lea rax, [rip + .L__unnamed_1] mov rdi, r15 mov rsi, rdx mov rdx, rax call qword ptr [rip + core::slice::index::slice_index_order_fail@GOTPCREL] .LBB4_10: lea rax, [rip + .L__unnamed_1] mov rdi, rdx mov rsi, r12 mov rdx, rax call qword ptr [rip + core::slice::index::slice_end_index_len_fail@GOTPCREL] <<alloc::collections::vec_deque::drain::Drain<T,A> as core::ops::drop::Drop>::drop::DropGuard<T,A> as core::ops::drop::Drop>::drop::copy_data: push rbp push r15 push r14 push r13 push r12 push rbx push rax mov r14, rsi cmp rdx, rcx jae .LBB0_1 mov r12, qword ptr [rdi] mov rax, qword ptr [rdi + 16] add r14, rax xor ecx, ecx cmp r14, r12 cmovae rcx, r12 sub r14, rcx mov r15, rdx mov r13, r14 mov r14, rax mov rcx, r13 sub rcx, r14 je .LBB0_18 .LBB0_4: mov rdi, qword ptr [rdi + 8] mov rax, rcx add rax, r12 cmovae rax, rcx mov rbx, r12 sub rbx, r14 mov rcx, r12 sub rcx, r13 mov rbp, r15 sub rbp, rbx jbe .LBB0_5 cmp rax, r15 jae .LBB0_12 mov rdx, r15 sub rdx, rbx shl rdx, 2 cmp r15, rcx jbe .LBB0_16 sub rbx, rcx mov rbp, rdi lea rdi, [rdi + 4*rbx] mov r15, qword ptr [rip + memmove@GOTPCREL] mov rsi, rbp mov qword ptr [rsp], rcx call r15 sub r12, rbx lea rsi, [4*r12] add rsi, rbp shl rbx, 2 mov rdi, rbp mov rdx, rbx call r15 mov rdi, rbp lea rsi, [4*r14] add rsi, rbp lea rdi, [4*r13] add rdi, rbp mov r15, qword ptr [rsp] jmp .LBB0_7 .LBB0_1: mov r15, rcx add r14, rdx mov r12, qword ptr [rdi] mov r13, qword ptr [rdi + 16] add r14, r13 xor eax, eax cmp r14, r12 mov rcx, r12 cmovb rcx, rax sub r14, rcx add r13, rdx cmp r13, r12 cmovae rax, r12 sub r13, rax mov rcx, r13 sub rcx, r14 jne .LBB0_4 .LBB0_18: add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp ret .LBB0_5: mov rbx, r15 sub rbx, rcx jbe .LBB0_6 cmp rax, r15 jae .LBB0_9 lea rax, [rcx + r14] sub r15, rcx lea rsi, [rdi + 4*rax] shl r15, 2 mov rbx, rdi mov rdx, r15 mov r15, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, rbx lea rsi, [rbx + 4*r14] lea rdi, [rbx + 4*r13] jmp .LBB0_7 .LBB0_12: sub r15, rcx jbe .LBB0_13 sub rcx, rbx lea rsi, [rdi + 4*r14] mov r12, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rbx] mov r14, qword ptr [rip + memmove@GOTPCREL] mov rbp, rcx call r14 add rbx, r13 lea rdi, [r12 + 4*rbx] lea rdx, [4*rbp] mov rsi, r12 call r14 mov rdi, r12 lea rsi, [r12 + 4*rbp] jmp .LBB0_7 .LBB0_6: lea rsi, [rdi + 4*r14] lea rdi, [rdi + 4*r13] jmp .LBB0_7 .LBB0_16: lea rax, [rbx + r13] mov r15, rdi lea rdi, [rdi + 4*rax] mov rsi, r15 call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 lea rsi, [r15 + 4*r14] lea rdi, [r15 + 4*r13] mov r15, rbx jmp .LBB0_7 .LBB0_9: lea rsi, [rdi + 4*r14] mov r15, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rcx] mov r12, rcx call qword ptr [rip + memmove@GOTPCREL] mov rdi, r15 add r12, r14 lea rsi, [r15 + 4*r12] mov r15, rbx jmp .LBB0_7 .LBB0_13: lea rsi, [rdi + 4*r14] mov r14, rdi lea rdi, [rdi + 4*r13] lea rdx, [4*rbx] call qword ptr [rip + memmove@GOTPCREL] add rbx, r13 mov rsi, r14 lea rdi, [r14 + 4*rbx] mov r15, rbp .LBB0_7: shl r15, 2 mov rdx, r15 add rsp, 8 pop rbx pop r12 pop r13 pop r14 pop r15 pop rbp jmp qword ptr [rip + memmove@GOTPCREL] ``` </details>

…llaumeGomez Rollup of 7 pull requests Successful merges: - rust-lang#118264 (Optimize `VecDeque::drain` for (half-)open ranges) - rust-lang#121079 (distribute tool documentations and avoid file conflicts on `x install`) - rust-lang#121100 (Detect when method call on argument could be removed to fulfill failed trait bound) - rust-lang#121160 (rustdoc: fix and refactor HTML rendering a bit) - rust-lang#121198 (Add more checks for `unnamed_fields` during HIR analysis) - rust-lang#121221 (AstConv: Refactor lowering of associated item bindings a bit) - rust-lang#121237 (Use better heuristic for printing Cargo specific diagnostics) r? `@ghost` `@rustbot` modify labels: rollup

bors · 2024-02-18T00:03:42Z

⌛ Testing commit 5d98977 with merge 158f00a...

bors · 2024-02-18T01:59:16Z

☀️ Test successful - checks-actions
Approved by: the8472
Pushing 158f00a to master...

rust-timer · 2024-02-18T03:13:51Z

Finished benchmarking commit (158f00a): comparison URL.

Overall result: ❌✅ regressions and improvements - no action needed

@rustbot label: -perf-regression

Instruction count

This is a highly reliable metric that was used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	-	-	0
Regressions ❌ (secondary)	2.9%	[2.9%, 2.9%]	1
Improvements ✅ (primary)	-	-	0
Improvements ✅ (secondary)	-0.4%	[-0.4%, -0.4%]	1
All ❌✅ (primary)	-	-	0

Max RSS (memory usage)

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	3.8%	[1.9%, 8.0%]	7
Regressions ❌ (secondary)	2.7%	[1.0%, 4.2%]	29
Improvements ✅ (primary)	-	-	0
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	3.8%	[1.9%, 8.0%]	7

Cycles

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	-	-	0
Regressions ❌ (secondary)	3.3%	[3.2%, 3.5%]	3
Improvements ✅ (primary)	-	-	0
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	-	-	0

Binary size

Results

This is a less reliable metric that may be of interest but was not used to determine the overall result at the top of this comment.

	mean	range	count
Regressions ❌ (primary)	0.2%	[0.2%, 0.2%]	1
Regressions ❌ (secondary)	-	-	0
Improvements ✅ (primary)	-0.0%	[-0.0%, -0.0%]	4
Improvements ✅ (secondary)	-	-	0
All ❌✅ (primary)	0.0%	[-0.0%, 0.2%]	5

Bootstrap: 641.933s -> 639.353s (-0.40%)
Artifact size: 306.43 MiB -> 306.34 MiB (-0.03%)

rustbot assigned thomcc Nov 24, 2023

rustbot added S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. T-libs Relevant to the library team, which will review and decide on the PR/issue. labels Nov 24, 2023

lukas-code force-pushed the optimized-draining branch from d24ef27 to 63ce0c8 Compare January 17, 2024 16:34

rustbot assigned the8472 and unassigned thomcc Feb 1, 2024

the8472 reviewed Feb 10, 2024

View reviewed changes

library/alloc/src/collections/vec_deque/drain.rs Outdated Show resolved Hide resolved

the8472 reviewed Feb 10, 2024

View reviewed changes

lukas-code force-pushed the optimized-draining branch 3 times, most recently from 218fee2 to da1c7f6 Compare February 15, 2024 12:02

bors added S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. and removed S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. labels Feb 15, 2024

GuillaumeGomez mentioned this pull request Feb 15, 2024

Rollup of 13 pull requests #121158

Closed

oli-obk mentioned this pull request Feb 15, 2024

Rollup of 20 pull requests #121164

Closed

This comment has been minimized.

Sign in to view

bors added S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. and removed S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. labels Feb 16, 2024

Lukas Markeffsky added 6 commits February 16, 2024 13:10

simplify codegen for trivially droppable types

ec95d25

reduce amount of math

500ef67

reduce branchiness

1e3849a

outline large copies

b670293

add codegen test

8f259ad

address review comments

5d98977

lukas-code force-pushed the optimized-draining branch from da1c7f6 to 07bd9b4 Compare February 16, 2024 12:13

rustbot added A-testsuite Area: The testsuite used to check the correctness of rustc T-infra Relevant to the infrastructure team, which will review and decide on the PR/issue. labels Feb 16, 2024

lukas-code force-pushed the optimized-draining branch from 07bd9b4 to 5d98977 Compare February 16, 2024 12:58

rustbot removed A-testsuite Area: The testsuite used to check the correctness of rustc T-infra Relevant to the infrastructure team, which will review and decide on the PR/issue. labels Feb 16, 2024

bors added S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. and removed S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. labels Feb 17, 2024

GuillaumeGomez mentioned this pull request Feb 17, 2024

Rollup of 7 pull requests #121244

Closed

bors added the merged-by-bors This PR was explicitly merged by bors. label Feb 18, 2024

bors merged commit 158f00a into rust-lang:master Feb 18, 2024
12 checks passed

rustbot added this to the 1.78.0 milestone Feb 18, 2024

lukas-code deleted the optimized-draining branch February 18, 2024 09:46

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Optimize `VecDeque::drain` for (half-)open ranges #118264

Optimize `VecDeque::drain` for (half-)open ranges #118264

lukas-code commented Nov 24, 2023 •

edited

Loading

rustbot commented Nov 24, 2023

AngelicosPhosphoros commented Nov 25, 2023 •

edited

Loading

lukas-code commented Nov 25, 2023

AngelicosPhosphoros commented Nov 25, 2023

lukas-code commented Jan 17, 2024

lukas-code commented Feb 1, 2024

the8472 left a comment

the8472 Feb 10, 2024

the8472 Feb 10, 2024

the8472 commented Feb 15, 2024

bors commented Feb 15, 2024

bors commented Feb 16, 2024

This comment has been minimized.

bors commented Feb 16, 2024

lukas-code commented Feb 16, 2024

the8472 commented Feb 17, 2024

bors commented Feb 17, 2024

bors commented Feb 18, 2024

bors commented Feb 18, 2024

rust-timer commented Feb 18, 2024

Optimize VecDeque::drain for (half-)open ranges #118264

Optimize VecDeque::drain for (half-)open ranges #118264

Conversation

lukas-code commented Nov 24, 2023 • edited Loading

Footnotes

rustbot commented Nov 24, 2023

AngelicosPhosphoros commented Nov 25, 2023 • edited Loading

lukas-code commented Nov 25, 2023

AngelicosPhosphoros commented Nov 25, 2023

lukas-code commented Jan 17, 2024

lukas-code commented Feb 1, 2024

the8472 left a comment

Choose a reason for hiding this comment

the8472 Feb 10, 2024

Choose a reason for hiding this comment

the8472 Feb 10, 2024

Choose a reason for hiding this comment

the8472 commented Feb 15, 2024

bors commented Feb 15, 2024

bors commented Feb 16, 2024

This comment has been minimized.

bors commented Feb 16, 2024

lukas-code commented Feb 16, 2024

the8472 commented Feb 17, 2024

bors commented Feb 17, 2024

bors commented Feb 18, 2024

bors commented Feb 18, 2024

rust-timer commented Feb 18, 2024

Overall result: ❌✅ regressions and improvements - no action needed

Instruction count

Max RSS (memory usage)

Cycles

Binary size

Optimize `VecDeque::drain` for (half-)open ranges #118264

Optimize `VecDeque::drain` for (half-)open ranges #118264

lukas-code commented Nov 24, 2023 •

edited

Loading

AngelicosPhosphoros commented Nov 25, 2023 •

edited

Loading