linebender/druid#1025 (comment)
According to cargo bloat, Event::parse is more than 9kb of code (this is only with randr). Then there's also a bunch of things like KeyPressEvent::try_parse at around the 3kb mark. I bet that rustc is doing a lot of inlining, and that the error-handling code could be de-duplicated.
I'm not sure I understand this correctly (is it "there are multiple things (e.g. KeyPressEvent::try_parse) that are in total 3kb in size" or is it "KeyPressEvent::try_parse is 3kb in size"?), but I cannot really reproduce.
I copied together some self-contained code for `KeyPressEvent::try_parse`
use std::convert::TryInto;
pub enum ParseError { ParseError }
pub type Window = u32;
pub type Pixmap = u32;
pub type Cursor = u32;
pub type Font = u32;
pub type Gcontext = u32;
pub type Colormap = u32;
pub type Atom = u32;
pub type Drawable = u32;
pub type Fontable = u32;
pub type Bool32 = u32;
pub type Visualid = u32;
pub type Timestamp = u32;
pub type Keysym = u32;
pub type Keycode = u8;
pub type Keycode32 = u32;
pub type Button = u8;
/// A type implementing this trait can be parsed from some raw bytes.
pub trait TryParse: Sized {
/// Try to parse the given values into an instance of this type.
///
/// If parsing is successful, an instance of the type and a slice for the remaining data should
/// be returned. Otherwise, an error is returned.
fn try_parse(value: &[u8]) -> Result<(Self, &[u8]), ParseError>;
}
macro_rules! implement_try_parse {
($t:ty) => {
impl TryParse for $t {
fn try_parse(value: &[u8]) -> Result<(Self, &[u8]), ParseError> {
let len = std::mem::size_of::<$t>();
let bytes = value
.get(..len)
.ok_or(ParseError::ParseError)?
.try_into() // TryInto<[u8; len]>
.unwrap();
Ok((<$t>::from_ne_bytes(bytes), &value[len..]))
}
}
};
}
impl TryParse for bool {
fn try_parse(value: &[u8]) -> Result<(Self, &[u8]), ParseError> {
let (data, remaining) = u8::try_parse(value)?;
Ok((data != 0, remaining))
}
}
implement_try_parse!(u8);
implement_try_parse!(i8);
implement_try_parse!(u16);
implement_try_parse!(i16);
implement_try_parse!(u32);
implement_try_parse!(i32);
implement_try_parse!(u64);
implement_try_parse!(i64);
pub struct KeyPressEvent {
pub response_type: u8,
pub detail: Keycode,
pub sequence: u16,
pub time: Timestamp,
pub root: Window,
pub event: Window,
pub child: Window,
pub root_x: i16,
pub root_y: i16,
pub event_x: i16,
pub event_y: i16,
pub state: u16,
pub same_screen: bool,
}
impl TryParse for KeyPressEvent {
fn try_parse(initial_value: &[u8]) -> Result<(Self, &[u8]), ParseError> {
let remaining = initial_value;
let (response_type, remaining) = u8::try_parse(remaining)?;
let (detail, remaining) = Keycode::try_parse(remaining)?;
let (sequence, remaining) = u16::try_parse(remaining)?;
let (time, remaining) = Timestamp::try_parse(remaining)?;
let (root, remaining) = Window::try_parse(remaining)?;
let (event, remaining) = Window::try_parse(remaining)?;
let (child, remaining) = Window::try_parse(remaining)?;
let (root_x, remaining) = i16::try_parse(remaining)?;
let (root_y, remaining) = i16::try_parse(remaining)?;
let (event_x, remaining) = i16::try_parse(remaining)?;
let (event_y, remaining) = i16::try_parse(remaining)?;
let (state, remaining) = u16::try_parse(remaining)?;
let (same_screen, remaining) = bool::try_parse(remaining)?;
let remaining = remaining.get(1..).ok_or(ParseError::ParseError)?;
let result = KeyPressEvent { response_type, detail, sequence, time, root, event, child, root_x, root_y, event_x, event_y, state, same_screen };
let _ = remaining;
let remaining = initial_value.get(32..)
.ok_or(ParseError::ParseError)?;
Ok((result, remaining))
}
}
The resulting compiler output with `-Copt-level=3 --edition=2018` is 41 KiB of text (according to https://rust.godbolt.org).
I cannot easily see the binary size, but here is the assembly for `KeyPressEvent::try_parse`:
<example::KeyPressEvent as example::TryParse>::try_parse:
push rbp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 1
mov rax, rdi
test rdx, rdx
je .LBB1_16
cmp rdx, 1
je .LBB1_16
mov rcx, rdx
and rcx, -2
cmp rcx, 2
je .LBB1_16
mov rdi, rdx
and rdi, -4
cmp rdi, 4
je .LBB1_16
cmp rdi, 8
je .LBB1_16
cmp rdi, 12
je .LBB1_16
cmp rdi, 16
je .LBB1_16
cmp rcx, 20
je .LBB1_16
cmp rcx, 22
je .LBB1_16
cmp rcx, 24
je .LBB1_16
cmp rcx, 26
je .LBB1_16
cmp rcx, 28
je .LBB1_16
cmp rdx, 30
je .LBB1_16
cmp byte ptr [rsi + 30], 0
setne cl
cmp rdx, 31
je .LBB1_16
cmp rdx, 32
jae .LBB1_15
.LBB1_16:
mov byte ptr [rax + 30], 2
.LBB1_17:
add rsp, 1
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB1_15:
mov r9b, byte ptr [rsi]
mov bl, byte ptr [rsi + 1]
mov byte ptr [rsp], bl
movzx r10d, word ptr [rsi + 2]
mov edi, dword ptr [rsi + 4]
mov r11d, dword ptr [rsi + 8]
mov ebx, dword ptr [rsi + 12]
mov ebp, dword ptr [rsi + 16]
movzx r14d, word ptr [rsi + 20]
movzx r15d, word ptr [rsi + 22]
movzx r12d, word ptr [rsi + 24]
movzx r13d, word ptr [rsi + 26]
movzx r8d, word ptr [rsi + 28]
add rsi, 32
add rdx, -32
mov dword ptr [rax], edi
mov dword ptr [rax + 4], r11d
mov dword ptr [rax + 8], ebx
mov dword ptr [rax + 12], ebp
mov word ptr [rax + 16], r10w
mov word ptr [rax + 18], r14w
mov word ptr [rax + 20], r15w
mov word ptr [rax + 22], r12w
mov word ptr [rax + 24], r13w
mov word ptr [rax + 26], r8w
mov byte ptr [rax + 28], r9b
mov bl, byte ptr [rsp]
mov byte ptr [rax + 29], bl
mov byte ptr [rax + 30], cl
mov qword ptr [rax + 32], rsi
mov qword ptr [rax + 40], rdx
jmp .LBB1_17
That's just 90 lines of assembly and it does not call any other code. This can't be 3 KiB of binary code.
Without optimisation, the output is a lot more ugly, but I do not think that looking at this output makes sense.
One thing I notice: llvm managed to merge all the error handling, but it does not notice that it can simplify if length < 4 then goto error; if length < 8 then goto error; etc. Adding if initial_value.len() < 32 { return Err(ParseError::ParseError); } as a new first line to KeyPressEvent::try_parse helps here. The assembly now only has 56 lines. There are some simplifications that I do not immediately understand, but all of this "cmp with small number, then jump" was merged into a single cmp rdx, 31. I guess generating something like this "everywhere" in the code generator shouldn't be too hard and should help a lot.
For the timeline: Optimisation just for the sake of optimisation is hard. It makes more sense to take "size of some program" as a measurement. Thus, I suggest not to merge anything on this before the release and instead proceed carefully.
A goal for optimisation might be to take one of the examples in this repo and check their binary size. For example cargo build --release --example xclock_utc results in a 7.3 MiB binary which strip turns into 503 KiB.
After the following patch, this turns into 7.3 MiB and 499 KiB. That's already 4 KiB less, just by adding more code to the generated code. :-)
diff --git a/generator/src/generator/namespace.rs b/generator/src/generator/namespace.rs
index c54f996..826744b 100644
--- a/generator/src/generator/namespace.rs
+++ b/generator/src/generator/namespace.rs
@@ -2560,6 +2560,9 @@ impl<'ns, 'c> NamespaceGenerator<'ns, 'c> {
if parse_size_constraint != StructSizeConstraint::None {
outln!(out, "let remaining = initial_value;");
}
+ if let StructSizeConstraint::Fixed(size) = parse_size_constraint {
+ outln!(out, "if remaining.len() < {} {{ return Err(ParseError::ParseError); }}", size);
+ }
Self::emit_let_value_for_dynamic_align(fields, out);
for field in fields.iter() {
self.emit_field_parse(
CC @jneem I'd be happy about your input here. (And I have never worked with cargo bloat before.)
One quick question: Did you use cargo build --release? Or did I perhaps misunderstand you?
linebender/druid#1025 (comment)
I'm not sure I understand this correctly (is it "there are multiple things (e.g.
KeyPressEvent::try_parse) that are in total 3kb in size" or is it "KeyPressEvent::try_parseis 3kb in size"?), but I cannot really reproduce.I copied together some self-contained code for `KeyPressEvent::try_parse`
I cannot easily see the binary size, but here is the assembly for `KeyPressEvent::try_parse`:
Without optimisation, the output is a lot more ugly, but I do not think that looking at this output makes sense.
One thing I notice: llvm managed to merge all the error handling, but it does not notice that it can simplify
if length < 4 then goto error; if length < 8 then goto error;etc. Addingif initial_value.len() < 32 { return Err(ParseError::ParseError); }as a new first line toKeyPressEvent::try_parsehelps here. The assembly now only has 56 lines. There are some simplifications that I do not immediately understand, but all of this "cmpwith small number, then jump" was merged into a singlecmp rdx, 31. I guess generating something like this "everywhere" in the code generator shouldn't be too hard and should help a lot.For the timeline: Optimisation just for the sake of optimisation is hard. It makes more sense to take "size of some program" as a measurement. Thus, I suggest not to merge anything on this before the release and instead proceed carefully.
A goal for optimisation might be to take one of the examples in this repo and check their binary size. For example
cargo build --release --example xclock_utcresults in a 7.3 MiB binary whichstripturns into 503 KiB.After the following patch, this turns into 7.3 MiB and 499 KiB. That's already 4 KiB less, just by adding more code to the generated code. :-)
CC @jneem I'd be happy about your input here. (And I have never worked with
cargo bloatbefore.)One quick question: Did you use
cargo build --release? Or did I perhaps misunderstand you?