1- #![ allow( dead_code) ] // runtime init functions not used during testing
1+ //! The Windows command line is just a string
2+ //! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
3+ //!
4+ //! This module implements the parsing necessary to turn that string into a list of arguments.
25
36#[ cfg( test) ]
47mod tests;
58
69use crate :: ffi:: OsString ;
710use crate :: fmt;
11+ use crate :: marker:: PhantomData ;
12+ use crate :: num:: NonZeroU16 ;
813use crate :: os:: windows:: prelude:: * ;
914use crate :: path:: PathBuf ;
10- use crate :: slice ;
15+ use crate :: ptr :: NonNull ;
1116use crate :: sys:: c;
1217use crate :: sys:: windows:: os:: current_exe;
1318use crate :: vec;
1419
1520use core:: iter;
1621
1722pub fn args ( ) -> Args {
23+ // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
24+ // string so it's safe for `WStrUnits` to use.
1825 unsafe {
1926 let lp_cmd_line = c:: GetCommandLineW ( ) ;
20- let parsed_args_list = parse_lp_cmd_line ( lp_cmd_line as * const u16 , || {
27+ let parsed_args_list = parse_lp_cmd_line ( WStrUnits :: new ( lp_cmd_line) , || {
2128 current_exe ( ) . map ( PathBuf :: into_os_string) . unwrap_or_else ( |_| OsString :: new ( ) )
2229 } ) ;
2330
@@ -28,129 +35,120 @@ pub fn args() -> Args {
2835/// Implements the Windows command-line argument parsing algorithm.
2936///
3037/// Microsoft's documentation for the Windows CLI argument format can be found at
31- /// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>.
38+ /// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
3239///
33- /// Windows includes a function to do this in shell32.dll,
34- /// but linking with that DLL causes the process to be registered as a GUI application.
40+ /// A more in-depth explanation is here:
41+ /// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
42+ ///
43+ /// Windows includes a function to do command line parsing in shell32.dll.
44+ /// However, this is not used for two reasons:
45+ ///
46+ /// 1. Linking with that DLL causes the process to be registered as a GUI application.
3547/// GUI applications add a bunch of overhead, even if no windows are drawn. See
3648/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
3749///
38- /// This function was tested for equivalence to the shell32.dll implementation in
39- /// Windows 10 Pro v1803, using an exhaustive test suite available at
40- /// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or
41- /// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>.
42- unsafe fn parse_lp_cmd_line < F : Fn ( ) -> OsString > (
43- lp_cmd_line : * const u16 ,
50+ /// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
51+ ///
52+ /// This function was tested for equivalence to the C/C++ parsing rules using an
53+ /// extensive test suite available at
54+ /// <https://github.com/ChrisDenton/winarg/tree/std>.
55+ fn parse_lp_cmd_line < ' a , F : Fn ( ) -> OsString > (
56+ lp_cmd_line : Option < WStrUnits < ' a > > ,
4457 exe_name : F ,
4558) -> Vec < OsString > {
46- const BACKSLASH : u16 = '\\' as u16 ;
47- const QUOTE : u16 = '"' as u16 ;
48- const TAB : u16 = '\t' as u16 ;
49- const SPACE : u16 = ' ' as u16 ;
59+ const BACKSLASH : NonZeroU16 = NonZeroU16 :: new ( b'\\' as u16 ) . unwrap ( ) ;
60+ const QUOTE : NonZeroU16 = NonZeroU16 :: new ( b'"' as u16 ) . unwrap ( ) ;
61+ const TAB : NonZeroU16 = NonZeroU16 :: new ( b'\t' as u16 ) . unwrap ( ) ;
62+ const SPACE : NonZeroU16 = NonZeroU16 :: new ( b' ' as u16 ) . unwrap ( ) ;
63+
5064 let mut ret_val = Vec :: new ( ) ;
51- if lp_cmd_line. is_null ( ) || * lp_cmd_line == 0 {
65+ // If the cmd line pointer is null or it points to an empty string then
66+ // return the name of the executable as argv[0].
67+ if lp_cmd_line. as_ref ( ) . and_then ( |cmd| cmd. peek ( ) ) . is_none ( ) {
5268 ret_val. push ( exe_name ( ) ) ;
5369 return ret_val;
5470 }
55- let mut cmd_line = {
56- let mut end = 0 ;
57- while * lp_cmd_line. offset ( end) != 0 {
58- end += 1 ;
59- }
60- slice:: from_raw_parts ( lp_cmd_line, end as usize )
61- } ;
71+ let mut code_units = lp_cmd_line. unwrap ( ) ;
72+
6273 // The executable name at the beginning is special.
63- cmd_line = match cmd_line[ 0 ] {
64- // The executable name ends at the next quote mark,
65- // no matter what.
66- QUOTE => {
67- let args = {
68- let mut cut = cmd_line[ 1 ..] . splitn ( 2 , |& c| c == QUOTE ) ;
69- if let Some ( exe) = cut. next ( ) {
70- ret_val. push ( OsString :: from_wide ( exe) ) ;
71- }
72- cut. next ( )
73- } ;
74- if let Some ( args) = args {
75- args
76- } else {
77- return ret_val;
78- }
79- }
80- // Implement quirk: when they say whitespace here,
81- // they include the entire ASCII control plane:
82- // "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
83- // will consider the first argument to be an empty string. Excess whitespace at the
84- // end of lpCmdLine is ignored."
85- 0 ..=SPACE => {
86- ret_val. push ( OsString :: new ( ) ) ;
87- & cmd_line[ 1 ..]
88- }
89- // The executable name ends at the next whitespace,
90- // no matter what.
91- _ => {
92- let args = {
93- let mut cut = cmd_line. splitn ( 2 , |& c| c > 0 && c <= SPACE ) ;
94- if let Some ( exe) = cut. next ( ) {
95- ret_val. push ( OsString :: from_wide ( exe) ) ;
96- }
97- cut. next ( )
98- } ;
99- if let Some ( args) = args {
100- args
101- } else {
102- return ret_val;
103- }
74+ let mut in_quotes = false ;
75+ let mut cur = Vec :: new ( ) ;
76+ for w in & mut code_units {
77+ match w {
78+ // A quote mark always toggles `in_quotes` no matter what because
79+ // there are no escape characters when parsing the executable name.
80+ QUOTE => in_quotes = !in_quotes,
81+ // If not `in_quotes` then whitespace ends argv[0].
82+ SPACE | TAB if !in_quotes => break ,
83+ // In all other cases the code unit is taken literally.
84+ _ => cur. push ( w. get ( ) ) ,
10485 }
105- } ;
86+ }
87+ // Skip whitespace.
88+ code_units. advance_while ( |w| w == SPACE || w == TAB ) ;
89+ ret_val. push ( OsString :: from_wide ( & cur) ) ;
90+
91+ // Parse the arguments according to these rules:
92+ // * All code units are taken literally except space, tab, quote and backslash.
93+ // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
94+ // treated as a single separator.
95+ // * A space or tab `in_quotes` is taken literally.
96+ // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
97+ // * A quote can be escaped if preceded by an odd number of backslashes.
98+ // * If any number of backslashes is immediately followed by a quote then the number of
99+ // backslashes is halved (rounding down).
100+ // * Backslashes not followed by a quote are all taken literally.
101+ // * If `in_quotes` then a quote can also be escaped using another quote
102+ // (i.e. two consecutive quotes become one literal quote).
106103 let mut cur = Vec :: new ( ) ;
107104 let mut in_quotes = false ;
108- let mut was_in_quotes = false ;
109- let mut backslash_count: usize = 0 ;
110- for & c in cmd_line {
111- match c {
112- // backslash
113- BACKSLASH => {
114- backslash_count += 1 ;
115- was_in_quotes = false ;
105+ while let Some ( w) = code_units. next ( ) {
106+ match w {
107+ // If not `in_quotes`, a space or tab ends the argument.
108+ SPACE | TAB if !in_quotes => {
109+ ret_val. push ( OsString :: from_wide ( & cur[ ..] ) ) ;
110+ cur. truncate ( 0 ) ;
111+
112+ // Skip whitespace.
113+ code_units. advance_while ( |w| w == SPACE || w == TAB ) ;
116114 }
117- QUOTE if backslash_count % 2 == 0 => {
118- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count / 2 ) ) ;
119- backslash_count = 0 ;
120- if was_in_quotes {
121- cur. push ( '"' as u16 ) ;
122- was_in_quotes = false ;
115+ // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
116+ BACKSLASH => {
117+ let backslash_count = code_units. advance_while ( |w| w == BACKSLASH ) + 1 ;
118+ if code_units. peek ( ) == Some ( QUOTE ) {
119+ cur. extend ( iter:: repeat ( BACKSLASH . get ( ) ) . take ( backslash_count / 2 ) ) ;
120+ // The quote is escaped if there are an odd number of backslashes.
121+ if backslash_count % 2 == 1 {
122+ code_units. next ( ) ;
123+ cur. push ( QUOTE . get ( ) ) ;
124+ }
123125 } else {
124- was_in_quotes = in_quotes ;
125- in_quotes = !in_quotes ;
126+ // If there is no quote on the end then there is no escaping.
127+ cur . extend ( iter :: repeat ( BACKSLASH . get ( ) ) . take ( backslash_count ) ) ;
126128 }
127129 }
128- QUOTE if backslash_count % 2 != 0 => {
129- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count / 2 ) ) ;
130- backslash_count = 0 ;
131- was_in_quotes = false ;
132- cur. push ( b'"' as u16 ) ;
133- }
134- SPACE | TAB if !in_quotes => {
135- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count) ) ;
136- if !cur. is_empty ( ) || was_in_quotes {
137- ret_val. push ( OsString :: from_wide ( & cur[ ..] ) ) ;
138- cur. truncate ( 0 ) ;
130+ // If `in_quotes` and not backslash escaped (see above) then a quote either
131+ // unsets `in_quote` or is escaped by another quote.
132+ QUOTE if in_quotes => match code_units. peek ( ) {
133+ // Two consecutive quotes when `in_quotes` produces one literal quote.
134+ Some ( QUOTE ) => {
135+ cur. push ( QUOTE . get ( ) ) ;
136+ code_units. next ( ) ;
139137 }
140- backslash_count = 0 ;
141- was_in_quotes = false ;
142- }
143- _ => {
144- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count) ) ;
145- backslash_count = 0 ;
146- was_in_quotes = false ;
147- cur. push ( c) ;
148- }
138+ // Otherwise set `in_quotes`.
139+ Some ( _) => in_quotes = false ,
140+ // The end of the command line.
141+ // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
142+ None => break ,
143+ } ,
144+ // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
145+ QUOTE => in_quotes = true ,
146+ // Everything else is always taken literally.
147+ _ => cur. push ( w. get ( ) ) ,
149148 }
150149 }
151- cur. extend ( iter:: repeat ( b'\\' as u16 ) . take ( backslash_count) ) ;
152- // include empty quoted strings at the end of the arguments list
153- if !cur. is_empty ( ) || was_in_quotes || in_quotes {
150+ // Push the final argument, if any.
151+ if !cur. is_empty ( ) || in_quotes {
154152 ret_val. push ( OsString :: from_wide ( & cur[ ..] ) ) ;
155153 }
156154 ret_val
@@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
187185 self . parsed_args_list . len ( )
188186 }
189187}
188+
189+ /// A safe iterator over a LPWSTR
190+ /// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
191+ struct WStrUnits < ' a > {
192+ // The pointer must never be null...
193+ lpwstr : NonNull < u16 > ,
194+ // ...and the memory it points to must be valid for this lifetime.
195+ lifetime : PhantomData < & ' a [ u16 ] > ,
196+ }
197+ impl WStrUnits < ' _ > {
198+ /// Create the iterator. Returns `None` if `lpwstr` is null.
199+ ///
200+ /// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
201+ /// at least as long as the lifetime of this struct.
202+ unsafe fn new ( lpwstr : * const u16 ) -> Option < Self > {
203+ Some ( Self { lpwstr : NonNull :: new ( lpwstr as _ ) ?, lifetime : PhantomData } )
204+ }
205+ fn peek ( & self ) -> Option < NonZeroU16 > {
206+ // SAFETY: It's always safe to read the current item because we don't
207+ // ever move out of the array's bounds.
208+ unsafe { NonZeroU16 :: new ( * self . lpwstr . as_ptr ( ) ) }
209+ }
210+ /// Advance the iterator while `predicate` returns true.
211+ /// Returns the number of items it advanced by.
212+ fn advance_while < P : FnMut ( NonZeroU16 ) -> bool > ( & mut self , mut predicate : P ) -> usize {
213+ let mut counter = 0 ;
214+ while let Some ( w) = self . peek ( ) {
215+ if !predicate ( w) {
216+ break ;
217+ }
218+ counter += 1 ;
219+ self . next ( ) ;
220+ }
221+ counter
222+ }
223+ }
224+ impl Iterator for WStrUnits < ' _ > {
225+ // This can never return zero as that marks the end of the string.
226+ type Item = NonZeroU16 ;
227+ fn next ( & mut self ) -> Option < NonZeroU16 > {
228+ // SAFETY: If NULL is reached we immediately return.
229+ // Therefore it's safe to advance the pointer after that.
230+ unsafe {
231+ let next = self . peek ( ) ?;
232+ self . lpwstr = NonNull :: new_unchecked ( self . lpwstr . as_ptr ( ) . add ( 1 ) ) ;
233+ Some ( next)
234+ }
235+ }
236+ }
0 commit comments