@@ -44,9 +44,6 @@ def get_content_el(root_el):
4444VERSION_CXX11 = 102
4545VERSION_CXX14 = 103
4646
47- DESC_CHAR_LIMIT = 200
48- MAX_PAREN_SIZE = 40
49-
5047''' Returns the declaration of the feature with name 'name'.
5148 If several declarations with the same name are present, and entries
5249 superseded in the later standards (as determined by the presence of until
@@ -118,13 +115,155 @@ def get_declarations(root_el, name):
118115def del_all_attrs (el ):
119116 for key in el .attrib :
120117 del el .attrib [key ]
118+
119+ def iterate_top_text (text , on_text = None ):
120+ last_close = 0
121+ open_count = 0
122+
123+ for match in re .finditer ('(<code>|</code>|<i>|</i>|<b>|</b>)' , text ):
124+ if match .group (1 )[1 ] != '/' :
125+ if open_count == 0 :
126+ on_text (last_close , text [last_close :match .start ()])
127+ open_count += 1
128+
129+ else :
130+ open_count -= 1
131+ if open_count == 0 :
132+ last_close = match .start () + len (match .group (1 ))
133+
134+ if open_count == 0 :
135+ on_text (last_close , text [last_close :])
136+
137+ def remove_parentheses (desc , max_paren_text_size ):
138+
139+ open_paren_count = 0
140+ last_paren_open = 0
141+
142+ del_ranges = []
143+
144+ def on_text (pos , text ):
145+ nonlocal open_paren_count , last_paren_open , del_ranges
146+ for match in re .finditer ('(\(|\))' , text ):
147+ gr = match .group (1 )
148+ if gr == '(' :
149+ if open_paren_count == 0 :
150+ last_paren_open = pos + match .start ()
151+ open_paren_count += 1
152+ else :
153+ open_paren_count -= 1
154+ if open_paren_count == 0 :
155+ end = pos + match .start ()+ 1
156+
157+ if end - last_paren_open > max_paren_text_size :
158+ del_ranges .append ((last_paren_open , end ))
159+
160+ if last_paren_open >= pos :
161+ if text .find ('ᚃ' ) != - 1 or text .find ('ᚄ' ) != - 1 :
162+ del_ranges .append ((last_paren_open , end ))
163+
164+ for r in reversed (del_ranges ):
165+ begin ,end = r
166+ desc = desc [:begin ] + desc [end :]
167+ return desc
168+
169+ def split_sentences (desc ):
170+
171+ sentences = []
172+
173+ sentence_start_pos = 0
174+
175+ def on_text (pos , text ):
176+ nonlocal sentence_start_pos
177+ dot_pos = text .find ('.' )
178+ if dot_pos != - 1 :
179+ dot_pos += pos
180+ sentences .append (desc [sentence_start_pos :dot_pos + 1 ])
181+ sentence_start_pos = dot_pos + 1
182+
183+ iterate_top_text (desc , on_text )
184+
185+ if len (desc [sentence_start_pos :].strip ()) > 0 :
186+ sentences .append (desc [sentence_start_pos :])
187+
188+ return sentences
189+
190+ def remove_punctuation_at_end (sentence ):
191+ return sentence .rstrip (' .,:;' )
192+
193+ def trim_single_sentence_at_word (sentence , max_chars ):
194+
195+ last_valid_chunk = None
196+ last_valid_chunk_pos = 0
197+
198+ def on_text (pos , text ):
199+ nonlocal last_valid_chunk , last_valid_chunk_pos
200+ if pos <= max_chars :
201+ last_valid_chunk = text
202+ last_valid_chunk_pos = pos
203+
204+ iterate_top_text (sentence , on_text )
205+
206+ # split only single top-level chunk
207+ words = last_valid_chunk .split (' ' )
208+ last_word = 0
209+ curr_pos = last_valid_chunk_pos
210+ for i , word in enumerate (words ):
211+ curr_pos += len (word ) + 1
212+ if curr_pos > max_chars :
213+ break
214+ last_word = i
215+
216+ last_valid_chunk = ' ' .join (words [:last_word + 1 ])
217+
218+ return sentence [:last_valid_chunk_pos ] + last_valid_chunk
219+
220+ def trim_single_sentence (text , max_chars ):
221+ if len (text ) <= max_chars :
222+ return text
223+
224+ # If the sentence is longer than the limit, then try to cut desc at "i.e."
225+ # if present. Otherwise, cut desc in the middle of the sentence, preferably
226+ # at the end of a word
227+
228+ #find the first match
229+ ie_pos = None
230+
231+ def on_ie_text (pos , match_text ):
232+ nonlocal ie_pos
233+ m = next (re .finditer ('[ᚃᚄ]' , match_text ), None )
234+ if m is not None and ie_pos is None :
235+ ie_pos = pos + m .start ()
236+
237+ iterate_top_text (text , on_ie_text )
238+
239+ if ie_pos is not None :
240+ if ie_pos <= 2 :
241+ return ''
242+
243+ if ie_pos > max_chars :
244+ text = trim_single_sentence_at_word (text , max_chars )
245+ else :
246+ text = text [:ie_pos ]
247+
248+ return remove_punctuation_at_end (text ) + '...'
249+
250+ text = trim_single_sentence_at_word (text , max_chars )
251+ return remove_punctuation_at_end (text ) + '...'
252+
121253''' Processes description text. Drops all tags except <code> and <i>. Replaces
122254 <b> with <i>. Replaces span.mw-geshi with <code>. Returns the processed
123- description as str. The description is limited to one sentence (delimited
124- by a dot) and a maximum of 200 characters. If the sentence is longer than
125- 200 characters, '...' is appended.
255+ description as str.
256+
257+ The description is limited to max_sentences number of sentences and
258+ max_chars number of characters (each delimited by a dot).
259+ If a single sentence is longer than max_chars characters, '...' is appended.
260+
261+ Setting max_paren_text_size to controls the maximum number of characters in
262+ parenthesized text. If the size of parenthesized block exceeds that, it is
263+ removed. Such blocks within <code>, <b> or <i> tag are ignored.
126264'''
127- def process_description (el , debug = False ):
265+ def process_description (el , max_sentences = 1 , max_chars = 200 ,
266+ max_paren_text_size = 40 , debug = False ):
128267
129268 el = deepcopy (el ) # we'll modify the tree
130269 el .tag = 'root'
@@ -151,163 +290,40 @@ def process_description(el, debug=False):
151290 desc = desc .replace ('that is,' , 'ᚄ' )
152291
153292 # process the description:
154- # remove text in parentheses (except when it's within a tags
293+ # remove text in parentheses (except when it's within some tag)
155294 # get the position of the cut of the description
156295
157296 open_count = 0
158297 open_paren_count = 0
159298
160- del_ranges = []
161-
162- # remove parentheses
163- for t in re .finditer ('(<code>|</code>|<i>|</i>|<b>|</b>|\(|\))' , desc ):
164- mt = t .group (1 )
165-
166- if mt == '(' :
167- if open_count == 0 :
168- open_paren_count += 1
169- if open_paren_count == 1 :
170- last_paren_open = t .start ()
171-
172- elif mt == ')' :
173- if open_count == 0 and open_paren_count > 0 :
174- open_paren_count -= 1
175- if open_paren_count == 0 :
176- end = t .start ()+ 1
177- text = desc [last_paren_open :end ]
178- if (text .find ('ᚃ' ) != - 1 or
179- text .find ('ᚄ' ) != - 1 or
180- len (text ) > MAX_PAREN_SIZE ):
181- del_ranges .append ((last_paren_open , t .start ()+ 1 ))
182-
183- else :
184- if mt [1 ] != '/' :
185- open_count += 1
186- else :
187- open_count -= 1
188-
189- for r in reversed (del_ranges ):
190- begin ,end = r
191- desc = desc [:begin ] + desc [end :]
192-
193- if debug :
194- print ("PAREN: " + desc )
195-
196- # find the first dot, actual limit when ignoring the tags
197- last_open = - 1
198- last_close = 0
199- open_count = 0
200- first_dot = - 1
201-
202- curr_limit = DESC_CHAR_LIMIT
299+ desc = remove_parentheses (desc , max_paren_text_size )
300+ sentences = split_sentences (desc )
203301
204- for t in re .finditer ('(<code>|</code>|<i>|</i>|<b>|</b>)' , desc ):
205- mt = t .group (1 )
302+ # limit sentence count
303+ if len (sentences ) > max_sentences :
304+ sentences = sentences [:max_sentences ]
206305
207- if t .start () > curr_limit + len (mt ):
306+ # coarse character limit
307+ char_count = 0
308+ last_sentence = len (sentences )
309+ for i , s in enumerate (sentences ):
310+ char_count += len (s )
311+ if char_count > max_chars :
312+ last_sentence = i + 1
208313 break
314+ sentences = sentences [:last_sentence ]
209315
210- curr_limit += len (mt )
211-
212- if t .group (1 )[1 ] != '/' :
213- if open_count == 0 :
214- last_open = t .start ()
215- # find any dots in the top level text
216- pos = desc [last_close :last_open ].find ('.' )
217- if pos != - 1 and first_dot == - 1 :
218- first_dot = last_close + pos
219-
220- open_count += 1
221-
222- else :
223- open_count -= 1
224- if open_count == 0 :
225- last_close = t .start ()
226-
227- # find dot if there were no tags (last_close == 0) or in the range after
228- # the last close tag
229- if first_dot == - 1 :
230- pos = desc [last_close :].find ('.' )
231- if pos != - 1 :
232- first_dot = last_close + pos
233-
234- # limit desc to the adjusted limit
235- # additionally strip unclosed tags (last_open < curr_limit)
236- if debug :
237- print ("open_count: " + str (open_count ))
238- print ("last_open: " + str (last_open ))
239- print ("first_dot: " + str (first_dot ))
240- print ("len: " + str (len (desc )))
241-
242- limited = False
243- if len (desc ) > curr_limit :
244- limited = True
245- if open_count == 0 :
246- desc = desc [:curr_limit ]
247- else :
248- desc = desc [:last_open ]
249-
250- if debug :
251- print ("limited: " + str (limited ))
252- print ("open_count: " + str (open_count ))
253- print ("last_open: " + str (last_open ))
254- print ("first_dot: " + str (first_dot ))
255- print ("LIMIT: " + desc )
256-
257- # limit desc to the first sentence. If first sentence is longer than the
258- # limit, then try to cut desc at "i.e." if present. Otherwise, cut desc
259- # in the middle of the sentence, preferably at the end of a word
260- if limited and (first_dot == - 1 or first_dot > len (desc )):
261- # interrupted in the middle of a sentence. Polish the result
262-
263- #find the last match
264- m = None
265- for m in re .finditer ('[ᚃᚄ]' , desc ):
266- pass
267- if m and m .start () > 2 :
268- pos = m .start ()
269- char = m .group (0 )
270-
271- # string is too long but we can cut it at 'i.e.'
272- if desc [pos - 2 :pos + 1 ] == ', ' + char :
273- desc = desc [:pos - 2 ] + '.'
274- elif desc [pos - 2 :pos + 1 ] == ' ,' + char :
275- desc = desc [:pos - 2 ] + '.'
276- elif desc [pos - 1 :pos + 1 ] == ',' + char :
277- desc = desc [:pos - 1 ] + '.'
278- elif desc [pos - 1 :pos + 1 ] == ' ' + char :
279- desc = desc [:pos - 1 ] + '.'
280- else :
281- desc = desc [:pos ]
282- else :
283- # open_count != 0 means that we are not within a word already
284- if open_count == 0 :
285- m = None
286- for m in re .finditer ('[\s]+' , desc ):
287- pass
288- if m :
289- desc = desc [:m .start ()]
290-
291- desc = desc + '...'
316+ # trim the single sentence if needed
317+ if char_count > max_chars and len (sentences ) == 1 :
318+ sentences [0 ] = trim_single_sentence (sentences [0 ], max_chars )
292319 else :
293- desc = desc .rstrip ()
294- if first_dot == - 1 :
295- # fix the punctuation at the end
296- if desc [- 1 ] in [';' , ',' ]:
297- desc = desc [:- 1 ] + '.'
298- if desc [- 1 ] in [':' , '-' ]:
299- desc = desc + ' ...'
300- elif desc [- 1 ] != '.' :
301- desc = desc + '.'
302- else :
303- # cut the summary at the end of the first sentence
304- desc = desc [:first_dot ] + '.'
320+ if sentences [- 1 ].rstrip ()[- 1 ] != '.' :
321+ sentences [- 1 ] = remove_punctuation_at_end (sentences [- 1 ]) + '...'
305322
323+ desc = '\n ' .join (sentences )
306324 desc = desc .replace ('ᚃ' , 'i.e.' )
307325 desc = desc .replace ('ᚄ' , 'that is,' )
308326
309- if debug :
310- print ("FINAL: " + desc )
311327 return desc
312328
313329''' Returns a short description of a feature. This is the first sentence after
0 commit comments