|
7 | 7 | from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit, urlunsplit |
8 | 8 |
|
9 | 9 | from django.utils.encoding import punycode |
10 | | -from django.utils.functional import Promise, keep_lazy, keep_lazy_text |
| 10 | +from django.utils.functional import Promise, cached_property, keep_lazy, keep_lazy_text |
11 | 11 | from django.utils.http import RFC3986_GENDELIMS, RFC3986_SUBDELIMS |
12 | 12 | from django.utils.regex_helper import _lazy_re_compile |
13 | 13 | from django.utils.safestring import SafeData, SafeString, mark_safe |
@@ -225,6 +225,16 @@ def unquote_quote(segment): |
225 | 225 | return urlunsplit((scheme, netloc, path, query, fragment)) |
226 | 226 |
|
227 | 227 |
|
| 228 | +class CountsDict(dict): |
| 229 | + def __init__(self, *args, word, **kwargs): |
| 230 | + super().__init__(*args, *kwargs) |
| 231 | + self.word = word |
| 232 | + |
| 233 | + def __missing__(self, key): |
| 234 | + self[key] = self.word.count(key) |
| 235 | + return self[key] |
| 236 | + |
| 237 | + |
228 | 238 | class Urlizer: |
229 | 239 | """ |
230 | 240 | Convert any URLs in text into clickable links. |
@@ -330,40 +340,72 @@ def trim_url(self, x, *, limit): |
330 | 340 | return x |
331 | 341 | return "%s…" % x[: max(0, limit - 1)] |
332 | 342 |
|
| 343 | + @cached_property |
| 344 | + def wrapping_punctuation_openings(self): |
| 345 | + return "".join(dict(self.wrapping_punctuation).keys()) |
| 346 | + |
| 347 | + @cached_property |
| 348 | + def trailing_punctuation_chars_no_semicolon(self): |
| 349 | + return self.trailing_punctuation_chars.replace(";", "") |
| 350 | + |
| 351 | + @cached_property |
| 352 | + def trailing_punctuation_chars_has_semicolon(self): |
| 353 | + return ";" in self.trailing_punctuation_chars |
| 354 | + |
333 | 355 | def trim_punctuation(self, word): |
334 | 356 | """ |
335 | 357 | Trim trailing and wrapping punctuation from `word`. Return the items of |
336 | 358 | the new state. |
337 | 359 | """ |
338 | | - lead, middle, trail = "", word, "" |
| 360 | + # Strip all opening wrapping punctuation. |
| 361 | + middle = word.lstrip(self.wrapping_punctuation_openings) |
| 362 | + lead = word[: len(word) - len(middle)] |
| 363 | + trail = "" |
| 364 | + |
339 | 365 | # Continue trimming until middle remains unchanged. |
340 | 366 | trimmed_something = True |
341 | | - while trimmed_something: |
| 367 | + counts = CountsDict(word=middle) |
| 368 | + while trimmed_something and middle: |
342 | 369 | trimmed_something = False |
343 | 370 | # Trim wrapping punctuation. |
344 | 371 | for opening, closing in self.wrapping_punctuation: |
345 | | - if middle.startswith(opening): |
346 | | - middle = middle[len(opening) :] |
347 | | - lead += opening |
348 | | - trimmed_something = True |
349 | | - # Keep parentheses at the end only if they're balanced. |
350 | | - if ( |
351 | | - middle.endswith(closing) |
352 | | - and middle.count(closing) == middle.count(opening) + 1 |
353 | | - ): |
354 | | - middle = middle[: -len(closing)] |
355 | | - trail = closing + trail |
356 | | - trimmed_something = True |
357 | | - # Trim trailing punctuation (after trimming wrapping punctuation, |
358 | | - # as encoded entities contain ';'). Unescape entities to avoid |
359 | | - # breaking them by removing ';'. |
360 | | - middle_unescaped = html.unescape(middle) |
361 | | - stripped = middle_unescaped.rstrip(self.trailing_punctuation_chars) |
362 | | - if middle_unescaped != stripped: |
363 | | - punctuation_count = len(middle_unescaped) - len(stripped) |
364 | | - trail = middle[-punctuation_count:] + trail |
365 | | - middle = middle[:-punctuation_count] |
| 372 | + if counts[opening] < counts[closing]: |
| 373 | + rstripped = middle.rstrip(closing) |
| 374 | + if rstripped != middle: |
| 375 | + strip = counts[closing] - counts[opening] |
| 376 | + trail = middle[-strip:] |
| 377 | + middle = middle[:-strip] |
| 378 | + trimmed_something = True |
| 379 | + counts[closing] -= strip |
| 380 | + |
| 381 | + rstripped = middle.rstrip(self.trailing_punctuation_chars_no_semicolon) |
| 382 | + if rstripped != middle: |
| 383 | + trail = middle[len(rstripped) :] + trail |
| 384 | + middle = rstripped |
366 | 385 | trimmed_something = True |
| 386 | + |
| 387 | + if self.trailing_punctuation_chars_has_semicolon and middle.endswith(";"): |
| 388 | + # Only strip if not part of an HTML entity. |
| 389 | + amp = middle.rfind("&") |
| 390 | + if amp == -1: |
| 391 | + can_strip = True |
| 392 | + else: |
| 393 | + potential_entity = middle[amp:] |
| 394 | + escaped = html.unescape(potential_entity) |
| 395 | + can_strip = (escaped == potential_entity) or escaped.endswith(";") |
| 396 | + |
| 397 | + if can_strip: |
| 398 | + rstripped = middle.rstrip(";") |
| 399 | + amount_stripped = len(middle) - len(rstripped) |
| 400 | + if amp > -1 and amount_stripped > 1: |
| 401 | + # Leave a trailing semicolon as might be an entity. |
| 402 | + trail = middle[len(rstripped) + 1 :] + trail |
| 403 | + middle = rstripped + ";" |
| 404 | + else: |
| 405 | + trail = middle[len(rstripped) :] + trail |
| 406 | + middle = rstripped |
| 407 | + trimmed_something = True |
| 408 | + |
367 | 409 | return lead, middle, trail |
368 | 410 |
|
369 | 411 | @staticmethod |
|
0 commit comments