diff options
Diffstat (limited to 'venv/lib/python3.9/site-packages/mdurl/_decode.py')
-rw-r--r-- | venv/lib/python3.9/site-packages/mdurl/_decode.py | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/venv/lib/python3.9/site-packages/mdurl/_decode.py b/venv/lib/python3.9/site-packages/mdurl/_decode.py new file mode 100644 index 00000000..9b50a2dd --- /dev/null +++ b/venv/lib/python3.9/site-packages/mdurl/_decode.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from collections.abc import Sequence +import functools +import re + +DECODE_DEFAULT_CHARS = ";/?:@&=+$,#" +DECODE_COMPONENT_CHARS = "" + +decode_cache: dict[str, list[str]] = {} + + +def get_decode_cache(exclude: str) -> Sequence[str]: + if exclude in decode_cache: + return decode_cache[exclude] + + cache: list[str] = [] + decode_cache[exclude] = cache + + for i in range(128): + ch = chr(i) + cache.append(ch) + + for i in range(len(exclude)): + ch_code = ord(exclude[i]) + cache[ch_code] = "%" + ("0" + hex(ch_code)[2:].upper())[-2:] + + return cache + + +# Decode percent-encoded string. +# +def decode(string: str, exclude: str = DECODE_DEFAULT_CHARS) -> str: + cache = get_decode_cache(exclude) + repl_func = functools.partial(repl_func_with_cache, cache=cache) + return re.sub(r"(%[a-f0-9]{2})+", repl_func, string, flags=re.IGNORECASE) + + +def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str: + seq = match.group() + result = "" + + i = 0 + l = len(seq) # noqa: E741 + while i < l: + b1 = int(seq[i + 1 : i + 3], 16) + + if b1 < 0x80: + result += cache[b1] + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xE0) == 0xC0 and (i + 3 < l): + # 110xxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + + if (b2 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 2 + + i += 3 + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xF0) == 0xE0 and (i + 6 < l): + # 1110xxxx 10xxxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + b3 = int(seq[i + 7 : i + 9], 16) + + if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2, b3)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 3 + + i += 6 + i += 3 # emulate JS for loop statement3 + continue + + if (b1 & 0xF8) == 0xF0 and (i + 9 < l): + # 111110xx 10xxxxxx 10xxxxxx 10xxxxxx + b2 = int(seq[i + 4 : i + 6], 16) + b3 = int(seq[i + 7 : i + 9], 16) + b4 = int(seq[i + 10 : i + 12], 16) + + if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80: + all_bytes = bytes((b1, b2, b3, b4)) + try: + result += all_bytes.decode() + except UnicodeDecodeError: + result += "\ufffd" * 4 + + i += 9 + i += 3 # emulate JS for loop statement3 + continue + + result += "\ufffd" + i += 3 # emulate JS for loop statement3 + + return result |