This repository has been archived by the owner on Jan 7, 2024. It is now read-only.
forked from g2p/rfc6266
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rfc6266.py
139 lines (101 loc) · 4.19 KB
/
rfc6266.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""Implements RFC 6266, the Content-Disposition HTTP header.
parse_headers handles the receiver side.
It has shortcuts for some http libraries:
parse_httplib2_response and parse_requests_response.
It returns a ContentDisposition object with attributes like is_inline,
filename_unsafe, filename_sanitized.
build_header handles the sender side.
"""
import logging
from string import ascii_letters, digits
from urllib.parse import quote
LOGGER = logging.getLogger("rfc6266")
try:
LOGGER.addHandler(logging.NullHandler())
except AttributeError:
pass
__all__ = ("build_header",)
def percent_encode(string, safe, encoding):
return quote(string, safe, encoding, errors="strict")
# RFC 2616
separator_chars = '()<>@,;:\\"/[]?={} \t'
ctl_chars = "".join(chr(i) for i in range(32)) + chr(127)
nontoken_chars = separator_chars + ctl_chars
# RFC 5987
attr_chars_nonalnum = "!#$&+-.^_`|~"
attr_chars = ascii_letters + digits + attr_chars_nonalnum
# RFC 5987 gives this alternative construction of the token character class
token_chars = attr_chars + "*'%"
def is_token_char(ch):
# Must be ascii, and neither a control char nor a separator char
asciicode = ord(ch)
# < 128 means ascii, exclude control chars at 0-31 and 127,
# exclude separator characters.
return 31 < asciicode < 127 and ch not in separator_chars
def usesonlycharsfrom(candidate, chars):
# Found that shortcut in urllib.quote
return candidate.rstrip(chars) == ""
def is_token(candidate):
# return usesonlycharsfrom(candidate, token_chars)
return all(is_token_char(ch) for ch in candidate)
def is_ascii(text):
return all(ord(ch) < 128 for ch in text)
def fits_inside_codec(text, codec):
try:
text.encode(codec)
except UnicodeEncodeError:
return False
else:
return True
def is_lws_safe(text):
return normalize_ws(text) == text
def normalize_ws(text):
return " ".join(text.split())
def qd_quote(text):
return text.replace("\\", "\\\\").replace('"', '\\"')
def build_header(filename, disposition="attachment", filename_compat=None):
"""Generate a Content-Disposition header for a given filename.
For legacy clients that don't understand the filename* parameter,
a filename_compat value may be given.
It should either be ascii-only (recommended) or iso-8859-1 only.
In the later case it should be a character string
(unicode in Python 2).
Options for generating filename_compat (only useful for legacy clients):
- ignore (will only send filename*);
- strip accents using unicode's decomposing normalisations,
which can be done from unicode data (stdlib), and keep only ascii;
- use the ascii transliteration tables from Unidecode (PyPI);
- use iso-8859-1
Ignore is the safest, and can be used to trigger a fallback
to the document location (which can be percent-encoded utf-8
if you control the URLs).
See https://tools.ietf.org/html/rfc6266#appendix-D
"""
# While this method exists, it could also sanitize the filename
# by rejecting slashes or other weirdness that might upset a receiver.
if disposition != "attachment":
assert is_token(disposition)
rv = disposition
if is_token(filename):
rv += "; filename=%s" % (filename,)
return rv
elif is_ascii(filename) and is_lws_safe(filename):
qd_filename = qd_quote(filename)
rv += '; filename="%s"' % (qd_filename,)
if qd_filename == filename:
# RFC 6266 claims some implementations are iffy on qdtext's
# backslash-escaping, we'll include filename* in that case.
return rv
elif filename_compat:
if is_token(filename_compat):
rv += "; filename=%s" % (filename_compat,)
else:
assert is_lws_safe(filename_compat)
rv += '; filename="%s"' % (qd_quote(filename_compat),)
# alnum are already considered always-safe, but the rest isn't.
# Python encodes ~ when it shouldn't, for example.
rv += "; filename*=utf-8''%s" % (
percent_encode(filename, safe=attr_chars_nonalnum, encoding="utf-8"),
)
# This will only encode filename_compat, if it used non-ascii iso-8859-1.
return rv.encode("iso-8859-1")