From 6032c4f2352260d32032800a2ff694339e2af6b2 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 2 Jul 2016 01:51:47 -0700 Subject: message.content -> .raw_content, implement .text This PR improves our handling of HTTP message body encodings: - The unaltered message body is now accessible as `.raw_content` - The "content-encoding"-decoded content (i.e. gzip removed) content is not `.content`, as this is what we want in 99% of the cases. - `.text` now provides the "content-encoding"-decoded and then "content-type charset"-decoded message body. - The decoded values for `.content` and `.text` are cached, so that repeated access and `x.text = x.text` is cheap. - The `decoded()` decorator is now deprecated, as we can now just use `.content`. Similarly `HTTPMessage.get_decoded_content()` is deprecated. --- netlib/encoding.py | 97 +++++++++++++-------- netlib/http/http1/assemble.py | 4 +- netlib/http/message.py | 192 +++++++++++++++++++++++++++++------------- netlib/http/request.py | 4 +- netlib/http/response.py | 5 +- 5 files changed, 202 insertions(+), 100 deletions(-) (limited to 'netlib') diff --git a/netlib/encoding.py b/netlib/encoding.py index 98502451..8b67b543 100644 --- a/netlib/encoding.py +++ b/netlib/encoding.py @@ -1,39 +1,62 @@ """ - Utility functions for decoding response bodies. +Utility functions for decoding response bodies. """ from __future__ import absolute_import + +import codecs from io import BytesIO import gzip import zlib +from typing import Union # noqa + -ENCODINGS = {"identity", "gzip", "deflate"} +def decode(obj, encoding, errors='strict'): + # type: (Union[str, bytes], str) -> Union[str, bytes] + """ + Decode the given input object + Returns: + The decoded value -def decode(e, content): - if not isinstance(content, bytes): - return None - encoding_map = { - "identity": identity, - "gzip": decode_gzip, - "deflate": decode_deflate, - } - if e not in encoding_map: - return None - return encoding_map[e](content) + Raises: + ValueError, if decoding fails. + """ + try: + try: + return custom_decode[encoding](obj) + except KeyError: + return codecs.decode(obj, encoding, errors) + except Exception as e: + raise ValueError("{} when decoding {} with {}".format( + type(e).__name__, + repr(obj)[:10], + repr(encoding), + )) + + +def encode(obj, encoding, errors='strict'): + # type: (Union[str, bytes], str) -> Union[str, bytes] + """ + Encode the given input object + Returns: + The encoded value -def encode(e, content): - if not isinstance(content, bytes): - return None - encoding_map = { - "identity": identity, - "gzip": encode_gzip, - "deflate": encode_deflate, - } - if e not in encoding_map: - return None - return encoding_map[e](content) + Raises: + ValueError, if encoding fails. + """ + try: + try: + return custom_encode[encoding](obj) + except KeyError: + return codecs.encode(obj, encoding, errors) + except Exception as e: + raise ValueError("{} when encoding {} with {}".format( + type(e).__name__, + repr(obj)[:10], + repr(encoding), + )) def identity(content): @@ -46,10 +69,7 @@ def identity(content): def decode_gzip(content): gfile = gzip.GzipFile(fileobj=BytesIO(content)) - try: - return gfile.read() - except (IOError, EOFError): - return None + return gfile.read() def encode_gzip(content): @@ -70,12 +90,9 @@ def decode_deflate(content): http://bugs.python.org/issue5784 """ try: - try: - return zlib.decompress(content) - except zlib.error: - return zlib.decompress(content, -15) + return zlib.decompress(content) except zlib.error: - return None + return zlib.decompress(content, -15) def encode_deflate(content): @@ -84,4 +101,16 @@ def encode_deflate(content): """ return zlib.compress(content) -__all__ = ["ENCODINGS", "encode", "decode"] + +custom_decode = { + "identity": identity, + "gzip": decode_gzip, + "deflate": decode_deflate, +} +custom_encode = { + "identity": identity, + "gzip": encode_gzip, + "deflate": encode_deflate, +} + +__all__ = ["encode", "decode"] diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py index 511328f1..e74732d2 100644 --- a/netlib/http/http1/assemble.py +++ b/netlib/http/http1/assemble.py @@ -5,7 +5,7 @@ from netlib import exceptions def assemble_request(request): - if request.content is None: + if request.data.content is None: raise exceptions.HttpException("Cannot assemble flow with missing content") head = assemble_request_head(request) body = b"".join(assemble_body(request.data.headers, [request.data.content])) @@ -19,7 +19,7 @@ def assemble_request_head(request): def assemble_response(response): - if response.content is None: + if response.data.content is None: raise exceptions.HttpException("Cannot assemble flow with missing content") head = assemble_response_head(response) body = b"".join(assemble_body(response.data.headers, [response.data.content])) diff --git a/netlib/http/message.py b/netlib/http/message.py index 0583c246..668198f8 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -52,7 +52,22 @@ class MessageData(basetypes.Serializable): return cls(**state) +class CachedDecode(object): + __slots__ = ["encoded", "encoding", "decoded"] + + def __init__(self, object, encoding, decoded): + self.encoded = object + self.encoding = encoding + self.decoded = decoded + +no_cached_decode = CachedDecode(None, None, None) + + class Message(basetypes.Serializable): + def __init__(self): + self._content_cache = no_cached_decode # type: CachedDecode + self._text_cache = no_cached_decode # type: CachedDecode + def __eq__(self, other): if isinstance(other, Message): return self.data == other.data @@ -90,19 +105,65 @@ class Message(basetypes.Serializable): self.data.headers = h @property - def content(self): + def raw_content(self): + # type: () -> bytes """ The raw (encoded) HTTP message body - See also: :py:attr:`text` + See also: :py:attr:`content`, :py:class:`text` """ return self.data.content - @content.setter - def content(self, content): + @raw_content.setter + def raw_content(self, content): self.data.content = content - if isinstance(content, bytes): - self.headers["content-length"] = str(len(content)) + + @property + def content(self): + # type: () -> bytes + """ + The HTTP message body decoded with the content-encoding header (e.g. gzip) + + See also: :py:class:`raw_content`, :py:attr:`text` + """ + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.encoded == self.raw_content and + self._content_cache.encoding == ce + ) + if not cached: + try: + if not ce: + raise ValueError() + decoded = encoding.decode(self.raw_content, ce) + except ValueError: + decoded = self.raw_content + self._content_cache = CachedDecode(self.raw_content, ce, decoded) + return self._content_cache.decoded + + @content.setter + def content(self, value): + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.decoded == value and + self._content_cache.encoding == ce + ) + if not cached: + try: + if not ce: + raise ValueError() + encoded = encoding.encode(value, ce) + except ValueError: + # Do we have an unknown content-encoding? + # If so, we want to remove it. + if value and ce: + self.headers.pop("content-encoding", None) + ce = None + encoded = value + self._content_cache = CachedDecode(encoded, ce, value) + self.raw_content = self._content_cache.encoded + if isinstance(self.raw_content, bytes): + self.headers["content-length"] = str(len(self.raw_content)) @property def http_version(self): @@ -137,56 +198,81 @@ class Message(basetypes.Serializable): def timestamp_end(self, timestamp_end): self.data.timestamp_end = timestamp_end + def _get_content_type_charset(self): + # type: () -> Optional[str] + ct = headers.parse_content_type(self.headers.get("content-type", "")) + if ct: + return ct[2].get("charset") + @property def text(self): + # type: () -> six.text_type """ - The decoded HTTP message body. - Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive. - - .. note:: - This is not implemented yet. + The HTTP message body decoded with both content-encoding header (e.g. gzip) + and content-type header charset. - See also: :py:attr:`content`, :py:class:`decoded` + See also: :py:attr:`content`, :py:class:`raw_content` """ # This attribute should be called text, because that's what requests does. - raise NotImplementedError() + enc = self._get_content_type_charset() + + # We may also want to check for HTML meta tags here at some point. + + cached = ( + self._text_cache.encoded == self.content and + self._text_cache.encoding == enc + ) + if not cached: + try: + if not enc: + raise ValueError() + decoded = encoding.decode(self.content, enc) + except ValueError: + decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(self.content, enc, decoded) + return self._text_cache.decoded @text.setter def text(self, text): - raise NotImplementedError() + enc = self._get_content_type_charset() + cached = ( + self._text_cache.decoded == text and + self._text_cache.encoding == enc + ) + if not cached: + try: + if not enc: + raise ValueError() + encoded = encoding.encode(text, enc) + except ValueError: + # Do we have an unknown content-type charset? + # If so, we want to replace it with utf8. + if text and enc: + self.headers["content-type"] = re.sub( + "charset=[^;]+", + "charset=utf-8", + self.headers["content-type"] + ) + encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(encoded, enc, text) + self.content = self._text_cache.encoded def decode(self): """ - Decodes body based on the current Content-Encoding header, then - removes the header. If there is no Content-Encoding header, no - action is taken. - - Returns: - True, if decoding succeeded. - False, otherwise. + Decodes body based on the current Content-Encoding header, then + removes the header. If there is no Content-Encoding header, no + action is taken. """ - ce = self.headers.get("content-encoding") - data = encoding.decode(ce, self.content) - if data is None: - return False - self.content = data + self.raw_content = self.content self.headers.pop("content-encoding", None) - return True def encode(self, e): """ - Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". - - Returns: - True, if decoding succeeded. - False, otherwise. + Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". """ - data = encoding.encode(e, self.content) - if data is None: - return False - self.content = data + self.decode() # remove the current encoding self.headers["content-encoding"] = e - return True + self.content = self.raw_content def replace(self, pattern, repl, flags=0): """ @@ -203,10 +289,9 @@ class Message(basetypes.Serializable): repl = strutils.escaped_str_to_bytes(repl) replacements = 0 if self.content: - with decoded(self): - self.content, replacements = re.subn( - pattern, repl, self.content, flags=flags - ) + self.content, replacements = re.subn( + pattern, repl, self.content, flags=flags + ) replacements += self.headers.replace(pattern, repl, flags) return replacements @@ -225,29 +310,16 @@ class Message(basetypes.Serializable): class decoded(object): """ - A context manager that decodes a request or response, and then - re-encodes it with the same encoding after execution of the block. - - Example: - - .. code-block:: python - - with decoded(request): - request.content = request.content.replace("foo", "bar") + Deprecated: You can now directly use :py:attr:`content`. + :py:attr:`raw_content` has the encoded content. """ def __init__(self, message): - self.message = message - ce = message.headers.get("content-encoding") - if ce in encoding.ENCODINGS: - self.ce = ce - else: - self.ce = None + warnings.warn("decoded() is deprecated, you can now directly use .content instead. " + ".raw_content has the encoded content.", DeprecationWarning) def __enter__(self): - if self.ce: - self.message.decode() + pass def __exit__(self, type, value, tb): - if self.ce: - self.message.encode(self.ce) + pass \ No newline at end of file diff --git a/netlib/http/request.py b/netlib/http/request.py index d9f4ed00..4ce94549 100644 --- a/netlib/http/request.py +++ b/netlib/http/request.py @@ -5,7 +5,6 @@ import re import six from six.moves import urllib -from netlib import encoding from netlib import multidict from netlib import strutils from netlib.http import multipart @@ -44,6 +43,7 @@ class Request(message.Message): An HTTP request. """ def __init__(self, *args, **kwargs): + super(Request, self).__init__() self.data = RequestData(*args, **kwargs) def __repr__(self): @@ -327,7 +327,7 @@ class Request(message.Message): self.headers["accept-encoding"] = ( ', '.join( e - for e in encoding.ENCODINGS + for e in {"gzip", "identity", "deflate"} if e in accept_encoding ) ) diff --git a/netlib/http/response.py b/netlib/http/response.py index 17d69418..d2273edd 100644 --- a/netlib/http/response.py +++ b/netlib/http/response.py @@ -30,13 +30,14 @@ class Response(message.Message): An HTTP response. """ def __init__(self, *args, **kwargs): + super(Response, self).__init__() self.data = ResponseData(*args, **kwargs) def __repr__(self): - if self.content: + if self.raw_content: details = "{}, {}".format( self.headers.get("content-type", "unknown content type"), - human.pretty_size(len(self.content)) + human.pretty_size(len(self.raw_content)) ) else: details = "no content" -- cgit v1.2.3 From d9f797e7e6936809171d9c99144fb5ded3ee131f Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 2 Jul 2016 02:11:00 -0700 Subject: make the linter happy --- netlib/http/message.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'netlib') diff --git a/netlib/http/message.py b/netlib/http/message.py index 668198f8..28278bd2 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -322,4 +322,4 @@ class decoded(object): pass def __exit__(self, type, value, tb): - pass \ No newline at end of file + pass -- cgit v1.2.3 From 2f8a1fd2cb1374941f436f36bbfa0d0b3d9213c7 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Sat, 2 Jul 2016 03:03:42 -0700 Subject: tests++ --- netlib/http/message.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'netlib') diff --git a/netlib/http/message.py b/netlib/http/message.py index 28278bd2..ca3a4145 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -314,12 +314,12 @@ class decoded(object): :py:attr:`raw_content` has the encoded content. """ - def __init__(self, message): + def __init__(self, message): # pragma no cover warnings.warn("decoded() is deprecated, you can now directly use .content instead. " ".raw_content has the encoded content.", DeprecationWarning) - def __enter__(self): + def __enter__(self): # pragma no cover pass - def __exit__(self, type, value, tb): + def __exit__(self, type, value, tb): # pragma no cover pass -- cgit v1.2.3 From a6b3551934e2b8768177d6831ca08f97f5bdae44 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Mon, 4 Jul 2016 13:58:09 -0700 Subject: raise ValueError if content-encoding is invalid --- netlib/http/message.py | 42 +++++++++++++++++++++++++++++------------- netlib/http/request.py | 12 +++++++++--- netlib/wsgi.py | 6 +++++- 3 files changed, 43 insertions(+), 17 deletions(-) (limited to 'netlib') diff --git a/netlib/http/message.py b/netlib/http/message.py index ca3a4145..86ff64d1 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -124,6 +124,9 @@ class Message(basetypes.Serializable): """ The HTTP message body decoded with the content-encoding header (e.g. gzip) + Raises: + ValueError, when getting the content and the content-encoding is invalid. + See also: :py:class:`raw_content`, :py:attr:`text` """ ce = self.headers.get("content-encoding") @@ -132,17 +135,21 @@ class Message(basetypes.Serializable): self._content_cache.encoding == ce ) if not cached: - try: - if not ce: - raise ValueError() + if ce: decoded = encoding.decode(self.raw_content, ce) - except ValueError: + else: decoded = self.raw_content self._content_cache = CachedDecode(self.raw_content, ce, decoded) return self._content_cache.decoded @content.setter def content(self, value): + if value is not None and not isinstance(value, bytes): + raise TypeError( + "Message content must be bytes, not {}. " + "Please use .text if you want to assign a str." + .format(type(value).__name__) + ) ce = self.headers.get("content-encoding") cached = ( self._content_cache.decoded == value and @@ -150,15 +157,15 @@ class Message(basetypes.Serializable): ) if not cached: try: - if not ce: - raise ValueError() - encoded = encoding.encode(value, ce) + if ce and value is not None: + encoded = encoding.encode(value, ce) + else: + encoded = value except ValueError: - # Do we have an unknown content-encoding? - # If so, we want to remove it. - if value and ce: - self.headers.pop("content-encoding", None) - ce = None + # So we have an invalid content-encoding? + # Let's remove it! + del self.headers["content-encoding"] + ce = None encoded = value self._content_cache = CachedDecode(encoded, ce, value) self.raw_content = self._content_cache.encoded @@ -262,6 +269,9 @@ class Message(basetypes.Serializable): Decodes body based on the current Content-Encoding header, then removes the header. If there is no Content-Encoding header, no action is taken. + + Raises: + ValueError, when the content-encoding is invalid. """ self.raw_content = self.content self.headers.pop("content-encoding", None) @@ -269,10 +279,16 @@ class Message(basetypes.Serializable): def encode(self, e): """ Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". + Any existing content-encodings are overwritten, + the content is not decoded beforehand. + + Raises: + ValueError, when the specified content-encoding is invalid. """ - self.decode() # remove the current encoding self.headers["content-encoding"] = e self.content = self.raw_content + if "content-encoding" not in self.headers: + raise ValueError("Invalid content encoding {}".format(repr(e))) def replace(self, pattern, repl, flags=0): """ diff --git a/netlib/http/request.py b/netlib/http/request.py index 4ce94549..a8ec6238 100644 --- a/netlib/http/request.py +++ b/netlib/http/request.py @@ -347,7 +347,10 @@ class Request(message.Message): def _get_urlencoded_form(self): is_valid_content_type = "application/x-www-form-urlencoded" in self.headers.get("content-type", "").lower() if is_valid_content_type: - return tuple(netlib.http.url.decode(self.content)) + try: + return tuple(netlib.http.url.decode(self.content)) + except ValueError: + pass return () def _set_urlencoded_form(self, value): @@ -356,7 +359,7 @@ class Request(message.Message): This will overwrite the existing content if there is one. """ self.headers["content-type"] = "application/x-www-form-urlencoded" - self.content = netlib.http.url.encode(value) + self.content = netlib.http.url.encode(value).encode() @urlencoded_form.setter def urlencoded_form(self, value): @@ -376,7 +379,10 @@ class Request(message.Message): def _get_multipart_form(self): is_valid_content_type = "multipart/form-data" in self.headers.get("content-type", "").lower() if is_valid_content_type: - return multipart.decode(self.headers, self.content) + try: + return multipart.decode(self.headers, self.content) + except ValueError: + pass return () def _set_multipart_form(self, value): diff --git a/netlib/wsgi.py b/netlib/wsgi.py index c66fddc2..2444f449 100644 --- a/netlib/wsgi.py +++ b/netlib/wsgi.py @@ -60,10 +60,14 @@ class WSGIAdaptor(object): else: path_info = path query = '' + try: + content = flow.request.content + except ValueError: + content = flow.request.raw_content environ = { 'wsgi.version': (1, 0), 'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"), - 'wsgi.input': BytesIO(flow.request.content or b""), + 'wsgi.input': BytesIO(content or b""), 'wsgi.errors': errsoc, 'wsgi.multithread': True, 'wsgi.multiprocess': False, -- cgit v1.2.3 From a3c7c84d49c3e6563e7f37ef60c989f99ed96788 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Fri, 15 Jul 2016 22:50:33 -0700 Subject: improve message content semantics --- netlib/http/headers.py | 12 +++++ netlib/http/message.py | 133 ++++++++++++++++++++++++++++++------------------- netlib/wsgi.py | 10 ++-- 3 files changed, 98 insertions(+), 57 deletions(-) (limited to 'netlib') diff --git a/netlib/http/headers.py b/netlib/http/headers.py index f052a53b..13a8c98f 100644 --- a/netlib/http/headers.py +++ b/netlib/http/headers.py @@ -204,3 +204,15 @@ def parse_content_type(c): if len(clause) == 2: d[clause[0].strip()] = clause[1].strip() return ts[0].lower(), ts[1].lower(), d + + +def assemble_content_type(type, subtype, parameters): + if not parameters: + return "{}/{}".format(type, subtype) + params = "; ".join( + "{}={}".format(k, v) + for k, v in parameters.items() + ) + return "{}/{}; {}".format( + type, subtype, params + ) diff --git a/netlib/http/message.py b/netlib/http/message.py index 86ff64d1..1252ed25 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -53,14 +53,15 @@ class MessageData(basetypes.Serializable): class CachedDecode(object): - __slots__ = ["encoded", "encoding", "decoded"] + __slots__ = ["encoded", "encoding", "strict", "decoded"] - def __init__(self, object, encoding, decoded): + def __init__(self, object, encoding, strict, decoded): self.encoded = object self.encoding = encoding + self.strict = strict self.decoded = decoded -no_cached_decode = CachedDecode(None, None, None) +no_cached_decode = CachedDecode(None, None, None, None) class Message(basetypes.Serializable): @@ -118,33 +119,44 @@ class Message(basetypes.Serializable): def raw_content(self, content): self.data.content = content - @property - def content(self): - # type: () -> bytes + def get_content(self, strict=True): + # type: (bool) -> bytes """ The HTTP message body decoded with the content-encoding header (e.g. gzip) Raises: - ValueError, when getting the content and the content-encoding is invalid. + ValueError, when the content-encoding is invalid and strict is True. See also: :py:class:`raw_content`, :py:attr:`text` """ + if self.raw_content is None: + return None ce = self.headers.get("content-encoding") cached = ( self._content_cache.encoded == self.raw_content and + (self._content_cache.strict or not strict) and self._content_cache.encoding == ce ) if not cached: + is_strict = True if ce: - decoded = encoding.decode(self.raw_content, ce) + try: + decoded = encoding.decode(self.raw_content, ce) + except ValueError: + if strict: + raise + is_strict = False + decoded = self.raw_content else: decoded = self.raw_content - self._content_cache = CachedDecode(self.raw_content, ce, decoded) + self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded) return self._content_cache.decoded - @content.setter - def content(self, value): - if value is not None and not isinstance(value, bytes): + def set_content(self, value): + if value is None: + self.raw_content = None + return + if not isinstance(value, bytes): raise TypeError( "Message content must be bytes, not {}. " "Please use .text if you want to assign a str." @@ -153,24 +165,23 @@ class Message(basetypes.Serializable): ce = self.headers.get("content-encoding") cached = ( self._content_cache.decoded == value and - self._content_cache.encoding == ce + self._content_cache.encoding == ce and + self._content_cache.strict ) if not cached: try: - if ce and value is not None: - encoded = encoding.encode(value, ce) - else: - encoded = value + encoded = encoding.encode(value, ce or "identity") except ValueError: # So we have an invalid content-encoding? # Let's remove it! del self.headers["content-encoding"] ce = None encoded = value - self._content_cache = CachedDecode(encoded, ce, value) + self._content_cache = CachedDecode(encoded, ce, True, value) self.raw_content = self._content_cache.encoded - if isinstance(self.raw_content, bytes): - self.headers["content-length"] = str(len(self.raw_content)) + self.headers["content-length"] = str(len(self.raw_content)) + + content = property(get_content, set_content) @property def http_version(self): @@ -211,69 +222,87 @@ class Message(basetypes.Serializable): if ct: return ct[2].get("charset") - @property - def text(self): - # type: () -> six.text_type + def _guess_encoding(self): + # type: () -> str + enc = self._get_content_type_charset() + if enc: + return enc + + if "json" in self.headers.get("content-type", ""): + return "utf8" + else: + # We may also want to check for HTML meta tags here at some point. + return "latin-1" + + def get_text(self, strict=True): + # type: (bool) -> six.text_type """ The HTTP message body decoded with both content-encoding header (e.g. gzip) and content-type header charset. + Raises: + ValueError, when either content-encoding or charset is invalid and strict is True. + See also: :py:attr:`content`, :py:class:`raw_content` """ - # This attribute should be called text, because that's what requests does. - enc = self._get_content_type_charset() - - # We may also want to check for HTML meta tags here at some point. + if self.raw_content is None: + return None + enc = self._guess_encoding() + content = self.get_content(strict) cached = ( - self._text_cache.encoded == self.content and + self._text_cache.encoded == content and + (self._text_cache.strict or not strict) and self._text_cache.encoding == enc ) if not cached: + is_strict = self._content_cache.strict try: - if not enc: - raise ValueError() - decoded = encoding.decode(self.content, enc) + decoded = encoding.decode(content, enc) except ValueError: - decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") - self._text_cache = CachedDecode(self.content, enc, decoded) + if strict: + raise + is_strict = False + decoded = self.content.decode(enc, "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(content, enc, is_strict, decoded) return self._text_cache.decoded - @text.setter - def text(self, text): - enc = self._get_content_type_charset() + def set_text(self, text): + if text is None: + self.content = None + return + enc = self._guess_encoding() + cached = ( self._text_cache.decoded == text and - self._text_cache.encoding == enc + self._text_cache.encoding == enc and + self._text_cache.strict ) if not cached: try: - if not enc: - raise ValueError() encoded = encoding.encode(text, enc) except ValueError: - # Do we have an unknown content-type charset? - # If so, we want to replace it with utf8. - if text and enc: - self.headers["content-type"] = re.sub( - "charset=[^;]+", - "charset=utf-8", - self.headers["content-type"] - ) - encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape") - self._text_cache = CachedDecode(encoded, enc, text) + # Fall back to UTF-8 and update the content-type header. + ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {}) + ct[2]["charset"] = "utf-8" + self.headers["content-type"] = headers.assemble_content_type(*ct) + enc = "utf8" + encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(encoded, enc, True, text) self.content = self._text_cache.encoded - def decode(self): + text = property(get_text, set_text) + + def decode(self, strict=True): """ Decodes body based on the current Content-Encoding header, then removes the header. If there is no Content-Encoding header, no action is taken. Raises: - ValueError, when the content-encoding is invalid. + ValueError, when the content-encoding is invalid and strict is True. """ - self.raw_content = self.content + self.raw_content = self.get_content(strict) self.headers.pop("content-encoding", None) def encode(self, e): diff --git a/netlib/wsgi.py b/netlib/wsgi.py index 2444f449..0def75b5 100644 --- a/netlib/wsgi.py +++ b/netlib/wsgi.py @@ -54,20 +54,20 @@ class WSGIAdaptor(object): self.app, self.domain, self.port, self.sversion = app, domain, port, sversion def make_environ(self, flow, errsoc, **extra): + """ + Raises: + ValueError, if the content-encoding is invalid. + """ path = strutils.native(flow.request.path, "latin-1") if '?' in path: path_info, query = strutils.native(path, "latin-1").split('?', 1) else: path_info = path query = '' - try: - content = flow.request.content - except ValueError: - content = flow.request.raw_content environ = { 'wsgi.version': (1, 0), 'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"), - 'wsgi.input': BytesIO(content or b""), + 'wsgi.input': BytesIO(flow.request.content or b""), 'wsgi.errors': errsoc, 'wsgi.multithread': True, 'wsgi.multiprocess': False, -- cgit v1.2.3 From e6e39ce80f4daaf6a1d6f8d87616409486d358a5 Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Fri, 15 Jul 2016 23:46:12 -0700 Subject: preserve content-type parameter order --- netlib/http/headers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'netlib') diff --git a/netlib/http/headers.py b/netlib/http/headers.py index b8aa212a..9fa7e1e6 100644 --- a/netlib/http/headers.py +++ b/netlib/http/headers.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division import re +import collections import six from netlib import multidict from netlib import strutils @@ -206,7 +207,7 @@ def parse_content_type(c): ts = parts[0].split("/", 1) if len(ts) != 2: return None - d = {} + d = collections.OrderedDict() if len(parts) == 2: for i in parts[1].split(";"): clause = i.split("=", 1) -- cgit v1.2.3