From 6032c4f2352260d32032800a2ff694339e2af6b2 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Sat, 2 Jul 2016 01:51:47 -0700
Subject: message.content -> .raw_content, implement .text

This PR improves our handling of HTTP message body encodings:

- The unaltered message body is now accessible as `.raw_content`
- The "content-encoding"-decoded content (i.e. gzip removed) content
  is not `.content`, as this is what we want in 99% of the cases.
- `.text` now provides the "content-encoding"-decoded and then
  "content-type charset"-decoded message body.
- The decoded values for `.content` and `.text` are cached,
  so that repeated access and `x.text = x.text` is cheap.
- The `decoded()` decorator is now deprecated, as we can now just use
  `.content`. Similarly `HTTPMessage.get_decoded_content()` is
  deprecated.
---
 netlib/encoding.py            |  97 +++++++++++++--------
 netlib/http/http1/assemble.py |   4 +-
 netlib/http/message.py        | 192 +++++++++++++++++++++++++++++-------------
 netlib/http/request.py        |   4 +-
 netlib/http/response.py       |   5 +-
 5 files changed, 202 insertions(+), 100 deletions(-)

(limited to 'netlib')

diff --git a/netlib/encoding.py b/netlib/encoding.py
index 98502451..8b67b543 100644
--- a/netlib/encoding.py
+++ b/netlib/encoding.py
@@ -1,39 +1,62 @@
 """
-    Utility functions for decoding response bodies.
+Utility functions for decoding response bodies.
 """
 from __future__ import absolute_import
+
+import codecs
 from io import BytesIO
 import gzip
 import zlib
 
+from typing import Union  # noqa
+
 
-ENCODINGS = {"identity", "gzip", "deflate"}
+def decode(obj, encoding, errors='strict'):
+    # type: (Union[str, bytes], str) -> Union[str, bytes]
+    """
+    Decode the given input object
 
+    Returns:
+        The decoded value
 
-def decode(e, content):
-    if not isinstance(content, bytes):
-        return None
-    encoding_map = {
-        "identity": identity,
-        "gzip": decode_gzip,
-        "deflate": decode_deflate,
-    }
-    if e not in encoding_map:
-        return None
-    return encoding_map[e](content)
+    Raises:
+        ValueError, if decoding fails.
+    """
+    try:
+        try:
+            return custom_decode[encoding](obj)
+        except KeyError:
+            return codecs.decode(obj, encoding, errors)
+    except Exception as e:
+        raise ValueError("{} when decoding {} with {}".format(
+            type(e).__name__,
+            repr(obj)[:10],
+            repr(encoding),
+        ))
+
+
+def encode(obj, encoding, errors='strict'):
+    # type: (Union[str, bytes], str) -> Union[str, bytes]
+    """
+    Encode the given input object
 
+    Returns:
+        The encoded value
 
-def encode(e, content):
-    if not isinstance(content, bytes):
-        return None
-    encoding_map = {
-        "identity": identity,
-        "gzip": encode_gzip,
-        "deflate": encode_deflate,
-    }
-    if e not in encoding_map:
-        return None
-    return encoding_map[e](content)
+    Raises:
+        ValueError, if encoding fails.
+    """
+    try:
+        try:
+            return custom_encode[encoding](obj)
+        except KeyError:
+            return codecs.encode(obj, encoding, errors)
+    except Exception as e:
+        raise ValueError("{} when encoding {} with {}".format(
+            type(e).__name__,
+            repr(obj)[:10],
+            repr(encoding),
+        ))
 
 
 def identity(content):
@@ -46,10 +69,7 @@ def identity(content):
 
 def decode_gzip(content):
     gfile = gzip.GzipFile(fileobj=BytesIO(content))
-    try:
-        return gfile.read()
-    except (IOError, EOFError):
-        return None
+    return gfile.read()
 
 
 def encode_gzip(content):
@@ -70,12 +90,9 @@ def decode_deflate(content):
         http://bugs.python.org/issue5784
     """
     try:
-        try:
-            return zlib.decompress(content)
-        except zlib.error:
-            return zlib.decompress(content, -15)
+        return zlib.decompress(content)
     except zlib.error:
-        return None
+        return zlib.decompress(content, -15)
 
 
 def encode_deflate(content):
@@ -84,4 +101,16 @@ def encode_deflate(content):
     """
     return zlib.compress(content)
 
-__all__ = ["ENCODINGS", "encode", "decode"]
+
+custom_decode = {
+    "identity": identity,
+    "gzip": decode_gzip,
+    "deflate": decode_deflate,
+}
+custom_encode = {
+    "identity": identity,
+    "gzip": encode_gzip,
+    "deflate": encode_deflate,
+}
+
+__all__ = ["encode", "decode"]
diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py
index 511328f1..e74732d2 100644
--- a/netlib/http/http1/assemble.py
+++ b/netlib/http/http1/assemble.py
@@ -5,7 +5,7 @@ from netlib import exceptions
 
 
 def assemble_request(request):
-    if request.content is None:
+    if request.data.content is None:
         raise exceptions.HttpException("Cannot assemble flow with missing content")
     head = assemble_request_head(request)
     body = b"".join(assemble_body(request.data.headers, [request.data.content]))
@@ -19,7 +19,7 @@ def assemble_request_head(request):
 
 
 def assemble_response(response):
-    if response.content is None:
+    if response.data.content is None:
         raise exceptions.HttpException("Cannot assemble flow with missing content")
     head = assemble_response_head(response)
     body = b"".join(assemble_body(response.data.headers, [response.data.content]))
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 0583c246..668198f8 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -52,7 +52,22 @@ class MessageData(basetypes.Serializable):
         return cls(**state)
 
 
+class CachedDecode(object):
+    __slots__ = ["encoded", "encoding", "decoded"]
+
+    def __init__(self, object, encoding, decoded):
+        self.encoded = object
+        self.encoding = encoding
+        self.decoded = decoded
+
+no_cached_decode = CachedDecode(None, None, None)
+
+
 class Message(basetypes.Serializable):
+    def __init__(self):
+        self._content_cache = no_cached_decode  # type: CachedDecode
+        self._text_cache = no_cached_decode  # type: CachedDecode
+
     def __eq__(self, other):
         if isinstance(other, Message):
             return self.data == other.data
@@ -90,19 +105,65 @@ class Message(basetypes.Serializable):
         self.data.headers = h
 
     @property
-    def content(self):
+    def raw_content(self):
+        # type: () -> bytes
         """
         The raw (encoded) HTTP message body
 
-        See also: :py:attr:`text`
+        See also: :py:attr:`content`, :py:class:`text`
         """
         return self.data.content
 
-    @content.setter
-    def content(self, content):
+    @raw_content.setter
+    def raw_content(self, content):
         self.data.content = content
-        if isinstance(content, bytes):
-            self.headers["content-length"] = str(len(content))
+
+    @property
+    def content(self):
+        # type: () -> bytes
+        """
+        The HTTP message body decoded with the content-encoding header (e.g. gzip)
+
+        See also: :py:class:`raw_content`, :py:attr:`text`
+        """
+        ce = self.headers.get("content-encoding")
+        cached = (
+            self._content_cache.encoded == self.raw_content and
+            self._content_cache.encoding == ce
+        )
+        if not cached:
+            try:
+                if not ce:
+                    raise ValueError()
+                decoded = encoding.decode(self.raw_content, ce)
+            except ValueError:
+                decoded = self.raw_content
+            self._content_cache = CachedDecode(self.raw_content, ce, decoded)
+        return self._content_cache.decoded
+
+    @content.setter
+    def content(self, value):
+        ce = self.headers.get("content-encoding")
+        cached = (
+            self._content_cache.decoded == value and
+            self._content_cache.encoding == ce
+        )
+        if not cached:
+            try:
+                if not ce:
+                    raise ValueError()
+                encoded = encoding.encode(value, ce)
+            except ValueError:
+                # Do we have an unknown content-encoding?
+                # If so, we want to remove it.
+                if value and ce:
+                    self.headers.pop("content-encoding", None)
+                    ce = None
+                encoded = value
+            self._content_cache = CachedDecode(encoded, ce, value)
+        self.raw_content = self._content_cache.encoded
+        if isinstance(self.raw_content, bytes):
+            self.headers["content-length"] = str(len(self.raw_content))
 
     @property
     def http_version(self):
@@ -137,56 +198,81 @@ class Message(basetypes.Serializable):
     def timestamp_end(self, timestamp_end):
         self.data.timestamp_end = timestamp_end
 
+    def _get_content_type_charset(self):
+        # type: () -> Optional[str]
+        ct = headers.parse_content_type(self.headers.get("content-type", ""))
+        if ct:
+            return ct[2].get("charset")
+
     @property
     def text(self):
+        # type: () -> six.text_type
         """
-        The decoded HTTP message body.
-        Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive.
-
-        .. note::
-            This is not implemented yet.
+        The HTTP message body decoded with both content-encoding header (e.g. gzip)
+        and content-type header charset.
 
-        See also: :py:attr:`content`, :py:class:`decoded`
+        See also: :py:attr:`content`, :py:class:`raw_content`
         """
         # This attribute should be called text, because that's what requests does.
-        raise NotImplementedError()
+        enc = self._get_content_type_charset()
+
+        # We may also want to check for HTML meta tags here at some point.
+
+        cached = (
+            self._text_cache.encoded == self.content and
+            self._text_cache.encoding == enc
+        )
+        if not cached:
+            try:
+                if not enc:
+                    raise ValueError()
+                decoded = encoding.decode(self.content, enc)
+            except ValueError:
+                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(self.content, enc, decoded)
+        return self._text_cache.decoded
 
     @text.setter
     def text(self, text):
-        raise NotImplementedError()
+        enc = self._get_content_type_charset()
+        cached = (
+            self._text_cache.decoded == text and
+            self._text_cache.encoding == enc
+        )
+        if not cached:
+            try:
+                if not enc:
+                    raise ValueError()
+                encoded = encoding.encode(text, enc)
+            except ValueError:
+                # Do we have an unknown content-type charset?
+                # If so, we want to replace it with utf8.
+                if text and enc:
+                    self.headers["content-type"] = re.sub(
+                        "charset=[^;]+",
+                        "charset=utf-8",
+                        self.headers["content-type"]
+                    )
+                encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(encoded, enc, text)
+        self.content = self._text_cache.encoded
 
     def decode(self):
         """
-            Decodes body based on the current Content-Encoding header, then
-            removes the header. If there is no Content-Encoding header, no
-            action is taken.
-
-            Returns:
-                True, if decoding succeeded.
-                False, otherwise.
+        Decodes body based on the current Content-Encoding header, then
+        removes the header. If there is no Content-Encoding header, no
+        action is taken.
         """
-        ce = self.headers.get("content-encoding")
-        data = encoding.decode(ce, self.content)
-        if data is None:
-            return False
-        self.content = data
+        self.raw_content = self.content
         self.headers.pop("content-encoding", None)
-        return True
 
     def encode(self, e):
         """
-            Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
-
-            Returns:
-                True, if decoding succeeded.
-                False, otherwise.
+        Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
         """
-        data = encoding.encode(e, self.content)
-        if data is None:
-            return False
-        self.content = data
+        self.decode()  # remove the current encoding
         self.headers["content-encoding"] = e
-        return True
+        self.content = self.raw_content
 
     def replace(self, pattern, repl, flags=0):
         """
@@ -203,10 +289,9 @@ class Message(basetypes.Serializable):
             repl = strutils.escaped_str_to_bytes(repl)
         replacements = 0
         if self.content:
-            with decoded(self):
-                self.content, replacements = re.subn(
-                    pattern, repl, self.content, flags=flags
-                )
+            self.content, replacements = re.subn(
+                pattern, repl, self.content, flags=flags
+            )
         replacements += self.headers.replace(pattern, repl, flags)
         return replacements
 
@@ -225,29 +310,16 @@ class Message(basetypes.Serializable):
 
 class decoded(object):
     """
-    A context manager that decodes a request or response, and then
-    re-encodes it with the same encoding after execution of the block.
-
-    Example:
-
-    .. code-block:: python
-
-        with decoded(request):
-            request.content = request.content.replace("foo", "bar")
+    Deprecated: You can now directly use :py:attr:`content`.
+    :py:attr:`raw_content` has the encoded content.
     """
 
     def __init__(self, message):
-        self.message = message
-        ce = message.headers.get("content-encoding")
-        if ce in encoding.ENCODINGS:
-            self.ce = ce
-        else:
-            self.ce = None
+        warnings.warn("decoded() is deprecated, you can now directly use .content instead. "
+                      ".raw_content has the encoded content.", DeprecationWarning)
 
     def __enter__(self):
-        if self.ce:
-            self.message.decode()
+        pass
 
     def __exit__(self, type, value, tb):
-        if self.ce:
-            self.message.encode(self.ce)
+        pass
\ No newline at end of file
diff --git a/netlib/http/request.py b/netlib/http/request.py
index d9f4ed00..4ce94549 100644
--- a/netlib/http/request.py
+++ b/netlib/http/request.py
@@ -5,7 +5,6 @@ import re
 import six
 from six.moves import urllib
 
-from netlib import encoding
 from netlib import multidict
 from netlib import strutils
 from netlib.http import multipart
@@ -44,6 +43,7 @@ class Request(message.Message):
     An HTTP request.
     """
     def __init__(self, *args, **kwargs):
+        super(Request, self).__init__()
         self.data = RequestData(*args, **kwargs)
 
     def __repr__(self):
@@ -327,7 +327,7 @@ class Request(message.Message):
             self.headers["accept-encoding"] = (
                 ', '.join(
                     e
-                    for e in encoding.ENCODINGS
+                    for e in {"gzip", "identity", "deflate"}
                     if e in accept_encoding
                 )
             )
diff --git a/netlib/http/response.py b/netlib/http/response.py
index 17d69418..d2273edd 100644
--- a/netlib/http/response.py
+++ b/netlib/http/response.py
@@ -30,13 +30,14 @@ class Response(message.Message):
     An HTTP response.
     """
     def __init__(self, *args, **kwargs):
+        super(Response, self).__init__()
         self.data = ResponseData(*args, **kwargs)
 
     def __repr__(self):
-        if self.content:
+        if self.raw_content:
             details = "{}, {}".format(
                 self.headers.get("content-type", "unknown content type"),
-                human.pretty_size(len(self.content))
+                human.pretty_size(len(self.raw_content))
             )
         else:
             details = "no content"
-- 
cgit v1.2.3


From d9f797e7e6936809171d9c99144fb5ded3ee131f Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Sat, 2 Jul 2016 02:11:00 -0700
Subject: make the linter happy

---
 netlib/http/message.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'netlib')

diff --git a/netlib/http/message.py b/netlib/http/message.py
index 668198f8..28278bd2 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -322,4 +322,4 @@ class decoded(object):
         pass
 
     def __exit__(self, type, value, tb):
-        pass
\ No newline at end of file
+        pass
-- 
cgit v1.2.3


From 2f8a1fd2cb1374941f436f36bbfa0d0b3d9213c7 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Sat, 2 Jul 2016 03:03:42 -0700
Subject: tests++

---
 netlib/http/message.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'netlib')

diff --git a/netlib/http/message.py b/netlib/http/message.py
index 28278bd2..ca3a4145 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -314,12 +314,12 @@ class decoded(object):
     :py:attr:`raw_content` has the encoded content.
     """
 
-    def __init__(self, message):
+    def __init__(self, message):  # pragma no cover
         warnings.warn("decoded() is deprecated, you can now directly use .content instead. "
                       ".raw_content has the encoded content.", DeprecationWarning)
 
-    def __enter__(self):
+    def __enter__(self):  # pragma no cover
         pass
 
-    def __exit__(self, type, value, tb):
+    def __exit__(self, type, value, tb):  # pragma no cover
         pass
-- 
cgit v1.2.3


From a6b3551934e2b8768177d6831ca08f97f5bdae44 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Mon, 4 Jul 2016 13:58:09 -0700
Subject: raise ValueError if content-encoding is invalid

---
 netlib/http/message.py | 42 +++++++++++++++++++++++++++++-------------
 netlib/http/request.py | 12 +++++++++---
 netlib/wsgi.py         |  6 +++++-
 3 files changed, 43 insertions(+), 17 deletions(-)

(limited to 'netlib')

diff --git a/netlib/http/message.py b/netlib/http/message.py
index ca3a4145..86ff64d1 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -124,6 +124,9 @@ class Message(basetypes.Serializable):
         """
         The HTTP message body decoded with the content-encoding header (e.g. gzip)
 
+        Raises:
+            ValueError, when getting the content and the content-encoding is invalid.
+
         See also: :py:class:`raw_content`, :py:attr:`text`
         """
         ce = self.headers.get("content-encoding")
@@ -132,17 +135,21 @@ class Message(basetypes.Serializable):
             self._content_cache.encoding == ce
         )
         if not cached:
-            try:
-                if not ce:
-                    raise ValueError()
+            if ce:
                 decoded = encoding.decode(self.raw_content, ce)
-            except ValueError:
+            else:
                 decoded = self.raw_content
             self._content_cache = CachedDecode(self.raw_content, ce, decoded)
         return self._content_cache.decoded
 
     @content.setter
     def content(self, value):
+        if value is not None and not isinstance(value, bytes):
+            raise TypeError(
+                "Message content must be bytes, not {}. "
+                "Please use .text if you want to assign a str."
+                .format(type(value).__name__)
+            )
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.decoded == value and
@@ -150,15 +157,15 @@ class Message(basetypes.Serializable):
         )
         if not cached:
             try:
-                if not ce:
-                    raise ValueError()
-                encoded = encoding.encode(value, ce)
+                if ce and value is not None:
+                    encoded = encoding.encode(value, ce)
+                else:
+                    encoded = value
             except ValueError:
-                # Do we have an unknown content-encoding?
-                # If so, we want to remove it.
-                if value and ce:
-                    self.headers.pop("content-encoding", None)
-                    ce = None
+                # So we have an invalid content-encoding?
+                # Let's remove it!
+                del self.headers["content-encoding"]
+                ce = None
                 encoded = value
             self._content_cache = CachedDecode(encoded, ce, value)
         self.raw_content = self._content_cache.encoded
@@ -262,6 +269,9 @@ class Message(basetypes.Serializable):
         Decodes body based on the current Content-Encoding header, then
         removes the header. If there is no Content-Encoding header, no
         action is taken.
+
+        Raises:
+            ValueError, when the content-encoding is invalid.
         """
         self.raw_content = self.content
         self.headers.pop("content-encoding", None)
@@ -269,10 +279,16 @@ class Message(basetypes.Serializable):
     def encode(self, e):
         """
         Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
+        Any existing content-encodings are overwritten,
+        the content is not decoded beforehand.
+
+        Raises:
+            ValueError, when the specified content-encoding is invalid.
         """
-        self.decode()  # remove the current encoding
         self.headers["content-encoding"] = e
         self.content = self.raw_content
+        if "content-encoding" not in self.headers:
+            raise ValueError("Invalid content encoding {}".format(repr(e)))
 
     def replace(self, pattern, repl, flags=0):
         """
diff --git a/netlib/http/request.py b/netlib/http/request.py
index 4ce94549..a8ec6238 100644
--- a/netlib/http/request.py
+++ b/netlib/http/request.py
@@ -347,7 +347,10 @@ class Request(message.Message):
     def _get_urlencoded_form(self):
         is_valid_content_type = "application/x-www-form-urlencoded" in self.headers.get("content-type", "").lower()
         if is_valid_content_type:
-            return tuple(netlib.http.url.decode(self.content))
+            try:
+                return tuple(netlib.http.url.decode(self.content))
+            except ValueError:
+                pass
         return ()
 
     def _set_urlencoded_form(self, value):
@@ -356,7 +359,7 @@ class Request(message.Message):
         This will overwrite the existing content if there is one.
         """
         self.headers["content-type"] = "application/x-www-form-urlencoded"
-        self.content = netlib.http.url.encode(value)
+        self.content = netlib.http.url.encode(value).encode()
 
     @urlencoded_form.setter
     def urlencoded_form(self, value):
@@ -376,7 +379,10 @@ class Request(message.Message):
     def _get_multipart_form(self):
         is_valid_content_type = "multipart/form-data" in self.headers.get("content-type", "").lower()
         if is_valid_content_type:
-            return multipart.decode(self.headers, self.content)
+            try:
+                return multipart.decode(self.headers, self.content)
+            except ValueError:
+                pass
         return ()
 
     def _set_multipart_form(self, value):
diff --git a/netlib/wsgi.py b/netlib/wsgi.py
index c66fddc2..2444f449 100644
--- a/netlib/wsgi.py
+++ b/netlib/wsgi.py
@@ -60,10 +60,14 @@ class WSGIAdaptor(object):
         else:
             path_info = path
             query = ''
+        try:
+            content = flow.request.content
+        except ValueError:
+            content = flow.request.raw_content
         environ = {
             'wsgi.version': (1, 0),
             'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"),
-            'wsgi.input': BytesIO(flow.request.content or b""),
+            'wsgi.input': BytesIO(content or b""),
             'wsgi.errors': errsoc,
             'wsgi.multithread': True,
             'wsgi.multiprocess': False,
-- 
cgit v1.2.3


From a3c7c84d49c3e6563e7f37ef60c989f99ed96788 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Fri, 15 Jul 2016 22:50:33 -0700
Subject: improve message content semantics

---
 netlib/http/headers.py |  12 +++++
 netlib/http/message.py | 133 ++++++++++++++++++++++++++++++-------------------
 netlib/wsgi.py         |  10 ++--
 3 files changed, 98 insertions(+), 57 deletions(-)

(limited to 'netlib')

diff --git a/netlib/http/headers.py b/netlib/http/headers.py
index f052a53b..13a8c98f 100644
--- a/netlib/http/headers.py
+++ b/netlib/http/headers.py
@@ -204,3 +204,15 @@ def parse_content_type(c):
             if len(clause) == 2:
                 d[clause[0].strip()] = clause[1].strip()
     return ts[0].lower(), ts[1].lower(), d
+
+
+def assemble_content_type(type, subtype, parameters):
+    if not parameters:
+        return "{}/{}".format(type, subtype)
+    params = "; ".join(
+        "{}={}".format(k, v)
+        for k, v in parameters.items()
+    )
+    return "{}/{}; {}".format(
+        type, subtype, params
+    )
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 86ff64d1..1252ed25 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -53,14 +53,15 @@ class MessageData(basetypes.Serializable):
 
 
 class CachedDecode(object):
-    __slots__ = ["encoded", "encoding", "decoded"]
+    __slots__ = ["encoded", "encoding", "strict", "decoded"]
 
-    def __init__(self, object, encoding, decoded):
+    def __init__(self, object, encoding, strict, decoded):
         self.encoded = object
         self.encoding = encoding
+        self.strict = strict
         self.decoded = decoded
 
-no_cached_decode = CachedDecode(None, None, None)
+no_cached_decode = CachedDecode(None, None, None, None)
 
 
 class Message(basetypes.Serializable):
@@ -118,33 +119,44 @@ class Message(basetypes.Serializable):
     def raw_content(self, content):
         self.data.content = content
 
-    @property
-    def content(self):
-        # type: () -> bytes
+    def get_content(self, strict=True):
+        # type: (bool) -> bytes
         """
         The HTTP message body decoded with the content-encoding header (e.g. gzip)
 
         Raises:
-            ValueError, when getting the content and the content-encoding is invalid.
+            ValueError, when the content-encoding is invalid and strict is True.
 
         See also: :py:class:`raw_content`, :py:attr:`text`
         """
+        if self.raw_content is None:
+            return None
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.encoded == self.raw_content and
+            (self._content_cache.strict or not strict) and
             self._content_cache.encoding == ce
         )
         if not cached:
+            is_strict = True
             if ce:
-                decoded = encoding.decode(self.raw_content, ce)
+                try:
+                    decoded = encoding.decode(self.raw_content, ce)
+                except ValueError:
+                    if strict:
+                        raise
+                    is_strict = False
+                    decoded = self.raw_content
             else:
                 decoded = self.raw_content
-            self._content_cache = CachedDecode(self.raw_content, ce, decoded)
+            self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded)
         return self._content_cache.decoded
 
-    @content.setter
-    def content(self, value):
-        if value is not None and not isinstance(value, bytes):
+    def set_content(self, value):
+        if value is None:
+            self.raw_content = None
+            return
+        if not isinstance(value, bytes):
             raise TypeError(
                 "Message content must be bytes, not {}. "
                 "Please use .text if you want to assign a str."
@@ -153,24 +165,23 @@ class Message(basetypes.Serializable):
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.decoded == value and
-            self._content_cache.encoding == ce
+            self._content_cache.encoding == ce and
+            self._content_cache.strict
         )
         if not cached:
             try:
-                if ce and value is not None:
-                    encoded = encoding.encode(value, ce)
-                else:
-                    encoded = value
+                encoded = encoding.encode(value, ce or "identity")
             except ValueError:
                 # So we have an invalid content-encoding?
                 # Let's remove it!
                 del self.headers["content-encoding"]
                 ce = None
                 encoded = value
-            self._content_cache = CachedDecode(encoded, ce, value)
+            self._content_cache = CachedDecode(encoded, ce, True, value)
         self.raw_content = self._content_cache.encoded
-        if isinstance(self.raw_content, bytes):
-            self.headers["content-length"] = str(len(self.raw_content))
+        self.headers["content-length"] = str(len(self.raw_content))
+
+    content = property(get_content, set_content)
 
     @property
     def http_version(self):
@@ -211,69 +222,87 @@ class Message(basetypes.Serializable):
         if ct:
             return ct[2].get("charset")
 
-    @property
-    def text(self):
-        # type: () -> six.text_type
+    def _guess_encoding(self):
+        # type: () -> str
+        enc = self._get_content_type_charset()
+        if enc:
+            return enc
+
+        if "json" in self.headers.get("content-type", ""):
+            return "utf8"
+        else:
+            # We may also want to check for HTML meta tags here at some point.
+            return "latin-1"
+
+    def get_text(self, strict=True):
+        # type: (bool) -> six.text_type
         """
         The HTTP message body decoded with both content-encoding header (e.g. gzip)
         and content-type header charset.
 
+        Raises:
+            ValueError, when either content-encoding or charset is invalid and strict is True.
+
         See also: :py:attr:`content`, :py:class:`raw_content`
         """
-        # This attribute should be called text, because that's what requests does.
-        enc = self._get_content_type_charset()
-
-        # We may also want to check for HTML meta tags here at some point.
+        if self.raw_content is None:
+            return None
+        enc = self._guess_encoding()
 
+        content = self.get_content(strict)
         cached = (
-            self._text_cache.encoded == self.content and
+            self._text_cache.encoded == content and
+            (self._text_cache.strict or not strict) and
             self._text_cache.encoding == enc
         )
         if not cached:
+            is_strict = self._content_cache.strict
             try:
-                if not enc:
-                    raise ValueError()
-                decoded = encoding.decode(self.content, enc)
+                decoded = encoding.decode(content, enc)
             except ValueError:
-                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
-            self._text_cache = CachedDecode(self.content, enc, decoded)
+                if strict:
+                    raise
+                is_strict = False
+                decoded = self.content.decode(enc, "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(content, enc, is_strict, decoded)
         return self._text_cache.decoded
 
-    @text.setter
-    def text(self, text):
-        enc = self._get_content_type_charset()
+    def set_text(self, text):
+        if text is None:
+            self.content = None
+            return
+        enc = self._guess_encoding()
+
         cached = (
             self._text_cache.decoded == text and
-            self._text_cache.encoding == enc
+            self._text_cache.encoding == enc and
+            self._text_cache.strict
         )
         if not cached:
             try:
-                if not enc:
-                    raise ValueError()
                 encoded = encoding.encode(text, enc)
             except ValueError:
-                # Do we have an unknown content-type charset?
-                # If so, we want to replace it with utf8.
-                if text and enc:
-                    self.headers["content-type"] = re.sub(
-                        "charset=[^;]+",
-                        "charset=utf-8",
-                        self.headers["content-type"]
-                    )
-                encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape")
-            self._text_cache = CachedDecode(encoded, enc, text)
+                # Fall back to UTF-8 and update the content-type header.
+                ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {})
+                ct[2]["charset"] = "utf-8"
+                self.headers["content-type"] = headers.assemble_content_type(*ct)
+                enc = "utf8"
+                encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(encoded, enc, True, text)
         self.content = self._text_cache.encoded
 
-    def decode(self):
+    text = property(get_text, set_text)
+
+    def decode(self, strict=True):
         """
         Decodes body based on the current Content-Encoding header, then
         removes the header. If there is no Content-Encoding header, no
         action is taken.
 
         Raises:
-            ValueError, when the content-encoding is invalid.
+            ValueError, when the content-encoding is invalid and strict is True.
         """
-        self.raw_content = self.content
+        self.raw_content = self.get_content(strict)
         self.headers.pop("content-encoding", None)
 
     def encode(self, e):
diff --git a/netlib/wsgi.py b/netlib/wsgi.py
index 2444f449..0def75b5 100644
--- a/netlib/wsgi.py
+++ b/netlib/wsgi.py
@@ -54,20 +54,20 @@ class WSGIAdaptor(object):
         self.app, self.domain, self.port, self.sversion = app, domain, port, sversion
 
     def make_environ(self, flow, errsoc, **extra):
+        """
+        Raises:
+            ValueError, if the content-encoding is invalid.
+        """
         path = strutils.native(flow.request.path, "latin-1")
         if '?' in path:
             path_info, query = strutils.native(path, "latin-1").split('?', 1)
         else:
             path_info = path
             query = ''
-        try:
-            content = flow.request.content
-        except ValueError:
-            content = flow.request.raw_content
         environ = {
             'wsgi.version': (1, 0),
             'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"),
-            'wsgi.input': BytesIO(content or b""),
+            'wsgi.input': BytesIO(flow.request.content or b""),
             'wsgi.errors': errsoc,
             'wsgi.multithread': True,
             'wsgi.multiprocess': False,
-- 
cgit v1.2.3


From e6e39ce80f4daaf6a1d6f8d87616409486d358a5 Mon Sep 17 00:00:00 2001
From: Maximilian Hils <git@maximilianhils.com>
Date: Fri, 15 Jul 2016 23:46:12 -0700
Subject: preserve content-type parameter order

---
 netlib/http/headers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'netlib')

diff --git a/netlib/http/headers.py b/netlib/http/headers.py
index b8aa212a..9fa7e1e6 100644
--- a/netlib/http/headers.py
+++ b/netlib/http/headers.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import, print_function, division
 
 import re
 
+import collections
 import six
 from netlib import multidict
 from netlib import strutils
@@ -206,7 +207,7 @@ def parse_content_type(c):
     ts = parts[0].split("/", 1)
     if len(ts) != 2:
         return None
-    d = {}
+    d = collections.OrderedDict()
     if len(parts) == 2:
         for i in parts[1].split(";"):
             clause = i.split("=", 1)
-- 
cgit v1.2.3