diff options
author | Maximilian Hils <git@maximilianhils.com> | 2016-07-16 00:13:58 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-07-16 00:13:58 -0700 |
commit | b27d59095d799436fed41eaeaba502ecceb40f76 (patch) | |
tree | 152440c1e22850b81aa115817bee4d661f2435de /netlib/http/message.py | |
parent | 903807292b42b2481a3d72d6dbdc72939fc39b01 (diff) | |
parent | e6e39ce80f4daaf6a1d6f8d87616409486d358a5 (diff) | |
download | mitmproxy-b27d59095d799436fed41eaeaba502ecceb40f76.tar.gz mitmproxy-b27d59095d799436fed41eaeaba502ecceb40f76.tar.bz2 mitmproxy-b27d59095d799436fed41eaeaba502ecceb40f76.zip |
Merge pull request #1306 from mitmproxy/message-body-encoding
Improve Message Body Encoding
Diffstat (limited to 'netlib/http/message.py')
-rw-r--r-- | netlib/http/message.py | 254 |
1 files changed, 184 insertions, 70 deletions
diff --git a/netlib/http/message.py b/netlib/http/message.py index b268fec9..34709f0a 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -52,7 +52,23 @@ class MessageData(basetypes.Serializable): return cls(**state) +class CachedDecode(object): + __slots__ = ["encoded", "encoding", "strict", "decoded"] + + def __init__(self, object, encoding, strict, decoded): + self.encoded = object + self.encoding = encoding + self.strict = strict + self.decoded = decoded + +no_cached_decode = CachedDecode(None, None, None, None) + + class Message(basetypes.Serializable): + def __init__(self): + self._content_cache = no_cached_decode # type: CachedDecode + self._text_cache = no_cached_decode # type: CachedDecode + def __eq__(self, other): if isinstance(other, Message): return self.data == other.data @@ -90,22 +106,82 @@ class Message(basetypes.Serializable): self.data.headers = h @property - def content(self): + def raw_content(self): + # type: () -> bytes """ The raw (encoded) HTTP message body - See also: :py:attr:`text` + See also: :py:attr:`content`, :py:class:`text` """ return self.data.content - @content.setter - def content(self, content): - # type: (Optional[bytes]) -> None + @raw_content.setter + def raw_content(self, content): self.data.content = content - if isinstance(content, six.text_type): - raise ValueError("Message content must be bytes, not {}".format(type(content).__name__)) - if isinstance(content, bytes): - self.headers["content-length"] = str(len(content)) + + def get_content(self, strict=True): + # type: (bool) -> bytes + """ + The HTTP message body decoded with the content-encoding header (e.g. gzip) + + Raises: + ValueError, when the content-encoding is invalid and strict is True. + + See also: :py:class:`raw_content`, :py:attr:`text` + """ + if self.raw_content is None: + return None + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.encoded == self.raw_content and + (self._content_cache.strict or not strict) and + self._content_cache.encoding == ce + ) + if not cached: + is_strict = True + if ce: + try: + decoded = encoding.decode(self.raw_content, ce) + except ValueError: + if strict: + raise + is_strict = False + decoded = self.raw_content + else: + decoded = self.raw_content + self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded) + return self._content_cache.decoded + + def set_content(self, value): + if value is None: + self.raw_content = None + return + if not isinstance(value, bytes): + raise TypeError( + "Message content must be bytes, not {}. " + "Please use .text if you want to assign a str." + .format(type(value).__name__) + ) + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.decoded == value and + self._content_cache.encoding == ce and + self._content_cache.strict + ) + if not cached: + try: + encoded = encoding.encode(value, ce or "identity") + except ValueError: + # So we have an invalid content-encoding? + # Let's remove it! + del self.headers["content-encoding"] + ce = None + encoded = value + self._content_cache = CachedDecode(encoded, ce, True, value) + self.raw_content = self._content_cache.encoded + self.headers["content-length"] = str(len(self.raw_content)) + + content = property(get_content, set_content) @property def http_version(self): @@ -140,56 +216,108 @@ class Message(basetypes.Serializable): def timestamp_end(self, timestamp_end): self.data.timestamp_end = timestamp_end - @property - def text(self): - """ - The decoded HTTP message body. - Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive. + def _get_content_type_charset(self): + # type: () -> Optional[str] + ct = headers.parse_content_type(self.headers.get("content-type", "")) + if ct: + return ct[2].get("charset") - .. note:: - This is not implemented yet. + def _guess_encoding(self): + # type: () -> str + enc = self._get_content_type_charset() + if enc: + return enc - See also: :py:attr:`content`, :py:class:`decoded` + if "json" in self.headers.get("content-type", ""): + return "utf8" + else: + # We may also want to check for HTML meta tags here at some point. + return "latin-1" + + def get_text(self, strict=True): + # type: (bool) -> six.text_type """ - # This attribute should be called text, because that's what requests does. - raise NotImplementedError() + The HTTP message body decoded with both content-encoding header (e.g. gzip) + and content-type header charset. - @text.setter - def text(self, text): - raise NotImplementedError() + Raises: + ValueError, when either content-encoding or charset is invalid and strict is True. - def decode(self): + See also: :py:attr:`content`, :py:class:`raw_content` + """ + if self.raw_content is None: + return None + enc = self._guess_encoding() + + content = self.get_content(strict) + cached = ( + self._text_cache.encoded == content and + (self._text_cache.strict or not strict) and + self._text_cache.encoding == enc + ) + if not cached: + is_strict = self._content_cache.strict + try: + decoded = encoding.decode(content, enc) + except ValueError: + if strict: + raise + is_strict = False + decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(content, enc, is_strict, decoded) + return self._text_cache.decoded + + def set_text(self, text): + if text is None: + self.content = None + return + enc = self._guess_encoding() + + cached = ( + self._text_cache.decoded == text and + self._text_cache.encoding == enc and + self._text_cache.strict + ) + if not cached: + try: + encoded = encoding.encode(text, enc) + except ValueError: + # Fall back to UTF-8 and update the content-type header. + ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {}) + ct[2]["charset"] = "utf-8" + self.headers["content-type"] = headers.assemble_content_type(*ct) + enc = "utf8" + encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(encoded, enc, True, text) + self.content = self._text_cache.encoded + + text = property(get_text, set_text) + + def decode(self, strict=True): """ - Decodes body based on the current Content-Encoding header, then - removes the header. If there is no Content-Encoding header, no - action is taken. + Decodes body based on the current Content-Encoding header, then + removes the header. If there is no Content-Encoding header, no + action is taken. - Returns: - True, if decoding succeeded. - False, otherwise. + Raises: + ValueError, when the content-encoding is invalid and strict is True. """ - ce = self.headers.get("content-encoding") - data = encoding.decode(ce, self.content) - if data is None: - return False - self.content = data + self.raw_content = self.get_content(strict) self.headers.pop("content-encoding", None) - return True def encode(self, e): """ - Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". + Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". + Any existing content-encodings are overwritten, + the content is not decoded beforehand. - Returns: - True, if decoding succeeded. - False, otherwise. + Raises: + ValueError, when the specified content-encoding is invalid. """ - data = encoding.encode(e, self.content) - if data is None: - return False - self.content = data self.headers["content-encoding"] = e - return True + self.content = self.raw_content + if "content-encoding" not in self.headers: + raise ValueError("Invalid content encoding {}".format(repr(e))) def replace(self, pattern, repl, flags=0): """ @@ -206,10 +334,9 @@ class Message(basetypes.Serializable): repl = strutils.escaped_str_to_bytes(repl) replacements = 0 if self.content: - with decoded(self): - self.content, replacements = re.subn( - pattern, repl, self.content, flags=flags - ) + self.content, replacements = re.subn( + pattern, repl, self.content, flags=flags + ) replacements += self.headers.replace(pattern, repl, flags) return replacements @@ -228,29 +355,16 @@ class Message(basetypes.Serializable): class decoded(object): """ - A context manager that decodes a request or response, and then - re-encodes it with the same encoding after execution of the block. - - Example: - - .. code-block:: python - - with decoded(request): - request.content = request.content.replace("foo", "bar") + Deprecated: You can now directly use :py:attr:`content`. + :py:attr:`raw_content` has the encoded content. """ - def __init__(self, message): - self.message = message - ce = message.headers.get("content-encoding") - if ce in encoding.ENCODINGS: - self.ce = ce - else: - self.ce = None + def __init__(self, message): # pragma no cover + warnings.warn("decoded() is deprecated, you can now directly use .content instead. " + ".raw_content has the encoded content.", DeprecationWarning) - def __enter__(self): - if self.ce: - self.message.decode() + def __enter__(self): # pragma no cover + pass - def __exit__(self, type, value, tb): - if self.ce: - self.message.encode(self.ce) + def __exit__(self, type, value, tb): # pragma no cover + pass |