diff options
author | Maximilian Hils <git@maximilianhils.com> | 2016-07-02 01:51:47 -0700 |
---|---|---|
committer | Maximilian Hils <git@maximilianhils.com> | 2016-07-02 01:51:47 -0700 |
commit | 6032c4f2352260d32032800a2ff694339e2af6b2 (patch) | |
tree | e242ede8ebb828f424f270aeb5143516ed048939 /netlib | |
parent | 2c09e0416bcf94d9ebef7c11bb1883388e8e2c5d (diff) | |
download | mitmproxy-6032c4f2352260d32032800a2ff694339e2af6b2.tar.gz mitmproxy-6032c4f2352260d32032800a2ff694339e2af6b2.tar.bz2 mitmproxy-6032c4f2352260d32032800a2ff694339e2af6b2.zip |
message.content -> .raw_content, implement .text
This PR improves our handling of HTTP message body encodings:
- The unaltered message body is now accessible as `.raw_content`
- The "content-encoding"-decoded content (i.e. gzip removed) content
is not `.content`, as this is what we want in 99% of the cases.
- `.text` now provides the "content-encoding"-decoded and then
"content-type charset"-decoded message body.
- The decoded values for `.content` and `.text` are cached,
so that repeated access and `x.text = x.text` is cheap.
- The `decoded()` decorator is now deprecated, as we can now just use
`.content`. Similarly `HTTPMessage.get_decoded_content()` is
deprecated.
Diffstat (limited to 'netlib')
-rw-r--r-- | netlib/encoding.py | 97 | ||||
-rw-r--r-- | netlib/http/http1/assemble.py | 4 | ||||
-rw-r--r-- | netlib/http/message.py | 192 | ||||
-rw-r--r-- | netlib/http/request.py | 4 | ||||
-rw-r--r-- | netlib/http/response.py | 5 |
5 files changed, 202 insertions, 100 deletions
diff --git a/netlib/encoding.py b/netlib/encoding.py index 98502451..8b67b543 100644 --- a/netlib/encoding.py +++ b/netlib/encoding.py @@ -1,39 +1,62 @@ """ - Utility functions for decoding response bodies. +Utility functions for decoding response bodies. """ from __future__ import absolute_import + +import codecs from io import BytesIO import gzip import zlib +from typing import Union # noqa + -ENCODINGS = {"identity", "gzip", "deflate"} +def decode(obj, encoding, errors='strict'): + # type: (Union[str, bytes], str) -> Union[str, bytes] + """ + Decode the given input object + Returns: + The decoded value -def decode(e, content): - if not isinstance(content, bytes): - return None - encoding_map = { - "identity": identity, - "gzip": decode_gzip, - "deflate": decode_deflate, - } - if e not in encoding_map: - return None - return encoding_map[e](content) + Raises: + ValueError, if decoding fails. + """ + try: + try: + return custom_decode[encoding](obj) + except KeyError: + return codecs.decode(obj, encoding, errors) + except Exception as e: + raise ValueError("{} when decoding {} with {}".format( + type(e).__name__, + repr(obj)[:10], + repr(encoding), + )) + + +def encode(obj, encoding, errors='strict'): + # type: (Union[str, bytes], str) -> Union[str, bytes] + """ + Encode the given input object + Returns: + The encoded value -def encode(e, content): - if not isinstance(content, bytes): - return None - encoding_map = { - "identity": identity, - "gzip": encode_gzip, - "deflate": encode_deflate, - } - if e not in encoding_map: - return None - return encoding_map[e](content) + Raises: + ValueError, if encoding fails. + """ + try: + try: + return custom_encode[encoding](obj) + except KeyError: + return codecs.encode(obj, encoding, errors) + except Exception as e: + raise ValueError("{} when encoding {} with {}".format( + type(e).__name__, + repr(obj)[:10], + repr(encoding), + )) def identity(content): @@ -46,10 +69,7 @@ def identity(content): def decode_gzip(content): gfile = gzip.GzipFile(fileobj=BytesIO(content)) - try: - return gfile.read() - except (IOError, EOFError): - return None + return gfile.read() def encode_gzip(content): @@ -70,12 +90,9 @@ def decode_deflate(content): http://bugs.python.org/issue5784 """ try: - try: - return zlib.decompress(content) - except zlib.error: - return zlib.decompress(content, -15) + return zlib.decompress(content) except zlib.error: - return None + return zlib.decompress(content, -15) def encode_deflate(content): @@ -84,4 +101,16 @@ def encode_deflate(content): """ return zlib.compress(content) -__all__ = ["ENCODINGS", "encode", "decode"] + +custom_decode = { + "identity": identity, + "gzip": decode_gzip, + "deflate": decode_deflate, +} +custom_encode = { + "identity": identity, + "gzip": encode_gzip, + "deflate": encode_deflate, +} + +__all__ = ["encode", "decode"] diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py index 511328f1..e74732d2 100644 --- a/netlib/http/http1/assemble.py +++ b/netlib/http/http1/assemble.py @@ -5,7 +5,7 @@ from netlib import exceptions def assemble_request(request): - if request.content is None: + if request.data.content is None: raise exceptions.HttpException("Cannot assemble flow with missing content") head = assemble_request_head(request) body = b"".join(assemble_body(request.data.headers, [request.data.content])) @@ -19,7 +19,7 @@ def assemble_request_head(request): def assemble_response(response): - if response.content is None: + if response.data.content is None: raise exceptions.HttpException("Cannot assemble flow with missing content") head = assemble_response_head(response) body = b"".join(assemble_body(response.data.headers, [response.data.content])) diff --git a/netlib/http/message.py b/netlib/http/message.py index 0583c246..668198f8 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -52,7 +52,22 @@ class MessageData(basetypes.Serializable): return cls(**state) +class CachedDecode(object): + __slots__ = ["encoded", "encoding", "decoded"] + + def __init__(self, object, encoding, decoded): + self.encoded = object + self.encoding = encoding + self.decoded = decoded + +no_cached_decode = CachedDecode(None, None, None) + + class Message(basetypes.Serializable): + def __init__(self): + self._content_cache = no_cached_decode # type: CachedDecode + self._text_cache = no_cached_decode # type: CachedDecode + def __eq__(self, other): if isinstance(other, Message): return self.data == other.data @@ -90,19 +105,65 @@ class Message(basetypes.Serializable): self.data.headers = h @property - def content(self): + def raw_content(self): + # type: () -> bytes """ The raw (encoded) HTTP message body - See also: :py:attr:`text` + See also: :py:attr:`content`, :py:class:`text` """ return self.data.content - @content.setter - def content(self, content): + @raw_content.setter + def raw_content(self, content): self.data.content = content - if isinstance(content, bytes): - self.headers["content-length"] = str(len(content)) + + @property + def content(self): + # type: () -> bytes + """ + The HTTP message body decoded with the content-encoding header (e.g. gzip) + + See also: :py:class:`raw_content`, :py:attr:`text` + """ + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.encoded == self.raw_content and + self._content_cache.encoding == ce + ) + if not cached: + try: + if not ce: + raise ValueError() + decoded = encoding.decode(self.raw_content, ce) + except ValueError: + decoded = self.raw_content + self._content_cache = CachedDecode(self.raw_content, ce, decoded) + return self._content_cache.decoded + + @content.setter + def content(self, value): + ce = self.headers.get("content-encoding") + cached = ( + self._content_cache.decoded == value and + self._content_cache.encoding == ce + ) + if not cached: + try: + if not ce: + raise ValueError() + encoded = encoding.encode(value, ce) + except ValueError: + # Do we have an unknown content-encoding? + # If so, we want to remove it. + if value and ce: + self.headers.pop("content-encoding", None) + ce = None + encoded = value + self._content_cache = CachedDecode(encoded, ce, value) + self.raw_content = self._content_cache.encoded + if isinstance(self.raw_content, bytes): + self.headers["content-length"] = str(len(self.raw_content)) @property def http_version(self): @@ -137,56 +198,81 @@ class Message(basetypes.Serializable): def timestamp_end(self, timestamp_end): self.data.timestamp_end = timestamp_end + def _get_content_type_charset(self): + # type: () -> Optional[str] + ct = headers.parse_content_type(self.headers.get("content-type", "")) + if ct: + return ct[2].get("charset") + @property def text(self): + # type: () -> six.text_type """ - The decoded HTTP message body. - Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive. - - .. note:: - This is not implemented yet. + The HTTP message body decoded with both content-encoding header (e.g. gzip) + and content-type header charset. - See also: :py:attr:`content`, :py:class:`decoded` + See also: :py:attr:`content`, :py:class:`raw_content` """ # This attribute should be called text, because that's what requests does. - raise NotImplementedError() + enc = self._get_content_type_charset() + + # We may also want to check for HTML meta tags here at some point. + + cached = ( + self._text_cache.encoded == self.content and + self._text_cache.encoding == enc + ) + if not cached: + try: + if not enc: + raise ValueError() + decoded = encoding.decode(self.content, enc) + except ValueError: + decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(self.content, enc, decoded) + return self._text_cache.decoded @text.setter def text(self, text): - raise NotImplementedError() + enc = self._get_content_type_charset() + cached = ( + self._text_cache.decoded == text and + self._text_cache.encoding == enc + ) + if not cached: + try: + if not enc: + raise ValueError() + encoded = encoding.encode(text, enc) + except ValueError: + # Do we have an unknown content-type charset? + # If so, we want to replace it with utf8. + if text and enc: + self.headers["content-type"] = re.sub( + "charset=[^;]+", + "charset=utf-8", + self.headers["content-type"] + ) + encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(encoded, enc, text) + self.content = self._text_cache.encoded def decode(self): """ - Decodes body based on the current Content-Encoding header, then - removes the header. If there is no Content-Encoding header, no - action is taken. - - Returns: - True, if decoding succeeded. - False, otherwise. + Decodes body based on the current Content-Encoding header, then + removes the header. If there is no Content-Encoding header, no + action is taken. """ - ce = self.headers.get("content-encoding") - data = encoding.decode(ce, self.content) - if data is None: - return False - self.content = data + self.raw_content = self.content self.headers.pop("content-encoding", None) - return True def encode(self, e): """ - Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". - - Returns: - True, if decoding succeeded. - False, otherwise. + Encodes body with the encoding e, where e is "gzip", "deflate" or "identity". """ - data = encoding.encode(e, self.content) - if data is None: - return False - self.content = data + self.decode() # remove the current encoding self.headers["content-encoding"] = e - return True + self.content = self.raw_content def replace(self, pattern, repl, flags=0): """ @@ -203,10 +289,9 @@ class Message(basetypes.Serializable): repl = strutils.escaped_str_to_bytes(repl) replacements = 0 if self.content: - with decoded(self): - self.content, replacements = re.subn( - pattern, repl, self.content, flags=flags - ) + self.content, replacements = re.subn( + pattern, repl, self.content, flags=flags + ) replacements += self.headers.replace(pattern, repl, flags) return replacements @@ -225,29 +310,16 @@ class Message(basetypes.Serializable): class decoded(object): """ - A context manager that decodes a request or response, and then - re-encodes it with the same encoding after execution of the block. - - Example: - - .. code-block:: python - - with decoded(request): - request.content = request.content.replace("foo", "bar") + Deprecated: You can now directly use :py:attr:`content`. + :py:attr:`raw_content` has the encoded content. """ def __init__(self, message): - self.message = message - ce = message.headers.get("content-encoding") - if ce in encoding.ENCODINGS: - self.ce = ce - else: - self.ce = None + warnings.warn("decoded() is deprecated, you can now directly use .content instead. " + ".raw_content has the encoded content.", DeprecationWarning) def __enter__(self): - if self.ce: - self.message.decode() + pass def __exit__(self, type, value, tb): - if self.ce: - self.message.encode(self.ce) + pass
\ No newline at end of file diff --git a/netlib/http/request.py b/netlib/http/request.py index d9f4ed00..4ce94549 100644 --- a/netlib/http/request.py +++ b/netlib/http/request.py @@ -5,7 +5,6 @@ import re import six from six.moves import urllib -from netlib import encoding from netlib import multidict from netlib import strutils from netlib.http import multipart @@ -44,6 +43,7 @@ class Request(message.Message): An HTTP request. """ def __init__(self, *args, **kwargs): + super(Request, self).__init__() self.data = RequestData(*args, **kwargs) def __repr__(self): @@ -327,7 +327,7 @@ class Request(message.Message): self.headers["accept-encoding"] = ( ', '.join( e - for e in encoding.ENCODINGS + for e in {"gzip", "identity", "deflate"} if e in accept_encoding ) ) diff --git a/netlib/http/response.py b/netlib/http/response.py index 17d69418..d2273edd 100644 --- a/netlib/http/response.py +++ b/netlib/http/response.py @@ -30,13 +30,14 @@ class Response(message.Message): An HTTP response. """ def __init__(self, *args, **kwargs): + super(Response, self).__init__() self.data = ResponseData(*args, **kwargs) def __repr__(self): - if self.content: + if self.raw_content: details = "{}, {}".format( self.headers.get("content-type", "unknown content type"), - human.pretty_size(len(self.content)) + human.pretty_size(len(self.raw_content)) ) else: details = "no content" |