aboutsummaryrefslogtreecommitdiffstats
path: root/netlib
diff options
context:
space:
mode:
authorMaximilian Hils <git@maximilianhils.com>2016-07-02 01:51:47 -0700
committerMaximilian Hils <git@maximilianhils.com>2016-07-02 01:51:47 -0700
commit6032c4f2352260d32032800a2ff694339e2af6b2 (patch)
treee242ede8ebb828f424f270aeb5143516ed048939 /netlib
parent2c09e0416bcf94d9ebef7c11bb1883388e8e2c5d (diff)
downloadmitmproxy-6032c4f2352260d32032800a2ff694339e2af6b2.tar.gz
mitmproxy-6032c4f2352260d32032800a2ff694339e2af6b2.tar.bz2
mitmproxy-6032c4f2352260d32032800a2ff694339e2af6b2.zip
message.content -> .raw_content, implement .text
This PR improves our handling of HTTP message body encodings: - The unaltered message body is now accessible as `.raw_content` - The "content-encoding"-decoded content (i.e. gzip removed) content is not `.content`, as this is what we want in 99% of the cases. - `.text` now provides the "content-encoding"-decoded and then "content-type charset"-decoded message body. - The decoded values for `.content` and `.text` are cached, so that repeated access and `x.text = x.text` is cheap. - The `decoded()` decorator is now deprecated, as we can now just use `.content`. Similarly `HTTPMessage.get_decoded_content()` is deprecated.
Diffstat (limited to 'netlib')
-rw-r--r--netlib/encoding.py97
-rw-r--r--netlib/http/http1/assemble.py4
-rw-r--r--netlib/http/message.py192
-rw-r--r--netlib/http/request.py4
-rw-r--r--netlib/http/response.py5
5 files changed, 202 insertions, 100 deletions
diff --git a/netlib/encoding.py b/netlib/encoding.py
index 98502451..8b67b543 100644
--- a/netlib/encoding.py
+++ b/netlib/encoding.py
@@ -1,39 +1,62 @@
"""
- Utility functions for decoding response bodies.
+Utility functions for decoding response bodies.
"""
from __future__ import absolute_import
+
+import codecs
from io import BytesIO
import gzip
import zlib
+from typing import Union # noqa
+
-ENCODINGS = {"identity", "gzip", "deflate"}
+def decode(obj, encoding, errors='strict'):
+ # type: (Union[str, bytes], str) -> Union[str, bytes]
+ """
+ Decode the given input object
+ Returns:
+ The decoded value
-def decode(e, content):
- if not isinstance(content, bytes):
- return None
- encoding_map = {
- "identity": identity,
- "gzip": decode_gzip,
- "deflate": decode_deflate,
- }
- if e not in encoding_map:
- return None
- return encoding_map[e](content)
+ Raises:
+ ValueError, if decoding fails.
+ """
+ try:
+ try:
+ return custom_decode[encoding](obj)
+ except KeyError:
+ return codecs.decode(obj, encoding, errors)
+ except Exception as e:
+ raise ValueError("{} when decoding {} with {}".format(
+ type(e).__name__,
+ repr(obj)[:10],
+ repr(encoding),
+ ))
+
+
+def encode(obj, encoding, errors='strict'):
+ # type: (Union[str, bytes], str) -> Union[str, bytes]
+ """
+ Encode the given input object
+ Returns:
+ The encoded value
-def encode(e, content):
- if not isinstance(content, bytes):
- return None
- encoding_map = {
- "identity": identity,
- "gzip": encode_gzip,
- "deflate": encode_deflate,
- }
- if e not in encoding_map:
- return None
- return encoding_map[e](content)
+ Raises:
+ ValueError, if encoding fails.
+ """
+ try:
+ try:
+ return custom_encode[encoding](obj)
+ except KeyError:
+ return codecs.encode(obj, encoding, errors)
+ except Exception as e:
+ raise ValueError("{} when encoding {} with {}".format(
+ type(e).__name__,
+ repr(obj)[:10],
+ repr(encoding),
+ ))
def identity(content):
@@ -46,10 +69,7 @@ def identity(content):
def decode_gzip(content):
gfile = gzip.GzipFile(fileobj=BytesIO(content))
- try:
- return gfile.read()
- except (IOError, EOFError):
- return None
+ return gfile.read()
def encode_gzip(content):
@@ -70,12 +90,9 @@ def decode_deflate(content):
http://bugs.python.org/issue5784
"""
try:
- try:
- return zlib.decompress(content)
- except zlib.error:
- return zlib.decompress(content, -15)
+ return zlib.decompress(content)
except zlib.error:
- return None
+ return zlib.decompress(content, -15)
def encode_deflate(content):
@@ -84,4 +101,16 @@ def encode_deflate(content):
"""
return zlib.compress(content)
-__all__ = ["ENCODINGS", "encode", "decode"]
+
+custom_decode = {
+ "identity": identity,
+ "gzip": decode_gzip,
+ "deflate": decode_deflate,
+}
+custom_encode = {
+ "identity": identity,
+ "gzip": encode_gzip,
+ "deflate": encode_deflate,
+}
+
+__all__ = ["encode", "decode"]
diff --git a/netlib/http/http1/assemble.py b/netlib/http/http1/assemble.py
index 511328f1..e74732d2 100644
--- a/netlib/http/http1/assemble.py
+++ b/netlib/http/http1/assemble.py
@@ -5,7 +5,7 @@ from netlib import exceptions
def assemble_request(request):
- if request.content is None:
+ if request.data.content is None:
raise exceptions.HttpException("Cannot assemble flow with missing content")
head = assemble_request_head(request)
body = b"".join(assemble_body(request.data.headers, [request.data.content]))
@@ -19,7 +19,7 @@ def assemble_request_head(request):
def assemble_response(response):
- if response.content is None:
+ if response.data.content is None:
raise exceptions.HttpException("Cannot assemble flow with missing content")
head = assemble_response_head(response)
body = b"".join(assemble_body(response.data.headers, [response.data.content]))
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 0583c246..668198f8 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -52,7 +52,22 @@ class MessageData(basetypes.Serializable):
return cls(**state)
+class CachedDecode(object):
+ __slots__ = ["encoded", "encoding", "decoded"]
+
+ def __init__(self, object, encoding, decoded):
+ self.encoded = object
+ self.encoding = encoding
+ self.decoded = decoded
+
+no_cached_decode = CachedDecode(None, None, None)
+
+
class Message(basetypes.Serializable):
+ def __init__(self):
+ self._content_cache = no_cached_decode # type: CachedDecode
+ self._text_cache = no_cached_decode # type: CachedDecode
+
def __eq__(self, other):
if isinstance(other, Message):
return self.data == other.data
@@ -90,19 +105,65 @@ class Message(basetypes.Serializable):
self.data.headers = h
@property
- def content(self):
+ def raw_content(self):
+ # type: () -> bytes
"""
The raw (encoded) HTTP message body
- See also: :py:attr:`text`
+ See also: :py:attr:`content`, :py:class:`text`
"""
return self.data.content
- @content.setter
- def content(self, content):
+ @raw_content.setter
+ def raw_content(self, content):
self.data.content = content
- if isinstance(content, bytes):
- self.headers["content-length"] = str(len(content))
+
+ @property
+ def content(self):
+ # type: () -> bytes
+ """
+ The HTTP message body decoded with the content-encoding header (e.g. gzip)
+
+ See also: :py:class:`raw_content`, :py:attr:`text`
+ """
+ ce = self.headers.get("content-encoding")
+ cached = (
+ self._content_cache.encoded == self.raw_content and
+ self._content_cache.encoding == ce
+ )
+ if not cached:
+ try:
+ if not ce:
+ raise ValueError()
+ decoded = encoding.decode(self.raw_content, ce)
+ except ValueError:
+ decoded = self.raw_content
+ self._content_cache = CachedDecode(self.raw_content, ce, decoded)
+ return self._content_cache.decoded
+
+ @content.setter
+ def content(self, value):
+ ce = self.headers.get("content-encoding")
+ cached = (
+ self._content_cache.decoded == value and
+ self._content_cache.encoding == ce
+ )
+ if not cached:
+ try:
+ if not ce:
+ raise ValueError()
+ encoded = encoding.encode(value, ce)
+ except ValueError:
+ # Do we have an unknown content-encoding?
+ # If so, we want to remove it.
+ if value and ce:
+ self.headers.pop("content-encoding", None)
+ ce = None
+ encoded = value
+ self._content_cache = CachedDecode(encoded, ce, value)
+ self.raw_content = self._content_cache.encoded
+ if isinstance(self.raw_content, bytes):
+ self.headers["content-length"] = str(len(self.raw_content))
@property
def http_version(self):
@@ -137,56 +198,81 @@ class Message(basetypes.Serializable):
def timestamp_end(self, timestamp_end):
self.data.timestamp_end = timestamp_end
+ def _get_content_type_charset(self):
+ # type: () -> Optional[str]
+ ct = headers.parse_content_type(self.headers.get("content-type", ""))
+ if ct:
+ return ct[2].get("charset")
+
@property
def text(self):
+ # type: () -> six.text_type
"""
- The decoded HTTP message body.
- Decoded contents are not cached, so accessing this attribute repeatedly is relatively expensive.
-
- .. note::
- This is not implemented yet.
+ The HTTP message body decoded with both content-encoding header (e.g. gzip)
+ and content-type header charset.
- See also: :py:attr:`content`, :py:class:`decoded`
+ See also: :py:attr:`content`, :py:class:`raw_content`
"""
# This attribute should be called text, because that's what requests does.
- raise NotImplementedError()
+ enc = self._get_content_type_charset()
+
+ # We may also want to check for HTML meta tags here at some point.
+
+ cached = (
+ self._text_cache.encoded == self.content and
+ self._text_cache.encoding == enc
+ )
+ if not cached:
+ try:
+ if not enc:
+ raise ValueError()
+ decoded = encoding.decode(self.content, enc)
+ except ValueError:
+ decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
+ self._text_cache = CachedDecode(self.content, enc, decoded)
+ return self._text_cache.decoded
@text.setter
def text(self, text):
- raise NotImplementedError()
+ enc = self._get_content_type_charset()
+ cached = (
+ self._text_cache.decoded == text and
+ self._text_cache.encoding == enc
+ )
+ if not cached:
+ try:
+ if not enc:
+ raise ValueError()
+ encoded = encoding.encode(text, enc)
+ except ValueError:
+ # Do we have an unknown content-type charset?
+ # If so, we want to replace it with utf8.
+ if text and enc:
+ self.headers["content-type"] = re.sub(
+ "charset=[^;]+",
+ "charset=utf-8",
+ self.headers["content-type"]
+ )
+ encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape")
+ self._text_cache = CachedDecode(encoded, enc, text)
+ self.content = self._text_cache.encoded
def decode(self):
"""
- Decodes body based on the current Content-Encoding header, then
- removes the header. If there is no Content-Encoding header, no
- action is taken.
-
- Returns:
- True, if decoding succeeded.
- False, otherwise.
+ Decodes body based on the current Content-Encoding header, then
+ removes the header. If there is no Content-Encoding header, no
+ action is taken.
"""
- ce = self.headers.get("content-encoding")
- data = encoding.decode(ce, self.content)
- if data is None:
- return False
- self.content = data
+ self.raw_content = self.content
self.headers.pop("content-encoding", None)
- return True
def encode(self, e):
"""
- Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
-
- Returns:
- True, if decoding succeeded.
- False, otherwise.
+ Encodes body with the encoding e, where e is "gzip", "deflate" or "identity".
"""
- data = encoding.encode(e, self.content)
- if data is None:
- return False
- self.content = data
+ self.decode() # remove the current encoding
self.headers["content-encoding"] = e
- return True
+ self.content = self.raw_content
def replace(self, pattern, repl, flags=0):
"""
@@ -203,10 +289,9 @@ class Message(basetypes.Serializable):
repl = strutils.escaped_str_to_bytes(repl)
replacements = 0
if self.content:
- with decoded(self):
- self.content, replacements = re.subn(
- pattern, repl, self.content, flags=flags
- )
+ self.content, replacements = re.subn(
+ pattern, repl, self.content, flags=flags
+ )
replacements += self.headers.replace(pattern, repl, flags)
return replacements
@@ -225,29 +310,16 @@ class Message(basetypes.Serializable):
class decoded(object):
"""
- A context manager that decodes a request or response, and then
- re-encodes it with the same encoding after execution of the block.
-
- Example:
-
- .. code-block:: python
-
- with decoded(request):
- request.content = request.content.replace("foo", "bar")
+ Deprecated: You can now directly use :py:attr:`content`.
+ :py:attr:`raw_content` has the encoded content.
"""
def __init__(self, message):
- self.message = message
- ce = message.headers.get("content-encoding")
- if ce in encoding.ENCODINGS:
- self.ce = ce
- else:
- self.ce = None
+ warnings.warn("decoded() is deprecated, you can now directly use .content instead. "
+ ".raw_content has the encoded content.", DeprecationWarning)
def __enter__(self):
- if self.ce:
- self.message.decode()
+ pass
def __exit__(self, type, value, tb):
- if self.ce:
- self.message.encode(self.ce)
+ pass \ No newline at end of file
diff --git a/netlib/http/request.py b/netlib/http/request.py
index d9f4ed00..4ce94549 100644
--- a/netlib/http/request.py
+++ b/netlib/http/request.py
@@ -5,7 +5,6 @@ import re
import six
from six.moves import urllib
-from netlib import encoding
from netlib import multidict
from netlib import strutils
from netlib.http import multipart
@@ -44,6 +43,7 @@ class Request(message.Message):
An HTTP request.
"""
def __init__(self, *args, **kwargs):
+ super(Request, self).__init__()
self.data = RequestData(*args, **kwargs)
def __repr__(self):
@@ -327,7 +327,7 @@ class Request(message.Message):
self.headers["accept-encoding"] = (
', '.join(
e
- for e in encoding.ENCODINGS
+ for e in {"gzip", "identity", "deflate"}
if e in accept_encoding
)
)
diff --git a/netlib/http/response.py b/netlib/http/response.py
index 17d69418..d2273edd 100644
--- a/netlib/http/response.py
+++ b/netlib/http/response.py
@@ -30,13 +30,14 @@ class Response(message.Message):
An HTTP response.
"""
def __init__(self, *args, **kwargs):
+ super(Response, self).__init__()
self.data = ResponseData(*args, **kwargs)
def __repr__(self):
- if self.content:
+ if self.raw_content:
details = "{}, {}".format(
self.headers.get("content-type", "unknown content type"),
- human.pretty_size(len(self.content))
+ human.pretty_size(len(self.raw_content))
)
else:
details = "no content"