aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mitmproxy/console/common.py30
-rw-r--r--mitmproxy/console/flowview.py12
-rw-r--r--mitmproxy/dump.py2
-rw-r--r--mitmproxy/filt.py28
-rw-r--r--mitmproxy/flow/export.py11
-rw-r--r--netlib/http/headers.py12
-rw-r--r--netlib/http/message.py133
-rw-r--r--netlib/wsgi.py10
-rw-r--r--test/netlib/http/test_headers.py9
-rw-r--r--test/netlib/http/test_message.py77
10 files changed, 194 insertions, 130 deletions
diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py
index ef220b4c..41f4f243 100644
--- a/mitmproxy/console/common.py
+++ b/mitmproxy/console/common.py
@@ -257,16 +257,13 @@ def copy_flow_format_data(part, scope, flow):
data = ""
if scope in ("q", "a"):
request = flow.request.copy()
- try:
- request.decode()
- except ValueError:
- pass
- if request.raw_content is None:
+ request.decode(strict=False)
+ if request.content is None:
return None, "Request content is missing"
if part == "h":
data += netlib.http.http1.assemble_request(request)
elif part == "c":
- data += request.raw_content
+ data += request.content
else:
raise ValueError("Unknown part: {}".format(part))
if scope == "a" and flow.request.raw_content and flow.response:
@@ -274,16 +271,13 @@ def copy_flow_format_data(part, scope, flow):
data += "\r\n" * 2
if scope in ("s", "a") and flow.response:
response = flow.response.copy()
- try:
- response.decode()
- except ValueError:
- pass
- if response.raw_content is None:
+ response.decode(strict=False)
+ if response.content is None:
return None, "Response content is missing"
if part == "h":
data += netlib.http.http1.assemble_response(response)
elif part == "c":
- data += response.raw_content
+ data += response.content
else:
raise ValueError("Unknown part: {}".format(part))
return data, False
@@ -393,22 +387,14 @@ def ask_save_body(part, master, state, flow):
ask_save_body("q", master, state, flow)
elif part == "q" and request_has_content:
- try:
- content = flow.request.content
- except ValueError:
- content = flow.request.raw_content
ask_save_path(
"Save request content",
- content
+ flow.request.get_content(strict=False),
)
elif part == "s" and response_has_content:
- try:
- content = flow.response.content
- except ValueError:
- content = flow.response.raw_content
ask_save_path(
"Save response content",
- content
+ flow.response.get_content(strict=False),
)
else:
signals.status_message.send(message="No content to save.")
diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py
index d994e670..f8686b41 100644
--- a/mitmproxy/console/flowview.py
+++ b/mitmproxy/console/flowview.py
@@ -427,11 +427,7 @@ class FlowView(tabs.Tabs):
# editing message bodies, this can cause problems. For now, I
# just strip the newlines off the end of the body when we return
# from an editor.
- try:
- content = message.content
- except ValueError:
- content = message.raw_content
- c = self.master.spawn_editor(content or b"")
+ c = self.master.spawn_editor(message.get_content(strict=False) or b"")
message.content = c.rstrip(b"\n")
elif part == "f":
if not message.urlencoded_form and message.raw_content:
@@ -697,11 +693,7 @@ class FlowView(tabs.Tabs):
if conn.raw_content:
t = conn.headers.get("content-type")
if "EDITOR" in os.environ or "PAGER" in os.environ:
- try:
- content = conn.content
- except ValueError:
- content = conn.raw_content
- self.master.spawn_external_viewer(content, t)
+ self.master.spawn_external_viewer(conn.get_content(strict=False), t)
else:
signals.status_message.send(
message = "Error! Set $EDITOR or $PAGER."
diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py
index 0a9b76a7..14d55cd1 100644
--- a/mitmproxy/dump.py
+++ b/mitmproxy/dump.py
@@ -190,7 +190,7 @@ class DumpMaster(flow.FlowMaster):
try:
content = message.content
except ValueError:
- content = message.raw_content
+ content = message.get_content(strict=False)
if content is None:
self.echo("(content missing)", indent=4)
diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py
index e8687b9f..a42988f1 100644
--- a/mitmproxy/filt.py
+++ b/mitmproxy/filt.py
@@ -194,17 +194,11 @@ class FBod(_Rex):
def __call__(self, f):
if f.request and f.request.raw_content:
- try:
- if self.re.search(f.request.content):
- return True
- except ValueError:
- pass
+ if self.re.search(f.request.get_content(strict=False)):
+ return True
if f.response and f.response.raw_content:
- try:
- if self.re.search(f.response.content):
- return True
- except ValueError:
- pass
+ if self.re.search(f.response.get_content(strict=False)):
+ return True
return False
@@ -214,11 +208,8 @@ class FBodRequest(_Rex):
def __call__(self, f):
if f.request and f.request.raw_content:
- try:
- if self.re.search(f.request.content):
- return True
- except ValueError:
- pass
+ if self.re.search(f.request.get_content(strict=False)):
+ return True
class FBodResponse(_Rex):
@@ -227,11 +218,8 @@ class FBodResponse(_Rex):
def __call__(self, f):
if f.response and f.response.raw_content:
- try:
- if self.re.search(f.response.content):
- return True
- except ValueError:
- pass
+ if self.re.search(f.response.get_content(strict=False)):
+ return True
class FMethod(_Rex):
diff --git a/mitmproxy/flow/export.py b/mitmproxy/flow/export.py
index 9da18f22..4659af7b 100644
--- a/mitmproxy/flow/export.py
+++ b/mitmproxy/flow/export.py
@@ -20,12 +20,9 @@ def curl_command(flow):
data = "curl "
request = flow.request.copy()
- try:
- request.decode()
- except ValueError:
- pass
+ request.decode(strict=False)
- for k, v in request.headers.fields:
+ for k, v in request.headers.items(multi=True):
data += "-H '%s:%s' " % (k, v)
if request.method != "GET":
@@ -34,8 +31,8 @@ def curl_command(flow):
full_url = request.scheme + "://" + request.host + request.path
data += "'%s'" % full_url
- if request.raw_content:
- data += " --data-binary '%s'" % request.raw_content
+ if request.content:
+ data += " --data-binary '%s'" % request.content
return data
diff --git a/netlib/http/headers.py b/netlib/http/headers.py
index f052a53b..13a8c98f 100644
--- a/netlib/http/headers.py
+++ b/netlib/http/headers.py
@@ -204,3 +204,15 @@ def parse_content_type(c):
if len(clause) == 2:
d[clause[0].strip()] = clause[1].strip()
return ts[0].lower(), ts[1].lower(), d
+
+
+def assemble_content_type(type, subtype, parameters):
+ if not parameters:
+ return "{}/{}".format(type, subtype)
+ params = "; ".join(
+ "{}={}".format(k, v)
+ for k, v in parameters.items()
+ )
+ return "{}/{}; {}".format(
+ type, subtype, params
+ )
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 86ff64d1..1252ed25 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -53,14 +53,15 @@ class MessageData(basetypes.Serializable):
class CachedDecode(object):
- __slots__ = ["encoded", "encoding", "decoded"]
+ __slots__ = ["encoded", "encoding", "strict", "decoded"]
- def __init__(self, object, encoding, decoded):
+ def __init__(self, object, encoding, strict, decoded):
self.encoded = object
self.encoding = encoding
+ self.strict = strict
self.decoded = decoded
-no_cached_decode = CachedDecode(None, None, None)
+no_cached_decode = CachedDecode(None, None, None, None)
class Message(basetypes.Serializable):
@@ -118,33 +119,44 @@ class Message(basetypes.Serializable):
def raw_content(self, content):
self.data.content = content
- @property
- def content(self):
- # type: () -> bytes
+ def get_content(self, strict=True):
+ # type: (bool) -> bytes
"""
The HTTP message body decoded with the content-encoding header (e.g. gzip)
Raises:
- ValueError, when getting the content and the content-encoding is invalid.
+ ValueError, when the content-encoding is invalid and strict is True.
See also: :py:class:`raw_content`, :py:attr:`text`
"""
+ if self.raw_content is None:
+ return None
ce = self.headers.get("content-encoding")
cached = (
self._content_cache.encoded == self.raw_content and
+ (self._content_cache.strict or not strict) and
self._content_cache.encoding == ce
)
if not cached:
+ is_strict = True
if ce:
- decoded = encoding.decode(self.raw_content, ce)
+ try:
+ decoded = encoding.decode(self.raw_content, ce)
+ except ValueError:
+ if strict:
+ raise
+ is_strict = False
+ decoded = self.raw_content
else:
decoded = self.raw_content
- self._content_cache = CachedDecode(self.raw_content, ce, decoded)
+ self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded)
return self._content_cache.decoded
- @content.setter
- def content(self, value):
- if value is not None and not isinstance(value, bytes):
+ def set_content(self, value):
+ if value is None:
+ self.raw_content = None
+ return
+ if not isinstance(value, bytes):
raise TypeError(
"Message content must be bytes, not {}. "
"Please use .text if you want to assign a str."
@@ -153,24 +165,23 @@ class Message(basetypes.Serializable):
ce = self.headers.get("content-encoding")
cached = (
self._content_cache.decoded == value and
- self._content_cache.encoding == ce
+ self._content_cache.encoding == ce and
+ self._content_cache.strict
)
if not cached:
try:
- if ce and value is not None:
- encoded = encoding.encode(value, ce)
- else:
- encoded = value
+ encoded = encoding.encode(value, ce or "identity")
except ValueError:
# So we have an invalid content-encoding?
# Let's remove it!
del self.headers["content-encoding"]
ce = None
encoded = value
- self._content_cache = CachedDecode(encoded, ce, value)
+ self._content_cache = CachedDecode(encoded, ce, True, value)
self.raw_content = self._content_cache.encoded
- if isinstance(self.raw_content, bytes):
- self.headers["content-length"] = str(len(self.raw_content))
+ self.headers["content-length"] = str(len(self.raw_content))
+
+ content = property(get_content, set_content)
@property
def http_version(self):
@@ -211,69 +222,87 @@ class Message(basetypes.Serializable):
if ct:
return ct[2].get("charset")
- @property
- def text(self):
- # type: () -> six.text_type
+ def _guess_encoding(self):
+ # type: () -> str
+ enc = self._get_content_type_charset()
+ if enc:
+ return enc
+
+ if "json" in self.headers.get("content-type", ""):
+ return "utf8"
+ else:
+ # We may also want to check for HTML meta tags here at some point.
+ return "latin-1"
+
+ def get_text(self, strict=True):
+ # type: (bool) -> six.text_type
"""
The HTTP message body decoded with both content-encoding header (e.g. gzip)
and content-type header charset.
+ Raises:
+ ValueError, when either content-encoding or charset is invalid and strict is True.
+
See also: :py:attr:`content`, :py:class:`raw_content`
"""
- # This attribute should be called text, because that's what requests does.
- enc = self._get_content_type_charset()
-
- # We may also want to check for HTML meta tags here at some point.
+ if self.raw_content is None:
+ return None
+ enc = self._guess_encoding()
+ content = self.get_content(strict)
cached = (
- self._text_cache.encoded == self.content and
+ self._text_cache.encoded == content and
+ (self._text_cache.strict or not strict) and
self._text_cache.encoding == enc
)
if not cached:
+ is_strict = self._content_cache.strict
try:
- if not enc:
- raise ValueError()
- decoded = encoding.decode(self.content, enc)
+ decoded = encoding.decode(content, enc)
except ValueError:
- decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
- self._text_cache = CachedDecode(self.content, enc, decoded)
+ if strict:
+ raise
+ is_strict = False
+ decoded = self.content.decode(enc, "replace" if six.PY2 else "surrogateescape")
+ self._text_cache = CachedDecode(content, enc, is_strict, decoded)
return self._text_cache.decoded
- @text.setter
- def text(self, text):
- enc = self._get_content_type_charset()
+ def set_text(self, text):
+ if text is None:
+ self.content = None
+ return
+ enc = self._guess_encoding()
+
cached = (
self._text_cache.decoded == text and
- self._text_cache.encoding == enc
+ self._text_cache.encoding == enc and
+ self._text_cache.strict
)
if not cached:
try:
- if not enc:
- raise ValueError()
encoded = encoding.encode(text, enc)
except ValueError:
- # Do we have an unknown content-type charset?
- # If so, we want to replace it with utf8.
- if text and enc:
- self.headers["content-type"] = re.sub(
- "charset=[^;]+",
- "charset=utf-8",
- self.headers["content-type"]
- )
- encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape")
- self._text_cache = CachedDecode(encoded, enc, text)
+ # Fall back to UTF-8 and update the content-type header.
+ ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {})
+ ct[2]["charset"] = "utf-8"
+ self.headers["content-type"] = headers.assemble_content_type(*ct)
+ enc = "utf8"
+ encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape")
+ self._text_cache = CachedDecode(encoded, enc, True, text)
self.content = self._text_cache.encoded
- def decode(self):
+ text = property(get_text, set_text)
+
+ def decode(self, strict=True):
"""
Decodes body based on the current Content-Encoding header, then
removes the header. If there is no Content-Encoding header, no
action is taken.
Raises:
- ValueError, when the content-encoding is invalid.
+ ValueError, when the content-encoding is invalid and strict is True.
"""
- self.raw_content = self.content
+ self.raw_content = self.get_content(strict)
self.headers.pop("content-encoding", None)
def encode(self, e):
diff --git a/netlib/wsgi.py b/netlib/wsgi.py
index 2444f449..0def75b5 100644
--- a/netlib/wsgi.py
+++ b/netlib/wsgi.py
@@ -54,20 +54,20 @@ class WSGIAdaptor(object):
self.app, self.domain, self.port, self.sversion = app, domain, port, sversion
def make_environ(self, flow, errsoc, **extra):
+ """
+ Raises:
+ ValueError, if the content-encoding is invalid.
+ """
path = strutils.native(flow.request.path, "latin-1")
if '?' in path:
path_info, query = strutils.native(path, "latin-1").split('?', 1)
else:
path_info = path
query = ''
- try:
- content = flow.request.content
- except ValueError:
- content = flow.request.raw_content
environ = {
'wsgi.version': (1, 0),
'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"),
- 'wsgi.input': BytesIO(content or b""),
+ 'wsgi.input': BytesIO(flow.request.content or b""),
'wsgi.errors': errsoc,
'wsgi.multithread': True,
'wsgi.multiprocess': False,
diff --git a/test/netlib/http/test_headers.py b/test/netlib/http/test_headers.py
index 51819b86..8462a5af 100644
--- a/test/netlib/http/test_headers.py
+++ b/test/netlib/http/test_headers.py
@@ -1,4 +1,4 @@
-from netlib.http import Headers, parse_content_type
+from netlib.http.headers import Headers, parse_content_type, assemble_content_type
from netlib.tutils import raises
@@ -81,3 +81,10 @@ def test_parse_content_type():
v = p("text/html; charset=UTF-8")
assert v == ('text', 'html', {'charset': 'UTF-8'})
+
+
+def test_assemble_content_type():
+ p = assemble_content_type
+ assert p("text", "html", {}) == "text/html"
+ assert p("text", "html", {"charset": "utf8"}) == "text/html; charset=utf8"
+ assert p("text", "html", {"charset": "utf8", "foo": "bar"}) == "text/html; charset=utf8; foo=bar"
diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py
index ed7d3da5..8b178e04 100644
--- a/test/netlib/http/test_message.py
+++ b/test/netlib/http/test_message.py
@@ -142,6 +142,9 @@ class TestMessageContentEncoding(object):
r.content = b"bar"
assert e.call_count == 1
+ with tutils.raises(TypeError):
+ r.content = u"foo"
+
def test_unknown_ce(self):
r = tresp()
r.headers["content-encoding"] = "zopfli"
@@ -149,6 +152,7 @@ class TestMessageContentEncoding(object):
with tutils.raises(ValueError):
assert r.content
assert r.headers["content-encoding"]
+ assert r.get_content(strict=False) == b"foo"
def test_cannot_decode(self):
r = tresp()
@@ -157,12 +161,25 @@ class TestMessageContentEncoding(object):
with tutils.raises(ValueError):
assert r.content
assert r.headers["content-encoding"]
+ assert r.get_content(strict=False) == b"foo"
with tutils.raises(ValueError):
r.decode()
assert r.raw_content == b"foo"
assert "content-encoding" in r.headers
+ r.decode(strict=False)
+ assert r.content == b"foo"
+ assert "content-encoding" not in r.headers
+
+ def test_none(self):
+ r = tresp(content=None)
+ assert r.content is None
+ r.content = b"foo"
+ assert r.content is not None
+ r.content = None
+ assert r.content is None
+
def test_cannot_encode(self):
r = tresp()
r.encode("gzip")
@@ -175,12 +192,17 @@ class TestMessageContentEncoding(object):
assert "content-encoding" not in r.headers
assert r.raw_content == b"foo"
+ with tutils.raises(ValueError):
+ r.encode("zopfli")
+ assert r.raw_content == b"foo"
+ assert "content-encoding" not in r.headers
+
class TestMessageText(object):
def test_simple(self):
- r = tresp(content=b'\xc3\xbc')
- assert r.raw_content == b"\xc3\xbc"
- assert r.content == b"\xc3\xbc"
+ r = tresp(content=b'\xfc')
+ assert r.raw_content == b"\xfc"
+ assert r.content == b"\xfc"
assert r.text == u"ü"
r.encode("gzip")
@@ -189,8 +211,10 @@ class TestMessageText(object):
assert r.text == u"ü"
r.headers["content-type"] = "text/html; charset=latin1"
- assert r.content == b"\xc3\xbc"
+ r.content = b"\xc3\xbc"
assert r.text == u"ü"
+ r.headers["content-type"] = "text/html; charset=utf8"
+ assert r.text == u"ü"
r.encode("identity")
r.raw_content = b"foo"
@@ -201,16 +225,29 @@ class TestMessageText(object):
assert r.text
assert e.call_count == 0
+ def test_guess_json(self):
+ r = tresp(content=b'"\xc3\xbc"')
+ r.headers["content-type"] = "application/json"
+ assert r.text == u'"ü"'
+
+ def test_none(self):
+ r = tresp(content=None)
+ assert r.text is None
+ r.text = b"foo"
+ assert r.text is not None
+ r.text = None
+ assert r.text is None
+
def test_modify(self):
r = tresp()
r.text = u"ü"
- assert r.raw_content == b"\xc3\xbc"
+ assert r.raw_content == b"\xfc"
- r.headers["content-type"] = "text/html; charset=latin1"
+ r.headers["content-type"] = "text/html; charset=utf8"
r.text = u"ü"
- assert r.raw_content == b"\xfc"
- assert r.headers["content-length"] == "1"
+ assert r.raw_content == b"\xc3\xbc"
+ assert r.headers["content-length"] == "2"
r.encode("identity")
with mock.patch("netlib.encoding.encode") as e:
@@ -224,12 +261,18 @@ class TestMessageText(object):
r = tresp()
r.headers["content-type"] = "text/html; charset=wtf"
r.raw_content = b"foo"
- assert r.text == u"foo"
+ with tutils.raises(ValueError):
+ assert r.text == u"foo"
+ assert r.get_text(strict=False) == u"foo"
def test_cannot_decode(self):
r = tresp()
+ r.headers["content-type"] = "text/html; charset=utf8"
r.raw_content = b"\xFF"
- assert r.text == u'\ufffd' if six.PY2 else '\udcff'
+ with tutils.raises(ValueError):
+ assert r.text
+
+ assert r.get_text(strict=False) == u'\ufffd' if six.PY2 else '\udcff'
def test_cannot_encode(self):
r = tresp()
@@ -237,9 +280,19 @@ class TestMessageText(object):
assert "content-type" not in r.headers
assert r.raw_content is None
- r.headers["content-type"] = "text/html; charset=latin1"
+ r.headers["content-type"] = "text/html; charset=latin1; foo=bar"
r.text = u"☃"
- assert r.headers["content-type"] == "text/html; charset=utf-8"
+ assert r.headers["content-type"] == "text/html; charset=utf-8; foo=bar"
+ assert r.raw_content == b'\xe2\x98\x83'
+
+ r.headers["content-type"] = "gibberish"
+ r.text = u"☃"
+ assert r.headers["content-type"] == "text/plain; charset=utf-8"
+ assert r.raw_content == b'\xe2\x98\x83'
+
+ del r.headers["content-type"]
+ r.text = u"☃"
+ assert r.headers["content-type"] == "text/plain; charset=utf-8"
assert r.raw_content == b'\xe2\x98\x83'
r.headers["content-type"] = "text/html; charset=latin1"