diff options
author | Maximilian Hils <git@maximilianhils.com> | 2016-07-15 22:50:33 -0700 |
---|---|---|
committer | Maximilian Hils <git@maximilianhils.com> | 2016-07-15 22:50:33 -0700 |
commit | a3c7c84d49c3e6563e7f37ef60c989f99ed96788 (patch) | |
tree | 578d596bb630f44cf0f6a26078d1070977f8a53d | |
parent | ca9de786fd7ed3edf7a485f7c019ac83d5abfc7f (diff) | |
download | mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.tar.gz mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.tar.bz2 mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.zip |
improve message content semantics
-rw-r--r-- | mitmproxy/console/common.py | 30 | ||||
-rw-r--r-- | mitmproxy/console/flowview.py | 12 | ||||
-rw-r--r-- | mitmproxy/dump.py | 2 | ||||
-rw-r--r-- | mitmproxy/filt.py | 28 | ||||
-rw-r--r-- | mitmproxy/flow/export.py | 11 | ||||
-rw-r--r-- | netlib/http/headers.py | 12 | ||||
-rw-r--r-- | netlib/http/message.py | 133 | ||||
-rw-r--r-- | netlib/wsgi.py | 10 | ||||
-rw-r--r-- | test/netlib/http/test_headers.py | 9 | ||||
-rw-r--r-- | test/netlib/http/test_message.py | 77 |
10 files changed, 194 insertions, 130 deletions
diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py index ef220b4c..41f4f243 100644 --- a/mitmproxy/console/common.py +++ b/mitmproxy/console/common.py @@ -257,16 +257,13 @@ def copy_flow_format_data(part, scope, flow): data = "" if scope in ("q", "a"): request = flow.request.copy() - try: - request.decode() - except ValueError: - pass - if request.raw_content is None: + request.decode(strict=False) + if request.content is None: return None, "Request content is missing" if part == "h": data += netlib.http.http1.assemble_request(request) elif part == "c": - data += request.raw_content + data += request.content else: raise ValueError("Unknown part: {}".format(part)) if scope == "a" and flow.request.raw_content and flow.response: @@ -274,16 +271,13 @@ def copy_flow_format_data(part, scope, flow): data += "\r\n" * 2 if scope in ("s", "a") and flow.response: response = flow.response.copy() - try: - response.decode() - except ValueError: - pass - if response.raw_content is None: + response.decode(strict=False) + if response.content is None: return None, "Response content is missing" if part == "h": data += netlib.http.http1.assemble_response(response) elif part == "c": - data += response.raw_content + data += response.content else: raise ValueError("Unknown part: {}".format(part)) return data, False @@ -393,22 +387,14 @@ def ask_save_body(part, master, state, flow): ask_save_body("q", master, state, flow) elif part == "q" and request_has_content: - try: - content = flow.request.content - except ValueError: - content = flow.request.raw_content ask_save_path( "Save request content", - content + flow.request.get_content(strict=False), ) elif part == "s" and response_has_content: - try: - content = flow.response.content - except ValueError: - content = flow.response.raw_content ask_save_path( "Save response content", - content + flow.response.get_content(strict=False), ) else: signals.status_message.send(message="No content to save.") diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py index d994e670..f8686b41 100644 --- a/mitmproxy/console/flowview.py +++ b/mitmproxy/console/flowview.py @@ -427,11 +427,7 @@ class FlowView(tabs.Tabs): # editing message bodies, this can cause problems. For now, I # just strip the newlines off the end of the body when we return # from an editor. - try: - content = message.content - except ValueError: - content = message.raw_content - c = self.master.spawn_editor(content or b"") + c = self.master.spawn_editor(message.get_content(strict=False) or b"") message.content = c.rstrip(b"\n") elif part == "f": if not message.urlencoded_form and message.raw_content: @@ -697,11 +693,7 @@ class FlowView(tabs.Tabs): if conn.raw_content: t = conn.headers.get("content-type") if "EDITOR" in os.environ or "PAGER" in os.environ: - try: - content = conn.content - except ValueError: - content = conn.raw_content - self.master.spawn_external_viewer(content, t) + self.master.spawn_external_viewer(conn.get_content(strict=False), t) else: signals.status_message.send( message = "Error! Set $EDITOR or $PAGER." diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py index 0a9b76a7..14d55cd1 100644 --- a/mitmproxy/dump.py +++ b/mitmproxy/dump.py @@ -190,7 +190,7 @@ class DumpMaster(flow.FlowMaster): try: content = message.content except ValueError: - content = message.raw_content + content = message.get_content(strict=False) if content is None: self.echo("(content missing)", indent=4) diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py index e8687b9f..a42988f1 100644 --- a/mitmproxy/filt.py +++ b/mitmproxy/filt.py @@ -194,17 +194,11 @@ class FBod(_Rex): def __call__(self, f): if f.request and f.request.raw_content: - try: - if self.re.search(f.request.content): - return True - except ValueError: - pass + if self.re.search(f.request.get_content(strict=False)): + return True if f.response and f.response.raw_content: - try: - if self.re.search(f.response.content): - return True - except ValueError: - pass + if self.re.search(f.response.get_content(strict=False)): + return True return False @@ -214,11 +208,8 @@ class FBodRequest(_Rex): def __call__(self, f): if f.request and f.request.raw_content: - try: - if self.re.search(f.request.content): - return True - except ValueError: - pass + if self.re.search(f.request.get_content(strict=False)): + return True class FBodResponse(_Rex): @@ -227,11 +218,8 @@ class FBodResponse(_Rex): def __call__(self, f): if f.response and f.response.raw_content: - try: - if self.re.search(f.response.content): - return True - except ValueError: - pass + if self.re.search(f.response.get_content(strict=False)): + return True class FMethod(_Rex): diff --git a/mitmproxy/flow/export.py b/mitmproxy/flow/export.py index 9da18f22..4659af7b 100644 --- a/mitmproxy/flow/export.py +++ b/mitmproxy/flow/export.py @@ -20,12 +20,9 @@ def curl_command(flow): data = "curl " request = flow.request.copy() - try: - request.decode() - except ValueError: - pass + request.decode(strict=False) - for k, v in request.headers.fields: + for k, v in request.headers.items(multi=True): data += "-H '%s:%s' " % (k, v) if request.method != "GET": @@ -34,8 +31,8 @@ def curl_command(flow): full_url = request.scheme + "://" + request.host + request.path data += "'%s'" % full_url - if request.raw_content: - data += " --data-binary '%s'" % request.raw_content + if request.content: + data += " --data-binary '%s'" % request.content return data diff --git a/netlib/http/headers.py b/netlib/http/headers.py index f052a53b..13a8c98f 100644 --- a/netlib/http/headers.py +++ b/netlib/http/headers.py @@ -204,3 +204,15 @@ def parse_content_type(c): if len(clause) == 2: d[clause[0].strip()] = clause[1].strip() return ts[0].lower(), ts[1].lower(), d + + +def assemble_content_type(type, subtype, parameters): + if not parameters: + return "{}/{}".format(type, subtype) + params = "; ".join( + "{}={}".format(k, v) + for k, v in parameters.items() + ) + return "{}/{}; {}".format( + type, subtype, params + ) diff --git a/netlib/http/message.py b/netlib/http/message.py index 86ff64d1..1252ed25 100644 --- a/netlib/http/message.py +++ b/netlib/http/message.py @@ -53,14 +53,15 @@ class MessageData(basetypes.Serializable): class CachedDecode(object): - __slots__ = ["encoded", "encoding", "decoded"] + __slots__ = ["encoded", "encoding", "strict", "decoded"] - def __init__(self, object, encoding, decoded): + def __init__(self, object, encoding, strict, decoded): self.encoded = object self.encoding = encoding + self.strict = strict self.decoded = decoded -no_cached_decode = CachedDecode(None, None, None) +no_cached_decode = CachedDecode(None, None, None, None) class Message(basetypes.Serializable): @@ -118,33 +119,44 @@ class Message(basetypes.Serializable): def raw_content(self, content): self.data.content = content - @property - def content(self): - # type: () -> bytes + def get_content(self, strict=True): + # type: (bool) -> bytes """ The HTTP message body decoded with the content-encoding header (e.g. gzip) Raises: - ValueError, when getting the content and the content-encoding is invalid. + ValueError, when the content-encoding is invalid and strict is True. See also: :py:class:`raw_content`, :py:attr:`text` """ + if self.raw_content is None: + return None ce = self.headers.get("content-encoding") cached = ( self._content_cache.encoded == self.raw_content and + (self._content_cache.strict or not strict) and self._content_cache.encoding == ce ) if not cached: + is_strict = True if ce: - decoded = encoding.decode(self.raw_content, ce) + try: + decoded = encoding.decode(self.raw_content, ce) + except ValueError: + if strict: + raise + is_strict = False + decoded = self.raw_content else: decoded = self.raw_content - self._content_cache = CachedDecode(self.raw_content, ce, decoded) + self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded) return self._content_cache.decoded - @content.setter - def content(self, value): - if value is not None and not isinstance(value, bytes): + def set_content(self, value): + if value is None: + self.raw_content = None + return + if not isinstance(value, bytes): raise TypeError( "Message content must be bytes, not {}. " "Please use .text if you want to assign a str." @@ -153,24 +165,23 @@ class Message(basetypes.Serializable): ce = self.headers.get("content-encoding") cached = ( self._content_cache.decoded == value and - self._content_cache.encoding == ce + self._content_cache.encoding == ce and + self._content_cache.strict ) if not cached: try: - if ce and value is not None: - encoded = encoding.encode(value, ce) - else: - encoded = value + encoded = encoding.encode(value, ce or "identity") except ValueError: # So we have an invalid content-encoding? # Let's remove it! del self.headers["content-encoding"] ce = None encoded = value - self._content_cache = CachedDecode(encoded, ce, value) + self._content_cache = CachedDecode(encoded, ce, True, value) self.raw_content = self._content_cache.encoded - if isinstance(self.raw_content, bytes): - self.headers["content-length"] = str(len(self.raw_content)) + self.headers["content-length"] = str(len(self.raw_content)) + + content = property(get_content, set_content) @property def http_version(self): @@ -211,69 +222,87 @@ class Message(basetypes.Serializable): if ct: return ct[2].get("charset") - @property - def text(self): - # type: () -> six.text_type + def _guess_encoding(self): + # type: () -> str + enc = self._get_content_type_charset() + if enc: + return enc + + if "json" in self.headers.get("content-type", ""): + return "utf8" + else: + # We may also want to check for HTML meta tags here at some point. + return "latin-1" + + def get_text(self, strict=True): + # type: (bool) -> six.text_type """ The HTTP message body decoded with both content-encoding header (e.g. gzip) and content-type header charset. + Raises: + ValueError, when either content-encoding or charset is invalid and strict is True. + See also: :py:attr:`content`, :py:class:`raw_content` """ - # This attribute should be called text, because that's what requests does. - enc = self._get_content_type_charset() - - # We may also want to check for HTML meta tags here at some point. + if self.raw_content is None: + return None + enc = self._guess_encoding() + content = self.get_content(strict) cached = ( - self._text_cache.encoded == self.content and + self._text_cache.encoded == content and + (self._text_cache.strict or not strict) and self._text_cache.encoding == enc ) if not cached: + is_strict = self._content_cache.strict try: - if not enc: - raise ValueError() - decoded = encoding.decode(self.content, enc) + decoded = encoding.decode(content, enc) except ValueError: - decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape") - self._text_cache = CachedDecode(self.content, enc, decoded) + if strict: + raise + is_strict = False + decoded = self.content.decode(enc, "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(content, enc, is_strict, decoded) return self._text_cache.decoded - @text.setter - def text(self, text): - enc = self._get_content_type_charset() + def set_text(self, text): + if text is None: + self.content = None + return + enc = self._guess_encoding() + cached = ( self._text_cache.decoded == text and - self._text_cache.encoding == enc + self._text_cache.encoding == enc and + self._text_cache.strict ) if not cached: try: - if not enc: - raise ValueError() encoded = encoding.encode(text, enc) except ValueError: - # Do we have an unknown content-type charset? - # If so, we want to replace it with utf8. - if text and enc: - self.headers["content-type"] = re.sub( - "charset=[^;]+", - "charset=utf-8", - self.headers["content-type"] - ) - encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape") - self._text_cache = CachedDecode(encoded, enc, text) + # Fall back to UTF-8 and update the content-type header. + ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {}) + ct[2]["charset"] = "utf-8" + self.headers["content-type"] = headers.assemble_content_type(*ct) + enc = "utf8" + encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape") + self._text_cache = CachedDecode(encoded, enc, True, text) self.content = self._text_cache.encoded - def decode(self): + text = property(get_text, set_text) + + def decode(self, strict=True): """ Decodes body based on the current Content-Encoding header, then removes the header. If there is no Content-Encoding header, no action is taken. Raises: - ValueError, when the content-encoding is invalid. + ValueError, when the content-encoding is invalid and strict is True. """ - self.raw_content = self.content + self.raw_content = self.get_content(strict) self.headers.pop("content-encoding", None) def encode(self, e): diff --git a/netlib/wsgi.py b/netlib/wsgi.py index 2444f449..0def75b5 100644 --- a/netlib/wsgi.py +++ b/netlib/wsgi.py @@ -54,20 +54,20 @@ class WSGIAdaptor(object): self.app, self.domain, self.port, self.sversion = app, domain, port, sversion def make_environ(self, flow, errsoc, **extra): + """ + Raises: + ValueError, if the content-encoding is invalid. + """ path = strutils.native(flow.request.path, "latin-1") if '?' in path: path_info, query = strutils.native(path, "latin-1").split('?', 1) else: path_info = path query = '' - try: - content = flow.request.content - except ValueError: - content = flow.request.raw_content environ = { 'wsgi.version': (1, 0), 'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"), - 'wsgi.input': BytesIO(content or b""), + 'wsgi.input': BytesIO(flow.request.content or b""), 'wsgi.errors': errsoc, 'wsgi.multithread': True, 'wsgi.multiprocess': False, diff --git a/test/netlib/http/test_headers.py b/test/netlib/http/test_headers.py index 51819b86..8462a5af 100644 --- a/test/netlib/http/test_headers.py +++ b/test/netlib/http/test_headers.py @@ -1,4 +1,4 @@ -from netlib.http import Headers, parse_content_type +from netlib.http.headers import Headers, parse_content_type, assemble_content_type from netlib.tutils import raises @@ -81,3 +81,10 @@ def test_parse_content_type(): v = p("text/html; charset=UTF-8") assert v == ('text', 'html', {'charset': 'UTF-8'}) + + +def test_assemble_content_type(): + p = assemble_content_type + assert p("text", "html", {}) == "text/html" + assert p("text", "html", {"charset": "utf8"}) == "text/html; charset=utf8" + assert p("text", "html", {"charset": "utf8", "foo": "bar"}) == "text/html; charset=utf8; foo=bar" diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py index ed7d3da5..8b178e04 100644 --- a/test/netlib/http/test_message.py +++ b/test/netlib/http/test_message.py @@ -142,6 +142,9 @@ class TestMessageContentEncoding(object): r.content = b"bar" assert e.call_count == 1 + with tutils.raises(TypeError): + r.content = u"foo" + def test_unknown_ce(self): r = tresp() r.headers["content-encoding"] = "zopfli" @@ -149,6 +152,7 @@ class TestMessageContentEncoding(object): with tutils.raises(ValueError): assert r.content assert r.headers["content-encoding"] + assert r.get_content(strict=False) == b"foo" def test_cannot_decode(self): r = tresp() @@ -157,12 +161,25 @@ class TestMessageContentEncoding(object): with tutils.raises(ValueError): assert r.content assert r.headers["content-encoding"] + assert r.get_content(strict=False) == b"foo" with tutils.raises(ValueError): r.decode() assert r.raw_content == b"foo" assert "content-encoding" in r.headers + r.decode(strict=False) + assert r.content == b"foo" + assert "content-encoding" not in r.headers + + def test_none(self): + r = tresp(content=None) + assert r.content is None + r.content = b"foo" + assert r.content is not None + r.content = None + assert r.content is None + def test_cannot_encode(self): r = tresp() r.encode("gzip") @@ -175,12 +192,17 @@ class TestMessageContentEncoding(object): assert "content-encoding" not in r.headers assert r.raw_content == b"foo" + with tutils.raises(ValueError): + r.encode("zopfli") + assert r.raw_content == b"foo" + assert "content-encoding" not in r.headers + class TestMessageText(object): def test_simple(self): - r = tresp(content=b'\xc3\xbc') - assert r.raw_content == b"\xc3\xbc" - assert r.content == b"\xc3\xbc" + r = tresp(content=b'\xfc') + assert r.raw_content == b"\xfc" + assert r.content == b"\xfc" assert r.text == u"ü" r.encode("gzip") @@ -189,8 +211,10 @@ class TestMessageText(object): assert r.text == u"ü" r.headers["content-type"] = "text/html; charset=latin1" - assert r.content == b"\xc3\xbc" + r.content = b"\xc3\xbc" assert r.text == u"ü" + r.headers["content-type"] = "text/html; charset=utf8" + assert r.text == u"ü" r.encode("identity") r.raw_content = b"foo" @@ -201,16 +225,29 @@ class TestMessageText(object): assert r.text assert e.call_count == 0 + def test_guess_json(self): + r = tresp(content=b'"\xc3\xbc"') + r.headers["content-type"] = "application/json" + assert r.text == u'"ü"' + + def test_none(self): + r = tresp(content=None) + assert r.text is None + r.text = b"foo" + assert r.text is not None + r.text = None + assert r.text is None + def test_modify(self): r = tresp() r.text = u"ü" - assert r.raw_content == b"\xc3\xbc" + assert r.raw_content == b"\xfc" - r.headers["content-type"] = "text/html; charset=latin1" + r.headers["content-type"] = "text/html; charset=utf8" r.text = u"ü" - assert r.raw_content == b"\xfc" - assert r.headers["content-length"] == "1" + assert r.raw_content == b"\xc3\xbc" + assert r.headers["content-length"] == "2" r.encode("identity") with mock.patch("netlib.encoding.encode") as e: @@ -224,12 +261,18 @@ class TestMessageText(object): r = tresp() r.headers["content-type"] = "text/html; charset=wtf" r.raw_content = b"foo" - assert r.text == u"foo" + with tutils.raises(ValueError): + assert r.text == u"foo" + assert r.get_text(strict=False) == u"foo" def test_cannot_decode(self): r = tresp() + r.headers["content-type"] = "text/html; charset=utf8" r.raw_content = b"\xFF" - assert r.text == u'\ufffd' if six.PY2 else '\udcff' + with tutils.raises(ValueError): + assert r.text + + assert r.get_text(strict=False) == u'\ufffd' if six.PY2 else '\udcff' def test_cannot_encode(self): r = tresp() @@ -237,9 +280,19 @@ class TestMessageText(object): assert "content-type" not in r.headers assert r.raw_content is None - r.headers["content-type"] = "text/html; charset=latin1" + r.headers["content-type"] = "text/html; charset=latin1; foo=bar" r.text = u"☃" - assert r.headers["content-type"] == "text/html; charset=utf-8" + assert r.headers["content-type"] == "text/html; charset=utf-8; foo=bar" + assert r.raw_content == b'\xe2\x98\x83' + + r.headers["content-type"] = "gibberish" + r.text = u"☃" + assert r.headers["content-type"] == "text/plain; charset=utf-8" + assert r.raw_content == b'\xe2\x98\x83' + + del r.headers["content-type"] + r.text = u"☃" + assert r.headers["content-type"] == "text/plain; charset=utf-8" assert r.raw_content == b'\xe2\x98\x83' r.headers["content-type"] = "text/html; charset=latin1" |