improve message content semantics

author: Maximilian Hils <git@maximilianhils.com> 2016-07-15 22:50:33 -0700
committer: Maximilian Hils <git@maximilianhils.com> 2016-07-15 22:50:33 -0700
commit: a3c7c84d49c3e6563e7f37ef60c989f99ed96788 (patch)
tree: 578d596bb630f44cf0f6a26078d1070977f8a53d
parent: ca9de786fd7ed3edf7a485f7c019ac83d5abfc7f (diff)
download: mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.tar.gz
mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.tar.bz2
mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.zip
10 files changed, 194 insertions, 130 deletions
diff --git a/mitmproxy/console/common.py b/mitmproxy/console/common.py
index ef220b4c..41f4f243 100644
--- a/mitmproxy/console/common.py
+++ b/mitmproxy/console/common.py
@@ -257,16 +257,13 @@ def copy_flow_format_data(part, scope, flow):
         data = ""
         if scope in ("q", "a"):
             request = flow.request.copy()
-            try:
-                request.decode()
-            except ValueError:
-                pass
-            if request.raw_content is None:
+            request.decode(strict=False)
+            if request.content is None:
                 return None, "Request content is missing"
             if part == "h":
                 data += netlib.http.http1.assemble_request(request)
             elif part == "c":
-                data += request.raw_content
+                data += request.content
             else:
                 raise ValueError("Unknown part: {}".format(part))
         if scope == "a" and flow.request.raw_content and flow.response:
@@ -274,16 +271,13 @@ def copy_flow_format_data(part, scope, flow):
             data += "\r\n" * 2
         if scope in ("s", "a") and flow.response:
             response = flow.response.copy()
-            try:
-                response.decode()
-            except ValueError:
-                pass
-            if response.raw_content is None:
+            response.decode(strict=False)
+            if response.content is None:
                 return None, "Response content is missing"
             if part == "h":
                 data += netlib.http.http1.assemble_response(response)
             elif part == "c":
-                data += response.raw_content
+                data += response.content
             else:
                 raise ValueError("Unknown part: {}".format(part))
     return data, False
@@ -393,22 +387,14 @@ def ask_save_body(part, master, state, flow):
             ask_save_body("q", master, state, flow)
 
     elif part == "q" and request_has_content:
-        try:
-            content = flow.request.content
-        except ValueError:
-            content = flow.request.raw_content
         ask_save_path(
             "Save request content",
-            content
+            flow.request.get_content(strict=False),
         )
     elif part == "s" and response_has_content:
-        try:
-            content = flow.response.content
-        except ValueError:
-            content = flow.response.raw_content
         ask_save_path(
             "Save response content",
-            content
+            flow.response.get_content(strict=False),
         )
     else:
         signals.status_message.send(message="No content to save.")
diff --git a/mitmproxy/console/flowview.py b/mitmproxy/console/flowview.py
index d994e670..f8686b41 100644
--- a/mitmproxy/console/flowview.py
+++ b/mitmproxy/console/flowview.py
@@ -427,11 +427,7 @@ class FlowView(tabs.Tabs):
             # editing message bodies, this can cause problems. For now, I
             # just strip the newlines off the end of the body when we return
             # from an editor.
-            try:
-                content = message.content
-            except ValueError:
-                content = message.raw_content
-            c = self.master.spawn_editor(content or b"")
+            c = self.master.spawn_editor(message.get_content(strict=False) or b"")
             message.content = c.rstrip(b"\n")
         elif part == "f":
             if not message.urlencoded_form and message.raw_content:
@@ -697,11 +693,7 @@ class FlowView(tabs.Tabs):
                 if conn.raw_content:
                     t = conn.headers.get("content-type")
                     if "EDITOR" in os.environ or "PAGER" in os.environ:
-                        try:
-                            content = conn.content
-                        except ValueError:
-                            content = conn.raw_content
-                        self.master.spawn_external_viewer(content, t)
+                        self.master.spawn_external_viewer(conn.get_content(strict=False), t)
                     else:
                         signals.status_message.send(
                             message = "Error! Set $EDITOR or $PAGER."
diff --git a/mitmproxy/dump.py b/mitmproxy/dump.py
index 0a9b76a7..14d55cd1 100644
--- a/mitmproxy/dump.py
+++ b/mitmproxy/dump.py
@@ -190,7 +190,7 @@ class DumpMaster(flow.FlowMaster):
             try:
                 content = message.content
             except ValueError:
-                content = message.raw_content
+                content = message.get_content(strict=False)
 
             if content is None:
                 self.echo("(content missing)", indent=4)
diff --git a/mitmproxy/filt.py b/mitmproxy/filt.py
index e8687b9f..a42988f1 100644
--- a/mitmproxy/filt.py
+++ b/mitmproxy/filt.py
@@ -194,17 +194,11 @@ class FBod(_Rex):
 
     def __call__(self, f):
         if f.request and f.request.raw_content:
-            try:
-                if self.re.search(f.request.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.request.get_content(strict=False)):
+                return True
         if f.response and f.response.raw_content:
-            try:
-                if self.re.search(f.response.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.response.get_content(strict=False)):
+                return True
         return False
 
 
@@ -214,11 +208,8 @@ class FBodRequest(_Rex):
 
     def __call__(self, f):
         if f.request and f.request.raw_content:
-            try:
-                if self.re.search(f.request.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.request.get_content(strict=False)):
+                return True
 
 
 class FBodResponse(_Rex):
@@ -227,11 +218,8 @@ class FBodResponse(_Rex):
 
     def __call__(self, f):
         if f.response and f.response.raw_content:
-            try:
-                if self.re.search(f.response.content):
-                    return True
-            except ValueError:
-                pass
+            if self.re.search(f.response.get_content(strict=False)):
+                return True
 
 
 class FMethod(_Rex):
diff --git a/mitmproxy/flow/export.py b/mitmproxy/flow/export.py
index 9da18f22..4659af7b 100644
--- a/mitmproxy/flow/export.py
+++ b/mitmproxy/flow/export.py
@@ -20,12 +20,9 @@ def curl_command(flow):
     data = "curl "
 
     request = flow.request.copy()
-    try:
-        request.decode()
-    except ValueError:
-        pass
+    request.decode(strict=False)
 
-    for k, v in request.headers.fields:
+    for k, v in request.headers.items(multi=True):
         data += "-H '%s:%s' " % (k, v)
 
     if request.method != "GET":
@@ -34,8 +31,8 @@ def curl_command(flow):
     full_url = request.scheme + "://" + request.host + request.path
     data += "'%s'" % full_url
 
-    if request.raw_content:
-        data += " --data-binary '%s'" % request.raw_content
+    if request.content:
+        data += " --data-binary '%s'" % request.content
 
     return data
 
diff --git a/netlib/http/headers.py b/netlib/http/headers.py
index f052a53b..13a8c98f 100644
--- a/netlib/http/headers.py
+++ b/netlib/http/headers.py
@@ -204,3 +204,15 @@ def parse_content_type(c):
             if len(clause) == 2:
                 d[clause[0].strip()] = clause[1].strip()
     return ts[0].lower(), ts[1].lower(), d
+
+
+def assemble_content_type(type, subtype, parameters):
+    if not parameters:
+        return "{}/{}".format(type, subtype)
+    params = "; ".join(
+        "{}={}".format(k, v)
+        for k, v in parameters.items()
+    )
+    return "{}/{}; {}".format(
+        type, subtype, params
+    )
diff --git a/netlib/http/message.py b/netlib/http/message.py
index 86ff64d1..1252ed25 100644
--- a/netlib/http/message.py
+++ b/netlib/http/message.py
@@ -53,14 +53,15 @@ class MessageData(basetypes.Serializable):
 
 
 class CachedDecode(object):
-    __slots__ = ["encoded", "encoding", "decoded"]
+    __slots__ = ["encoded", "encoding", "strict", "decoded"]
 
-    def __init__(self, object, encoding, decoded):
+    def __init__(self, object, encoding, strict, decoded):
         self.encoded = object
         self.encoding = encoding
+        self.strict = strict
         self.decoded = decoded
 
-no_cached_decode = CachedDecode(None, None, None)
+no_cached_decode = CachedDecode(None, None, None, None)
 
 
 class Message(basetypes.Serializable):
@@ -118,33 +119,44 @@ class Message(basetypes.Serializable):
     def raw_content(self, content):
         self.data.content = content
 
-    @property
-    def content(self):
-        # type: () -> bytes
+    def get_content(self, strict=True):
+        # type: (bool) -> bytes
         """
         The HTTP message body decoded with the content-encoding header (e.g. gzip)
 
         Raises:
-            ValueError, when getting the content and the content-encoding is invalid.
+            ValueError, when the content-encoding is invalid and strict is True.
 
         See also: :py:class:`raw_content`, :py:attr:`text`
         """
+        if self.raw_content is None:
+            return None
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.encoded == self.raw_content and
+            (self._content_cache.strict or not strict) and
             self._content_cache.encoding == ce
         )
         if not cached:
+            is_strict = True
             if ce:
-                decoded = encoding.decode(self.raw_content, ce)
+                try:
+                    decoded = encoding.decode(self.raw_content, ce)
+                except ValueError:
+                    if strict:
+                        raise
+                    is_strict = False
+                    decoded = self.raw_content
             else:
                 decoded = self.raw_content
-            self._content_cache = CachedDecode(self.raw_content, ce, decoded)
+            self._content_cache = CachedDecode(self.raw_content, ce, is_strict, decoded)
         return self._content_cache.decoded
 
-    @content.setter
-    def content(self, value):
-        if value is not None and not isinstance(value, bytes):
+    def set_content(self, value):
+        if value is None:
+            self.raw_content = None
+            return
+        if not isinstance(value, bytes):
             raise TypeError(
                 "Message content must be bytes, not {}. "
                 "Please use .text if you want to assign a str."
@@ -153,24 +165,23 @@ class Message(basetypes.Serializable):
         ce = self.headers.get("content-encoding")
         cached = (
             self._content_cache.decoded == value and
-            self._content_cache.encoding == ce
+            self._content_cache.encoding == ce and
+            self._content_cache.strict
         )
         if not cached:
             try:
-                if ce and value is not None:
-                    encoded = encoding.encode(value, ce)
-                else:
-                    encoded = value
+                encoded = encoding.encode(value, ce or "identity")
             except ValueError:
                 # So we have an invalid content-encoding?
                 # Let's remove it!
                 del self.headers["content-encoding"]
                 ce = None
                 encoded = value
-            self._content_cache = CachedDecode(encoded, ce, value)
+            self._content_cache = CachedDecode(encoded, ce, True, value)
         self.raw_content = self._content_cache.encoded
-        if isinstance(self.raw_content, bytes):
-            self.headers["content-length"] = str(len(self.raw_content))
+        self.headers["content-length"] = str(len(self.raw_content))
+
+    content = property(get_content, set_content)
 
     @property
     def http_version(self):
@@ -211,69 +222,87 @@ class Message(basetypes.Serializable):
         if ct:
             return ct[2].get("charset")
 
-    @property
-    def text(self):
-        # type: () -> six.text_type
+    def _guess_encoding(self):
+        # type: () -> str
+        enc = self._get_content_type_charset()
+        if enc:
+            return enc
+
+        if "json" in self.headers.get("content-type", ""):
+            return "utf8"
+        else:
+            # We may also want to check for HTML meta tags here at some point.
+            return "latin-1"
+
+    def get_text(self, strict=True):
+        # type: (bool) -> six.text_type
         """
         The HTTP message body decoded with both content-encoding header (e.g. gzip)
         and content-type header charset.
 
+        Raises:
+            ValueError, when either content-encoding or charset is invalid and strict is True.
+
         See also: :py:attr:`content`, :py:class:`raw_content`
         """
-        # This attribute should be called text, because that's what requests does.
-        enc = self._get_content_type_charset()
-
-        # We may also want to check for HTML meta tags here at some point.
+        if self.raw_content is None:
+            return None
+        enc = self._guess_encoding()
 
+        content = self.get_content(strict)
         cached = (
-            self._text_cache.encoded == self.content and
+            self._text_cache.encoded == content and
+            (self._text_cache.strict or not strict) and
             self._text_cache.encoding == enc
         )
         if not cached:
+            is_strict = self._content_cache.strict
             try:
-                if not enc:
-                    raise ValueError()
-                decoded = encoding.decode(self.content, enc)
+                decoded = encoding.decode(content, enc)
             except ValueError:
-                decoded = self.content.decode("utf8", "replace" if six.PY2 else "surrogateescape")
-            self._text_cache = CachedDecode(self.content, enc, decoded)
+                if strict:
+                    raise
+                is_strict = False
+                decoded = self.content.decode(enc, "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(content, enc, is_strict, decoded)
         return self._text_cache.decoded
 
-    @text.setter
-    def text(self, text):
-        enc = self._get_content_type_charset()
+    def set_text(self, text):
+        if text is None:
+            self.content = None
+            return
+        enc = self._guess_encoding()
+
         cached = (
             self._text_cache.decoded == text and
-            self._text_cache.encoding == enc
+            self._text_cache.encoding == enc and
+            self._text_cache.strict
         )
         if not cached:
             try:
-                if not enc:
-                    raise ValueError()
                 encoded = encoding.encode(text, enc)
             except ValueError:
-                # Do we have an unknown content-type charset?
-                # If so, we want to replace it with utf8.
-                if text and enc:
-                    self.headers["content-type"] = re.sub(
-                        "charset=[^;]+",
-                        "charset=utf-8",
-                        self.headers["content-type"]
-                    )
-                encoded = text.encode("utf8", "replace" if six.PY2 else "surrogateescape")
-            self._text_cache = CachedDecode(encoded, enc, text)
+                # Fall back to UTF-8 and update the content-type header.
+                ct = headers.parse_content_type(self.headers.get("content-type", "")) or ("text", "plain", {})
+                ct[2]["charset"] = "utf-8"
+                self.headers["content-type"] = headers.assemble_content_type(*ct)
+                enc = "utf8"
+                encoded = text.encode(enc, "replace" if six.PY2 else "surrogateescape")
+            self._text_cache = CachedDecode(encoded, enc, True, text)
         self.content = self._text_cache.encoded
 
-    def decode(self):
+    text = property(get_text, set_text)
+
+    def decode(self, strict=True):
         """
         Decodes body based on the current Content-Encoding header, then
         removes the header. If there is no Content-Encoding header, no
         action is taken.
 
         Raises:
-            ValueError, when the content-encoding is invalid.
+            ValueError, when the content-encoding is invalid and strict is True.
         """
-        self.raw_content = self.content
+        self.raw_content = self.get_content(strict)
         self.headers.pop("content-encoding", None)
 
     def encode(self, e):
diff --git a/netlib/wsgi.py b/netlib/wsgi.py
index 2444f449..0def75b5 100644
--- a/netlib/wsgi.py
+++ b/netlib/wsgi.py
@@ -54,20 +54,20 @@ class WSGIAdaptor(object):
         self.app, self.domain, self.port, self.sversion = app, domain, port, sversion
 
     def make_environ(self, flow, errsoc, **extra):
+        """
+        Raises:
+            ValueError, if the content-encoding is invalid.
+        """
         path = strutils.native(flow.request.path, "latin-1")
         if '?' in path:
             path_info, query = strutils.native(path, "latin-1").split('?', 1)
         else:
             path_info = path
             query = ''
-        try:
-            content = flow.request.content
-        except ValueError:
-            content = flow.request.raw_content
         environ = {
             'wsgi.version': (1, 0),
             'wsgi.url_scheme': strutils.native(flow.request.scheme, "latin-1"),
-            'wsgi.input': BytesIO(content or b""),
+            'wsgi.input': BytesIO(flow.request.content or b""),
             'wsgi.errors': errsoc,
             'wsgi.multithread': True,
             'wsgi.multiprocess': False,
diff --git a/test/netlib/http/test_headers.py b/test/netlib/http/test_headers.py
index 51819b86..8462a5af 100644
--- a/test/netlib/http/test_headers.py
+++ b/test/netlib/http/test_headers.py
@@ -1,4 +1,4 @@
-from netlib.http import Headers, parse_content_type
+from netlib.http.headers import Headers, parse_content_type, assemble_content_type
 from netlib.tutils import raises
 
 
@@ -81,3 +81,10 @@ def test_parse_content_type():
 
     v = p("text/html; charset=UTF-8")
     assert v == ('text', 'html', {'charset': 'UTF-8'})
+
+
+def test_assemble_content_type():
+    p = assemble_content_type
+    assert p("text", "html", {}) == "text/html"
+    assert p("text", "html", {"charset": "utf8"}) == "text/html; charset=utf8"
+    assert p("text", "html", {"charset": "utf8", "foo": "bar"}) == "text/html; charset=utf8; foo=bar"
diff --git a/test/netlib/http/test_message.py b/test/netlib/http/test_message.py
index ed7d3da5..8b178e04 100644
--- a/test/netlib/http/test_message.py
+++ b/test/netlib/http/test_message.py
@@ -142,6 +142,9 @@ class TestMessageContentEncoding(object):
             r.content = b"bar"
             assert e.call_count == 1
 
+        with tutils.raises(TypeError):
+            r.content = u"foo"
+
     def test_unknown_ce(self):
         r = tresp()
         r.headers["content-encoding"] = "zopfli"
@@ -149,6 +152,7 @@ class TestMessageContentEncoding(object):
         with tutils.raises(ValueError):
             assert r.content
         assert r.headers["content-encoding"]
+        assert r.get_content(strict=False) == b"foo"
 
     def test_cannot_decode(self):
         r = tresp()
@@ -157,12 +161,25 @@ class TestMessageContentEncoding(object):
         with tutils.raises(ValueError):
             assert r.content
         assert r.headers["content-encoding"]
+        assert r.get_content(strict=False) == b"foo"
 
         with tutils.raises(ValueError):
             r.decode()
         assert r.raw_content == b"foo"
         assert "content-encoding" in r.headers
 
+        r.decode(strict=False)
+        assert r.content == b"foo"
+        assert "content-encoding" not in r.headers
+
+    def test_none(self):
+        r = tresp(content=None)
+        assert r.content is None
+        r.content = b"foo"
+        assert r.content is not None
+        r.content = None
+        assert r.content is None
+
     def test_cannot_encode(self):
         r = tresp()
         r.encode("gzip")
@@ -175,12 +192,17 @@ class TestMessageContentEncoding(object):
         assert "content-encoding" not in r.headers
         assert r.raw_content == b"foo"
 
+        with tutils.raises(ValueError):
+            r.encode("zopfli")
+        assert r.raw_content == b"foo"
+        assert "content-encoding" not in r.headers
+
 
 class TestMessageText(object):
     def test_simple(self):
-        r = tresp(content=b'\xc3\xbc')
-        assert r.raw_content == b"\xc3\xbc"
-        assert r.content == b"\xc3\xbc"
+        r = tresp(content=b'\xfc')
+        assert r.raw_content == b"\xfc"
+        assert r.content == b"\xfc"
         assert r.text == u"ü"
 
         r.encode("gzip")
@@ -189,8 +211,10 @@ class TestMessageText(object):
         assert r.text == u"ü"
 
         r.headers["content-type"] = "text/html; charset=latin1"
-        assert r.content == b"\xc3\xbc"
+        r.content = b"\xc3\xbc"
         assert r.text == u"Ã¼"
+        r.headers["content-type"] = "text/html; charset=utf8"
+        assert r.text == u"ü"
 
         r.encode("identity")
         r.raw_content = b"foo"
@@ -201,16 +225,29 @@ class TestMessageText(object):
             assert r.text
             assert e.call_count == 0
 
+    def test_guess_json(self):
+        r = tresp(content=b'"\xc3\xbc"')
+        r.headers["content-type"] = "application/json"
+        assert r.text == u'"ü"'
+
+    def test_none(self):
+        r = tresp(content=None)
+        assert r.text is None
+        r.text = b"foo"
+        assert r.text is not None
+        r.text = None
+        assert r.text is None
+
     def test_modify(self):
         r = tresp()
 
         r.text = u"ü"
-        assert r.raw_content == b"\xc3\xbc"
+        assert r.raw_content == b"\xfc"
 
-        r.headers["content-type"] = "text/html; charset=latin1"
+        r.headers["content-type"] = "text/html; charset=utf8"
         r.text = u"ü"
-        assert r.raw_content == b"\xfc"
-        assert r.headers["content-length"] == "1"
+        assert r.raw_content == b"\xc3\xbc"
+        assert r.headers["content-length"] == "2"
 
         r.encode("identity")
         with mock.patch("netlib.encoding.encode") as e:
@@ -224,12 +261,18 @@ class TestMessageText(object):
         r = tresp()
         r.headers["content-type"] = "text/html; charset=wtf"
         r.raw_content = b"foo"
-        assert r.text == u"foo"
+        with tutils.raises(ValueError):
+            assert r.text == u"foo"
+        assert r.get_text(strict=False) == u"foo"
 
     def test_cannot_decode(self):
         r = tresp()
+        r.headers["content-type"] = "text/html; charset=utf8"
         r.raw_content = b"\xFF"
-        assert r.text == u'\ufffd' if six.PY2 else '\udcff'
+        with tutils.raises(ValueError):
+            assert r.text
+
+        assert r.get_text(strict=False) == u'\ufffd' if six.PY2 else '\udcff'
 
     def test_cannot_encode(self):
         r = tresp()
@@ -237,9 +280,19 @@ class TestMessageText(object):
         assert "content-type" not in r.headers
         assert r.raw_content is None
 
-        r.headers["content-type"] = "text/html; charset=latin1"
+        r.headers["content-type"] = "text/html; charset=latin1; foo=bar"
         r.text = u"☃"
-        assert r.headers["content-type"] == "text/html; charset=utf-8"
+        assert r.headers["content-type"] == "text/html; charset=utf-8; foo=bar"
+        assert r.raw_content == b'\xe2\x98\x83'
+
+        r.headers["content-type"] = "gibberish"
+        r.text = u"☃"
+        assert r.headers["content-type"] == "text/plain; charset=utf-8"
+        assert r.raw_content == b'\xe2\x98\x83'
+
+        del r.headers["content-type"]
+        r.text = u"☃"
+        assert r.headers["content-type"] == "text/plain; charset=utf-8"
         assert r.raw_content == b'\xe2\x98\x83'
 
         r.headers["content-type"] = "text/html; charset=latin1"
author	Maximilian Hils <git@maximilianhils.com>	2016-07-15 22:50:33 -0700
committer	Maximilian Hils <git@maximilianhils.com>	2016-07-15 22:50:33 -0700
commit	a3c7c84d49c3e6563e7f37ef60c989f99ed96788 (patch)
tree	578d596bb630f44cf0f6a26078d1070977f8a53d
parent	ca9de786fd7ed3edf7a485f7c019ac83d5abfc7f (diff)
download	mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.tar.gz mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.tar.bz2 mitmproxy-a3c7c84d49c3e6563e7f37ef60c989f99ed96788.zip