aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libmproxy/console/contentview.py67
-rw-r--r--libmproxy/utils.py45
-rw-r--r--test/test_console_contentview.py23
-rw-r--r--test/test_utils.py58
-rw-r--r--todo1
5 files changed, 77 insertions, 117 deletions
diff --git a/libmproxy/console/contentview.py b/libmproxy/console/contentview.py
index 0d725c9d..02394c6f 100644
--- a/libmproxy/console/contentview.py
+++ b/libmproxy/console/contentview.py
@@ -2,11 +2,12 @@ import re, cStringIO
import urwid
from PIL import Image
from PIL.ExifTags import TAGS
+import lxml.html, lxml.etree
import common
from .. import utils, encoding, flow
from ..contrib import jsbeautifier
-VIEW_CUTOFF = 1024*20
+VIEW_CUTOFF = 1024*200
VIEW_AUTO = 0
VIEW_JSON = 1
@@ -17,6 +18,7 @@ VIEW_JAVASCRIPT = 5
VIEW_IMAGE = 6
VIEW_RAW = 7
VIEW_HEX = 8
+VIEW_HTML = 9
VIEW_NAMES = {
VIEW_AUTO: "Auto",
@@ -28,35 +30,38 @@ VIEW_NAMES = {
VIEW_IMAGE: "Image",
VIEW_RAW: "Raw",
VIEW_HEX: "Hex",
+ VIEW_HTML: "HTML",
}
VIEW_PROMPT = (
("auto detect", "a"),
- ("hex", "h"),
+ ("hex", "e"),
+ ("html", "h"),
("image", "i"),
("javascript", "j"),
("json", "s"),
("raw", "r"),
("multipart", "m"),
("urlencoded", "u"),
- ("xmlish", "x"),
+ ("xml", "x"),
)
VIEW_SHORTCUTS = {
"a": VIEW_AUTO,
+ "x": VIEW_XML,
+ "h": VIEW_HTML,
"i": VIEW_IMAGE,
"j": VIEW_JAVASCRIPT,
"s": VIEW_JSON,
"u": VIEW_URLENCODED,
"m": VIEW_MULTIPART,
- "x": VIEW_XML,
"r": VIEW_RAW,
- "h": VIEW_HEX,
+ "e": VIEW_HEX,
}
CONTENT_TYPES_MAP = {
- "text/html": VIEW_XML,
+ "text/html": VIEW_HTML,
"application/json": VIEW_JSON,
"text/xml": VIEW_XML,
"multipart/form-data": VIEW_MULTIPART,
@@ -116,9 +121,34 @@ def view_hex(hdrs, content):
return "Hex", txt
-def view_xmlish(hdrs, content):
+def view_xml(hdrs, content):
+ parser = lxml.etree.XMLParser(remove_blank_text=True, resolve_entities=False, strip_cdata=False, recover=False)
+ try:
+ document = lxml.etree.fromstring(content, parser)
+ except lxml.etree.XMLSyntaxError, v:
+ print v
+ return None
+ docinfo = document.getroottree().docinfo
+
+ prev = []
+ p = document.getroottree().getroot().getprevious()
+ while p is not None:
+ prev.insert(
+ 0,
+ lxml.etree.tostring(p)
+ )
+ p = p.getprevious()
+
+ s = lxml.etree.tostring(
+ document,
+ pretty_print=True,
+ xml_declaration=True,
+ doctype=docinfo.doctype + "\n".join(prev),
+ encoding = docinfo.encoding
+ )
+
txt = []
- for i in utils.pretty_xmlish(content[:VIEW_CUTOFF]):
+ for i in s[:VIEW_CUTOFF].strip().split("\n"):
txt.append(
urwid.Text(("text", i)),
)
@@ -126,6 +156,22 @@ def view_xmlish(hdrs, content):
return "XML-like data", txt
+def view_html(hdrs, content):
+ if utils.isXML(content):
+ parser = lxml.etree.HTMLParser(strip_cdata=True, remove_blank_text=True)
+ d = lxml.html.fromstring(content, parser=parser)
+ docinfo = d.getroottree().docinfo
+ s = lxml.etree.tostring(d, pretty_print=True, doctype=docinfo.doctype)
+
+ txt = []
+ for i in s[:VIEW_CUTOFF].strip().split("\n"):
+ txt.append(
+ urwid.Text(("text", i)),
+ )
+ trailer(len(content), txt)
+ return "HTML", txt
+
+
def view_json(hdrs, content):
lines = utils.pretty_json(content)
if lines:
@@ -229,7 +275,8 @@ def view_image(hdrs, content):
PRETTY_FUNCTION_MAP = {
- VIEW_XML: view_xmlish,
+ VIEW_XML: view_xml,
+ VIEW_HTML: view_html,
VIEW_JSON: view_json,
VIEW_URLENCODED: view_urlencoded,
VIEW_MULTIPART: view_multipart,
@@ -274,7 +321,7 @@ def get_content_view(viewmode, hdrItems, content):
if not ret:
viewmode = VIEW_RAW
ret = view_raw(hdrs, content)
- msg.append("Fallback to Raw")
+ msg.append("Couldn't parse: falling back to Raw")
else:
msg.append(ret[0])
return " ".join(msg), ret[1]
diff --git a/libmproxy/utils.py b/libmproxy/utils.py
index b4e317c5..d8345399 100644
--- a/libmproxy/utils.py
+++ b/libmproxy/utils.py
@@ -72,51 +72,6 @@ def cleanBin(s, fixspacing=False):
return "".join(parts)
-TAG = r"""
- <\s*
- (?!\s*[!"])
- (?P<close>\s*\/)?
- (?P<name>\w+)
- (
- [^'"\t >]+ |
- "[^\"]*"['\"]* |
- '[^']*'['\"]* |
- \s+
- )*
- (?P<selfcont>\s*\/\s*)?
- \s*>
- """
-UNI = set(["br", "hr", "img", "input", "area", "link"])
-INDENT = " "*4
-def pretty_xmlish(s):
- """
- A robust pretty-printer for XML-ish data.
- Returns a list of lines.
- """
- s = cleanBin(s)
- data, offset, indent, prev = [], 0, 0, None
- for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE):
- start, end = i.span()
- name = i.group("name")
- if start > offset:
- txt = []
- for x in textwrap.dedent(s[offset:start]).split("\n"):
- if x.strip():
- txt.append(indent*INDENT + x)
- data.extend(txt)
- if i.group("close") and not (name in UNI and name==prev):
- indent = max(indent - 1, 0)
- data.append(indent*INDENT + i.group().strip())
- offset = end
- if not any([i.group("close"), i.group("selfcont"), name in UNI]):
- indent += 1
- prev = name
- trail = s[offset:]
- if trail.strip():
- data.append(s[offset:])
- return data
-
-
def pretty_json(s):
try:
p = json.loads(s)
diff --git a/test/test_console_contentview.py b/test/test_console_contentview.py
index babe59ea..cf2ab1e5 100644
--- a/test/test_console_contentview.py
+++ b/test/test_console_contentview.py
@@ -57,15 +57,32 @@ class uContentView(libpry.AutoTree):
assert cv.view_urlencoded([], d)
assert not cv.view_urlencoded([], "foo")
+ def test_view_html(self):
+ s = "<html><br><br></br><p>one</p></html>"
+ assert cv.view_html([], s)
+
+ s = "gobbledygook"
+ assert not cv.view_html([], s)
+
def test_view_json(self):
cv.VIEW_CUTOFF = 100
assert cv.view_json([], "{}")
assert not cv.view_urlencoded([], "{")
assert cv.view_json([], "[" + ",".join(["0"]*cv.VIEW_CUTOFF) + "]")
- def test_view_xmlish(self):
- assert cv.view_xmlish([], "<foo></foo>")
- assert cv.view_xmlish([], "<foo>")
+ def test_view_xml(self):
+ #assert cv.view_xml([], "<foo></foo>")
+ #assert not cv.view_xml([], "<foo>")
+
+ s = """<?xml version="1.0" encoding="UTF-8"?>
+ <?xml-stylesheet title="XSL_formatting"?>
+ <rss
+ xmlns:media="http://search.yahoo.com/mrss/"
+ xmlns:atom="http://www.w3.org/2005/Atom"
+ version="2.0">
+ </rss>
+ """
+ print cv.view_xml([], s)
def test_view_raw(self):
assert cv.view_raw([], "foo")
diff --git a/test/test_utils.py b/test/test_utils.py
index e445614a..f279ce65 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -58,63 +58,6 @@ class uData(libpry.AutoTree):
libpry.raises("does not exist", utils.pkg_data.path, "nonexistent")
-
-class upretty_xmlish(libpry.AutoTree):
- def test_tagre(self):
- def f(s):
- return re.search(utils.TAG, s, re.VERBOSE|re.MULTILINE)
- assert f(r"<body>")
- assert f(r"<body/>")
- assert f(r"< body/>")
- assert f(r"< body/ >")
- assert f(r"< body / >")
- assert f(r"<foo a=b>")
- assert f(r"<foo a='b'>")
- assert f(r"<foo a='b\"'>")
- assert f(r'<a b=(a.b) href="foo">')
- assert f('<td width=25%>')
- assert f('<form name="search" action="/search.php" method="get" accept-charset="utf-8" class="search">')
- assert f('<img src="gif" width="125" height="16" alt=&quot;&quot; />')
-
-
- def test_all(self):
- def isbalanced(ret):
- # The last tag should have no indent
- assert ret[-1].strip() == ret[-1]
-
- s = "<html><br><br></br><p>one</p></html>"
- ret = utils.pretty_xmlish(s)
- isbalanced(ret)
-
- s = r"""
-<body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="document.f.q.focus();if(document.images)new Image().src='/images/srpr/nav_logo27.png'" ><textarea id=csi style=display:none></textarea></body>
- """
- isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
-
- s = r"""
- <a href="http://foo.com" target="">
- <img src="http://foo.gif" alt="bar" height="25" width="132">
- </a>
- """
- isbalanced(utils.pretty_xmlish(textwrap.dedent(s)))
-
- s = r"""
- <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"
- \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">
- <html></html>
- """
- ret = utils.pretty_xmlish(textwrap.dedent(s))
- isbalanced(ret)
-
- s = "<html><br/><p>one</p></html>"
- ret = utils.pretty_xmlish(s)
- assert len(ret) == 6
- isbalanced(ret)
-
- s = "gobbledygook"
- assert utils.pretty_xmlish(s) == ["gobbledygook"]
-
-
class upretty_json(libpry.AutoTree):
def test_one(self):
s = json.dumps({"foo": 1})
@@ -242,7 +185,6 @@ tests = [
uhexdump(),
upretty_size(),
uData(),
- upretty_xmlish(),
upretty_json(),
u_urldecode(),
udel_all(),
diff --git a/todo b/todo
index c19548be..7cc9f0d1 100644
--- a/todo
+++ b/todo
@@ -4,7 +4,6 @@ of these and need some pointers.
Targeted for 0.9:
- Upstream proxy support.
- - Improve worst-case performance problem with XML-ish indenter
- Follow mode to keep most recent flow in view
- Rewrite the core to be asynchronous. I've done some research, and
although it's a bit of a bloated monster, it looks like Twisted is the way