diff options
Diffstat (limited to 'libmproxy')
-rw-r--r-- | libmproxy/console/contentview.py | 67 | ||||
-rw-r--r-- | libmproxy/utils.py | 45 |
2 files changed, 57 insertions, 55 deletions
diff --git a/libmproxy/console/contentview.py b/libmproxy/console/contentview.py index 0d725c9d..02394c6f 100644 --- a/libmproxy/console/contentview.py +++ b/libmproxy/console/contentview.py @@ -2,11 +2,12 @@ import re, cStringIO import urwid from PIL import Image from PIL.ExifTags import TAGS +import lxml.html, lxml.etree import common from .. import utils, encoding, flow from ..contrib import jsbeautifier -VIEW_CUTOFF = 1024*20 +VIEW_CUTOFF = 1024*200 VIEW_AUTO = 0 VIEW_JSON = 1 @@ -17,6 +18,7 @@ VIEW_JAVASCRIPT = 5 VIEW_IMAGE = 6 VIEW_RAW = 7 VIEW_HEX = 8 +VIEW_HTML = 9 VIEW_NAMES = { VIEW_AUTO: "Auto", @@ -28,35 +30,38 @@ VIEW_NAMES = { VIEW_IMAGE: "Image", VIEW_RAW: "Raw", VIEW_HEX: "Hex", + VIEW_HTML: "HTML", } VIEW_PROMPT = ( ("auto detect", "a"), - ("hex", "h"), + ("hex", "e"), + ("html", "h"), ("image", "i"), ("javascript", "j"), ("json", "s"), ("raw", "r"), ("multipart", "m"), ("urlencoded", "u"), - ("xmlish", "x"), + ("xml", "x"), ) VIEW_SHORTCUTS = { "a": VIEW_AUTO, + "x": VIEW_XML, + "h": VIEW_HTML, "i": VIEW_IMAGE, "j": VIEW_JAVASCRIPT, "s": VIEW_JSON, "u": VIEW_URLENCODED, "m": VIEW_MULTIPART, - "x": VIEW_XML, "r": VIEW_RAW, - "h": VIEW_HEX, + "e": VIEW_HEX, } CONTENT_TYPES_MAP = { - "text/html": VIEW_XML, + "text/html": VIEW_HTML, "application/json": VIEW_JSON, "text/xml": VIEW_XML, "multipart/form-data": VIEW_MULTIPART, @@ -116,9 +121,34 @@ def view_hex(hdrs, content): return "Hex", txt -def view_xmlish(hdrs, content): +def view_xml(hdrs, content): + parser = lxml.etree.XMLParser(remove_blank_text=True, resolve_entities=False, strip_cdata=False, recover=False) + try: + document = lxml.etree.fromstring(content, parser) + except lxml.etree.XMLSyntaxError, v: + print v + return None + docinfo = document.getroottree().docinfo + + prev = [] + p = document.getroottree().getroot().getprevious() + while p is not None: + prev.insert( + 0, + lxml.etree.tostring(p) + ) + p = p.getprevious() + + s = lxml.etree.tostring( + document, + pretty_print=True, + xml_declaration=True, + doctype=docinfo.doctype + "\n".join(prev), + encoding = docinfo.encoding + ) + txt = [] - for i in utils.pretty_xmlish(content[:VIEW_CUTOFF]): + for i in s[:VIEW_CUTOFF].strip().split("\n"): txt.append( urwid.Text(("text", i)), ) @@ -126,6 +156,22 @@ def view_xmlish(hdrs, content): return "XML-like data", txt +def view_html(hdrs, content): + if utils.isXML(content): + parser = lxml.etree.HTMLParser(strip_cdata=True, remove_blank_text=True) + d = lxml.html.fromstring(content, parser=parser) + docinfo = d.getroottree().docinfo + s = lxml.etree.tostring(d, pretty_print=True, doctype=docinfo.doctype) + + txt = [] + for i in s[:VIEW_CUTOFF].strip().split("\n"): + txt.append( + urwid.Text(("text", i)), + ) + trailer(len(content), txt) + return "HTML", txt + + def view_json(hdrs, content): lines = utils.pretty_json(content) if lines: @@ -229,7 +275,8 @@ def view_image(hdrs, content): PRETTY_FUNCTION_MAP = { - VIEW_XML: view_xmlish, + VIEW_XML: view_xml, + VIEW_HTML: view_html, VIEW_JSON: view_json, VIEW_URLENCODED: view_urlencoded, VIEW_MULTIPART: view_multipart, @@ -274,7 +321,7 @@ def get_content_view(viewmode, hdrItems, content): if not ret: viewmode = VIEW_RAW ret = view_raw(hdrs, content) - msg.append("Fallback to Raw") + msg.append("Couldn't parse: falling back to Raw") else: msg.append(ret[0]) return " ".join(msg), ret[1] diff --git a/libmproxy/utils.py b/libmproxy/utils.py index b4e317c5..d8345399 100644 --- a/libmproxy/utils.py +++ b/libmproxy/utils.py @@ -72,51 +72,6 @@ def cleanBin(s, fixspacing=False): return "".join(parts) -TAG = r""" - <\s* - (?!\s*[!"]) - (?P<close>\s*\/)? - (?P<name>\w+) - ( - [^'"\t >]+ | - "[^\"]*"['\"]* | - '[^']*'['\"]* | - \s+ - )* - (?P<selfcont>\s*\/\s*)? - \s*> - """ -UNI = set(["br", "hr", "img", "input", "area", "link"]) -INDENT = " "*4 -def pretty_xmlish(s): - """ - A robust pretty-printer for XML-ish data. - Returns a list of lines. - """ - s = cleanBin(s) - data, offset, indent, prev = [], 0, 0, None - for i in re.finditer(TAG, s, re.VERBOSE|re.MULTILINE): - start, end = i.span() - name = i.group("name") - if start > offset: - txt = [] - for x in textwrap.dedent(s[offset:start]).split("\n"): - if x.strip(): - txt.append(indent*INDENT + x) - data.extend(txt) - if i.group("close") and not (name in UNI and name==prev): - indent = max(indent - 1, 0) - data.append(indent*INDENT + i.group().strip()) - offset = end - if not any([i.group("close"), i.group("selfcont"), name in UNI]): - indent += 1 - prev = name - trail = s[offset:] - if trail.strip(): - data.append(s[offset:]) - return data - - def pretty_json(s): try: p = json.loads(s) |