Skip to content

Commit 4b4ba95

Browse files
views: best escaping yet of xml data
1 parent 171abe6 commit 4b4ba95

File tree

2 files changed

+61
-38
lines changed

2 files changed

+61
-38
lines changed

Evtx/Views.py

Lines changed: 61 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1616
# See the License for the specific language governing permissions and
1717
# limitations under the License.
18+
import re
19+
import xml.sax.saxutils
20+
1821
import six
1922

2023
import Evtx.Nodes as e_nodes
21-
import xml.sax.saxutils
2224

2325

2426
XML_HEADER = "<?xml version=\"1.1\" encoding=\"utf-8\" standalone=\"yes\" ?>\n"
@@ -29,45 +31,66 @@ def __init__(self, msg):
2931
super(UnexpectedElementException, self).__init__(msg)
3032

3133

32-
try:
33-
# unfortunately no support yet in six.
34-
# py3
35-
from html import escape as html_escape
36-
except ImportError:
37-
# py2
38-
from cgi import escape as html_escape
34+
# ref: https://www.w3.org/TR/xml11/#charsets
35+
RESTRICTED_CHARS = re.compile('[\x01-\x08\x0B\x0C\x0E-\x1F\x7F-\x84\x86-\x9F]')
3936

4037

41-
CHAR_TAB = 0x9
42-
CHAR_NL = 0xA
43-
CHAR_CR = 0xD
38+
def escape_attr(s):
39+
'''
40+
escape the given string such that it can be placed in an XML attribute, like:
4441
45-
VALID_WHITESPACE = (CHAR_TAB, CHAR_NL, CHAR_CR)
42+
<foo bar='$value'>
4643
47-
import re
48-
# ref: https://www.w3.org/TR/xml11/#charsets
49-
RESTRICTED_CHARS = re.compile('[\x01-\x08\x0B\x0C\x0E-\x1F\x7F-\x84\x86-\x9F]')
44+
Args:
45+
s (str): the string to escape.
5046
47+
Returns:
48+
str: the escaped string.
49+
'''
50+
esc = xml.sax.saxutils.quoteattr(s)
51+
esc = esc.encode('ascii', 'xmlcharrefreplace').decode('ascii')
52+
esc = RESTRICTED_CHARS.sub('', esc)
53+
return esc
54+
55+
56+
def escape_value(s):
57+
'''
58+
escape the given string such that it can be placed in an XML value location, like:
59+
60+
<foo>
61+
$value
62+
</foo>
63+
64+
Args:
65+
s (str): the string to escape.
5166
52-
def escape(s):
53-
esc = html_escape(s)
67+
Returns:
68+
str: the escaped string.
69+
'''
70+
esc = xml.sax.saxutils.escape(s)
5471
esc = esc.encode('ascii', 'xmlcharrefreplace').decode('ascii')
5572
esc = RESTRICTED_CHARS.sub('', esc)
5673
return esc
5774

58-
out = []
59-
for c in s:
60-
# ref: http://www.asciitable.com/index/asciifull.gif
61-
if ord(c) < 0x20 and c not in VALID_WHITESPACE:
62-
c = '&#x%04x;' % (ord(c))
63-
out.append(c)
6475

65-
return ''.join(out)
76+
# ref: https://www.w3.org/TR/xml/#NT-NameStartChar
77+
# but we are going to require a even stricter subset.
78+
NAME_PATTERN = re.compile('[a-zA-Z_][a-zA-Z_\-]*')
6679

6780

68-
def to_xml_string(s):
69-
s = xml.sax.saxutils.escape(s, {'"': '&quot;'})
70-
return escape(s)
81+
def validate_name(s):
82+
'''
83+
ensure the given name can be used as an XML entity name, such as tag or attribute name.
84+
85+
Args:
86+
s (str): the string to validate.
87+
88+
Raises:
89+
RuntimeError: if the string is not suitable to be an XML name.
90+
'''
91+
if not NAME_PATTERN.match(s):
92+
raise RuntimeError('invalid xml name: %s' % (s))
93+
return s
7194

7295

7396
def render_root_node_with_subs(root_node, subs):
@@ -90,16 +113,17 @@ def rec(node, acc):
90113
for child in node.children():
91114
if isinstance(child, e_nodes.AttributeNode):
92115
acc.append(" ")
93-
acc.append(to_xml_string(child.attribute_name().string()))
116+
acc.append(validate_name(child.attribute_name().string()))
94117
acc.append("=\"")
95118
# TODO: should use xml.sax.saxutils.quoteattr here
119+
# but to do so, we'd need to ensure we're not double-quoting this value.
96120
rec(child.attribute_value(), acc)
97121
acc.append("\"")
98122
acc.append(">")
99123
for child in node.children():
100124
rec(child, acc)
101125
acc.append("</")
102-
acc.append(to_xml_string(node.tag_name()))
126+
acc.append(validate_name(node.tag_name()))
103127
acc.append(">\n")
104128
elif isinstance(node, e_nodes.CloseStartElementNode):
105129
pass # intended
@@ -108,19 +132,20 @@ def rec(node, acc):
108132
elif isinstance(node, e_nodes.CloseElementNode):
109133
pass # intended
110134
elif isinstance(node, e_nodes.ValueNode):
111-
acc.append(to_xml_string(node.children()[0].string()))
135+
acc.append(escape_value(node.children()[0].string()))
112136
elif isinstance(node, e_nodes.AttributeNode):
113137
pass # intended
114138
elif isinstance(node, e_nodes.CDataSectionNode):
115139
acc.append("<![CDATA[")
116-
acc.append(to_xml_string(node.cdata()))
140+
# TODO: is this correct escaping???
141+
acc.append(escape_value(node.cdata()))
117142
acc.append("]]>")
118143
elif isinstance(node, e_nodes.EntityReferenceNode):
119-
acc.append(to_xml_string(node.entity_reference()))
144+
acc.append(escape_value(node.entity_reference()))
120145
elif isinstance(node, e_nodes.ProcessingInstructionTargetNode):
121-
acc.append(to_xml_string(node.processing_instruction_target()))
146+
acc.append(escape_value(node.processing_instruction_target()))
122147
elif isinstance(node, e_nodes.ProcessingInstructionDataNode):
123-
acc.append(to_xml_string(node.string()))
148+
acc.append(escape_value(node.string()))
124149
elif isinstance(node, e_nodes.TemplateInstanceNode):
125150
raise UnexpectedElementException("TemplateInstanceNode")
126151
elif isinstance(node, e_nodes.NormalSubstitutionNode):
@@ -129,7 +154,7 @@ def rec(node, acc):
129154
if isinstance(sub, e_nodes.BXmlTypeNode):
130155
sub = render_root_node(sub.root())
131156
else:
132-
sub = to_xml_string(sub.string())
157+
sub = escape_value(sub.string())
133158

134159
acc.append(sub)
135160
elif isinstance(node, e_nodes.ConditionalSubstitutionNode):
@@ -138,7 +163,7 @@ def rec(node, acc):
138163
if isinstance(sub, e_nodes.BXmlTypeNode):
139164
sub = render_root_node(sub.root())
140165
else:
141-
sub = to_xml_string(sub.string())
166+
sub = escape_value(sub.string())
142167

143168
acc.append(sub)
144169
elif isinstance(node, e_nodes.StreamStartNode):

tests/test_records.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@
44

55
import Evtx.Evtx as evtx
66
import Evtx.Nodes as e_nodes
7-
import Evtx.Views as e_views
87

98
from fixtures import *
109

1110
try:
1211
import lxml
13-
import lxml.etree
1412
no_lxml = False
1513
except:
1614
no_lxml = True

0 commit comments

Comments
 (0)