• R/O
  • HTTP
  • SSH
  • HTTPS

htmltree: 提交

メインリポジトリ


Commit MetaInfo

修订版a2bce2c0e3c0c859dde1da06cfc9592942175dd8 (tree)
时间2011-08-25 18:31:26
作者Hiromichi MATSUSHIMA <hirom@offi...>
CommiterHiromichi MATSUSHIMA

Log Message

fix: attribute string isn't encoded

更改概述

差异

--- a/htmltree.py
+++ b/htmltree.py
@@ -62,6 +62,8 @@ class HTMLRenderer(Renderer):
6262 self._recursive(child, texts)
6363 elif elem.is_decl():
6464 texts.append("<!" + elem.name + ">")
65+ elif elem.is_comment():
66+ texts.append("<!--" + elem.name + "-->")
6567
6668
6769 class TEXTRenderer(Renderer):
@@ -97,6 +99,7 @@ class HTMLElement(list):
9799 TAG = 100
98100 TEXT = 200
99101 DECL = 300
102+ COMMENT = 400
100103
101104 def __init__(self, type, name="", attrs={}):
102105 """
@@ -126,6 +129,8 @@ class HTMLElement(list):
126129 return "<TAG:%s %s>" % (self.name, self._attrs2str())
127130 elif self.type == HTMLElement.DECL:
128131 return "<DECL:'%s'>" % self.name
132+ elif self.type == HTMLElement.COMMENT:
133+ return "<COMMENT:'%s'>" % self.name
129134 elif self.type == HTMLElement.TEXT:
130135 return "<TEXT:'%s'>" % self._text
131136 else:
@@ -212,6 +217,18 @@ class HTMLElement(list):
212217 for i in self:
213218 i._r_get_elements_by_name(name, buf)
214219
220+ def get_comments(self):
221+ buf = []
222+ for i in self:
223+ i._r_get_comments(buf)
224+ return buf
225+
226+ def _r_get_comments(self, buf):
227+ if self.is_comment():
228+ buf.append(self)
229+ for i in self:
230+ i._r_get_comments(buf)
231+
215232 def get_element_by_id(self, id):
216233 for i in self:
217234 if "id" in i.attrs and i.attrs["id"] == id:
@@ -310,6 +327,9 @@ class HTMLElement(list):
310327 def is_decl(self):
311328 return self.type == HTMLElement.DECL
312329
330+ def is_comment(self):
331+ return self.type == HTMLElement.COMMENT
332+
313333 def is_descendant(self, tagname):
314334 p = self.parent()
315335 while p != None:
@@ -430,13 +450,32 @@ class HTMLTree(HTMLParser.HTMLParser):
430450 r = self.root()
431451 self._r_validate(self, e)
432452
453+ # tools
454+ def _text_encoder(self, text):
455+ # text encode check and convert.
456+ # if charset is given, convert text to unicode type.
457+ val = ""
458+ if self.charset:
459+ try:
460+ val = unicode(text, self.charset)
461+ except TypeError:
462+ # self.charset is utf-8.
463+ val = text
464+ else:
465+ # treat as unicode input
466+ val = text
467+ return val
468+
469+ def _attr_encoder(self, attrs):
470+ return [(k, self._text_encoder(v)) for (k, v) in attrs]
471+
433472 # Handlers
434473 def handle_starttag(self, tag, attrs):
435474 # some tags treat as start-end tag.
436475 if tag in self.UNCLOSABLE_TAGS:
437476 return self.handle_startendtag(tag, attrs)
438477
439- elem = HTMLElement(HTMLElement.TAG, tag, attrs)
478+ elem = HTMLElement(HTMLElement.TAG, tag, self._attr_encoder(attrs))
440479
441480 if self._option & HTMLTree.USE_VALIDATE > 0:
442481 # try validation (experimental)
@@ -456,7 +495,7 @@ class HTMLTree(HTMLParser.HTMLParser):
456495 self._cursor = self._cursor.parent()
457496
458497 def handle_startendtag(self, tag, attrs):
459- elem = HTMLElement(HTMLElement.TAG, tag, attrs)
498+ elem = HTMLElement(HTMLElement.TAG, tag, self._attr_encoder(attrs))
460499 elem._parent = self._cursor
461500 self._cursor.append(elem)
462501
@@ -468,17 +507,9 @@ class HTMLTree(HTMLParser.HTMLParser):
468507 elem = HTMLElement(HTMLElement.TEXT)
469508 elem._parent = self._cursor
470509
471- # text encode check and convert.
472- # if charset is given, convert text to unicode type.
473- if self.charset:
474- try:
475- elem._text = unicode(data, self.charset)
476- except TypeError:
477- # self.charset is utf-8.
478- elem._text = data
479- else:
480- # treat as unicode input
481- elem._text = data
510+ # encode text to utf-8
511+ elem._text = self._text_encoder(data)
512+
482513 self._cursor.append(elem)
483514
484515 def handle_entityref(self, name):
@@ -494,6 +525,11 @@ class HTMLTree(HTMLParser.HTMLParser):
494525 elem._parent = self._cursor
495526 self._cursor.append(elem)
496527
528+ def handle_comment(self, data):
529+ elem = HTMLElement(HTMLElement.COMMENT, data)
530+ elem._parent = self._cursor
531+ self._cursor.append(elem)
532+
497533 # Accessor
498534 def root(self):
499535 return self._htmlroot
Show on old repository browser