• R/O
  • SSH

提交

标签
No Tags

Frequently used words (click to add to your profile)

javac++androidlinuxc#windowsobjective-ccocoa誰得qtpythonphprubygameguibathyscaphec計画中(planning stage)翻訳omegatframeworktwitterdomtestvb.netdirectxゲームエンジンbtronarduinopreviewer

Mercurial Test 02


Commit MetaInfo

修订版4db780ccc4b48ee647d6975cf1bc304bc884d90b (tree)
时间2013-06-11 18:51:57
作者hylom <hylom@user...>
Commiterhylom

Log Message

initial commit for htmltree

更改概述

差异

diff -r 000000000000 -r 4db780ccc4b4 .hgignore
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore Tue Jun 11 18:51:57 2013 +0900
@@ -0,0 +1,2 @@
1+.*~
2+
diff -r 000000000000 -r 4db780ccc4b4 README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md Tue Jun 11 18:51:57 2013 +0900
@@ -0,0 +1,5 @@
1+htmltree - HTML parse and query library for python
2+========
3+
4+
5+
diff -r 000000000000 -r 4db780ccc4b4 htmltree/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/htmltree/__init__.py Tue Jun 11 18:51:57 2013 +0900
@@ -0,0 +1,16 @@
1+# Copyright (c) 2011 hylom <hylomm at gmail.com>
2+# All rights reserved.
3+#
4+# This module is released under BSD License.
5+# http://www.opensource.org/licenses/bsd-license.php
6+#
7+"""htmltree module
8+
9+Parse and convert HTML to tree-style object
10+"""
11+#__all__ = ["htmltree"]
12+
13+import htmltree
14+from htmltree import parse
15+
16+version = (0, 2, 1)
diff -r 000000000000 -r 4db780ccc4b4 htmltree/htmltree.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/htmltree/htmltree.py Tue Jun 11 18:51:57 2013 +0900
@@ -0,0 +1,542 @@
1+# htmltree.py by hylom
2+# -*- coding: utf-8 -*-
3+
4+"""htmltree.py - HTML Element-Tree Builder
5+by hylom <hylomm@@single_at_mark@@gmail.com>
6+"""
7+
8+import HTMLParser
9+import re
10+
11+class HTMLElementError(Exception):
12+ def __init__(self, msg, elem):
13+ self.msg = msg
14+ self.elem = elem
15+
16+ def __repr__(self):
17+ str = "HTML Element Error: %s in %s" % (self.msg, self.elem)
18+ return str
19+
20+class Renderer(object):
21+ """HTMLElement Render base class."""
22+ def attrs2str(self, elem):
23+ strs = []
24+ for attr in elem.attrs:
25+ if elem.attrs[attr] == None:
26+ strs.append(attr)
27+ elif "'" in elem.attrs[attr]:
28+ strs.append('%s="%s"' % (attr, elem.attrs[attr]))
29+ else:
30+ strs.append("%s='%s'" % (attr, elem.attrs[attr]))
31+ strs.insert(0, "")
32+ return " ".join(strs)
33+
34+class HTMLRenderer(Renderer):
35+ """Render HTMLElement as HTML."""
36+ # TODO: check tags not need to close more strict...
37+ UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
38+
39+ def render_inner(self, elem):
40+ texts = []
41+ for child in elem:
42+ self._recursive(child, texts)
43+ return "".join(texts)
44+
45+ def render(self, elem):
46+ texts = []
47+ self._recursive(elem, texts)
48+ return "".join(texts)
49+
50+ def _recursive(self, elem, texts):
51+ if elem.is_tag():
52+ texts.append("<" + elem.name + self.attrs2str(elem) + ">")
53+ for child in elem:
54+ self._recursive(child, texts)
55+ if not elem.name in self.UNCLOSABLE_TAGS:
56+ texts.append("</" + elem.name + ">")
57+ elif elem.is_text():
58+ if elem.text():
59+ texts.append(elem.text())
60+ elif elem.is_root():
61+ for child in elem:
62+ self._recursive(child, texts)
63+ elif elem.is_decl():
64+ texts.append("<!" + elem.name + ">")
65+ elif elem.is_comment():
66+ texts.append("<!--" + elem.name + "-->")
67+
68+
69+class TEXTRenderer(Renderer):
70+ """Render HTMLElement as TEXT."""
71+ # TODO: check tags not need to close more strict...
72+ UNCLOSABLE_TAGS = ["br", "link", "meta", "img"]
73+
74+ def render_inner(self, elem):
75+ texts = []
76+ for child in elem:
77+ self._recursive(child, texts)
78+ return "".join(texts)
79+
80+ def render(self, elem):
81+ texts = []
82+ self._recursive(elem, texts)
83+ return "".join(texts)
84+
85+ def _recursive(self, elem, texts):
86+ if elem.is_tag():
87+ for child in elem:
88+ self._recursive(child, texts)
89+ elif elem.is_text():
90+ if elem.text():
91+ texts.append(elem.text())
92+ elif elem.is_root():
93+ for child in elem:
94+ self._recursive(child, texts)
95+
96+class HTMLElement(list):
97+ """HTML element object to use as tree nodes."""
98+ ROOT = 0
99+ TAG = 100
100+ TEXT = 200
101+ DECL = 300
102+ COMMENT = 400
103+
104+ def __init__(self, type, name="", attrs={}):
105+ """
106+ create HTMLElement object.
107+
108+ Arguments:
109+ type -- element type. HTMLElement.(ROOT|TAG|TEXT)
110+ name -- element name (default: "")
111+ attrs -- dict of attributes (default:{})
112+
113+ Example:
114+ attr = dict(href="http://example.com/", target="_blank")
115+ e = HTMLElement(HTMLElement.TAG, "a", attr)
116+ # 'e' means <a href="http://example.com/" target="_blank">
117+ """
118+
119+ self.type = type
120+ self.name = name
121+ self.attrs = dict(attrs)
122+ self._text = ""
123+ self._parent = None
124+ self._next_elem = None
125+ self._prev_elem = None
126+
127+ def __repr__(self):
128+ if self.type == HTMLElement.TAG:
129+ return "<TAG:%s %s>" % (self.name, self._attrs2str())
130+ elif self.type == HTMLElement.DECL:
131+ return "<DECL:'%s'>" % self.name
132+ elif self.type == HTMLElement.COMMENT:
133+ return "<COMMENT:'%s'>" % self.name
134+ elif self.type == HTMLElement.TEXT:
135+ return "<TEXT:'%s'>" % self._text
136+ else:
137+ return "<UNKNOWN>"
138+
139+ def __eq__(self, other):
140+ return id(self) == id(other)
141+
142+ def _attrs2str(self):
143+ str = []
144+ f = lambda x,y: x if y == None else "%s='%s'" % (x,y)
145+
146+ strs = [f(x,self.attrs[x]) for x in self.attrs]
147+ return " ".join(strs)
148+
149+ # basic acquision functions
150+ def get_attribute(self, attr, default=None):
151+ """returns given attribute's value."""
152+ return self.attrs.get(attr, default)
153+
154+ def attr(self, attr, default=None):
155+ """returns given attribute's value."""
156+ return self.attrs.get(attr, default)
157+
158+ def has_attribute(self, attr):
159+ """returns True if element has "attr" attribute."""
160+ return attr in self.attrs
161+
162+ def text(self):
163+ """returns content in the tag."""
164+ return self._text
165+
166+ def inner_html(self):
167+ "returns inner html"
168+ rn = HTMLRenderer()
169+ return rn.render_inner(self)
170+
171+ def inner_text(self):
172+ "returns inner text"
173+ rn = TEXTRenderer()
174+ return rn.render_inner(self)
175+
176+ def get_classes(self):
177+ "returns classes"
178+ attr = self.get_attribute('class')
179+ if attr == None:
180+ return []
181+ return attr.split()
182+
183+ # navigation functions
184+ def parent(self):
185+ """returns tag's parent element."""
186+ return self._parent
187+
188+ def next(self):
189+ """returns tag's next element."""
190+ return self._next_elem
191+
192+ def prev(self):
193+ """returns tag's previous element."""
194+ return self._prev_elem
195+
196+ def next_tag(self):
197+ """returns tag's next tag."""
198+ next = self.next()
199+ while(next != None):
200+ if next.is_tag():
201+ break
202+ next = next.next()
203+ return next
204+
205+ def prev_tag(self):
206+ """returns tag's previous tag."""
207+ prev = self.prev()
208+ while(prev != None):
209+ if prev.is_tag():
210+ break
211+ prev = prev.prev()
212+ return prev
213+
214+ # basic query functions
215+ def get_elements_by_name(self, name):
216+ buf = []
217+ for i in self:
218+ i._r_get_elements_by_name(name, buf)
219+ return buf
220+
221+ def _r_get_elements_by_name(self, name, buf):
222+ if self.name == name:
223+ buf.append(self)
224+ for i in self:
225+ i._r_get_elements_by_name(name, buf)
226+
227+ def get_comments(self):
228+ buf = []
229+ for i in self:
230+ i._r_get_comments(buf)
231+ return buf
232+
233+ def _r_get_comments(self, buf):
234+ if self.is_comment():
235+ buf.append(self)
236+ for i in self:
237+ i._r_get_comments(buf)
238+
239+ def get_element_by_id(self, id):
240+ for i in self:
241+ if "id" in i.attrs and i.attrs["id"] == id:
242+ return i
243+ e = i.get_element_by_id(id)
244+ if e != None:
245+ return e
246+ #raise HTMLElementError("Element not found")
247+ return None
248+
249+ def get_elements_by_class(self, cls):
250+ buf = []
251+ for i in self:
252+ i._r_get_elements_by_class(cls, buf)
253+ return buf
254+
255+ def _r_get_elements_by_class(self, cls, buf):
256+ if cls in self.get_classes():
257+ buf.append(self)
258+ for i in self:
259+ i._r_get_elements_by_class(cls, buf)
260+
261+ def get_elements(self, name, attrs):
262+ elems = self.get_elements_by_name(name)
263+ results = []
264+ for elem in elems:
265+ for name in attrs:
266+ if elem.get_attribute(name, "") != attrs[name]:
267+ break
268+ else:
269+ results.append(elem)
270+ return results
271+
272+ # manipulation functions
273+ def append_tag(self, tag, attrs):
274+ elem = HTMLElement(HTMLElement.TAG, tag, attrs)
275+ self.append(elem)
276+
277+ def remove_element(self, elem):
278+ parent = elem.parent()
279+ parent.remove(elem)
280+
281+ def delete(self):
282+ p = self.parent()
283+ p.remove(self)
284+
285+ # query functions
286+ # TODO: this function is under implementing...
287+ def select(self, expr):
288+ terms = expr.strip().split()
289+ if len(terms) == 0:
290+ return []
291+ results = self
292+ for pat in terms:
293+ t = []
294+ for elem in results:
295+ t.extend(self._select_pattern(pat, elem))
296+ results = t
297+ return results
298+
299+ def _select_pattern(self, pat, elem):
300+ results = []
301+ if pat[0] == "#":
302+ results = [elem.get_element_by_id(pat[1:]),]
303+ elif pat[0] == ".":
304+ results = elem.get_elements_by_class(pat[1:])
305+ else:
306+ results = elem.get_elements_by_name(pat)
307+ return [x for x in results if x]
308+
309+ def select_1st(self, expr):
310+ r = self.select(expr)
311+ if len(r) == 0:
312+ return None
313+ else:
314+ return r[0]
315+
316+ def select_by_name2(self, term1, term2):
317+ tbl = self.get_elements_by_name(term1)
318+ buf = []
319+ for elem in tbl:
320+ st = elem.get_elements_by_name(term2)
321+ buf.extend(st)
322+ return buf
323+
324+ # is_* functions
325+ def is_text(self):
326+ return self.type == HTMLElement.TEXT
327+
328+ def is_tag(self):
329+ return self.type == HTMLElement.TAG
330+
331+ def is_root(self):
332+ return self.type == HTMLElement.ROOT
333+
334+ def is_decl(self):
335+ return self.type == HTMLElement.DECL
336+
337+ def is_comment(self):
338+ return self.type == HTMLElement.COMMENT
339+
340+ def is_descendant(self, tagname):
341+ p = self.parent()
342+ while p != None:
343+ if p.name == tagname:
344+ return p
345+ p = p.parent()
346+ return False
347+
348+ # mmmh....
349+ def trace_back(self, tag):
350+ """ regexp string => list"""
351+ p = self.parent()
352+ rex = re.compile(tag)
353+ result = []
354+ while p != None:
355+ if rex.search(p.name):
356+ result.append(p.name)
357+ p = p.parent()
358+ return result
359+
360+
361+class HTMLTreeError(Exception):
362+ def __init__(self, msg, lineno, offset):
363+ self.msg = msg
364+ self.lineno = lineno
365+ self.offset = offset
366+
367+ def __repr__(self):
368+ str = "HTML Parse Error: %s , line: %d, char: %d" % (self.msg, self.lineno, self.offset)
369+ return str
370+
371+
372+def parse(data, charset=None, option=0):
373+ "parse HTML and returns HTMLTree object"
374+ tree = HTMLTree()
375+ tree.parse(data, charset, option)
376+ return tree
377+
378+
379+class HTMLTree(HTMLParser.HTMLParser):
380+ "HTML Tree Builder"
381+ USE_VALIDATE = 0x0001
382+
383+ IGNORE_BLANK = 0x0010
384+ TRUNC_BLANK = 0x0020
385+ JOIN_TEXT = 0x0040
386+
387+ TRUNC_BR = 0x0100
388+ # TODO: check tags not need to close more strict...
389+ UNCLOSABLE_TAGS = ["br", "link", "meta", "img", "input"]
390+
391+ def __init__(self):
392+ "Constructor"
393+ HTMLParser.HTMLParser.__init__(self)
394+
395+ def parse(self, data, charset=None, option=0):
396+ """
397+ Parse given HTML.
398+
399+ Arguments:
400+ data -- HTML to parse
401+ charset -- charset of HTML (default: None)
402+ option -- option (default: 0, meaning none)
403+
404+ """
405+
406+ self.charset = charset
407+ self._htmlroot = HTMLElement(HTMLElement.ROOT)
408+ self._cursor = self._htmlroot
409+ self._option = option
410+ try:
411+ self.feed(data)
412+ except HTMLParser.HTMLParseError, e:
413+ raise HTMLTreeError("HTML parse error: " + e.msg,
414+ e.lineno, e.offset)
415+
416+ # if charset is not given, detect charset
417+ if self.charset == None:
418+ r = self.root()
419+ metas = r.get_elements_by_name("meta")
420+ for meta in metas:
421+ if meta.attrs.get("http-equiv", None) == "Content-Type":
422+ ctype = meta.attrs.get("content", "")
423+ m = re.search(r"charset=([^;]+)", ctype)
424+ if m:
425+ self.charset = m.group(1)
426+ else:
427+ self.charset = None
428+
429+ if self.charset:
430+ self._htmlroot = HTMLElement(HTMLElement.ROOT)
431+ self._cursor = self._htmlroot
432+ self.feed(data)
433+
434+ self._finalize()
435+
436+ def _finalize(self):
437+ r = self.root()
438+ self._r_finalize(r)
439+
440+ def _r_finalize(self, elem):
441+ if elem.is_text():
442+ return
443+
444+ l = len(elem)
445+ if l > 1:
446+ elem[0]._next_elem = elem[1]
447+ elem[-1]._prev_elem = elem[-2]
448+ if l > 2:
449+ for i in range(1, l-1): # 1 to l-2
450+ elem[i]._prev_elem = elem[i-1]
451+ elem[i]._next_elem = elem[i+1]
452+
453+ for sub_elem in elem:
454+ self._r_finalize(sub_elem)
455+
456+ def validate(self):
457+ r = self.root()
458+ self._r_validate(self, e)
459+
460+ # tools
461+ def _text_encoder(self, text):
462+ # text encode check and convert.
463+ # if charset is given, convert text to unicode type.
464+ val = ""
465+ if self.charset:
466+ try:
467+ val = unicode(text, self.charset)
468+ except TypeError:
469+ # self.charset is utf-8.
470+ val = text
471+ else:
472+ # treat as unicode input
473+ val = text
474+ return val
475+
476+ def _attr_encoder(self, attrs):
477+ return [(k, self._text_encoder(v)) for (k, v) in attrs]
478+
479+ # Handlers
480+ def handle_starttag(self, tag, attrs):
481+ # some tags treat as start-end tag.
482+ if tag in self.UNCLOSABLE_TAGS:
483+ return self.handle_startendtag(tag, attrs)
484+
485+ elem = HTMLElement(HTMLElement.TAG, tag, self._attr_encoder(attrs))
486+
487+ if self._option & HTMLTree.USE_VALIDATE > 0:
488+ # try validation (experimental)
489+ if tag == "li" and self._cursor.name == "li":
490+ self.handle_endtag("li")
491+ # end of validation
492+
493+ elem._parent = self._cursor
494+ self._cursor.append(elem)
495+ self._cursor = elem
496+
497+ def handle_endtag(self, tag):
498+ # some tags treat as start-end tag.
499+ if tag in self.UNCLOSABLE_TAGS:
500+ return
501+
502+ self._cursor = self._cursor.parent()
503+
504+ def handle_startendtag(self, tag, attrs):
505+ elem = HTMLElement(HTMLElement.TAG, tag, self._attr_encoder(attrs))
506+ elem._parent = self._cursor
507+ self._cursor.append(elem)
508+
509+ def handle_data(self, data):
510+ if self._option & HTMLTree.IGNORE_BLANK > 0:
511+ if re.search(r"^\s*$", data):
512+ data = ""
513+
514+ elem = HTMLElement(HTMLElement.TEXT)
515+ elem._parent = self._cursor
516+
517+ # encode text to utf-8
518+ elem._text = self._text_encoder(data)
519+
520+ self._cursor.append(elem)
521+
522+ def handle_entityref(self, name):
523+ data = "&" + name + ";"
524+ self.handle_data(data)
525+
526+ def handle_charref(self, ref):
527+ data = "&#" + ref + ";"
528+ self.handle_data(data)
529+
530+ def handle_decl(self, decl):
531+ elem = HTMLElement(HTMLElement.DECL, decl)
532+ elem._parent = self._cursor
533+ self._cursor.append(elem)
534+
535+ def handle_comment(self, data):
536+ elem = HTMLElement(HTMLElement.COMMENT, data)
537+ elem._parent = self._cursor
538+ self._cursor.append(elem)
539+
540+ # Accessor
541+ def root(self):
542+ return self._htmlroot
diff -r 000000000000 -r 4db780ccc4b4 test/htmltree
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/htmltree Tue Jun 11 18:51:57 2013 +0900
@@ -0,0 +1,1 @@
1+../
\ No newline at end of file
diff -r 000000000000 -r 4db780ccc4b4 test/sample.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/sample.html Tue Jun 11 18:51:57 2013 +0900
@@ -0,0 +1,15 @@
1+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
3+<head>
4+<title>htmltree.py sample html</title>
5+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
6+<link rel="stylesheet" type="text/css" href="main.css">
7+</head>
8+<body id="htmltree" class="content">
9+<div class="main-column">
10+<h1 id="header1">This is htmltree</h1>
11+foo bar hoge hoge
12+</div>
13+</didy>
14+</html>
15+
diff -r 000000000000 -r 4db780ccc4b4 test/test_base.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_base.py Tue Jun 11 18:51:57 2013 +0900
@@ -0,0 +1,54 @@
1+#!/usr/bin/env python
2+
3+from htmltree import htmltree
4+import sys
5+import re
6+
7+try:
8+ input = sys.argv[1]
9+except IndexError:
10+ sys.exit("%s <input_html>" % sys.argv[0])
11+
12+f = open(input, "r")
13+html_text = f.read()
14+f.close()
15+
16+t = htmltree.HTMLTree()
17+t.parse(html_text)
18+
19+r = t.root()
20+
21+titles = r.select_by_name2("head", "title")
22+if len(titles) > 0:
23+ for title in titles:
24+ for item in title:
25+ print item
26+ print item.text()
27+
28+rn = htmltree.HTMLRenderer()
29+print rn.render(r)
30+
31+t = r.get_element_by_id("htmltree")
32+print t.inner_html()
33+
34+t = r.get_element_by_id("header1")
35+print t.inner_html()
36+
37+t = r.select_1st("#header1")
38+print t.inner_html()
39+
40+print "test_remove"
41+attrs = {
42+ "rel": "stylesheet",
43+ "type": "text/css",
44+ "href": "main.css"
45+ }
46+elems = r.get_elements("link", attrs)
47+for elem in elems:
48+ elem.delete()
49+print r.inner_html()
50+
51+t = r.select_1st("#htmltree")
52+t.delete()
53+print r.inner_html()
54+