修订版 | 22 (tree) |
---|---|
时间 | 2011-05-30 16:51:57 |
作者 | linuxchecker |
improved wikipedia_parse
@@ -27,19 +27,28 @@ | ||
27 | 27 | id_node = xml_object.xpath('//car:id', 'car' => MEDIAWIKI); |
28 | 28 | id = id_node[0].children[0].text; |
29 | 29 | return(false) if id == "1" or id == "2"; |
30 | +# title = id_node[0].children[0].title; | |
31 | +# return(false) if title =~ /^ファイル:/ | |
32 | +#p id_node[0].children[0]; | |
30 | 33 | text_node = xml_object.xpath('//car:text', 'car' => MEDIAWIKI); |
34 | +#p text_node; | |
35 | + return(false) if !text_node.children[0]; | |
31 | 36 | text_contents = CGI.unescapeHTML(text_node.children[0].to_xml(:encoding => 'UTF-8')); |
37 | +#p text_contents; | |
38 | + text_contents.gsub!(/<!--[^(\-\->)]*-->/,""); # unvisible comments | |
39 | + text_contents.gsub!(/<br\/><small><\/small><span ><\/span><sub><\/sub>/,''); # unvisible tag | |
32 | 40 | text_contents.gsub!(/\[\[\w{2,3}:[^\]]+\]\]/,''); # link other language |
33 | - text_contents.gsub!(/\n{2,}+/,"\n"); # multiple cariage return | |
34 | - text_contents.gsub!(/<!--[^(\-\->)]*-->/,""); # unvisible comments | |
35 | 41 | text_contents.gsub!(/style="[^"]+"/,""); # CSS |
36 | 42 | text_contents.gsub!(/<ref>\{\{PDFlink\|\[\S+\s(.+)\]\}\}(\{\{.+\}\})?([^<]+)<\/ref>/,'\1'); # PDF |
37 | 43 | text_contents.gsub!(/<ref>\[http\S+\s([^\]]+)\](\{\{.+\}\})?([^<]*)<\/ref>/,"<<\\1>>"); # uri reference |
38 | 44 | text_contents.gsub!(/\[http\S+\]/,''); # link |
39 | -#p text_contents; | |
45 | + text_contents.gsub!(/(style)|(class)|(width)|(align)|(colspan)|(rowspan)\s=\s"[^"]+"/,''); # ignore style | |
46 | + text_contents.gsub!(/\n{2,}/,"\n"); # multiple cariage return | |
47 | + return(false) if text_contents.scan(/$/).size()==1; | |
40 | 48 | fp = File.open("#{MEDIA_DIR}/#{id}.txt","w"); |
41 | 49 | # puts "#####################################################################\n" |
42 | -# puts(text_contents) if text_contents; | |
50 | +# p text_contents if text_contents; | |
51 | +# puts text_contents if text_contents; | |
43 | 52 | fp.puts(text_contents) if text_contents; |
44 | 53 | fp.close(); |
45 | 54 | return(true); |