• R/O
  • SSH
  • HTTPS

nls: 提交


Commit MetaInfo

修订版22 (tree)
时间2011-05-30 16:51:57
作者linuxchecker

Log Message

improved wikipedia_parse

更改概述

差异

--- bin/wikipedia_parse.rb (revision 21)
+++ bin/wikipedia_parse.rb (revision 22)
@@ -27,19 +27,28 @@
2727 id_node = xml_object.xpath('//car:id', 'car' => MEDIAWIKI);
2828 id = id_node[0].children[0].text;
2929 return(false) if id == "1" or id == "2";
30+# title = id_node[0].children[0].title;
31+# return(false) if title =~ /^ファイル:/
32+#p id_node[0].children[0];
3033 text_node = xml_object.xpath('//car:text', 'car' => MEDIAWIKI);
34+#p text_node;
35+ return(false) if !text_node.children[0];
3136 text_contents = CGI.unescapeHTML(text_node.children[0].to_xml(:encoding => 'UTF-8'));
37+#p text_contents;
38+ text_contents.gsub!(/<!--[^(\-\->)]*-->/,""); # unvisible comments
39+ text_contents.gsub!(/<br\/><small><\/small><span ><\/span><sub><\/sub>/,''); # unvisible tag
3240 text_contents.gsub!(/\[\[\w{2,3}:[^\]]+\]\]/,''); # link other language
33- text_contents.gsub!(/\n{2,}+/,"\n"); # multiple cariage return
34- text_contents.gsub!(/<!--[^(\-\->)]*-->/,""); # unvisible comments
3541 text_contents.gsub!(/style="[^"]+"/,""); # CSS
3642 text_contents.gsub!(/<ref>\{\{PDFlink\|\[\S+\s(.+)\]\}\}(\{\{.+\}\})?([^<]+)<\/ref>/,'\1'); # PDF
3743 text_contents.gsub!(/<ref>\[http\S+\s([^\]]+)\](\{\{.+\}\})?([^<]*)<\/ref>/,"<<\\1>>"); # uri reference
3844 text_contents.gsub!(/\[http\S+\]/,''); # link
39-#p text_contents;
45+ text_contents.gsub!(/(style)|(class)|(width)|(align)|(colspan)|(rowspan)\s=\s"[^"]+"/,''); # ignore style
46+ text_contents.gsub!(/\n{2,}/,"\n"); # multiple cariage return
47+ return(false) if text_contents.scan(/$/).size()==1;
4048 fp = File.open("#{MEDIA_DIR}/#{id}.txt","w");
4149 # puts "#####################################################################\n"
42-# puts(text_contents) if text_contents;
50+# p text_contents if text_contents;
51+# puts text_contents if text_contents;
4352 fp.puts(text_contents) if text_contents;
4453 fp.close();
4554 return(true);
Show on old repository browser