修订版 | 21 (tree) |
---|---|
时间 | 2011-05-24 20:31:21 |
作者 | linuxchecker |
improved wikipedia contents extraction
@@ -59,6 +59,6 @@ | ||
59 | 59 | }catch(Exception e){ |
60 | 60 | e.printStackTrace(); |
61 | 61 | } |
62 | - return(1d-Math.exp(-val)); | |
62 | + return(1d-Math.exp(-0.5*(val))); | |
63 | 63 | } |
64 | 64 | } |
@@ -42,7 +42,6 @@ | ||
42 | 42 | public class NldSearcher extends NldCore { |
43 | 43 | |
44 | 44 | static RAMDirectory RAM_INDEX_DIR; |
45 | - static public String IPADIC; | |
46 | 45 | static public NldSearcher search = new NldSearcher(); |
47 | 46 | static IndexReader reader; |
48 | 47 | static IndexSearcher searcher; |
@@ -1,6 +1,7 @@ | ||
1 | 1 | #!/usr/local/bin/ruby |
2 | 2 | # coding: UTF-8 |
3 | 3 | require 'nokogiri'; |
4 | +require 'cgi'; | |
4 | 5 | class Wikipedia_parse |
5 | 6 | MEDIAWIKI = 'http://www.mediawiki.org/xml/export-0.5/'; |
6 | 7 | MEDIA_DIR = 'data'; |
@@ -17,7 +18,7 @@ | ||
17 | 18 | if item.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT \ |
18 | 19 | and item.name == "page" then |
19 | 20 | outer_xml = Nokogiri::XML(item.outer_xml); |
20 | - i += 1 if writer(outer_xml); | |
21 | + i += 1 if writer(outer_xml); | |
21 | 22 | end; |
22 | 23 | break if i>=max; |
23 | 24 | } |
@@ -27,9 +28,19 @@ | ||
27 | 28 | id = id_node[0].children[0].text; |
28 | 29 | return(false) if id == "1" or id == "2"; |
29 | 30 | text_node = xml_object.xpath('//car:text', 'car' => MEDIAWIKI); |
30 | - text_contents = text_node.children[0]; | |
31 | + text_contents = CGI.unescapeHTML(text_node.children[0].to_xml(:encoding => 'UTF-8')); | |
32 | + text_contents.gsub!(/\[\[\w{2,3}:[^\]]+\]\]/,''); # link other language | |
33 | + text_contents.gsub!(/\n{2,}+/,"\n"); # multiple cariage return | |
34 | + text_contents.gsub!(/<!--[^(\-\->)]*-->/,""); # unvisible comments | |
35 | + text_contents.gsub!(/style="[^"]+"/,""); # CSS | |
36 | + text_contents.gsub!(/<ref>\{\{PDFlink\|\[\S+\s(.+)\]\}\}(\{\{.+\}\})?([^<]+)<\/ref>/,'\1'); # PDF | |
37 | + text_contents.gsub!(/<ref>\[http\S+\s([^\]]+)\](\{\{.+\}\})?([^<]*)<\/ref>/,"<<\\1>>"); # uri reference | |
38 | + text_contents.gsub!(/\[http\S+\]/,''); # link | |
39 | +#p text_contents; | |
31 | 40 | fp = File.open("#{MEDIA_DIR}/#{id}.txt","w"); |
32 | - fp.puts(text_contents.to_xml(:encoding => 'UTF-8')) if text_contents; | |
41 | +# puts "#####################################################################\n" | |
42 | +# puts(text_contents) if text_contents; | |
43 | + fp.puts(text_contents) if text_contents; | |
33 | 44 | fp.close(); |
34 | 45 | return(true); |
35 | 46 | end; |
@@ -106,3 +106,14 @@ | ||
106 | 106 | |
107 | 107 | doc.add(new Field("path", f.getPath(), Field.Store.NO, Field.Index.NOT_ANALYZED)); |
108 | 108 | |
109 | +writer=org.apache.lucene.index.IndexWriter@1aa57fb | |
110 | +adding ../../data/267365.txt | |
111 | +Optimizing... | |
112 | +22829755 total milliseconds | |
113 | + | |
114 | +real 380m31.397s | |
115 | +user 127m44.471s | |
116 | +sys 7m45.093s | |
117 | + | |
118 | +* Loader log to image map | |
119 | + grep , tmp1.log |awk 'BEGIN{i=1;j=0;FS=","}{ val[i,j]=$3; val[j,i]=$3; i+=1; if(i==20){j+=1; i=j+1;} }END{for(k=0;k<20;++k){ printf("%d,",k); for(l=0;l<20;++l) {printf("%.3f",val[k,l]); if(l!=19) printf(",");}; printf("\n");}}' | |
\ No newline at end of file |