• R/O
  • SSH
  • HTTPS

nls: 提交


Commit MetaInfo

修订版21 (tree)
时间2011-05-24 20:31:21
作者linuxchecker

Log Message

improved wikipedia contents extraction

更改概述

差异

--- src/org/mathsci/distance/NldCalc.java (revision 20)
+++ src/org/mathsci/distance/NldCalc.java (revision 21)
@@ -59,6 +59,6 @@
5959 }catch(Exception e){
6060 e.printStackTrace();
6161 }
62- return(1d-Math.exp(-val));
62+ return(1d-Math.exp(-0.5*(val)));
6363 }
6464 }
--- src/org/mathsci/distance/NldSearcher.java (revision 20)
+++ src/org/mathsci/distance/NldSearcher.java (revision 21)
@@ -42,7 +42,6 @@
4242 public class NldSearcher extends NldCore {
4343
4444 static RAMDirectory RAM_INDEX_DIR;
45- static public String IPADIC;
4645 static public NldSearcher search = new NldSearcher();
4746 static IndexReader reader;
4847 static IndexSearcher searcher;
--- bin/wikipedia_parse.rb (revision 20)
+++ bin/wikipedia_parse.rb (revision 21)
@@ -1,6 +1,7 @@
11 #!/usr/local/bin/ruby
22 # coding: UTF-8
33 require 'nokogiri';
4+require 'cgi';
45 class Wikipedia_parse
56 MEDIAWIKI = 'http://www.mediawiki.org/xml/export-0.5/';
67 MEDIA_DIR = 'data';
@@ -17,7 +18,7 @@
1718 if item.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT \
1819 and item.name == "page" then
1920 outer_xml = Nokogiri::XML(item.outer_xml);
20- i += 1 if writer(outer_xml);
21+ i += 1 if writer(outer_xml);
2122 end;
2223 break if i>=max;
2324 }
@@ -27,9 +28,19 @@
2728 id = id_node[0].children[0].text;
2829 return(false) if id == "1" or id == "2";
2930 text_node = xml_object.xpath('//car:text', 'car' => MEDIAWIKI);
30- text_contents = text_node.children[0];
31+ text_contents = CGI.unescapeHTML(text_node.children[0].to_xml(:encoding => 'UTF-8'));
32+ text_contents.gsub!(/\[\[\w{2,3}:[^\]]+\]\]/,''); # link other language
33+ text_contents.gsub!(/\n{2,}+/,"\n"); # multiple cariage return
34+ text_contents.gsub!(/<!--[^(\-\->)]*-->/,""); # unvisible comments
35+ text_contents.gsub!(/style="[^"]+"/,""); # CSS
36+ text_contents.gsub!(/<ref>\{\{PDFlink\|\[\S+\s(.+)\]\}\}(\{\{.+\}\})?([^<]+)<\/ref>/,'\1'); # PDF
37+ text_contents.gsub!(/<ref>\[http\S+\s([^\]]+)\](\{\{.+\}\})?([^<]*)<\/ref>/,"<<\\1>>"); # uri reference
38+ text_contents.gsub!(/\[http\S+\]/,''); # link
39+#p text_contents;
3140 fp = File.open("#{MEDIA_DIR}/#{id}.txt","w");
32- fp.puts(text_contents.to_xml(:encoding => 'UTF-8')) if text_contents;
41+# puts "#####################################################################\n"
42+# puts(text_contents) if text_contents;
43+ fp.puts(text_contents) if text_contents;
3344 fp.close();
3445 return(true);
3546 end;
--- README.txt (revision 20)
+++ README.txt (revision 21)
@@ -106,3 +106,14 @@
106106
107107 doc.add(new Field("path", f.getPath(), Field.Store.NO, Field.Index.NOT_ANALYZED));
108108
109+writer=org.apache.lucene.index.IndexWriter@1aa57fb
110+adding ../../data/267365.txt
111+Optimizing...
112+22829755 total milliseconds
113+
114+real 380m31.397s
115+user 127m44.471s
116+sys 7m45.093s
117+
118+* Loader log to image map
119+ grep , tmp1.log |awk 'BEGIN{i=1;j=0;FS=","}{ val[i,j]=$3; val[j,i]=$3; i+=1; if(i==20){j+=1; i=j+1;} }END{for(k=0;k<20;++k){ printf("%d,",k); for(l=0;l<20;++l) {printf("%.3f",val[k,l]); if(l!=19) printf(",");}; printf("\n");}}'
\ No newline at end of file
Show on old repository browser