• R/O
  • SSH
  • HTTPS

nls: 提交


Commit MetaInfo

修订版24 (tree)
时间2011-07-12 10:06:42
作者tullio

Log Message

introduced cache in loader mode
introduced new distance formula
introduced removing REDIRECT code in wikipedia parser

更改概述

差异

--- src/org/mathsci/distance/NldCalc.java (revision 23)
+++ src/org/mathsci/distance/NldCalc.java (revision 24)
@@ -12,20 +12,23 @@
1212 }
1313 public double computeScore(String[] args) throws UnsupportedEncodingException
1414 {
15- int x, y, xy;
15+ Integer x, y, xy, hits;
1616 double lx,ly,lxy;
1717 double m = max_pages;
18- double val = 0d;
18+ double val = 0d, val2 = 0d;
1919 Date start_time = new Date();
2020 PrintStream out = new PrintStream(System.out, true, "UTF-8");
21+ double alpha=1.0;
2122 try{
2223
2324 if(timewatch) System.out.println("time="+(new Date().getTime()-start_time.getTime())+"<BR />");
2425 NldSearcher s = new NldSearcher();
2526 if(timewatch) System.out.println("time="+(new Date().getTime()-start_time.getTime())+"<BR />");
26- x = s.searcher(args[1]);
27+ if((x=HitCache.get(args[1]))==null){ x = s.searcher(args[1]); HitCache.put(args[1], x);}
28+ else System.out.println("cache hit for "+args[1]+"["+x+"]");
2729 if(timewatch) System.out.println("time="+(new Date().getTime()-start_time.getTime())+"<BR />");
28- y = s.searcher(args[2]);
30+ if((y=HitCache.get(args[2]))==null){ y = s.searcher(args[2]); HitCache.put(args[2], y);}
31+ else System.out.println("cache hit for "+args[2]+"["+y+"]");
2932 if(timewatch) System.out.println("time="+(new Date().getTime()-start_time.getTime())+"<BR />");
3033
3134 if(y==0)
@@ -50,18 +53,48 @@
5053 if(timewatch) out.println("time="+(new Date().getTime()-start_time.getTime())+"<BR />");
5154 lx = Math.log(x);
5255 ly = Math.log(y);
53- lxy = Math.log(xy);
56+ // if(xy==0)
57+ // {
58+ // lxy = Math.max(lx,ly)-Math.log(m)+Math.min(lx,ly);
59+ // xy = Math.min(x,y);
60+ // }
61+ // else
62+ lxy = Math.log(xy);
5463
5564 if(debug_level>1)
5665 {
5766 System.out.println((new String(args[1].getBytes(),"UTF-8"))+" <> "+args[2]);
5867 }
59- val = (Math.max(lx,ly)-lxy)/(Math.log(m)-Math.min(lx,ly));
68+ if(xy==0)
69+ val = 1;
70+ else
71+ val = (Math.max(lx,ly)-lxy)/(Math.log(m)-Math.min(lx,ly));
72+System.out.println(x+","+y+","+xy);
73+if(xy==0)
74+ alpha=m;
75+else
76+ alpha=Math.log(Math.min(x,y)/xy);
77+System.out.println(alpha);
78+val2=(1d-Math.exp((-alpha*(Math.max(lx,ly)-lxy)/(Math.log(m)-Math.min(lx,ly)))));
79+
80+//alpha=Math.abs(x-y)/(double)Math.max(x,y);
81+//alpha=1d/(xy/(double)(x+y)*2.0);
82+//alpha=1d-xy/(double)Math.max(x,y);
83+//alpha=(double)xy/Math.min(x,y);
84+//alpha=(double)xy/Math.max(x,y);
85+//alpha=1.0*(double)Math.min(x,y)/Math.max(x,y);
86+//val2 = (Math.max(lx,ly)-lxy)/(Math.log(m*alpha)-Math.min(lx,ly));
6087 }
6188 s.close();
6289 }catch(Exception e){
6390 e.printStackTrace();
6491 }
65- return(1d-Math.exp(-0.2*(val)));
92+ // return(1d-Math.exp(-0.2*(val)));
93+ // System.out.println("new NLD1="+(1d-alpha*Math.exp((val))));
94+ // System.out.println("NLD,"+args[1]+","+args[2]+","+(1d-Math.exp(-alpha*(val))));
95+ // System.out.println("new NLD3="+((1d-Math.exp(-alpha*(val)))+1d-alpha*Math.exp((val)))/2.0);
96+ System.out.println("NLD "+(val2));
97+ // System.out.println("new NLD5="+(1d-Math.exp((val2))));
98+ return(val);
6699 }
67100 }
--- src/org/mathsci/distance/NldCore.java (revision 23)
+++ src/org/mathsci/distance/NldCore.java (revision 24)
@@ -12,6 +12,7 @@
1212 static int debug_level;
1313 static public boolean timewatch = false;
1414 static public long start_time = 0;
15+ static public HashMap<String, Integer> HitCache = new HashMap<String, Integer>();
1516 protected NldCore()
1617 {
1718 try
--- src/org/mathsci/distance/NldSearcher.java (revision 23)
+++ src/org/mathsci/distance/NldSearcher.java (revision 24)
@@ -23,7 +23,9 @@
2323 import java.io.InputStreamReader;
2424
2525 import org.apache.lucene.analysis.Analyzer;
26-//import org.apache.lucene.analysis.standard.StandardAnalyzer;
26+import org.apache.lucene.analysis.standard.StandardAnalyzer;
27+import org.apache.lucene.analysis.SimpleAnalyzer;
28+import org.apache.lucene.analysis.WhitespaceAnalyzer;
2729 import net.reduls.igo.Tagger;
2830 import net.reduls.igo.analysis.ipadic.IpadicAnalyzer;
2931 import org.apache.lucene.document.Document;
@@ -60,8 +62,10 @@
6062
6163 reader = IndexReader.open(dir, true);
6264 searcher = new IndexSearcher(reader);
63-// Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
64- analyzer = new IpadicAnalyzer(new Tagger(IPADIC));
65+ // analyzer = new SimpleAnalyzer(Version.LUCENE_31);
66+ // analyzer = new StandardAnalyzer(Version.LUCENE_31);
67+ analyzer = new IpadicAnalyzer(new Tagger(IPADIC));
68+ // analyzer = new WhitespaceAnalyzer(Version.LUCENE_31);
6569
6670 }catch(IOException e)
6771 {
@@ -89,9 +93,10 @@
8993 }
9094
9195 QueryParser parser = new QueryParser(Version.LUCENE_31, field, analyzer);
96+ parser.setDefaultOperator(QueryParser.AND_OPERATOR);
9297 String line = in.readLine();
9398 line = line.trim();
94- Query query = parser.parse(line);
99+ Query query = parser.parse("+"+line);
95100 out.println("Searching for: " + query.toString(field)+"</ BR>");
96101 int numHitsTotal = doStreamingSearch(searcher, query);
97102 return(numHitsTotal);
--- src/org/mathsci/distance/NldCommand.java (revision 23)
+++ src/org/mathsci/distance/NldCommand.java (revision 24)
@@ -68,14 +68,17 @@
6868 }
6969 String[] word = new String[3];
7070 int len = stringBuffer.size();
71- for(int i = 0;i<len-1;++i)
71+// for(int i = 0;i<len-1;++i)
72+ for(int i = 0;i<len;++i)
7273 { word[1] = stringBuffer.get(i);
73- for(int j=i+1; j<len;++j)
74+// for(int j=i; j<len;++j)
75+ for(int j=0; j<len;++j)
7476 {
7577 word[2] = stringBuffer.get(j);
7678 try{
7779 double val = ncalc.computeScore(word);
78- System.out.println(word[1]+","+word[2]+","+val);
80+ System.out.println("NGD,"+word[1]+","+word[2]+","+val);
81+ System.out.println("NGD "+val);
7982 }catch(UnsupportedEncodingException e)
8083 {
8184 e.printStackTrace();
--- bin/wikipedia_parse.rb (revision 23)
+++ bin/wikipedia_parse.rb (revision 24)
@@ -1,4 +1,4 @@
1-#!/usr/local/bin/ruby
1+#!/usr/local/bin/ruby19
22 # coding: UTF-8
33 require 'nokogiri';
44 require 'cgi';
@@ -44,7 +44,7 @@
4444 text_contents.gsub!(/\[http\S+\]/,''); # link
4545 text_contents.gsub!(/(style)|(class)|(width)|(align)|(colspan)|(rowspan)\s=\s"[^"]+"/,''); # ignore style
4646 text_contents.gsub!(/\n{2,}/,"\n"); # multiple cariage return
47- return(false) if text_contents.scan(/$/).size()==1;
47+ return(false) if text_contents.scan(/$/).size()==1 || text_contents =~ /^#REDIRECT/;
4848 fp = File.open("#{MEDIA_DIR}/#{id}.txt","w");
4949 # puts "#####################################################################\n"
5050 # p text_contents if text_contents;
Show on old repository browser