version 8.2.0
@@ -0,0 +1,211 @@ | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
3 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
4 | + <modelVersion>4.0.0</modelVersion> | |
5 | + <groupId>jp.sf.fess</groupId> | |
6 | + <artifactId>fess-solr-plugin</artifactId> | |
7 | + <version>8.2.0</version> | |
8 | + <packaging>jar</packaging> | |
9 | + <name>fess-solr-plugin</name> | |
10 | + <url>http://fess.codelibs.org/</url> | |
11 | + <inceptionYear>2009</inceptionYear> | |
12 | + <licenses> | |
13 | + <license> | |
14 | + <name>The Apache Software License, Version 2.0</name> | |
15 | + <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> | |
16 | + <distribution>repo</distribution> | |
17 | + </license> | |
18 | + </licenses> | |
19 | + <organization> | |
20 | + <name>CodeLibs</name> | |
21 | + <url>http://www.codelibs.org/</url> | |
22 | + </organization> | |
23 | + <developers> | |
24 | + <developer> | |
25 | + <id>shinsuke</id> | |
26 | + <name>Shinsuke Sugaya</name> | |
27 | + <email>shinsuke_at_yahoo.co.jp</email> | |
28 | + <url>http://d.hatena.ne.jp/shinsuke_sugaya/</url> | |
29 | + </developer> | |
30 | + </developers> | |
31 | + <issueManagement> | |
32 | + <url><![CDATA[http://sourceforge.jp/projects/fess/ticket/]]></url> | |
33 | + </issueManagement> | |
34 | + <distributionManagement> | |
35 | + <repository> | |
36 | + <id>codelibs-repository</id> | |
37 | + <url>ftp://maven.codelibs.org/home/codelibs/maven/</url> | |
38 | + </repository> | |
39 | + <site> | |
40 | + <id>codelibs-site</id> | |
41 | + <url>ftp://maven.codelibs.org/home/codelibs/fess/projects/fess-solr-plugin/</url> | |
42 | + </site> | |
43 | + </distributionManagement> | |
44 | + <scm> | |
45 | + <connection>scm:svn:http://svn.sourceforge.jp/svnroot/fess/fess-solr-plugin/trunk</connection> | |
46 | + <developerConnection>scm:svn:https://svn.sourceforge.jp/svnroot/fess/fess-solr-plugin/trunk</developerConnection> | |
47 | + <url>http://sourceforge.jp/projects/fess/svn/view/</url> | |
48 | + </scm> | |
49 | + <mailingLists> | |
50 | + <mailingList> | |
51 | + <name>User List</name> | |
52 | + <subscribe>http://lists.sourceforge.jp/mailman/listinfo/fess-user</subscribe> | |
53 | + <unsubscribe>http://lists.sourceforge.jp/mailman/listinfo/fess-user</unsubscribe> | |
54 | + <post>fess-user@lists.sourceforge.jp</post> | |
55 | + <archive>http://sourceforge.jp/projects/fess/lists/archive/user/</archive> | |
56 | + </mailingList> | |
57 | + </mailingLists> | |
58 | + <properties> | |
59 | + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
60 | + </properties> | |
61 | + <build> | |
62 | + <plugins> | |
63 | + <plugin> | |
64 | + <artifactId>maven-compiler-plugin</artifactId> | |
65 | + <configuration> | |
66 | + <source>1.6</source> | |
67 | + <target>1.6</target> | |
68 | + <encoding>UTF-8</encoding> | |
69 | + </configuration> | |
70 | + </plugin> | |
71 | + <plugin> | |
72 | + <artifactId>maven-source-plugin</artifactId> | |
73 | + <executions> | |
74 | + <execution> | |
75 | + <id>source-jar</id> | |
76 | + <phase>package</phase> | |
77 | + <goals> | |
78 | + <goal>jar</goal> | |
79 | + </goals> | |
80 | + </execution> | |
81 | + </executions> | |
82 | + </plugin> | |
83 | + <plugin> | |
84 | + <artifactId>maven-javadoc-plugin</artifactId> | |
85 | + <configuration> | |
86 | + <encoding>UTF-8</encoding> | |
87 | + <docencoding>UTF-8</docencoding> | |
88 | + <charset>UTF-8</charset> | |
89 | + <links> | |
90 | + <link>http://docs.oracle.com/javase/6/docs/api/</link> | |
91 | + <link>http://docs.oracle.com/javaee/6/api/</link> | |
92 | + <link>http://aopalliance.sourceforge.net/doc/</link> | |
93 | + <link>http://www.csg.is.titech.ac.jp/~chiba/javassist/html/</link> | |
94 | + <link>http://www.junit.org/junit/javadoc/4.3/</link> | |
95 | + <link>http://www.junit.org/junit/javadoc/3.8.1/</link> | |
96 | + <link>http://s2container.seasar.org/2.4/s2-framework/ja/apidocs/</link> | |
97 | + <link>http://s2container.seasar.org/2.4/s2-extension/ja/apidocs/</link> | |
98 | + <link>http://s2container.seasar.org/2.4/s2-tiger/ja/apidocs/</link> | |
99 | + <link>http://s2robot.sandbox.seasar.org/apidocs/</link> | |
100 | + </links> | |
101 | + </configuration> | |
102 | + <executions> | |
103 | + <execution> | |
104 | + <phase>site</phase> | |
105 | + <goals> | |
106 | + <goal>javadoc</goal> | |
107 | + </goals> | |
108 | + </execution> | |
109 | + </executions> | |
110 | + </plugin> | |
111 | + <plugin> | |
112 | + <groupId>com.mycila.maven-license-plugin</groupId> | |
113 | + <artifactId>maven-license-plugin</artifactId> | |
114 | + <version>1.5.0</version> | |
115 | + <configuration> | |
116 | + <header>${basedir}/src/etc/header.txt</header> | |
117 | + <includes> | |
118 | + <include>src/**/*.java</include> | |
119 | + </includes> | |
120 | + <encoding>UTF-8</encoding> | |
121 | + <headerDefinitions> | |
122 | + <headerDefinition>${basedir}/src/etc/header-definition.xml</headerDefinition> | |
123 | + </headerDefinitions> | |
124 | + </configuration> | |
125 | + </plugin> | |
126 | + <plugin> | |
127 | + <artifactId>maven-site-plugin</artifactId> | |
128 | + <configuration> | |
129 | + <locales>en,ja</locales> | |
130 | + <inputEncoding>UTF-8</inputEncoding> | |
131 | + <outputEncoding>UTF-8</outputEncoding> | |
132 | + </configuration> | |
133 | + </plugin> | |
134 | + </plugins> | |
135 | + <extensions> | |
136 | + <extension> | |
137 | + <groupId>org.apache.maven.wagon</groupId> | |
138 | + <artifactId>wagon-ftp</artifactId> | |
139 | + <version>1.0-beta-6</version> | |
140 | + </extension> | |
141 | + </extensions> | |
142 | + </build> | |
143 | + <pluginRepositories> | |
144 | + <pluginRepository> | |
145 | + <id>maven.seasar.org</id> | |
146 | + <name>The Seasar Foundation Maven2 Repository</name> | |
147 | + <url>http://maven.seasar.org/maven2/</url> | |
148 | + </pluginRepository> | |
149 | + <pluginRepository> | |
150 | + <id>maven-snapshot.seasar.org</id> | |
151 | + <name>The Seasar Foundation Maven2 Repository</name> | |
152 | + <url>http://maven.seasar.org/maven2-snapshot/</url> | |
153 | + </pluginRepository> | |
154 | + <pluginRepository> | |
155 | + <name>oss.sonatype.org</name> | |
156 | + <id>oss.sonatype.org</id> | |
157 | + <url>http://oss.sonatype.org/content/groups/public/</url> | |
158 | + </pluginRepository> | |
159 | + </pluginRepositories> | |
160 | + <repositories> | |
161 | + <repository> | |
162 | + <id>codelibs.org</id> | |
163 | + <name>CodeLibs Repository</name> | |
164 | + <url>http://maven.codelibs.org/</url> | |
165 | + </repository> | |
166 | + <repository> | |
167 | + <id>maven.seasar.org</id> | |
168 | + <name>The Seasar Foundation Maven2 Repository</name> | |
169 | + <url>http://maven.seasar.org/maven2/</url> | |
170 | + </repository> | |
171 | + <repository> | |
172 | + <id>maven-snapshot.seasar.org</id> | |
173 | + <name>The Seasar Foundation Maven2 Repository</name> | |
174 | + <url>http://maven.seasar.org/maven2-snapshot/</url> | |
175 | + </repository> | |
176 | + <repository> | |
177 | + <id>orangesignal.sourceforge.jp</id> | |
178 | + <name>OrangeSignal Repository</name> | |
179 | + <url>http://orangesignal.sourceforge.jp/maven2/</url> | |
180 | + </repository> | |
181 | + </repositories> | |
182 | + <dependencies> | |
183 | + <dependency> | |
184 | + <groupId>org.apache.solr</groupId> | |
185 | + <artifactId>solr-core</artifactId> | |
186 | + <version>4.4.0</version> | |
187 | + <exclusions> | |
188 | + <exclusion> | |
189 | + <groupId>org.apache.commons</groupId> | |
190 | + <artifactId>commons-io</artifactId> | |
191 | + </exclusion> | |
192 | + </exclusions> | |
193 | + </dependency> | |
194 | + <dependency> | |
195 | + <groupId>org.noggit</groupId> | |
196 | + <artifactId>noggit</artifactId> | |
197 | + <version>0.5</version> | |
198 | + </dependency> | |
199 | + <dependency> | |
200 | + <groupId>jp.sf.fess</groupId> | |
201 | + <artifactId>fess-suggest</artifactId> | |
202 | + <version>1.0.2</version> | |
203 | + </dependency> | |
204 | + <dependency> | |
205 | + <groupId>junit</groupId> | |
206 | + <artifactId>junit</artifactId> | |
207 | + <version>4.11</version> | |
208 | + <scope>test</scope> | |
209 | + </dependency> | |
210 | + </dependencies> | |
211 | +</project> |
@@ -0,0 +1,381 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin.analysis; | |
18 | + | |
19 | +import java.io.IOException; | |
20 | +import java.io.Reader; | |
21 | +import java.io.StringReader; | |
22 | +import java.util.ArrayList; | |
23 | +import java.util.HashMap; | |
24 | +import java.util.List; | |
25 | +import java.util.Map; | |
26 | + | |
27 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
28 | + | |
29 | +import org.apache.commons.io.IOUtils; | |
30 | +import org.apache.lucene.analysis.TokenStream; | |
31 | +import org.apache.lucene.analysis.Tokenizer; | |
32 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
33 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; | |
34 | +import org.apache.lucene.analysis.ja.dict.UserDictionary; | |
35 | +import org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute; | |
36 | +import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute; | |
37 | +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
38 | +import org.slf4j.Logger; | |
39 | +import org.slf4j.LoggerFactory; | |
40 | + | |
41 | +public class SuggestTextTokenizer extends Tokenizer { | |
42 | + private static final Logger logger = LoggerFactory | |
43 | + .getLogger(SuggestTextTokenizer.class); | |
44 | + | |
45 | + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | |
46 | + | |
47 | + private String inputStr = ""; | |
48 | + | |
49 | + private int offset = 0; | |
50 | + | |
51 | + private int readingOffset = 0; | |
52 | + | |
53 | + private final List<String> termListByKuromoji = new ArrayList<String>(); | |
54 | + | |
55 | + private final List<String> readingList = new ArrayList<String>(); | |
56 | + | |
57 | + private final List<String> partOfSpeechList = new ArrayList<String>(); | |
58 | + | |
59 | + private final List<String> suggestStringList = new ArrayList<String>(); | |
60 | + | |
61 | + private final UserDictionary userDictionary; | |
62 | + | |
63 | + private final boolean discardPunctuation; | |
64 | + | |
65 | + private final Mode tokenizerMode; | |
66 | + | |
67 | + private final String wordSeparator; | |
68 | + | |
69 | + private final TermChecker termChecker; | |
70 | + | |
71 | + private final List<SuggestConverter> preConverterList; | |
72 | + | |
73 | + private final List<SuggestConverter> converterList; | |
74 | + | |
75 | + private final int maxLength; | |
76 | + | |
77 | + public SuggestTextTokenizer(final Reader input, final int bufferSize, | |
78 | + final UserDictionary userDictionaryPara, | |
79 | + final boolean discardPunctuationPara, final Mode modePara, | |
80 | + final TermChecker termChecker, | |
81 | + final List<SuggestConverter> preconverterList, | |
82 | + final List<SuggestConverter> converterList, | |
83 | + final String wordSeparator, final int maxLength) { | |
84 | + super(input); | |
85 | + | |
86 | + userDictionary = userDictionaryPara; | |
87 | + discardPunctuation = discardPunctuationPara; | |
88 | + tokenizerMode = modePara; | |
89 | + termAtt.resizeBuffer(bufferSize); | |
90 | + this.wordSeparator = wordSeparator; | |
91 | + this.termChecker = termChecker; | |
92 | + preConverterList = preconverterList; | |
93 | + this.converterList = converterList; | |
94 | + this.maxLength = maxLength; | |
95 | + | |
96 | + initialize(); | |
97 | + } | |
98 | + | |
99 | + public void initialize() { | |
100 | + termListByKuromoji.clear(); | |
101 | + partOfSpeechList.clear(); | |
102 | + readingList.clear(); | |
103 | + suggestStringList.clear(); | |
104 | + offset = 0; | |
105 | + readingOffset = 0; | |
106 | + inputStr = ""; | |
107 | + | |
108 | + try { | |
109 | + String s = IOUtils.toString(input); | |
110 | + if (s != null && s.length() > 0) { | |
111 | + if (maxLength > 0 && s.length() > maxLength) { | |
112 | + s = truncateInput(s); | |
113 | + } | |
114 | + inputStr = s; | |
115 | + for (final SuggestConverter converter : preConverterList) { | |
116 | + inputStr = converter.convert(inputStr); | |
117 | + } | |
118 | + } | |
119 | + } catch (final IOException e) { | |
120 | + } | |
121 | + | |
122 | + final Reader rd = new StringReader(inputStr); | |
123 | + | |
124 | + TokenStream stream = null; | |
125 | + | |
126 | + try { | |
127 | + stream = new JapaneseTokenizer(rd, userDictionary, | |
128 | + discardPunctuation, tokenizerMode); | |
129 | + | |
130 | + stream.reset(); | |
131 | + while (stream.incrementToken()) { | |
132 | + final CharTermAttribute att = stream | |
133 | + .getAttribute(CharTermAttribute.class); | |
134 | + termListByKuromoji.add(att.toString()); | |
135 | + | |
136 | + final PartOfSpeechAttribute psAtt = stream | |
137 | + .getAttribute(PartOfSpeechAttribute.class); | |
138 | + final String pos = psAtt.getPartOfSpeech(); | |
139 | + partOfSpeechList.add(pos); | |
140 | + | |
141 | + final ReadingAttribute rdAttr = stream | |
142 | + .getAttribute(ReadingAttribute.class); | |
143 | + | |
144 | + String reading; | |
145 | + if (rdAttr.getReading() != null) { | |
146 | + reading = rdAttr.getReading(); | |
147 | + } else { | |
148 | + reading = att.toString(); | |
149 | + } | |
150 | + | |
151 | + for (final SuggestConverter converter : converterList) { | |
152 | + reading = converter.convert(reading); | |
153 | + } | |
154 | + readingList.add(reading); | |
155 | + | |
156 | + } | |
157 | + } catch (final Exception e) { | |
158 | + logger.warn("JapaneseTokenizer stream error", e); | |
159 | + } finally { | |
160 | + try { | |
161 | + input.reset(); | |
162 | + } catch (final Exception e) { | |
163 | + } | |
164 | + try { | |
165 | + stream.end(); | |
166 | + } catch (final Exception e) { | |
167 | + } | |
168 | + try { | |
169 | + rd.close(); | |
170 | + } catch (final Exception e) { | |
171 | + } | |
172 | + } | |
173 | + } | |
174 | + | |
175 | + private String truncateInput(final String s) { | |
176 | + int pos = maxLength; | |
177 | + while (pos > 0) { | |
178 | + final int ch = s.codePointAt(pos); | |
179 | + if (!Character.isLetterOrDigit(ch)) { | |
180 | + break; | |
181 | + } | |
182 | + pos--; | |
183 | + } | |
184 | + if (pos == 0) { | |
185 | + pos = maxLength; | |
186 | + } | |
187 | + | |
188 | + return s.substring(0, pos); | |
189 | + } | |
190 | + | |
191 | + @Override | |
192 | + public boolean incrementToken() throws IOException { | |
193 | + | |
194 | + if (offset < termListByKuromoji.size()) { | |
195 | + while (partOfSpeechList.get(offset).indexOf("名詞") == -1) { | |
196 | + offset++; | |
197 | + if (offset >= termListByKuromoji.size()) { | |
198 | + break; | |
199 | + } | |
200 | + } | |
201 | + } | |
202 | + | |
203 | + if (offset < termListByKuromoji.size()) { | |
204 | + termAtt.setEmpty(); | |
205 | + termAtt.append(termListByKuromoji.get(offset)); | |
206 | + suggestStringList.add(convertSuggestString( | |
207 | + termListByKuromoji.get(offset), readingList.get(offset))); | |
208 | + offset++; | |
209 | + } else { | |
210 | + | |
211 | + int tmpOffset = offset - termListByKuromoji.size(); | |
212 | + boolean readingFlg = false; | |
213 | + if (tmpOffset < termListByKuromoji.size()) { | |
214 | + StringBuilder buffer = null; | |
215 | + StringBuilder readingBuf = null; | |
216 | + int end = 1; | |
217 | + | |
218 | + for (; tmpOffset < partOfSpeechList.size(); tmpOffset++) { | |
219 | + buffer = new StringBuilder(); | |
220 | + readingBuf = new StringBuilder(); | |
221 | + if (termChecker.check(partOfSpeechList.get(tmpOffset), | |
222 | + termListByKuromoji.get(tmpOffset), "start")) { | |
223 | + buffer.append(termListByKuromoji.get(tmpOffset)); | |
224 | + readingBuf.append(readingList.get(tmpOffset)); | |
225 | + | |
226 | + for (int i = 1; tmpOffset + i < partOfSpeechList.size(); i++) { | |
227 | + if (termChecker.check( | |
228 | + partOfSpeechList.get(tmpOffset + i), | |
229 | + termListByKuromoji.get(tmpOffset + i), | |
230 | + "middle")) { | |
231 | + if (inputStr | |
232 | + .indexOf(buffer.toString() | |
233 | + + termListByKuromoji | |
234 | + .get(tmpOffset + i)) != -1) { | |
235 | + buffer.append(termListByKuromoji | |
236 | + .get(tmpOffset + i)); | |
237 | + readingBuf.append(readingList.get(tmpOffset | |
238 | + + i)); | |
239 | + end++; | |
240 | + } else { | |
241 | + break; | |
242 | + } | |
243 | + } else { | |
244 | + break; | |
245 | + } | |
246 | + } | |
247 | + if (end > 1) { | |
248 | + break; | |
249 | + } | |
250 | + } | |
251 | + } | |
252 | + | |
253 | + if (buffer != null | |
254 | + && tmpOffset < partOfSpeechList.size() | |
255 | + && buffer.length() > termListByKuromoji.get(tmpOffset) | |
256 | + .length()) { | |
257 | + termAtt.setEmpty(); | |
258 | + termAtt.append(buffer.toString()); | |
259 | + suggestStringList.add(convertSuggestString( | |
260 | + buffer.toString(), readingBuf.toString())); | |
261 | + | |
262 | + } else { | |
263 | + readingFlg = true; | |
264 | + } | |
265 | + offset = tmpOffset + termListByKuromoji.size() + end; | |
266 | + } else { | |
267 | + readingFlg = true; | |
268 | + } | |
269 | + | |
270 | + if (readingFlg) { | |
271 | + if (readingOffset < suggestStringList.size()) { | |
272 | + termAtt.setEmpty(); | |
273 | + termAtt.append(suggestStringList.get(readingOffset)); | |
274 | + readingOffset++; | |
275 | + } else { | |
276 | + return false; | |
277 | + } | |
278 | + } | |
279 | + | |
280 | + } | |
281 | + return true; | |
282 | + } | |
283 | + | |
284 | + @Override | |
285 | + public void reset() throws IOException { | |
286 | + super.reset(); | |
287 | + initialize(); | |
288 | + } | |
289 | + | |
290 | + private String convertSuggestString(final String term, final String reading) { | |
291 | + String suggestString; | |
292 | + if (reading != null && reading.length() > 0) { | |
293 | + suggestString = reading + wordSeparator + term; | |
294 | + } else { | |
295 | + suggestString = term; | |
296 | + } | |
297 | + | |
298 | + return suggestString; | |
299 | + } | |
300 | + | |
301 | + public static class TermChecker { | |
302 | + private final Map<String, Map<String, List<String>>> paramMap = new HashMap<String, Map<String, List<String>>>( | |
303 | + 2); | |
304 | + | |
305 | + public TermChecker() { | |
306 | + final Map<String, List<String>> startParamMap = new HashMap<String, List<String>>( | |
307 | + 3); | |
308 | + startParamMap.put("includePartOfSpeech", new ArrayList<String>()); | |
309 | + startParamMap.put("excludePartOfSpeech", new ArrayList<String>()); | |
310 | + startParamMap.put("includeCharTerm", new ArrayList<String>()); | |
311 | + paramMap.put("start", startParamMap); | |
312 | + final Map<String, List<String>> middleParamMap = new HashMap<String, List<String>>( | |
313 | + 3); | |
314 | + middleParamMap.put("includePartOfSpeech", new ArrayList<String>()); | |
315 | + middleParamMap.put("excludePartOfSpeech", new ArrayList<String>()); | |
316 | + middleParamMap.put("includeCharTerm", new ArrayList<String>()); | |
317 | + paramMap.put("middle", middleParamMap); | |
318 | + } | |
319 | + | |
320 | + public void includePartOfSpeech(final String mode, final String value) { | |
321 | + updateParam(mode, "includePartOfSpeech", value); | |
322 | + } | |
323 | + | |
324 | + public void excludePartOfSpeech(final String mode, final String value) { | |
325 | + updateParam(mode, "excludePartOfSpeech", value); | |
326 | + } | |
327 | + | |
328 | + public void includeCharTerm(final String mode, final String value) { | |
329 | + updateParam(mode, "includeCharTerm", value); | |
330 | + } | |
331 | + | |
332 | + private void updateParam(final String mode, final String target, | |
333 | + final String value) { | |
334 | + final Map<String, List<String>> modeParamMap = paramMap.get(mode); | |
335 | + if (modeParamMap != null) { | |
336 | + final List<String> list = modeParamMap.get(target); | |
337 | + if (list != null) { | |
338 | + list.add(value); | |
339 | + } | |
340 | + } | |
341 | + } | |
342 | + | |
343 | + public boolean check(final String partOfSpeech, | |
344 | + final String termByKuromoji, final String mode) { | |
345 | + final Map<String, List<String>> modeParamMap = paramMap.get(mode); | |
346 | + final List<String> includePartOfSpeechList = modeParamMap | |
347 | + .get("includePartOfSpeech"); | |
348 | + final List<String> excludePartOfSpeechList = modeParamMap | |
349 | + .get("excludePartOfSpeech"); | |
350 | + final List<String> includeCharTermList = modeParamMap | |
351 | + .get("includeCharTerm"); | |
352 | + | |
353 | + boolean ret = false; | |
354 | + for (int i = 0; i < includePartOfSpeechList.size(); i++) { | |
355 | + if (partOfSpeech.indexOf(includePartOfSpeechList.get(i)) != -1) { | |
356 | + boolean isNg = false; | |
357 | + for (int j = 0; j < excludePartOfSpeechList.size(); j++) { | |
358 | + if (partOfSpeech | |
359 | + .indexOf(excludePartOfSpeechList.get(j)) != -1) { | |
360 | + isNg = true; | |
361 | + } | |
362 | + } | |
363 | + if (!isNg) { | |
364 | + ret = true; | |
365 | + break; | |
366 | + } | |
367 | + } | |
368 | + } | |
369 | + | |
370 | + if (!ret) { | |
371 | + for (int i = 0; i < includeCharTermList.size(); i++) { | |
372 | + if (termByKuromoji.equals(includeCharTermList.get(i))) { | |
373 | + ret = true; | |
374 | + break; | |
375 | + } | |
376 | + } | |
377 | + } | |
378 | + return ret; | |
379 | + } | |
380 | + } | |
381 | +} |
@@ -0,0 +1,192 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin.analysis; | |
18 | + | |
19 | +import java.io.InputStream; | |
20 | +import java.io.InputStreamReader; | |
21 | +import java.io.Reader; | |
22 | +import java.nio.charset.Charset; | |
23 | +import java.nio.charset.CharsetDecoder; | |
24 | +import java.nio.charset.CodingErrorAction; | |
25 | +import java.util.List; | |
26 | +import java.util.Locale; | |
27 | +import java.util.Map; | |
28 | + | |
29 | +import jp.sf.fess.solr.plugin.analysis.SuggestTextTokenizer.TermChecker; | |
30 | +import jp.sf.fess.solr.plugin.suggest.SuggestConverterCreator; | |
31 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
32 | + | |
33 | +import org.apache.lucene.analysis.Tokenizer; | |
34 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
35 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; | |
36 | +import org.apache.lucene.analysis.ja.dict.UserDictionary; | |
37 | +import org.apache.lucene.analysis.util.ResourceLoader; | |
38 | +import org.apache.lucene.analysis.util.ResourceLoaderAware; | |
39 | +import org.apache.lucene.analysis.util.TokenizerFactory; | |
40 | +import org.apache.lucene.util.AttributeSource.AttributeFactory; | |
41 | +import org.apache.lucene.util.IOUtils; | |
42 | +import org.slf4j.Logger; | |
43 | +import org.slf4j.LoggerFactory; | |
44 | + | |
45 | +public class SuggestTextTokenizerFactory extends TokenizerFactory implements | |
46 | + ResourceLoaderAware { | |
47 | + | |
48 | + private static final Logger logger = LoggerFactory | |
49 | + .getLogger(SuggestTextTokenizerFactory.class); | |
50 | + | |
51 | + private static final String MODE = "mode"; | |
52 | + | |
53 | + private static final String USER_DICT_PATH = "userDictionary"; | |
54 | + | |
55 | + private static final String USER_DICT_ENCODING = "userDictionaryEncoding"; | |
56 | + | |
57 | + private static final String BUFFER_SIZE = "bufferSize"; | |
58 | + | |
59 | + private static final String WORD_SEPARATOR = "wordSeparator"; | |
60 | + | |
61 | + private static final String INCLUDE_CHAR_TERM = "includeCharTerm"; | |
62 | + | |
63 | + private static final String EXCLUDE_PART_OF_SPEECH = "excludePartOfSpeech"; | |
64 | + | |
65 | + private static final String INCLUDE_PART_OF_SPEECH = "includePartOfSpeech"; | |
66 | + | |
67 | + private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option | |
68 | + | |
69 | + private static final String MAX_LENGTH = "maxLength"; | |
70 | + | |
71 | + private UserDictionary userDictionary; | |
72 | + | |
73 | + private final Mode mode; | |
74 | + | |
75 | + private final String userDictionaryPath; | |
76 | + | |
77 | + private final String userDictionaryEncoding; | |
78 | + | |
79 | + private final boolean discardPunctuation; | |
80 | + | |
81 | + private final int bufferSize; | |
82 | + | |
83 | + private final String wordSeparator; | |
84 | + | |
85 | + private final TermChecker termChecker; | |
86 | + | |
87 | + private final List<SuggestConverter> preConverterList; | |
88 | + | |
89 | + private final List<SuggestConverter> converterList; | |
90 | + | |
91 | + private final int maxLength; | |
92 | + | |
93 | + public SuggestTextTokenizerFactory(final Map<String, String> args) { | |
94 | + super(args); | |
95 | + | |
96 | + mode = getMode(args); | |
97 | + userDictionaryPath = args.get(USER_DICT_PATH); | |
98 | + userDictionaryEncoding = args.get(USER_DICT_ENCODING); | |
99 | + bufferSize = getInt(args, BUFFER_SIZE, 256); | |
100 | + wordSeparator = get(args, WORD_SEPARATOR, "_SP_"); | |
101 | + discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true); | |
102 | + maxLength = getInt(args, MAX_LENGTH, 0); | |
103 | + | |
104 | + termChecker = new TermChecker(); | |
105 | + // ex. start:名詞,middle:動詞 | |
106 | + final String includePartOfSpeech = args.get(INCLUDE_PART_OF_SPEECH); | |
107 | + if (includePartOfSpeech != null) { | |
108 | + for (String text : includePartOfSpeech.split(",")) { | |
109 | + text = text.trim(); | |
110 | + if (text.length() > 0) { | |
111 | + final String[] values = text.split(":"); | |
112 | + if (values.length == 2) { | |
113 | + termChecker.includePartOfSpeech(values[0].trim(), | |
114 | + values[1].trim()); | |
115 | + } | |
116 | + } | |
117 | + } | |
118 | + } | |
119 | + final String excludePartOfSpeech = args.get(EXCLUDE_PART_OF_SPEECH); | |
120 | + if (excludePartOfSpeech != null) { | |
121 | + for (String text : excludePartOfSpeech.split(",")) { | |
122 | + text = text.trim(); | |
123 | + if (text.length() > 0) { | |
124 | + final String[] values = text.split(":"); | |
125 | + if (values.length == 2) { | |
126 | + termChecker.excludePartOfSpeech(values[0].trim(), | |
127 | + values[1].trim()); | |
128 | + } | |
129 | + } | |
130 | + } | |
131 | + } | |
132 | + final String includeCharTerm = args.get(INCLUDE_CHAR_TERM); | |
133 | + if (includeCharTerm != null) { | |
134 | + for (String text : includeCharTerm.split(",")) { | |
135 | + text = text.trim(); | |
136 | + if (text.length() > 0) { | |
137 | + final String[] values = text.split(":"); | |
138 | + if (values.length == 2) { | |
139 | + termChecker.includeCharTerm(values[0].trim(), | |
140 | + values[1].trim()); | |
141 | + } | |
142 | + } | |
143 | + } | |
144 | + } | |
145 | + preConverterList = SuggestConverterCreator.create(args | |
146 | + .get("preConverters")); | |
147 | + converterList = SuggestConverterCreator.create(args.get("converters")); | |
148 | + } | |
149 | + | |
150 | + @Override | |
151 | + public Tokenizer create(final AttributeFactory factory, final Reader input) { | |
152 | + return new SuggestTextTokenizer(input, bufferSize, userDictionary, | |
153 | + discardPunctuation, mode, termChecker, preConverterList, | |
154 | + converterList, wordSeparator, maxLength); | |
155 | + } | |
156 | + | |
157 | + @Override | |
158 | + public void inform(final ResourceLoader loader) { | |
159 | + try { | |
160 | + | |
161 | + if (userDictionaryPath != null) { | |
162 | + final InputStream stream = loader | |
163 | + .openResource(userDictionaryPath); | |
164 | + String encoding = userDictionaryEncoding; | |
165 | + if (encoding == null) { | |
166 | + encoding = IOUtils.UTF_8; | |
167 | + } | |
168 | + final CharsetDecoder decoder = Charset.forName(encoding) | |
169 | + .newDecoder() | |
170 | + .onMalformedInput(CodingErrorAction.REPORT) | |
171 | + .onUnmappableCharacter(CodingErrorAction.REPORT); | |
172 | + final Reader reader = new InputStreamReader(stream, decoder); | |
173 | + userDictionary = new UserDictionary(reader); | |
174 | + } else { | |
175 | + userDictionary = null; | |
176 | + } | |
177 | + | |
178 | + } catch (final Exception e) { | |
179 | + logger.warn("Initialization failed.", e); | |
180 | + } | |
181 | + } | |
182 | + | |
183 | + private Mode getMode(final Map<String, String> args) { | |
184 | + final String modeArg = args.get(MODE); | |
185 | + if (modeArg != null) { | |
186 | + return Mode.valueOf(modeArg.toUpperCase(Locale.ROOT)); | |
187 | + } else { | |
188 | + return JapaneseTokenizer.Mode.NORMAL; | |
189 | + } | |
190 | + } | |
191 | + | |
192 | +} |
@@ -0,0 +1,139 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin.analysis; | |
18 | + | |
19 | +import java.io.InputStream; | |
20 | +import java.io.InputStreamReader; | |
21 | +import java.io.Reader; | |
22 | +import java.nio.charset.Charset; | |
23 | +import java.nio.charset.CharsetDecoder; | |
24 | +import java.nio.charset.CodingErrorAction; | |
25 | +import java.util.List; | |
26 | +import java.util.Locale; | |
27 | +import java.util.Map; | |
28 | + | |
29 | +import jp.sf.fess.solr.plugin.suggest.SuggestConverterCreator; | |
30 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
31 | + | |
32 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
33 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; | |
34 | +import org.apache.lucene.analysis.ja.dict.UserDictionary; | |
35 | +import org.apache.lucene.analysis.util.ResourceLoader; | |
36 | +import org.apache.lucene.analysis.util.ResourceLoaderAware; | |
37 | +import org.apache.lucene.analysis.util.TokenizerFactory; | |
38 | +import org.apache.lucene.util.AttributeSource.AttributeFactory; | |
39 | +import org.apache.lucene.util.IOUtils; | |
40 | +import org.slf4j.Logger; | |
41 | +import org.slf4j.LoggerFactory; | |
42 | + | |
43 | +public class SuggestStringTokenizerFactory extends TokenizerFactory implements | |
44 | + ResourceLoaderAware { | |
45 | + | |
46 | + private static final Logger logger = LoggerFactory | |
47 | + .getLogger(SuggestStringTokenizerFactory.class); | |
48 | + | |
49 | + private static final String MODE = "mode"; | |
50 | + | |
51 | + private static final String USER_DICT_PATH = "userDictionary"; | |
52 | + | |
53 | + private static final String USER_DICT_ENCODING = "userDictionaryEncoding"; | |
54 | + | |
55 | + private static final String BUFFER_SIZE = "bufferSize"; | |
56 | + | |
57 | + private static final String WORD_SEPARATOR = "wordSeparator"; | |
58 | + | |
59 | + private static final String DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option | |
60 | + | |
61 | + private UserDictionary userDictionary; | |
62 | + | |
63 | + private final Mode mode; | |
64 | + | |
65 | + private final String userDictionaryPath; | |
66 | + | |
67 | + private final String userDictionaryEncoding; | |
68 | + | |
69 | + private final boolean discardPunctuation; | |
70 | + | |
71 | + private final int bufferSize; | |
72 | + | |
73 | + private String wordSeparator; | |
74 | + | |
75 | + private final List<SuggestConverter> preConverterList; | |
76 | + | |
77 | + private final List<SuggestConverter> converterList; | |
78 | + | |
79 | + public SuggestStringTokenizerFactory(final Map<String, String> args) { | |
80 | + super(args); | |
81 | + | |
82 | + mode = getMode(args); | |
83 | + userDictionaryPath = args.get(USER_DICT_PATH); | |
84 | + userDictionaryEncoding = args.get(USER_DICT_ENCODING); | |
85 | + bufferSize = getInt(args, BUFFER_SIZE, 256); | |
86 | + discardPunctuation = getBoolean(args, DISCARD_PUNCTUATION, true); | |
87 | + wordSeparator = args.get(WORD_SEPARATOR); | |
88 | + if (wordSeparator == null) { | |
89 | + wordSeparator = "_SP_"; | |
90 | + } | |
91 | + | |
92 | + preConverterList = SuggestConverterCreator.create(args | |
93 | + .get("preConverters")); | |
94 | + converterList = SuggestConverterCreator.create(args.get("converters")); | |
95 | + } | |
96 | + | |
97 | + @Override | |
98 | + public void inform(final ResourceLoader loader) { | |
99 | + try { | |
100 | + if (userDictionaryPath != null) { | |
101 | + final InputStream stream = loader | |
102 | + .openResource(userDictionaryPath); | |
103 | + String encoding = userDictionaryEncoding; | |
104 | + if (encoding == null) { | |
105 | + encoding = IOUtils.UTF_8; | |
106 | + } | |
107 | + final CharsetDecoder decoder = Charset.forName(encoding) | |
108 | + .newDecoder() | |
109 | + .onMalformedInput(CodingErrorAction.REPORT) | |
110 | + .onUnmappableCharacter(CodingErrorAction.REPORT); | |
111 | + final Reader reader = new InputStreamReader(stream, decoder); | |
112 | + userDictionary = new UserDictionary(reader); | |
113 | + } else { | |
114 | + userDictionary = null; | |
115 | + } | |
116 | + | |
117 | + } catch (final Exception e) { | |
118 | + logger.warn("Initialization failed.", e); | |
119 | + } | |
120 | + } | |
121 | + | |
122 | + @Override | |
123 | + public SuggestStringTokenizer create(final AttributeFactory factory, | |
124 | + final Reader input) { | |
125 | + return new SuggestStringTokenizer(input, bufferSize, userDictionary, | |
126 | + discardPunctuation, mode, preConverterList, converterList, | |
127 | + wordSeparator); | |
128 | + } | |
129 | + | |
130 | + private Mode getMode(final Map<String, String> args) { | |
131 | + final String modeArg = args.get(MODE); | |
132 | + if (modeArg != null) { | |
133 | + return Mode.valueOf(modeArg.toUpperCase(Locale.ROOT)); | |
134 | + } else { | |
135 | + return JapaneseTokenizer.Mode.NORMAL; | |
136 | + } | |
137 | + } | |
138 | + | |
139 | +} |
@@ -0,0 +1,220 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin.analysis; | |
18 | + | |
19 | +import java.io.IOException; | |
20 | +import java.io.Reader; | |
21 | +import java.io.StringReader; | |
22 | +import java.util.ArrayList; | |
23 | +import java.util.List; | |
24 | + | |
25 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
26 | + | |
27 | +import org.apache.commons.io.IOUtils; | |
28 | +import org.apache.lucene.analysis.TokenStream; | |
29 | +import org.apache.lucene.analysis.Tokenizer; | |
30 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer; | |
31 | +import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode; | |
32 | +import org.apache.lucene.analysis.ja.dict.UserDictionary; | |
33 | +import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute; | |
34 | +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
35 | +import org.slf4j.Logger; | |
36 | +import org.slf4j.LoggerFactory; | |
37 | + | |
38 | +import com.ibm.icu.text.Transliterator; | |
39 | + | |
40 | +public class SuggestStringTokenizer extends Tokenizer { | |
41 | + private static final Logger logger = LoggerFactory | |
42 | + .getLogger(SuggestStringTokenizer.class); | |
43 | + | |
44 | + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); | |
45 | + | |
46 | + private int offset = 0; | |
47 | + | |
48 | + private final List<String> termListByKuromoji = new ArrayList<String>(); | |
49 | + | |
50 | + private final List<String> readingList = new ArrayList<String>(); | |
51 | + | |
52 | + private String[] titleArray = null; | |
53 | + | |
54 | + private final UserDictionary userDictionary; | |
55 | + | |
56 | + private final boolean discardPunctuation; | |
57 | + | |
58 | + private final Mode tokenizerMode; | |
59 | + | |
60 | + private final String wordSeparator; | |
61 | + | |
62 | + private final List<SuggestConverter> preConverterList; | |
63 | + | |
64 | + private final List<SuggestConverter> converterList; | |
65 | + | |
66 | + public SuggestStringTokenizer(final Reader input, final int bufferSize, | |
67 | + final UserDictionary userDictionaryPara, | |
68 | + final boolean discardPunctuationPara, final Mode modePara, | |
69 | + final List<SuggestConverter> preconverterList, | |
70 | + final List<SuggestConverter> converterList, | |
71 | + final String wordSeparator) { | |
72 | + super(input); | |
73 | + | |
74 | + userDictionary = userDictionaryPara; | |
75 | + discardPunctuation = discardPunctuationPara; | |
76 | + tokenizerMode = modePara; | |
77 | + termAtt.resizeBuffer(bufferSize); | |
78 | + this.wordSeparator = wordSeparator; | |
79 | + preConverterList = preconverterList; | |
80 | + this.converterList = converterList; | |
81 | + | |
82 | + initialize(); | |
83 | + } | |
84 | + | |
85 | + public void initialize() { | |
86 | + termListByKuromoji.clear(); | |
87 | + readingList.clear(); | |
88 | + titleArray = null; | |
89 | + offset = 0; | |
90 | + String inputStr = ""; | |
91 | + | |
92 | + try { | |
93 | + final String s = IOUtils.toString(input); | |
94 | + if (s != null && s.length() > 0) { | |
95 | + inputStr = s; | |
96 | + for (final SuggestConverter converter : preConverterList) { | |
97 | + inputStr = converter.convert(inputStr); | |
98 | + } | |
99 | + titleArray = inputStr.split("\\$\\{and\\}"); | |
100 | + inputStr = inputStr.replace("${and}", " "); | |
101 | + } | |
102 | + } catch (final IOException e) { | |
103 | + } | |
104 | + | |
105 | + final Reader rd = new StringReader(inputStr); | |
106 | + | |
107 | + TokenStream stream = null; | |
108 | + | |
109 | + try { | |
110 | + stream = new JapaneseTokenizer(rd, userDictionary, | |
111 | + discardPunctuation, tokenizerMode); | |
112 | + | |
113 | + stream.reset(); | |
114 | + while (stream.incrementToken()) { | |
115 | + final CharTermAttribute att = stream | |
116 | + .getAttribute(CharTermAttribute.class); | |
117 | + termListByKuromoji.add(att.toString()); | |
118 | + | |
119 | + final ReadingAttribute rdAttr = stream | |
120 | + .getAttribute(ReadingAttribute.class); | |
121 | + | |
122 | + String reading; | |
123 | + if (rdAttr.getReading() != null) { | |
124 | + reading = rdAttr.getReading(); | |
125 | + } else { | |
126 | + reading = att.toString(); | |
127 | + } | |
128 | + | |
129 | + for (final SuggestConverter converter : converterList) { | |
130 | + reading = converter.convert(reading); | |
131 | + } | |
132 | + readingList.add(reading); | |
133 | + | |
134 | + } | |
135 | + | |
136 | + } catch (final Exception e) { | |
137 | + logger.warn("JapaneseTokenizer stream error", e); | |
138 | + } finally { | |
139 | + try { | |
140 | + input.reset(); | |
141 | + } catch (final Exception e) { | |
142 | + } | |
143 | + try { | |
144 | + stream.end(); | |
145 | + } catch (final Exception e) { | |
146 | + } | |
147 | + try { | |
148 | + rd.close(); | |
149 | + } catch (final Exception e) { | |
150 | + } | |
151 | + } | |
152 | + } | |
153 | + | |
154 | + @Override | |
155 | + public boolean incrementToken() throws IOException { | |
156 | + if (titleArray == null || offset >= titleArray.length) { | |
157 | + return false; | |
158 | + } | |
159 | + | |
160 | + termAtt.setEmpty(); | |
161 | + termAtt.append(convertSuggestString(titleArray[offset], | |
162 | + getReading(titleArray[offset]))); | |
163 | + offset++; | |
164 | + return true; | |
165 | + } | |
166 | + | |
167 | + @Override | |
168 | + public void reset() throws IOException { | |
169 | + super.reset(); | |
170 | + initialize(); | |
171 | + } | |
172 | + | |
173 | + private String convertSuggestString(final String term, final String reading) { | |
174 | + String suggestString; | |
175 | + if (reading != null && reading.length() > 0) { | |
176 | + suggestString = reading + wordSeparator + term; | |
177 | + } else { | |
178 | + suggestString = term; | |
179 | + } | |
180 | + | |
181 | + return suggestString; | |
182 | + } | |
183 | + | |
184 | + private String getReading(final String s) { | |
185 | + | |
186 | + final StringBuilder buf = new StringBuilder(); | |
187 | + | |
188 | + for (int i = 0; i < s.length(); i++) { | |
189 | + String term = ""; | |
190 | + int length = 0; | |
191 | + | |
192 | + for (int j = 0; j < termListByKuromoji.size(); j++) { | |
193 | + final String tmpStr = termListByKuromoji.get(j); | |
194 | + if (s.substring(i).indexOf(tmpStr) == 0 | |
195 | + && tmpStr.length() > term.length()) { | |
196 | + term = readingList.get(j); | |
197 | + length = tmpStr.length(); | |
198 | + } | |
199 | + } | |
200 | + if (term.length() > 0) { | |
201 | + buf.append(term); | |
202 | + i += length - 1; | |
203 | + } else { | |
204 | + char c = s.charAt(i); | |
205 | + | |
206 | + c = Transliterator.getInstance("Hiragana-Katakana") | |
207 | + .transliterate(String.valueOf(c)).charAt(0); | |
208 | + | |
209 | + buf.append(c); | |
210 | + } | |
211 | + } | |
212 | + | |
213 | + String reading = buf.toString(); | |
214 | + for (final SuggestConverter converter : converterList) { | |
215 | + reading = converter.convert(reading); | |
216 | + } | |
217 | + | |
218 | + return reading; | |
219 | + } | |
220 | +} |
@@ -0,0 +1,151 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin.suggest; | |
18 | + | |
19 | +import java.io.IOException; | |
20 | +import java.lang.reflect.Constructor; | |
21 | +import java.util.ArrayList; | |
22 | +import java.util.Collections; | |
23 | +import java.util.List; | |
24 | +import java.util.Map; | |
25 | + | |
26 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
27 | + | |
28 | +import org.apache.commons.lang.StringUtils; | |
29 | +import org.noggit.ObjectBuilder; | |
30 | +import org.slf4j.Logger; | |
31 | +import org.slf4j.LoggerFactory; | |
32 | + | |
33 | +public class SuggestConverterCreator { | |
34 | + private static final Logger logger = LoggerFactory | |
35 | + .getLogger(SuggestConverterCreator.class); | |
36 | + | |
37 | + protected SuggestConverterCreator() { | |
38 | + // nothing | |
39 | + } | |
40 | + | |
41 | + public static List<SuggestConverter> create(final String val) { | |
42 | + if (StringUtils.isBlank(val)) { | |
43 | + return Collections.emptyList(); | |
44 | + } | |
45 | + | |
46 | + try { | |
47 | + final Object obj = ObjectBuilder.fromJSON(val); | |
48 | + if (obj instanceof List<?>) { | |
49 | + | |
50 | + final List<SuggestConverter> converterList = new ArrayList<SuggestConverter>(); | |
51 | + for (final Object map : (List<Object>) obj) { | |
52 | + if (map instanceof Map<?, ?>) { | |
53 | + try { | |
54 | + final Map<Object, Object> dataMap = (Map<Object, Object>) map; | |
55 | + final String className = (String) dataMap | |
56 | + .get("class"); | |
57 | + final Class<SuggestConverter> clazz = (Class<SuggestConverter>) Class | |
58 | + .forName(className); | |
59 | + final List<?> constructorArgs = (List<?>) dataMap | |
60 | + .get("args"); | |
61 | + SuggestConverter converter; | |
62 | + if (constructorArgs == null | |
63 | + || constructorArgs.isEmpty()) { | |
64 | + converter = clazz.newInstance(); | |
65 | + } else { | |
66 | + final List<Class<?>> classList = new ArrayList<Class<?>>( | |
67 | + constructorArgs.size()); | |
68 | + for (final Object arg : constructorArgs) { | |
69 | + classList.add(getArgClass(arg)); | |
70 | + } | |
71 | + final Constructor<SuggestConverter> constructor = clazz | |
72 | + .getConstructor(classList | |
73 | + .toArray(new Class<?>[constructorArgs | |
74 | + .size()])); | |
75 | + converter = constructor | |
76 | + .newInstance(constructorArgs | |
77 | + .toArray(new Object[constructorArgs | |
78 | + .size()])); | |
79 | + } | |
80 | + updateInstance(dataMap, clazz, converter); | |
81 | + converterList.add(converter); | |
82 | + } catch (final Exception e) { | |
83 | + logger.warn("Could not create a converter.", e); | |
84 | + } | |
85 | + } else { | |
86 | + logger.info("Data for a converter should be an object: " | |
87 | + + map.toString()); | |
88 | + } | |
89 | + } | |
90 | + return converterList; | |
91 | + } else { | |
92 | + logger.info("Could not create a converter list from " + val); | |
93 | + } | |
94 | + } catch (final IOException e) { | |
95 | + logger.warn("Failed to parse " + val, e); | |
96 | + } | |
97 | + | |
98 | + return Collections.emptyList(); | |
99 | + } | |
100 | + | |
101 | + private static void updateInstance(final Map<Object, Object> dataMap, | |
102 | + final Class<SuggestConverter> clazz, | |
103 | + final SuggestConverter converter) { | |
104 | + if (clazz == null) { | |
105 | + logger.warn("class is null. data:" + dataMap + ", converter: " | |
106 | + + converter); | |
107 | + return; | |
108 | + } | |
109 | + final List<?> methodList = (List<?>) dataMap.get("method"); | |
110 | + if (methodList != null && !methodList.isEmpty()) { | |
111 | + for (final Object obj : methodList) { | |
112 | + try { | |
113 | + if (obj instanceof Map<?, ?>) { | |
114 | + final Map<Object, Object> paramMap = (Map<Object, Object>) obj; | |
115 | + final String methodName = (String) paramMap.get("name"); | |
116 | + final List<?> methodArgs = (List<?>) paramMap | |
117 | + .get("args"); | |
118 | + final Class<?>[] argClasses; | |
119 | + if (methodArgs == null || methodArgs.isEmpty()) { | |
120 | + argClasses = null; | |
121 | + } else { | |
122 | + final List<Class<?>> classList = new ArrayList<Class<?>>( | |
123 | + methodArgs.size()); | |
124 | + for (final Object arg : methodArgs) { | |
125 | + classList.add(getArgClass(arg)); | |
126 | + } | |
127 | + argClasses = classList | |
128 | + .toArray(new Class<?>[classList.size()]); | |
129 | + } | |
130 | + clazz.getMethod(methodName, argClasses) | |
131 | + .invoke(converter, | |
132 | + methodArgs | |
133 | + .toArray(new Object[methodArgs | |
134 | + .size()])); | |
135 | + } | |
136 | + } catch (final Exception e) { | |
137 | + logger.warn("Failed to invoke: " + obj.toString(), e); | |
138 | + } | |
139 | + } | |
140 | + } | |
141 | + | |
142 | + } | |
143 | + | |
144 | + private static Class<? extends Object> getArgClass(final Object arg) { | |
145 | + final Class<? extends Object> clazz = arg.getClass(); | |
146 | + if (clazz.equals(ArrayList.class)) { | |
147 | + return List.class; | |
148 | + } | |
149 | + return clazz; | |
150 | + } | |
151 | +} |
@@ -0,0 +1,101 @@ | ||
1 | +package jp.sf.fess.solr.plugin.search; | |
2 | + | |
3 | +import java.io.IOException; | |
4 | +import java.util.HashSet; | |
5 | +import java.util.Map; | |
6 | +import java.util.Set; | |
7 | + | |
8 | +import org.apache.commons.lang.StringUtils; | |
9 | +import org.apache.lucene.document.Document; | |
10 | +import org.apache.lucene.index.AtomicReaderContext; | |
11 | +import org.apache.lucene.queries.function.FunctionValues; | |
12 | +import org.apache.lucene.queries.function.ValueSource; | |
13 | +import org.apache.lucene.queries.function.docvalues.IntDocValues; | |
14 | +import org.apache.lucene.search.IndexSearcher; | |
15 | +import org.apache.solr.search.FunctionQParser; | |
16 | +import org.apache.solr.search.SyntaxError; | |
17 | +import org.apache.solr.search.ValueSourceParser; | |
18 | + | |
19 | +public class WordFreqValueSourceParser extends ValueSourceParser { | |
20 | + | |
21 | + @Override | |
22 | + public ValueSource parse(final FunctionQParser fp) throws SyntaxError { | |
23 | + final String field = fp.parseArg(); | |
24 | + final String word = fp.parseArg(); | |
25 | + final boolean normalized = !"false".equals(fp.parseArg()); | |
26 | + return new WordFreqValueSource(field, word, normalized); | |
27 | + } | |
28 | + | |
29 | + public static class WordFreqValueSource extends ValueSource { | |
30 | + protected final String field; | |
31 | + | |
32 | + protected final String word; | |
33 | + | |
34 | + protected final boolean normalized; | |
35 | + | |
36 | + public WordFreqValueSource(final String field, final String word, | |
37 | + final boolean normalized) { | |
38 | + this.field = field; | |
39 | + this.word = normalized ? normalize(word) : word; | |
40 | + this.normalized = normalized; | |
41 | + } | |
42 | + | |
43 | + public String name() { | |
44 | + return "wordfreq"; | |
45 | + } | |
46 | + | |
47 | + protected String normalize(final String value) { | |
48 | + return value.toLowerCase(); | |
49 | + } | |
50 | + | |
51 | + @Override | |
52 | + public FunctionValues getValues(final Map context, | |
53 | + final AtomicReaderContext readerContext) throws IOException { | |
54 | + return new IntDocValues(this) { | |
55 | + @Override | |
56 | + public int intVal(final int docId) { | |
57 | + final IndexSearcher searcher = (IndexSearcher) context | |
58 | + .get("searcher"); | |
59 | + final Set<String> fieldSet = new HashSet<String>(); | |
60 | + fieldSet.add(field); | |
61 | + try { | |
62 | + final Document doc = searcher.doc(docId, fieldSet); | |
63 | + if (doc != null) { | |
64 | + String value = doc.get(field); | |
65 | + if (normalized) { | |
66 | + value = normalize(value); | |
67 | + } | |
68 | + return StringUtils.countMatches(value, word); | |
69 | + } | |
70 | + } catch (final IOException e) { | |
71 | + // ignore | |
72 | + } | |
73 | + return 0; | |
74 | + } | |
75 | + }; | |
76 | + } | |
77 | + | |
78 | + @Override | |
79 | + public boolean equals(final Object o) { | |
80 | + if (this.getClass() != o.getClass()) { | |
81 | + return false; | |
82 | + } | |
83 | + final WordFreqValueSource other = (WordFreqValueSource) o; | |
84 | + return field.equals(other.field) && word.equals(other.word) | |
85 | + && normalized == other.normalized; | |
86 | + | |
87 | + } | |
88 | + | |
89 | + @Override | |
90 | + public int hashCode() { | |
91 | + return (field + word).hashCode() + (normalized ? 1231 : 1237); | |
92 | + | |
93 | + } | |
94 | + | |
95 | + @Override | |
96 | + public String description() { | |
97 | + return name() + '(' + field + ',' + word + ')'; | |
98 | + } | |
99 | + | |
100 | + } | |
101 | +} |
@@ -0,0 +1,86 @@ | ||
1 | +/* | |
2 | + * Copyright 2009-2013 the Fess Project and the Others. | |
3 | + * | |
4 | + * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | + * you may not use this file except in compliance with the License. | |
6 | + * You may obtain a copy of the License at | |
7 | + * | |
8 | + * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | + * | |
10 | + * Unless required by applicable law or agreed to in writing, software | |
11 | + * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
13 | + * either express or implied. See the License for the specific language | |
14 | + * governing permissions and limitations under the License. | |
15 | + */ | |
16 | + | |
17 | +package jp.sf.fess.solr.plugin; | |
18 | + | |
19 | +import static org.hamcrest.core.Is.is; | |
20 | +import static org.junit.Assert.assertThat; | |
21 | + | |
22 | +import java.util.List; | |
23 | + | |
24 | +import jp.sf.fess.solr.plugin.suggest.SuggestConverterCreator; | |
25 | +import jp.sf.fess.suggest.converter.SuggestConverter; | |
26 | + | |
27 | +import org.junit.Test; | |
28 | + | |
29 | +public class SuggestConverterCreatorTest { | |
30 | + @Test | |
31 | + public void createTwoInstance() { | |
32 | + final String text = "[" | |
33 | + + // | |
34 | + "{\"class\":\"jp.sf.fess.suggest.converter.SymbolConverter\"," | |
35 | + + "\"method\":[{\"name\":\"addSymbol\",\"args\":[[\"A\"]]}]}" | |
36 | + + "," | |
37 | + + // | |
38 | + "{\"class\":\"jp.sf.fess.suggest.converter.SymbolConverter\",\"args\":[\"B\",\"E\"]," | |
39 | + + "\"method\":[{\"name\":\"addSymbol\",\"args\":[[\"X\"]]},{\"name\":\"addSymbol\",\"args\":[[\"Y\"]]}]}" | |
40 | + + "," | |
41 | + + // | |
42 | + "{\"class\":\"jp.sf.fess.suggest.converter.ReplaceConverter\"," | |
43 | + + "\"method\":[{\"name\":\"addReplaceString\",\"args\":[\"x\",\"X\"]},{\"name\":\"addReplaceString\",\"args\":[\"y\",\"Y\"]}]}" | |
44 | + + // | |
45 | + "]"; | |
46 | + final List<SuggestConverter> list = SuggestConverterCreator | |
47 | + .create(text); | |
48 | + assertThat(list.size(), is(3)); | |
49 | + assertThat(list.get(0).getClass().getName(), | |
50 | + is("jp.sf.fess.suggest.converter.SymbolConverter")); | |
51 | + assertThat(list.get(0).convert("abcABC"), is("abc__ID0__BC")); | |
52 | + assertThat(list.get(1).getClass().getName(), | |
53 | + is("jp.sf.fess.suggest.converter.SymbolConverter")); | |
54 | + assertThat(list.get(1).convert("xyzXYZ"), is("xyzB0EB1EZ")); | |
55 | + assertThat(list.get(2).getClass().getName(), | |
56 | + is("jp.sf.fess.suggest.converter.ReplaceConverter")); | |
57 | + assertThat(list.get(2).convert("xyzXYZ"), is("XYzXYZ")); | |
58 | + | |
59 | + } | |
60 | + | |
61 | + @Test | |
62 | + public void createOneInstance() { | |
63 | + final String text = "[" | |
64 | + + // | |
65 | + "{\"class\":\"jp.sf.fess.suggest.converter.ICUConverter\",\"args\":[\"Fullwidth-Halfwidth\"]}" | |
66 | + + // | |
67 | + "]"; | |
68 | + final List<SuggestConverter> list = SuggestConverterCreator | |
69 | + .create(text); | |
70 | + assertThat(list.size(), is(1)); | |
71 | + assertThat(list.get(0).getClass().getName(), | |
72 | + is("jp.sf.fess.suggest.converter.ICUConverter")); | |
73 | + | |
74 | + } | |
75 | + | |
76 | + @Test | |
77 | + public void createEmpty() { | |
78 | + List<SuggestConverter> list; | |
79 | + | |
80 | + list = SuggestConverterCreator.create(""); | |
81 | + assertThat(list.size(), is(0)); | |
82 | + | |
83 | + list = SuggestConverterCreator.create(null); | |
84 | + assertThat(list.size(), is(0)); | |
85 | + } | |
86 | +} |
@@ -0,0 +1,14 @@ | ||
1 | +Copyright 2009-${year} the Fess Project and the Others. | |
2 | + | |
3 | +Licensed under the Apache License, Version 2.0 (the "License"); | |
4 | +you may not use this file except in compliance with the License. | |
5 | +You may obtain a copy of the License at | |
6 | + | |
7 | + http://www.apache.org/licenses/LICENSE-2.0 | |
8 | + | |
9 | +Unless required by applicable law or agreed to in writing, software | |
10 | +distributed under the License is distributed on an "AS IS" BASIS, | |
11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, | |
12 | +either express or implied. See the License for the specific language | |
13 | +governing permissions and limitations under the License. | |
14 | + |
@@ -0,0 +1,13 @@ | ||
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<additionalHeaders> | |
3 | + <javadoc_style> | |
4 | + <firstLine>/*</firstLine> | |
5 | + <beforeEachLine> * </beforeEachLine> | |
6 | + <endLine> */</endLine> | |
7 | + <!--skipLine></skipLine--> | |
8 | + <firstLineDetectionPattern>(\s|\t)*/\*.*$</firstLineDetectionPattern> | |
9 | + <lastLineDetectionPattern>.*\*/(\s|\t)*$</lastLineDetectionPattern> | |
10 | + <allowBlankLines>false</allowBlankLines> | |
11 | + <isMultiline>true</isMultiline> | |
12 | + </javadoc_style> | |
13 | +</additionalHeaders> |