the-algorithm/src/java/com/twitter/search/common/schema/AnalyzerFactory.java

143 lines
5.0 KiB
Java

package com.twitter.search.common.schema;
import java.io.Reader;
import java.text.ParseException;
import java.util.Map;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.fa.PersianCharFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer;
import com.twitter.search.common.schema.thriftjava.ThriftClassInstantiater;
import com.twitter.search.common.schema.thriftjava.ThriftCustomAnalyzer;
public class AnalyzerFactory {
private static final Logger LOG = LoggerFactory.getLogger(AnalyzerFactory.class);
private static final String MATCH_VERSION_ARG_NAME = "matchVersion";
private static final String STANDARD_ANALYZER = "StandardAnalyzer";
private static final String WHITESPACE_ANALYZER = "WhitespaceAnalyzer";
private static final String SEARCH_WHITESPACE_ANALYZER = "SearchWhitespaceAnalyzer";
private static final String HTML_STRIP_CHAR_FILTER = "HTMLStripCharFilter";
private static final String PERSIAN_CHAR_FILTER = "PersianCharFilter";
/**
* Return a Lucene Analyzer based on the given ThriftAnalyzer.
*/
public Analyzer getAnalyzer(ThriftAnalyzer analyzer) {
if (analyzer.isSetAnalyzer()) {
return resolveAnalyzerClass(analyzer.getAnalyzer());
} else if (analyzer.isSetCustomAnalyzer()) {
return buildCustomAnalyzer(analyzer.getCustomAnalyzer());
}
return new SearchWhitespaceAnalyzer();
}
private Analyzer resolveAnalyzerClass(ThriftClassInstantiater classDef) {
Map<String, String> params = classDef.getParams();
Version matchVersion = Version.LUCENE_8_5_2;
String matchVersionName = getArg(params, MATCH_VERSION_ARG_NAME);
if (matchVersionName != null) {
try {
matchVersion = Version.parse(matchVersionName);
} catch (ParseException e) {
// ignore and use default version
LOG.warn("Unable to parse match version: " + matchVersionName
+ ". Will use default version of 8.5.2.");
}
}
if (classDef.getClassName().equals(STANDARD_ANALYZER)) {
String stopwords = getArg(params, "stopwords");
if (stopwords != null) {
CharArraySet stopwordSet = new CharArraySet(
Lists.newLinkedList(Splitter.on(",").split(stopwords)),
false);
return new StandardAnalyzer(stopwordSet);
} else {
return new StandardAnalyzer();
}
} else if (classDef.getClassName().equals(WHITESPACE_ANALYZER)) {
return new WhitespaceAnalyzer();
} else if (classDef.getClassName().equals(SEARCH_WHITESPACE_ANALYZER)) {
return new SearchWhitespaceAnalyzer();
}
return null;
}
private Analyzer buildCustomAnalyzer(final ThriftCustomAnalyzer customAnalyzer) {
return new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = resolveTokenizerClass(customAnalyzer.getTokenizer());
TokenStream filter = tokenizer;
if (customAnalyzer.isSetFilters()) {
for (ThriftClassInstantiater filterClass : customAnalyzer.getFilters()) {
filter = resolveTokenFilterClass(filterClass, filter);
}
}
return new TokenStreamComponents(tokenizer, filter);
}
};
}
private Tokenizer resolveTokenizerClass(ThriftClassInstantiater classDef) {
return null;
}
private TokenStream resolveTokenFilterClass(ThriftClassInstantiater classDef, TokenStream input) {
return null;
}
private CharFilter resolveCharFilterClass(ThriftClassInstantiater classDef, Reader input) {
if (classDef.getClassName().equals(HTML_STRIP_CHAR_FILTER)) {
String escapedTags = getArg(classDef.getParams(), "excapedTags");
if (escapedTags != null) {
return new HTMLStripCharFilter(input, Sets.newHashSet(Splitter.on(",").split(escapedTags)));
} else {
return new HTMLStripCharFilter(input);
}
} else if (classDef.getClassName().equals(PERSIAN_CHAR_FILTER)) {
return new PersianCharFilter(input);
}
throw new ClassNotSupportedException("CharFilter", classDef);
}
private String getArg(Map<String, String> args, String arg) {
if (args == null) {
return null;
}
return args.get(arg);
}
public final class ClassNotSupportedException extends RuntimeException {
private ClassNotSupportedException(String type, ThriftClassInstantiater classDef) {
super(type + " class with name " + classDef.getClassName() + " currently not supported.");
}
}
}