diff --git a/docs/elasticsearch.txt b/docs/elasticsearch.txt index 380b9af5ce..6160bfd201 100644 --- a/docs/elasticsearch.txt +++ b/docs/elasticsearch.txt @@ -13,6 +13,7 @@ JanusGraph supports https://www.elastic.co/[Elasticsearch] as an index backend. * *TTL*: Supports automatically expiring indexed elements. * *Collections*: Supports indexing SET and LIST cardinality properties. * *Temporal*: Nanosecond granularity temporal indexing. +* *Custom Analyzer*: Choose to use a custom analyzer Please see <> for details on what versions of ES will work with JanusGraph. @@ -189,4 +190,4 @@ For additional suggestions on how to increase write performance in Elasticsearch ==== Further Reading -* Please refer to the https://www.elastic.co[Elasticsearch homepage] and available documentation for more information on Elasticsearch and how to setup an Elasticsearch cluster. +* Please refer to the https://www.elastic.co[Elasticsearch homepage] and available documentation for more information on Elasticsearch and how to setup an Elasticsearch cluster. \ No newline at end of file diff --git a/docs/solr.txt b/docs/solr.txt index e3deaf81ef..a8c2c9ae21 100644 --- a/docs/solr.txt +++ b/docs/solr.txt @@ -12,6 +12,7 @@ JanusGraph supports http://lucene.apache.org/solr/[Solr] as an index backend. H * *Numeric Range*: Supports all numeric comparisons in `Compare`. * *TTL*: Supports automatically expiring indexed elements. * *Temporal*: Millisecond granularity temporal indexing. +* *Custom Analyzer*: Choose to use a custom analyzer Please see <> for details on what versions of Solr will work with JanusGraph. diff --git a/docs/textsearch.txt b/docs/textsearch.txt index 5c3e6b952b..81ced59537 100644 --- a/docs/textsearch.txt +++ b/docs/textsearch.txt @@ -129,3 +129,38 @@ Instead of individually adjusting the field mapping for every key added to a mix However, this approach has two limitations: 1) The user has to ensure that the property key names are valid field names for the indexing backend and 2) renaming the property key will NOT rename the field name in the index which can lead to naming collisions that the user has to be aware of and avoid. Note, that individual field mappings as described above can be used to overwrite the default name for a particular key. + + +==== Custom Analyser + +By default, JanusGraph will use the default analyzer from the indexing backend for properties with Mapping.TEXT, and no analyzer for properties with Mapping.STRING. If one wants to use another analyzer, it can be explicitly specified through a parameter : ParameterType.TEXT_ANALYZER for Mapping.TEXT and ParameterType.STRING_ANALYZER for Mapping.STRING. + +===== For Elasticsearch + +The name of the analyzer must be set as parameter value. + +[source, gremlin] +mgmt = graph.openManagement() +string = mgmt.makePropertyKey('string').dataType(String.class).make() +text = mgmt.makePropertyKey('text').dataType(String.class).make() +textString = mgmt.makePropertyKey('textString').dataType(String.class).make() +mgmt.buildIndex('string', Vertex.class).addKey(string, Mapping.STRING.asParameter(), Parameter.of(ParameterType.STRING_ANALYZER.getName(), 'standard')).buildMixedIndex("search") +mgmt.buildIndex('text', Vertex.class).addKey(text, Mapping.TEXT.asParameter(), Parameter.of(ParameterType.TEXT_ANALYZER.getName(), 'english')).buildMixedIndex("search") +mgmt.buildIndex('textString', Vertex.class).addKey(text, Mapping.TEXTSTRING.asParameter(), Parameter.of(ParameterType.STRING_ANALYZER.getName(), 'standard'), Parameter.of(ParameterType.TEXT_ANALYZER.getName(), 'english')).buildMixedIndex("search") +mgmt.commit() + +With these settings, JanusGraph will use the 'standard' analyzer for property key 'string' and the 'english' analyzer for property key 'text'. + +===== For Solr + +The class of the tokenizer must be set as parameter value. + +[source, gremlin] +mgmt = graph.openManagement() +string = mgmt.makePropertyKey('string').dataType(String.class).make() +text = mgmt.makePropertyKey('text').dataType(String.class).make() +mgmt.buildIndex('string', Vertex.class).addKey(string, Mapping.STRING.asParameter(), Parameter.of(ParameterType.STRING_ANALYZER.getName(), 'org.apache.lucene.analysis.standard.StandardTokenizer')).buildMixedIndex("search") +mgmt.buildIndex('text', Vertex.class).addKey(text, Mapping.TEXT.asParameter(), Parameter.of(ParameterType.TEXT_ANALYZER.getName(), 'org.apache.lucene.analysis.core.WhitespaceTokenizer')).buildMixedIndex("search") +mgmt.commit() + +With these settings, JanusGraph will use the 'standard' tokenizer for property key 'string' and the 'whitespace' tokenizer for property key 'text'. \ No newline at end of file diff --git a/janusgraph-core/src/main/java/org/janusgraph/diskstorage/indexing/IndexFeatures.java b/janusgraph-core/src/main/java/org/janusgraph/diskstorage/indexing/IndexFeatures.java index a43bfdb2b0..d56c0cf5a7 100644 --- a/janusgraph-core/src/main/java/org/janusgraph/diskstorage/indexing/IndexFeatures.java +++ b/janusgraph-core/src/main/java/org/janusgraph/diskstorage/indexing/IndexFeatures.java @@ -35,11 +35,12 @@ public class IndexFeatures { private final ImmutableSet supportedStringMappings; private final String wildcardField; private final boolean supportsNanoseconds; - private ImmutableSet supportedCardinaities; + private final boolean supportsCustomAnalyzer; + private ImmutableSet supportedCardinalities; public IndexFeatures(boolean supportsDocumentTTL, Mapping defaultMap, - ImmutableSet supportedMap, String wildcardField, ImmutableSet supportedCardinaities, boolean supportsNanoseconds) { + ImmutableSet supportedMap, String wildcardField, ImmutableSet supportedCardinaities, boolean supportsNanoseconds, boolean supportCustomAnalyzer) { Preconditions.checkArgument(defaultMap!=null || defaultMap!=Mapping.DEFAULT); Preconditions.checkArgument(supportedMap!=null && !supportedMap.isEmpty() @@ -48,8 +49,9 @@ public IndexFeatures(boolean supportsDocumentTTL, this.defaultStringMapping = defaultMap; this.supportedStringMappings = supportedMap; this.wildcardField = wildcardField; - this.supportedCardinaities = supportedCardinaities; + this.supportedCardinalities = supportedCardinaities; this.supportsNanoseconds = supportsNanoseconds; + this.supportsCustomAnalyzer = supportCustomAnalyzer; } public boolean supportsDocumentTTL() { @@ -69,12 +71,16 @@ public String getWildcardField() { } public boolean supportsCardinality(Cardinality cardinality) { - return supportedCardinaities.contains(cardinality); + return supportedCardinalities.contains(cardinality); } public boolean supportsNanoseconds() { return supportsNanoseconds; } + + public boolean supportsCustomAnalyzer() { + return supportsCustomAnalyzer; + } public static class Builder { @@ -84,6 +90,7 @@ public static class Builder { private Set supportedCardinalities = Sets.newHashSet(); private String wildcardField = "*"; private boolean supportsNanoseconds; + private boolean supportsCustomAnalyzer; public Builder supportsDocumentTTL() { supportsDocumentTTL=true; @@ -114,10 +121,15 @@ public Builder supportsNanoseconds() { supportsNanoseconds = true; return this; } + + public Builder supportsCustomAnalyzer() { + supportsCustomAnalyzer = true; + return this; + } public IndexFeatures build() { return new IndexFeatures(supportsDocumentTTL, defaultStringMapping, - ImmutableSet.copyOf(supportedMappings), wildcardField, ImmutableSet.copyOf(supportedCardinalities), supportsNanoseconds); + ImmutableSet.copyOf(supportedMappings), wildcardField, ImmutableSet.copyOf(supportedCardinalities), supportsNanoseconds, supportsCustomAnalyzer); } diff --git a/janusgraph-core/src/main/java/org/janusgraph/graphdb/types/ParameterType.java b/janusgraph-core/src/main/java/org/janusgraph/graphdb/types/ParameterType.java index 667acc9fbe..10af72a0ee 100644 --- a/janusgraph-core/src/main/java/org/janusgraph/graphdb/types/ParameterType.java +++ b/janusgraph-core/src/main/java/org/janusgraph/graphdb/types/ParameterType.java @@ -35,8 +35,15 @@ public enum ParameterType { INDEX_GEO_MAX_LEVELS("index-geo-max-levels"), /** Distance error percent used to determine precision in spatial prefix tree where applicable. **/ - INDEX_GEO_DIST_ERROR_PCT("index-geo-dist-error-pct"); - + INDEX_GEO_DIST_ERROR_PCT("index-geo-dist-error-pct"), + + /** Analyzer for String Type with mapping STRING**/ + STRING_ANALYZER("string-analyzer"), + + /** Analyzer for String Type with mapping TEXT**/ + TEXT_ANALYZER("text-analyzer"), + ; + private final String name; private ParameterType(String name) { diff --git a/janusgraph-es/src/main/java/org/janusgraph/diskstorage/es/ElasticSearchIndex.java b/janusgraph-es/src/main/java/org/janusgraph/diskstorage/es/ElasticSearchIndex.java index 6157f90f12..5ba241c844 100644 --- a/janusgraph-es/src/main/java/org/janusgraph/diskstorage/es/ElasticSearchIndex.java +++ b/janusgraph-es/src/main/java/org/janusgraph/diskstorage/es/ElasticSearchIndex.java @@ -106,6 +106,12 @@ public class ElasticSearchIndex implements IndexProvider { private static final String STRING_MAPPING_SUFFIX = "__STRING"; + private static final String NOT_ANALYZED = "not_analyzed"; + + private static final String ANALYZER = "analyzer"; + + private static final String INDEX = "index"; + public static final ConfigNamespace ELASTICSEARCH_NS = new ConfigNamespace(INDEX_NS, "elasticsearch", "Elasticsearch index configuration"); @@ -163,7 +169,7 @@ public class ElasticSearchIndex implements IndexProvider { new ConfigNamespace(ES_CREATE_NS, "ext", "Overrides for arbitrary settings applied at index creation", true); private static final IndexFeatures ES_FEATURES = new IndexFeatures.Builder() - .setDefaultStringMapping(Mapping.TEXT).supportedStringMappings(Mapping.TEXT, Mapping.TEXTSTRING, Mapping.STRING).setWildcardField("_all").supportsCardinality(Cardinality.SINGLE).supportsCardinality(Cardinality.LIST).supportsCardinality(Cardinality.SET).supportsNanoseconds().build(); + .setDefaultStringMapping(Mapping.TEXT).supportedStringMappings(Mapping.TEXT, Mapping.TEXTSTRING, Mapping.STRING).setWildcardField("_all").supportsCardinality(Cardinality.SINGLE).supportsCardinality(Cardinality.LIST).supportsCardinality(Cardinality.SET).supportsNanoseconds().supportsCustomAnalyzer().build(); public static final int HOST_PORT_DEFAULT = 9200; @@ -228,7 +234,6 @@ private void checkForOrCreateIndex(Configuration config) throws IOException { ElasticSearchSetup.applySettingsFromJanusGraphConf(settings, config, ES_CREATE_EXTRAS_NS); settings.put("index.max_result_window", Integer.MAX_VALUE); - client.createIndex(indexName, settings.build()); try { @@ -298,19 +303,34 @@ public void register(String store, String key, KeyInformation information, BaseT if (map==Mapping.DEFAULT) map=Mapping.TEXT; log.debug("Registering string type for {} with mapping {}", key, map); mapping.field("type", "string"); + String stringAnalyzer = (String) ParameterType.STRING_ANALYZER.findParameter(information.getParameters(), null); + String textAnalyzer = (String) ParameterType.TEXT_ANALYZER.findParameter(information.getParameters(), null); switch (map) { case STRING: - mapping.field("index","not_analyzed"); + if (stringAnalyzer != null) { + mapping.field(ANALYZER, stringAnalyzer); + } else { + mapping.field(INDEX, NOT_ANALYZED); + } break; case TEXT: - //default, do nothing + if (textAnalyzer != null) { + mapping.field(ANALYZER, textAnalyzer); + } break; case TEXTSTRING: + if (textAnalyzer != null) { + mapping.field(ANALYZER, textAnalyzer); + } mapping.endObject(); //add string mapping mapping.startObject(getDualMappingName(key)); mapping.field("type", "string"); - mapping.field("index","not_analyzed"); + if (stringAnalyzer != null) { + mapping.field(ANALYZER, stringAnalyzer); + } else { + mapping.field(INDEX, NOT_ANALYZED); + } break; default: throw new AssertionError("Unexpected mapping: "+map); } @@ -359,7 +379,7 @@ public void register(String store, String key, KeyInformation information, BaseT } else if (dataType == UUID.class) { log.debug("Registering uuid type for {}", key); mapping.field("type", "string"); - mapping.field("index","not_analyzed"); + mapping.field(INDEX, NOT_ANALYZED); } mapping.endObject().endObject().endObject(); @@ -687,28 +707,20 @@ public QueryBuilder getFilter(Condition condition, KeyInformation.StoreRetrie throw new IllegalArgumentException("String mapped string values do not support CONTAINS queries: " + janusgraphPredicate); if (map==Mapping.TEXTSTRING && !janusgraphPredicate.toString().startsWith("CONTAINS")) fieldName = getDualMappingName(key); - - if (janusgraphPredicate == Text.CONTAINS) { - value = ((String) value).toLowerCase(); - BoolQueryBuilder b = QueryBuilders.boolQuery(); - for (String term : Text.tokenize((String)value)) { - b.must(QueryBuilders.termQuery(fieldName, term)); - } - return b; + if (janusgraphPredicate == Text.CONTAINS || janusgraphPredicate == Cmp.EQUAL) { + return QueryBuilders.matchQuery(fieldName, value).operator(Operator.AND); } else if (janusgraphPredicate == Text.CONTAINS_PREFIX) { - value = ((String) value).toLowerCase(); + value = ParameterType.TEXT_ANALYZER.findParameter(informations.get(key).getParameters(), null)!=null?((String) value):((String) value).toLowerCase(); return QueryBuilders.prefixQuery(fieldName, (String) value); } else if (janusgraphPredicate == Text.CONTAINS_REGEX) { - value = ((String) value).toLowerCase(); + value = ParameterType.TEXT_ANALYZER.findParameter(informations.get(key).getParameters(), null)!=null?((String) value):((String) value).toLowerCase(); return QueryBuilders.regexpQuery(fieldName, (String) value); } else if (janusgraphPredicate == Text.PREFIX) { return QueryBuilders.prefixQuery(fieldName, (String) value); } else if (janusgraphPredicate == Text.REGEX) { return QueryBuilders.regexpQuery(fieldName, (String) value); - } else if (janusgraphPredicate == Cmp.EQUAL) { - return QueryBuilders.termQuery(fieldName, (String) value); } else if (janusgraphPredicate == Cmp.NOT_EQUAL) { - return QueryBuilders.boolQuery().mustNot(QueryBuilders.termQuery(fieldName, (String) value)); + return QueryBuilders.boolQuery().mustNot(QueryBuilders.matchQuery(fieldName, value).operator(Operator.AND)); } else if (janusgraphPredicate == Text.FUZZY || janusgraphPredicate == Text.CONTAINS_FUZZY){ return QueryBuilders.matchQuery(fieldName, (String) value).fuzziness(Fuzziness.AUTO).operator(Operator.AND); } else @@ -835,9 +847,7 @@ public QueryBuilder getFilter(Condition condition, KeyInformation.StoreRetrie @Override public List query(IndexQuery query, KeyInformation.IndexRetriever informations, BaseTransaction tx) throws BackendException { ElasticSearchRequest sr = new ElasticSearchRequest(); - - sr.setQuery(QueryBuilders.matchAllQuery()); - sr.setPostFilter(getFilter(query.getCondition(),informations.get(query.getStore()))); + sr.setQuery(getFilter(query.getCondition(),informations.get(query.getStore()))); if (!query.getOrder().isEmpty()) { List orders = query.getOrder(); for (int i = 0; i < orders.size(); i++) { diff --git a/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchConfigTest.java b/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchConfigTest.java index 6a457417e9..3d23341ec7 100644 --- a/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchConfigTest.java +++ b/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchConfigTest.java @@ -132,7 +132,7 @@ private void simpleWriteAndQuery(IndexProvider idx) throws BackendException, Int final Duration maxWrite = Duration.ofMillis(2000L); final String storeName = "jvmlocal_test_store"; - final KeyInformation.IndexRetriever indexRetriever = IndexProviderTest.getIndexRetriever(IndexProviderTest.getMapping(idx.getFeatures())); + final KeyInformation.IndexRetriever indexRetriever = IndexProviderTest.getIndexRetriever(IndexProviderTest.getMapping(idx.getFeatures(), "standard", "keyword")); BaseTransactionConfig txConfig = StandardBaseTransactionConfig.of(TimestampProviders.MILLI); IndexTransaction itx = new IndexTransaction(idx, indexRetriever, txConfig, maxWrite); diff --git a/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchIndexTest.java b/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchIndexTest.java index 661f6aed0c..24a090c8f9 100644 --- a/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchIndexTest.java +++ b/janusgraph-es/src/test/java/org/janusgraph/diskstorage/es/ElasticSearchIndexTest.java @@ -17,6 +17,7 @@ import com.google.common.base.Throwables; import com.google.common.collect.HashMultimap; import com.google.common.collect.Multimap; + import org.apache.commons.lang.RandomStringUtils; import org.janusgraph.core.Cardinality; import org.janusgraph.core.JanusGraphException; @@ -75,6 +76,16 @@ public boolean supportsLuceneStyleQueries() { return true; } + @Override + public String getEnglishAnalyzerName() { + return "english"; + } + + @Override + public String getKeywordAnalyzerName() { + return "keyword"; + } + public Configuration getESTestConfig() { final String index = "es"; ModifiableConfiguration config = GraphDatabaseConfiguration.buildGraphConfiguration(); @@ -192,5 +203,4 @@ public void testUpdateAdditionWithLongString() throws Exception { assertEquals(0, tx.query(new IndexQuery("vertex", PredicateCondition.of(TEXT, Text.CONTAINS, "bob"))).size()); assertEquals(1, tx.query(new IndexQuery("vertex", PredicateCondition.of(TEXT, Text.CONTAINS, "world"))).size()); } - } diff --git a/janusgraph-lucene/src/test/java/org/janusgraph/diskstorage/lucene/LuceneIndexTest.java b/janusgraph-lucene/src/test/java/org/janusgraph/diskstorage/lucene/LuceneIndexTest.java index 3f1ad9713a..541f383ef3 100644 --- a/janusgraph-lucene/src/test/java/org/janusgraph/diskstorage/lucene/LuceneIndexTest.java +++ b/janusgraph-lucene/src/test/java/org/janusgraph/diskstorage/lucene/LuceneIndexTest.java @@ -61,6 +61,16 @@ public boolean supportsLuceneStyleQueries() { return false; } + @Override + public String getEnglishAnalyzerName() { + return null; + } + + @Override + public String getKeywordAnalyzerName() { + return null; + } + public static final Configuration getLocalLuceneTestConfig() { final String index = "lucene"; ModifiableConfiguration config = GraphDatabaseConfiguration.buildGraphConfiguration(); diff --git a/janusgraph-solr/src/main/java/org/janusgraph/diskstorage/solr/SolrIndex.java b/janusgraph-solr/src/main/java/org/janusgraph/diskstorage/solr/SolrIndex.java index b665a47d73..630ecd7234 100644 --- a/janusgraph-solr/src/main/java/org/janusgraph/diskstorage/solr/SolrIndex.java +++ b/janusgraph-solr/src/main/java/org/janusgraph/diskstorage/solr/SolrIndex.java @@ -36,8 +36,12 @@ import org.janusgraph.graphdb.query.condition.*; import org.janusgraph.graphdb.types.ParameterType; +import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.http.client.HttpClient; +import org.apache.lucene.analysis.CachingTokenFilter; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.solr.client.solrj.*; import org.apache.solr.client.solrj.impl.*; import org.apache.solr.client.solrj.request.CollectionAdminRequest; @@ -54,11 +58,12 @@ import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.zookeeper.KeeperException; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.io.StringReader; +import java.lang.reflect.Constructor; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.time.Instant; @@ -163,7 +168,7 @@ public static Mode parse(String mode) { private static final IndexFeatures SOLR_FEATURES = new IndexFeatures.Builder().supportsDocumentTTL() - .setDefaultStringMapping(Mapping.TEXT).supportedStringMappings(Mapping.TEXT, Mapping.STRING).supportsCardinality(Cardinality.SINGLE).build(); + .setDefaultStringMapping(Mapping.TEXT).supportedStringMappings(Mapping.TEXT, Mapping.STRING).supportsCardinality(Cardinality.SINGLE).supportsCustomAnalyzer().build(); private static Map SPATIAL_PREDICATES = spatialPredicates(); @@ -240,6 +245,7 @@ private String getKeyFieldId(String collection) { * @param tx enclosing transaction * @throws org.janusgraph.diskstorage.BackendException */ + @SuppressWarnings("unchecked") @Override public void register(String store, String key, KeyInformation information, BaseTransaction tx) throws BackendException { if (mode==Mode.CLOUD) { @@ -257,6 +263,27 @@ public void register(String store, String key, KeyInformation information, BaseT } } //Since all data types must be defined in the schema.xml, pre-registering a type does not work + //But we check Analyse feature + String analyzer = (String) ParameterType.STRING_ANALYZER.findParameter(information.getParameters(), null); + if (analyzer != null) { + //If the key have a tokenizer, we try to get it by reflextion + try { + ((Constructor) ClassLoader.getSystemClassLoader().loadClass(analyzer) + .getConstructor()).newInstance(); + } catch (ReflectiveOperationException e) { + throw new PermanentBackendException(e.getMessage(),e); + } + } + analyzer = (String) ParameterType.TEXT_ANALYZER.findParameter(information.getParameters(), null); + if (analyzer != null) { + //If the key have a tokenizer, we try to get it by reflextion + try { + ((Constructor) ClassLoader.getSystemClassLoader().loadClass(analyzer) + .getConstructor()).newInstance(); + } catch (ReflectiveOperationException e) { + throw new PermanentBackendException(e.getMessage(),e); + } + } } @Override @@ -554,31 +581,18 @@ public String buildQueryFilter(Condition condition, KeyInform //Special case if (janusgraphPredicate == Text.CONTAINS) { - //e.g. - if terms tomorrow and world were supplied, and fq=text:(tomorrow world) - //sample data set would return 2 documents: one where text = Tomorrow is the World, - //and the second where text = Hello World. Hence, we are decomposing the query string - //and building an AND query explicitly because we need AND semantics - value = ((String) value).toLowerCase(); - List terms = Text.tokenize((String) value); - - if (terms.isEmpty()) { - return ""; - } else if (terms.size() == 1) { - return (key + ":(" + escapeValue(terms.get(0)) + ")"); - } else { - And andTerms = new And(); - for (String term : terms) { - andTerms.add(new PredicateCondition(key, janusgraphPredicate, term)); - } - return buildQueryFilter(andTerms, informations); - } - } - if (janusgraphPredicate == Text.PREFIX || janusgraphPredicate == Text.CONTAINS_PREFIX) { + return tokenize(informations, value, key, janusgraphPredicate, (String) ParameterType.TEXT_ANALYZER.findParameter(informations.get(key).getParameters(), null)); + } else if (janusgraphPredicate == Text.PREFIX || janusgraphPredicate == Text.CONTAINS_PREFIX) { return (key + ":" + escapeValue(value) + "*"); } else if (janusgraphPredicate == Text.REGEX || janusgraphPredicate == Text.CONTAINS_REGEX) { return (key + ":/" + value + "/"); } else if (janusgraphPredicate == Cmp.EQUAL) { - return (key + ":\"" + escapeValue(value) + "\""); + String tokenizer = (String) ParameterType.STRING_ANALYZER.findParameter(informations.get(key).getParameters(), null); + if(tokenizer != null){ + return tokenize(informations, value, key, janusgraphPredicate,tokenizer); + } else { + return (key + ":\"" + escapeValue(value) + "\""); + } } else if (janusgraphPredicate == Cmp.NOT_EQUAL) { return ("-" + key + ":\"" + escapeValue(value) + "\""); } else if (janusgraphPredicate == Text.FUZZY || janusgraphPredicate == Text.CONTAINS_FUZZY) { @@ -692,6 +706,49 @@ public String buildQueryFilter(Condition condition, KeyInform } } + private String tokenize(KeyInformation.StoreRetriever informations, Object value, String key, + JanusGraphPredicate janusgraphPredicate, String tokenizer) { + List terms; + if(tokenizer != null){ + terms = customTokenize(tokenizer, (String) value); + } else { + terms = Text.tokenize((String) value); + } + if (terms.isEmpty()) { + return ""; + } else if (terms.size() == 1) { + return (key + ":(" + escapeValue(terms.get(0)) + ")"); + } else { + And andTerms = new And(); + for (String term : terms) { + andTerms.add(new PredicateCondition(key, janusgraphPredicate, term)); + } + return buildQueryFilter(andTerms, informations); + } + } + + @SuppressWarnings("unchecked") + private List customTokenize(String tokenizerClass, String value){ + CachingTokenFilter stream = null; + try { + List terms = new ArrayList<>(); + Tokenizer tokenizer = ((Constructor) ClassLoader.getSystemClassLoader().loadClass(tokenizerClass) + .getConstructor()).newInstance(); + tokenizer.setReader(new StringReader(value)); + stream = new CachingTokenFilter(tokenizer); + TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); + stream.reset(); + while (stream.incrementToken()) { + terms.add(termAtt.getBytesRef().utf8ToString()); + } + return terms; + } catch ( ReflectiveOperationException | IOException e) { + throw new IllegalArgumentException(e.getMessage(),e); + } finally { + IOUtils.closeQuietly(stream); + } + } + private String toIsoDate(Date value) { TimeZone tz = TimeZone.getTimeZone("UTC"); DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); diff --git a/janusgraph-solr/src/test/java/org/janusgraph/diskstorage/solr/SolrIndexTest.java b/janusgraph-solr/src/test/java/org/janusgraph/diskstorage/solr/SolrIndexTest.java index 11282770b3..77ef2170f8 100644 --- a/janusgraph-solr/src/test/java/org/janusgraph/diskstorage/solr/SolrIndexTest.java +++ b/janusgraph-solr/src/test/java/org/janusgraph/diskstorage/solr/SolrIndexTest.java @@ -14,6 +14,8 @@ package org.janusgraph.diskstorage.solr; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.janusgraph.core.Cardinality; import org.janusgraph.core.attribute.Cmp; import org.janusgraph.core.attribute.Geo; @@ -64,6 +66,16 @@ public boolean supportsLuceneStyleQueries() { return true; } + @Override + public String getEnglishAnalyzerName() { + return WhitespaceTokenizer.class.getName(); + } + + @Override + public String getKeywordAnalyzerName() { + return KeywordTokenizer.class.getName(); + } + private Configuration getLocalSolrTestConfig() { final String index = "solr"; ModifiableConfiguration config = GraphDatabaseConfiguration.buildGraphConfiguration(); diff --git a/janusgraph-solr/src/test/resources/solr/core-template/schema.xml b/janusgraph-solr/src/test/resources/solr/core-template/schema.xml index 726187aceb..eb6201ed13 100644 --- a/janusgraph-solr/src/test/resources/solr/core-template/schema.xml +++ b/janusgraph-solr/src/test/resources/solr/core-template/schema.xml @@ -478,6 +478,13 @@ + + + + + + + @@ -556,6 +563,10 @@ + + + + diff --git a/janusgraph-test/src/main/java/org/janusgraph/diskstorage/indexing/IndexProviderTest.java b/janusgraph-test/src/main/java/org/janusgraph/diskstorage/indexing/IndexProviderTest.java index 964b8e7e15..e2594890ac 100644 --- a/janusgraph-test/src/main/java/org/janusgraph/diskstorage/indexing/IndexProviderTest.java +++ b/janusgraph-test/src/main/java/org/janusgraph/diskstorage/indexing/IndexProviderTest.java @@ -29,6 +29,7 @@ import org.janusgraph.diskstorage.util.time.TimestampProviders; import org.janusgraph.graphdb.query.JanusGraphPredicate; import org.janusgraph.graphdb.query.condition.*; +import org.janusgraph.graphdb.types.ParameterType; import org.janusgraph.testutil.RandomGenerator; import org.junit.After; @@ -57,7 +58,8 @@ public abstract class IndexProviderTest { protected Map allKeys; protected KeyInformation.IndexRetriever indexRetriever; - public static final String TEXT = "text", TIME = "time", WEIGHT = "weight", LOCATION = "location", BOUNDARY = "boundary", NAME = "name", PHONE_LIST = "phone_list", PHONE_SET = "phone_set", DATE = "date"; + public static final String TEXT = "text", TIME = "time", WEIGHT = "weight", LOCATION = "location", BOUNDARY = "boundary", NAME = "name", PHONE_LIST = "phone_list", PHONE_SET = "phone_set", DATE = "date", STRING="string", + ANALYZED="analyzed", FULL_TEXT="full_text", KEYWORD="keyword"; public static StandardKeyInformation of(Class clazz, Cardinality cardinality, Parameter... paras) { return new StandardKeyInformation(clazz, cardinality, paras); @@ -84,28 +86,38 @@ public KeyInformation get(String key) { }; } - public static final Map getMapping(final IndexFeatures indexFeatures) { + public static final Map getMapping(final IndexFeatures indexFeatures, final String englishAnalyzerName, final String keywordAnalyzerName) { Preconditions.checkArgument(indexFeatures.supportsStringMapping(Mapping.TEXTSTRING) || (indexFeatures.supportsStringMapping(Mapping.TEXT) && indexFeatures.supportsStringMapping(Mapping.STRING)), "Index must support string and text mapping"); + Parameter textParameter = indexFeatures.supportsStringMapping(Mapping.TEXT) ? Mapping.TEXT.asParameter() : Mapping.TEXTSTRING.asParameter(); + Parameter stringParameter = indexFeatures.supportsStringMapping(Mapping.STRING) ? Mapping.STRING.asParameter() : Mapping.TEXTSTRING.asParameter(); return new HashMap() {{ - put(TEXT,new StandardKeyInformation(String.class, Cardinality.SINGLE, new Parameter("mapping", - indexFeatures.supportsStringMapping(Mapping.TEXT)?Mapping.TEXT:Mapping.TEXTSTRING))); + put(TEXT,new StandardKeyInformation(String.class, Cardinality.SINGLE,textParameter)); put(TIME,new StandardKeyInformation(Long.class, Cardinality.SINGLE)); - put(WEIGHT,new StandardKeyInformation(Double.class, Cardinality.SINGLE, new Parameter("mapping",Mapping.DEFAULT))); + put(WEIGHT,new StandardKeyInformation(Double.class, Cardinality.SINGLE, Mapping.DEFAULT.asParameter())); put(LOCATION,new StandardKeyInformation(Geoshape.class, Cardinality.SINGLE)); - put(BOUNDARY,new StandardKeyInformation(Geoshape.class, Cardinality.SINGLE, new Parameter("mapping",Mapping.PREFIX_TREE))); - put(NAME,new StandardKeyInformation(String.class, Cardinality.SINGLE, new Parameter("mapping", - indexFeatures.supportsStringMapping(Mapping.STRING)?Mapping.STRING:Mapping.TEXTSTRING))); + put(BOUNDARY,new StandardKeyInformation(Geoshape.class, Cardinality.SINGLE, Mapping.PREFIX_TREE.asParameter())); + put(NAME,new StandardKeyInformation(String.class, Cardinality.SINGLE,stringParameter)); if(indexFeatures.supportsCardinality(Cardinality.LIST)) { - put(PHONE_LIST, new StandardKeyInformation(String.class, Cardinality.LIST, new Parameter("mapping", - indexFeatures.supportsStringMapping(Mapping.STRING) ? Mapping.STRING : Mapping.TEXTSTRING))); + put(PHONE_LIST, new StandardKeyInformation(String.class, Cardinality.LIST, stringParameter)); } if(indexFeatures.supportsCardinality(Cardinality.SET)) { - put(PHONE_SET, new StandardKeyInformation(String.class, Cardinality.SET, new Parameter("mapping", - indexFeatures.supportsStringMapping(Mapping.STRING) ? Mapping.STRING : Mapping.TEXTSTRING))); + put(PHONE_SET, new StandardKeyInformation(String.class, Cardinality.SET, stringParameter)); } put(DATE,new StandardKeyInformation(Instant.class, Cardinality.SINGLE)); + put(STRING, new StandardKeyInformation(String.class, Cardinality.SINGLE, stringParameter, + new Parameter(ParameterType.STRING_ANALYZER.getName(), englishAnalyzerName))); + put(ANALYZED, new StandardKeyInformation(String.class, Cardinality.SINGLE, textParameter, + new Parameter(ParameterType.TEXT_ANALYZER.getName(), englishAnalyzerName))); + if(indexFeatures.supportsStringMapping(Mapping.TEXTSTRING)){ + put(FULL_TEXT, new StandardKeyInformation(String.class, Cardinality.SINGLE, + Mapping.TEXTSTRING.asParameter(), + new Parameter(ParameterType.STRING_ANALYZER.getName(), englishAnalyzerName), + new Parameter(ParameterType.TEXT_ANALYZER.getName(), englishAnalyzerName))); + } + put(KEYWORD, new StandardKeyInformation(String.class, Cardinality.SINGLE, textParameter, + new Parameter(ParameterType.TEXT_ANALYZER.getName(), keywordAnalyzerName))); }}; } @@ -113,6 +125,10 @@ public static final Map getMapping(final IndexFeatures in public abstract boolean supportsLuceneStyleQueries(); + public abstract String getEnglishAnalyzerName(); + + public abstract String getKeywordAnalyzerName(); + @Before public void setUp() throws Exception { index = openIndex(); @@ -124,7 +140,7 @@ public void setUp() throws Exception { public void open() throws BackendException { index = openIndex(); indexFeatures = index.getFeatures(); - allKeys = getMapping(indexFeatures); + allKeys = getMapping(indexFeatures, getEnglishAnalyzerName(), getKeywordAnalyzerName()); indexRetriever = getIndexRetriever(allKeys); newTx(); @@ -872,6 +888,76 @@ public void run(IndexTransaction tx) { checkResult(new IndexQuery(defStore, PredicateCondition.of(TEXT, Text.CONTAINS, "brown")),null); } + /** + * Test custom analyzer + * @throws Exception + */ + @Test + public void testCustomAnalyzer() throws Exception { + if (!indexFeatures.supportsCustomAnalyzer()) + return; + String store = "vertex"; + initialize(store); + Multimap initialDoc = HashMultimap.create(); + + initialDoc.put(STRING, "Tom and Jerry"); + initialDoc.put(ANALYZED, "Tom and Jerry"); + if(indexFeatures.supportsStringMapping(Mapping.TEXTSTRING)) + initialDoc.put(FULL_TEXT, "Tom and Jerry"); + initialDoc.put(KEYWORD, "Tom and Jerry"); + add(store, "docId", initialDoc, true); + clopen(); + + IndexQuery query = new IndexQuery(store, PredicateCondition.of(STRING, Cmp.EQUAL, "Tom and Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(STRING, Cmp.EQUAL, "Tom Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(STRING, Cmp.EQUAL, "Tom or Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(STRING, Text.PREFIX, "jerr")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(STRING, Text.REGEX, "jer.*")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(ANALYZED, Text.CONTAINS, "Tom and Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(ANALYZED, Text.CONTAINS, "Tom Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(ANALYZED, Text.CONTAINS, "Tom or Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(ANALYZED, Text.CONTAINS_PREFIX, "jerr")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(ANALYZED, Text.CONTAINS_REGEX, "jer.*")); + assertEquals(query.toString(), 1, tx.query(query).size()); + if(indexFeatures.supportsStringMapping(Mapping.TEXTSTRING)){ + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Cmp.EQUAL, "Tom and Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Cmp.EQUAL, "Tom Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Cmp.EQUAL, "Tom or Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Text.PREFIX, "jerr")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Text.REGEX, "jer.*")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Text.CONTAINS, "Tom and Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Text.CONTAINS, "Tom Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Text.CONTAINS, "Tom or Jerry")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Text.CONTAINS_PREFIX, "jerr")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(FULL_TEXT, Text.CONTAINS_REGEX, "jer.*")); + assertEquals(query.toString(), 1, tx.query(query).size()); + } + + query = new IndexQuery(store, PredicateCondition.of(KEYWORD, Text.CONTAINS_PREFIX, "Tom")); + assertEquals(query.toString(), 1, tx.query(query).size()); + query = new IndexQuery(store, PredicateCondition.of(KEYWORD, Text.CONTAINS_REGEX, ".*Jer.*")); + assertEquals(query.toString(), 1, tx.query(query).size()); + + } + /* ================================================================================== HELPER METHODS ==================================================================================*/