Skip to content

Commit ae0a15d

Browse files
fix: align the handling of regular expressions between hbase & bigtable (#4430)
* fix: align the handling of regular expressions between hbase & bigtable When using RegexStringComparator, HBase will try to match a substring of the target. While bigtable requires a full match. For example, given the target "abcd", in HBase the regex "[bc]*" will match the string. However Bigtable will not. Bigtable will require the regex to also match a & d. This PR addresses this discrepancy by surrounding the user provided regex with a zero or more wildcards. ie transform "[bc]*" into ".*[bc]*.*" Change-Id: I7404bddd4c9c9d907d57225d99f7062c4407fc35 * avoid wrapping empty strings in wildcards Change-Id: I60325d415f24fe6dbd02fa84978047e5759128cf * update unit tests Change-Id: I617e968dc0d2600952f2e06d1789b6e9e9539e6b * remove stray debug statement Change-Id: Ie07e3c5d4beb03af93af8dbfa98847010f90457d * fix typo in regex test and re-order the test cases to make it easier to catch the typo Change-Id: I7bdbf601984e32f0f4b3703e19f5f61fc476967d
1 parent 21ea0b6 commit ae0a15d

File tree

10 files changed

+152
-23
lines changed

10 files changed

+152
-23
lines changed

bigtable-client-core-parent/bigtable-hbase-integration-tests-common/src/test/java/com/google/cloud/bigtable/hbase/AbstractTestFilters.java

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import static com.google.cloud.bigtable.hbase.test_env.SharedTestEnvRule.COLUMN_FAMILY;
1919
import static com.google.cloud.bigtable.hbase.test_env.SharedTestEnvRule.COLUMN_FAMILY2;
2020
import static com.google.common.truth.Truth.assertThat;
21+
import static com.google.common.truth.Truth.assertWithMessage;
2122
import static org.junit.Assert.assertArrayEquals;
2223
import static org.junit.Assert.assertEquals;
2324
import static org.junit.Assert.assertNotNull;
@@ -32,6 +33,7 @@
3233
import java.util.List;
3334
import java.util.Set;
3435
import java.util.TreeSet;
36+
import java.util.UUID;
3537
import org.apache.commons.lang.RandomStringUtils;
3638
import org.apache.hadoop.hbase.Cell;
3739
import org.apache.hadoop.hbase.CellUtil;
@@ -2297,6 +2299,97 @@ protected final void assertMatchingRow(Result result, byte[] key) {
22972299
actualKey);
22982300
}
22992301

2302+
/**
2303+
* Bigtable & HBase have different semantics for regexes - HBase matches are unachored and allow
2304+
* for the regex to match anywhere in the string. While Bigtable requires a full string match.
2305+
*/
2306+
@Test
2307+
public void testPartialRegexMatches() throws Exception {
2308+
Table table = getDefaultTable();
2309+
String keyRoot = UUID.randomUUID().toString();
2310+
String key = "prefix-" + keyRoot + "-suffix";
2311+
String family = new String(COLUMN_FAMILY);
2312+
String familyRoot = family.substring(3, family.length() - 3);
2313+
String qualifierRoot = "my-qualifier";
2314+
String qualifier = "prefix-" + qualifierRoot + "-suffix";
2315+
String valueRoot = "my-value";
2316+
String value = "prefix-" + valueRoot + "-suffix";
2317+
2318+
table.put(
2319+
new Put(key.getBytes()).addColumn(COLUMN_FAMILY, qualifier.getBytes(), value.getBytes()));
2320+
2321+
// Partial regexes that match the root of the value with single char matchers to make sure it
2322+
// doesnt use a literal optimization
2323+
List<Filter> filters =
2324+
Arrays.asList(
2325+
new RowFilter(CompareOp.EQUAL, new RegexStringComparator("." + keyRoot + ".")),
2326+
new FamilyFilter(CompareOp.EQUAL, new RegexStringComparator("." + familyRoot + ".")),
2327+
new QualifierFilter(
2328+
CompareOp.EQUAL, new RegexStringComparator("." + qualifierRoot + ".")),
2329+
new ValueFilter(CompareOp.EQUAL, new RegexStringComparator("." + valueRoot + ".")));
2330+
2331+
for (Filter filter : filters) {
2332+
try (ResultScanner scanner =
2333+
table.getScanner(new Scan().withStartRow(key.getBytes()).setLimit(1).setFilter(filter))) {
2334+
assertWithMessage("Filter with partial regex: " + filter + " yielded no results")
2335+
.that(scanner)
2336+
.isNotEmpty();
2337+
}
2338+
}
2339+
2340+
// Make sure that explicitly anchored regexes still work
2341+
List<Filter> anchoredRegex =
2342+
Arrays.asList(
2343+
new RowFilter(CompareOp.EQUAL, new RegexStringComparator("^" + key)),
2344+
new RowFilter(CompareOp.EQUAL, new RegexStringComparator(key + "$")),
2345+
new RowFilter(CompareOp.EQUAL, new RegexStringComparator("^" + key + "$")),
2346+
new FamilyFilter(CompareOp.EQUAL, new RegexStringComparator("^" + family)),
2347+
new FamilyFilter(CompareOp.EQUAL, new RegexStringComparator(family + "$")),
2348+
new FamilyFilter(CompareOp.EQUAL, new RegexStringComparator("^" + family + "$")),
2349+
new QualifierFilter(CompareOp.EQUAL, new RegexStringComparator("^" + qualifier + "$")),
2350+
new QualifierFilter(CompareOp.EQUAL, new RegexStringComparator("^" + qualifier)),
2351+
new QualifierFilter(CompareOp.EQUAL, new RegexStringComparator("^" + qualifier)),
2352+
new ValueFilter(CompareOp.EQUAL, new RegexStringComparator("^" + value + "$")),
2353+
new ValueFilter(CompareOp.EQUAL, new RegexStringComparator("^" + value)),
2354+
new ValueFilter(CompareOp.EQUAL, new RegexStringComparator(value + "$")));
2355+
2356+
for (Filter filter : anchoredRegex) {
2357+
try (ResultScanner scanner =
2358+
table.getScanner(new Scan().withStartRow(key.getBytes()).setLimit(1).setFilter(filter))) {
2359+
assertWithMessage("Filter with full regex: " + filter + " yielded no results")
2360+
.that(scanner)
2361+
.isNotEmpty();
2362+
}
2363+
}
2364+
2365+
// Make sure that explicitly anchored partial regexes dont overmatch
2366+
List<Filter> anchoredPartialRegex =
2367+
Arrays.asList(
2368+
new RowFilter(CompareOp.EQUAL, new RegexStringComparator("^" + keyRoot + "$")),
2369+
new RowFilter(CompareOp.EQUAL, new RegexStringComparator("^" + keyRoot)),
2370+
new RowFilter(CompareOp.EQUAL, new RegexStringComparator(keyRoot + "$")),
2371+
new FamilyFilter(CompareOp.EQUAL, new RegexStringComparator("^" + familyRoot + " $")),
2372+
new FamilyFilter(CompareOp.EQUAL, new RegexStringComparator("^" + familyRoot)),
2373+
new FamilyFilter(CompareOp.EQUAL, new RegexStringComparator(familyRoot + " $")),
2374+
new QualifierFilter(
2375+
CompareOp.EQUAL, new RegexStringComparator("^" + qualifierRoot + "$")),
2376+
new QualifierFilter(CompareOp.EQUAL, new RegexStringComparator("^" + qualifierRoot)),
2377+
new QualifierFilter(CompareOp.EQUAL, new RegexStringComparator(qualifierRoot + "$")),
2378+
new ValueFilter(CompareOp.EQUAL, new RegexStringComparator("^" + valueRoot + "$")),
2379+
new ValueFilter(CompareOp.EQUAL, new RegexStringComparator("^" + valueRoot)),
2380+
new ValueFilter(CompareOp.EQUAL, new RegexStringComparator(valueRoot + "$")));
2381+
2382+
for (Filter filter : anchoredPartialRegex) {
2383+
try (ResultScanner scanner =
2384+
table.getScanner(new Scan().withStartRow(key.getBytes()).setLimit(1).setFilter(filter))) {
2385+
assertWithMessage(
2386+
"Filter with anchored partial regex: " + filter + " yielded unpexpected results")
2387+
.that(scanner)
2388+
.isEmpty();
2389+
}
2390+
}
2391+
}
2392+
23002393
private final String toFuzzyKeyString(byte[] bytes) {
23012394
StringBuilder sb = new StringBuilder("[");
23022395
String prefix = "";

bigtable-client-core-parent/bigtable-hbase/src/main/java/com/google/cloud/bigtable/hbase/adapters/filters/FamilyFilterAdapter.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ public Filter adapt(FilterAdapterContext context, FamilyFilter filter) throws IO
5555
throw new IllegalStateException("Comparator cannot be null");
5656
} else if (comparator instanceof RegexStringComparator) {
5757
family = Bytes.toString(comparator.getValue());
58+
// HBase regex matching is unanchored, while Bigtable requires a full string match
59+
// To align the two, surround the user regex with wildcards
60+
if (!family.isEmpty()) {
61+
family = ".*" + family + ".*";
62+
}
5863
} else if (comparator instanceof BinaryComparator) {
5964
ByteString quotedRegularExpression =
6065
ReaderExpressionHelper.quoteRegularExpression(comparator.getValue());

bigtable-client-core-parent/bigtable-hbase/src/main/java/com/google/cloud/bigtable/hbase/adapters/filters/QualifierFilterAdapter.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ private Filter adaptBinaryComparator(
113113
private static Filter adaptRegexStringComparator(
114114
CompareOp compareOp, RegexStringComparator comparator) {
115115
String pattern = FilterAdapterHelper.extractRegexPattern(comparator);
116+
// HBase regex matching is unanchored, while Bigtable requires a full string match
117+
// To align the two, surround the user regex with wildcards
118+
if (!pattern.isEmpty()) {
119+
pattern = "\\C*" + pattern + "\\C*";
120+
}
116121
switch (compareOp) {
117122
case EQUAL:
118123
return FILTERS.qualifier().regex(pattern);

bigtable-client-core-parent/bigtable-hbase/src/main/java/com/google/cloud/bigtable/hbase/adapters/filters/RowFilterAdapter.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,17 @@ public Filter adapt(FilterAdapterContext context, org.apache.hadoop.hbase.filter
5858
if (comparator == null) {
5959
throw new IllegalStateException("Comparator cannot be null");
6060
} else if (comparator instanceof RegexStringComparator) {
61-
regexValue = ByteString.copyFrom(comparator.getValue());
61+
// HBase regex matching is unanchored, while Bigtable requires a full string match
62+
// To align the two, surround the user regex with wildcards
63+
if (comparator.getValue().length > 0) {
64+
regexValue =
65+
ByteString.copyFromUtf8("\\C*")
66+
.concat(ByteString.copyFrom(comparator.getValue()))
67+
.concat(ByteString.copyFromUtf8("\\C*"));
68+
} else {
69+
regexValue = ByteString.EMPTY;
70+
}
71+
6272
} else if (comparator instanceof BinaryComparator) {
6373
regexValue = ReaderExpressionHelper.quoteRegularExpression(comparator.getValue());
6474
} else {

bigtable-client-core-parent/bigtable-hbase/src/main/java/com/google/cloud/bigtable/hbase/adapters/filters/ValueFilterAdapter.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,11 @@ private Filter adaptRegexStringComparator(CompareOp compareOp, RegexStringCompar
116116
String pattern = FilterAdapterHelper.extractRegexPattern(comparator);
117117
switch (compareOp) {
118118
case EQUAL:
119+
// HBase regex matching is unanchored, while Bigtable requires a full string match
120+
// To align the two, surround the user regex with wildcards
121+
if (!pattern.isEmpty()) {
122+
pattern = "\\C*" + pattern + "\\C*";
123+
}
119124
return FILTERS.value().regex(pattern);
120125
// No-ops are always filtered out.
121126
// See:

bigtable-client-core-parent/bigtable-hbase/src/test/java/com/google/cloud/bigtable/hbase/adapters/filters/TestFamilyFilterAdapter.java

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
*/
1616
package com.google.cloud.bigtable.hbase.adapters.filters;
1717

18+
import static com.google.common.truth.Truth.assertThat;
19+
1820
import com.google.bigtable.v2.RowFilter;
1921
import com.google.cloud.bigtable.hbase.adapters.read.ReaderExpressionHelper;
2022
import com.google.protobuf.ByteString;
@@ -45,13 +47,16 @@ public void before() {
4547

4648
@Test
4749
public void testAdapt_RegexAndEquals() throws IOException {
48-
String regexp = "^.*hello world.*$";
50+
String regexp = "^hello world[?!]$";
4951
RegexStringComparator comparator = new RegexStringComparator(regexp);
5052
org.apache.hadoop.hbase.filter.RowFilter filter =
5153
new org.apache.hadoop.hbase.filter.RowFilter(CompareFilter.CompareOp.EQUAL, comparator);
52-
Assert.assertEquals(
53-
RowFilter.newBuilder().setRowKeyRegexFilter(ByteString.copyFrom(regexp.getBytes())).build(),
54-
adapter.adapt(context, filter).toProto());
54+
assertThat(adapter.adapt(context, filter).toProto())
55+
.isEqualTo(
56+
RowFilter.newBuilder()
57+
// user regex needs to be wrapped in wildcards to
58+
.setRowKeyRegexFilter(ByteString.copyFrom(("\\C*" + regexp + "\\C*").getBytes()))
59+
.build());
5560
}
5661

5762
@Test
@@ -74,9 +79,11 @@ public void testAdapt_EmptyRegex() throws IOException {
7479
RegexStringComparator comparator = new RegexStringComparator(regexp);
7580
org.apache.hadoop.hbase.filter.RowFilter filter =
7681
new org.apache.hadoop.hbase.filter.RowFilter(CompareFilter.CompareOp.EQUAL, comparator);
77-
Assert.assertEquals(
78-
RowFilter.newBuilder().setRowKeyRegexFilter(ByteString.copyFrom(regexp.getBytes())).build(),
79-
adapter.adapt(context, filter).toProto());
82+
assertThat(adapter.adapt(context, filter).toProto())
83+
.isEqualTo(
84+
RowFilter.newBuilder()
85+
.setRowKeyRegexFilter(ByteString.copyFrom(regexp.getBytes()))
86+
.build());
8087
}
8188

8289
@Test

bigtable-client-core-parent/bigtable-hbase/src/test/java/com/google/cloud/bigtable/hbase/adapters/filters/TestQualifierFilterAdapter.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import static com.google.cloud.bigtable.data.v2.models.Filters.FILTERS;
1919

2020
import com.google.cloud.bigtable.data.v2.models.Filters;
21+
import com.google.common.truth.Truth;
2122
import com.google.protobuf.ByteString;
2223
import java.io.IOException;
2324
import org.apache.hadoop.hbase.client.Scan;
@@ -28,7 +29,6 @@
2829
import org.apache.hadoop.hbase.filter.QualifierFilter;
2930
import org.apache.hadoop.hbase.filter.RegexStringComparator;
3031
import org.apache.hadoop.hbase.util.Bytes;
31-
import org.junit.Assert;
3232
import org.junit.Test;
3333
import org.junit.runner.RunWith;
3434
import org.junit.runners.JUnit4;
@@ -51,7 +51,7 @@ private void assertAdaptedForm(
5151
throws IOException {
5252
QualifierFilter filter = new QualifierFilter(op, comparable);
5353
Filters.Filter actualFilter = adapter.adapt(scanWithOnFamilyScanContext, filter);
54-
Assert.assertEquals(expectedFilter.toProto(), actualFilter.toProto());
54+
Truth.assertThat(actualFilter.toProto()).isEqualTo(expectedFilter.toProto());
5555
}
5656

5757
@Test
@@ -112,6 +112,6 @@ public void testRegexQualifierFilter() throws IOException {
112112
assertAdaptedForm(
113113
new RegexStringComparator(pattern),
114114
CompareOp.EQUAL,
115-
FILTERS.qualifier().regex(ByteString.copyFromUtf8(pattern)));
115+
FILTERS.qualifier().regex(ByteString.copyFromUtf8("\\C*" + pattern + "\\C*")));
116116
}
117117
}

bigtable-client-core-parent/bigtable-hbase/src/test/java/com/google/cloud/bigtable/hbase/adapters/filters/TestRowFilterAdapter.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package com.google.cloud.bigtable.hbase.adapters.filters;
1717

1818
import static com.google.cloud.bigtable.data.v2.models.Filters.FILTERS;
19+
import static com.google.common.truth.Truth.assertThat;
1920

2021
import com.google.cloud.bigtable.hbase.adapters.read.ReaderExpressionHelper;
2122
import com.google.protobuf.ByteString;
@@ -50,8 +51,8 @@ public void testAdapt_RegexAndEquals() throws IOException {
5051
RegexStringComparator comparator = new RegexStringComparator(regexp);
5152
org.apache.hadoop.hbase.filter.RowFilter filter =
5253
new org.apache.hadoop.hbase.filter.RowFilter(CompareFilter.CompareOp.EQUAL, comparator);
53-
Assert.assertEquals(
54-
FILTERS.key().regex(regexp).toProto(), adapter.adapt(context, filter).toProto());
54+
assertThat(adapter.adapt(context, filter).toProto())
55+
.isEqualTo(FILTERS.key().regex("\\C*" + regexp + "\\C*").toProto());
5556
}
5657

5758
@Test

bigtable-client-core-parent/bigtable-hbase/src/test/java/com/google/cloud/bigtable/hbase/adapters/filters/TestValueFilterAdapter.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package com.google.cloud.bigtable.hbase.adapters.filters;
1717

1818
import static com.google.cloud.bigtable.data.v2.models.Filters.FILTERS;
19+
import static com.google.common.truth.Truth.assertThat;
1920

2021
import com.google.cloud.bigtable.data.v2.models.Filters;
2122
import com.google.cloud.bigtable.hbase.adapters.read.ReaderExpressionHelper;
@@ -29,7 +30,6 @@
2930
import org.apache.hadoop.hbase.filter.RegexStringComparator;
3031
import org.apache.hadoop.hbase.filter.ValueFilter;
3132
import org.apache.hadoop.hbase.util.Bytes;
32-
import org.junit.Assert;
3333
import org.junit.Test;
3434
import org.junit.runner.RunWith;
3535
import org.junit.runners.JUnit4;
@@ -110,14 +110,16 @@ public void testNotEqualValueFilter() throws IOException {
110110
public void testRegexValueFilter() throws IOException {
111111
String pattern = "Foo\\d+";
112112
assertAdaptedForm(
113-
new RegexStringComparator(pattern), CompareOp.EQUAL, FILTERS.value().regex(pattern));
113+
new RegexStringComparator(pattern),
114+
CompareOp.EQUAL,
115+
FILTERS.value().regex("\\C*" + pattern + "\\C*"));
114116
}
115117

116118
private void assertAdaptedForm(
117119
ByteArrayComparable comparable, CompareFilter.CompareOp op, Filters.Filter expectedFilter)
118120
throws IOException {
119121
ValueFilter filter = new ValueFilter(op, comparable);
120122
Filters.Filter actualFilter = adapter.adapt(emptyScanContext, filter);
121-
Assert.assertEquals(expectedFilter.toProto(), actualFilter.toProto());
123+
assertThat(actualFilter.toProto()).isEqualTo(expectedFilter.toProto());
122124
}
123125
}

bigtable-client-core-parent/bigtable-hbase/src/test/java/com/google/cloud/bigtable/hbase/adapters/read/TestGetAdapter.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package com.google.cloud.bigtable.hbase.adapters.read;
1717

1818
import static com.google.cloud.bigtable.data.v2.models.Filters.FILTERS;
19+
import static com.google.common.truth.Truth.assertThat;
1920

2021
import com.google.cloud.bigtable.data.v2.internal.RequestContext;
2122
import com.google.cloud.bigtable.data.v2.models.Filters;
@@ -182,12 +183,12 @@ public void testBuildFilter() throws IOException {
182183

183184
Filters.Filter actualFilter = getAdapter.buildFilter(get);
184185

185-
Assert.assertEquals(
186-
FILTERS
187-
.chain()
188-
.filter(FILTERS.limit().cellsPerColumn(5))
189-
.filter(FILTERS.family().regex("a"))
190-
.toProto(),
191-
actualFilter.toProto());
186+
assertThat(actualFilter.toProto())
187+
.isEqualTo(
188+
FILTERS
189+
.chain()
190+
.filter(FILTERS.limit().cellsPerColumn(5))
191+
.filter(FILTERS.family().regex(".*a.*"))
192+
.toProto());
192193
}
193194
}

0 commit comments

Comments
 (0)