202 lines
6.4 KiB
Java
202 lines
6.4 KiB
Java
package com.twitter.search.common.relevance.features;
|
|
|
|
import java.nio.ByteBuffer;
|
|
import java.util.Arrays;
|
|
|
|
import com.google.common.base.Preconditions;
|
|
|
|
/**
|
|
* A TweetIntegerShingleSignature object consists of 4 bytes, each representing the signature of
|
|
* a status text sample. The signature bytes are sorted in ascending order and compacted to an
|
|
* integer in big endian for serialization.
|
|
*
|
|
* Fuzzy matching of two TweetIntegerShingleSignature objects is met when the number of matching
|
|
* bytes between the two is equal to or above 3.
|
|
*/
|
|
public class TweetIntegerShingleSignature {
|
|
public static final int NUM_SHINGLES = Integer.SIZE / Byte.SIZE;
|
|
public static final int DEFAULT_NO_SIGNATURE = 0;
|
|
public static final TweetIntegerShingleSignature NO_SIGNATURE_HANDLE =
|
|
deserialize(DEFAULT_NO_SIGNATURE);
|
|
public static final int DEFAULT_MIN_SHINGLES_MATCH = 3;
|
|
private final int minShinglesMatch;
|
|
|
|
private final byte[] shingles;
|
|
private final int signature; // redundant information, for easier comparison.
|
|
|
|
/**
|
|
* Construct from a byte array.
|
|
*/
|
|
public TweetIntegerShingleSignature(byte[] shingles, int minShinglesMatch) {
|
|
Preconditions.checkArgument(shingles.length == NUM_SHINGLES);
|
|
this.shingles = shingles;
|
|
// sort to byte's natural ascending order
|
|
Arrays.sort(this.shingles);
|
|
this.minShinglesMatch = minShinglesMatch;
|
|
this.signature = serializeInternal(shingles);
|
|
}
|
|
|
|
/**
|
|
* Construct from a byte array.
|
|
*/
|
|
public TweetIntegerShingleSignature(byte[] shingles) {
|
|
this(shingles, DEFAULT_MIN_SHINGLES_MATCH);
|
|
}
|
|
|
|
/**
|
|
* Construct from a serialized integer signature.
|
|
*/
|
|
public TweetIntegerShingleSignature(int signature, int minShinglesMatch) {
|
|
this.shingles = deserializeInternal(signature);
|
|
// sort to byte's natural ascending order
|
|
Arrays.sort(this.shingles);
|
|
this.minShinglesMatch = minShinglesMatch;
|
|
// now store the sorted shingles into signature field, may be different from what passed in.
|
|
this.signature = serializeInternal(shingles);
|
|
}
|
|
|
|
/**
|
|
* Construct from a serialized integer signature.
|
|
*/
|
|
public TweetIntegerShingleSignature(int signature) {
|
|
this(signature, DEFAULT_MIN_SHINGLES_MATCH);
|
|
}
|
|
|
|
/**
|
|
* Used by ingester to generate signature.
|
|
* Raw signatures are in byte arrays per sample, and can be more or less
|
|
* than what is asked for.
|
|
*
|
|
* @param rawSignature
|
|
*/
|
|
public TweetIntegerShingleSignature(Iterable<byte[]> rawSignature) {
|
|
byte[] condensedSignature = new byte[NUM_SHINGLES];
|
|
int i = 0;
|
|
for (byte[] signatureItem : rawSignature) {
|
|
condensedSignature[i++] = signatureItem[0];
|
|
if (i == NUM_SHINGLES) {
|
|
break;
|
|
}
|
|
}
|
|
this.shingles = condensedSignature;
|
|
Arrays.sort(this.shingles);
|
|
this.minShinglesMatch = DEFAULT_MIN_SHINGLES_MATCH;
|
|
this.signature = serializeInternal(shingles);
|
|
}
|
|
|
|
/**
|
|
* When used in a hashtable for dup detection, take the first byte of each signature for fast
|
|
* pass for majority case of no fuzzy matching. For top queries, this optimization losses about
|
|
* only 4% of all fuzzy matches.
|
|
*
|
|
* @return most significant byte of this signature as its hashcode.
|
|
*/
|
|
@Override
|
|
public int hashCode() {
|
|
return shingles[0] & 0xFF;
|
|
}
|
|
|
|
/**
|
|
* Perform fuzzy matching between two TweetIntegerShingleSignature objects.
|
|
*
|
|
* @param other TweetIntegerShingleSignature object to perform fuzzy match against
|
|
* @return true if at least minMatch number of bytes match
|
|
*/
|
|
@Override
|
|
public boolean equals(Object other) {
|
|
if (this == other) {
|
|
return true;
|
|
}
|
|
if (other == null) {
|
|
return false;
|
|
}
|
|
if (getClass() != other.getClass()) {
|
|
return false;
|
|
}
|
|
|
|
final TweetIntegerShingleSignature otherSignatureInteger = (TweetIntegerShingleSignature) other;
|
|
|
|
int otherSignature = otherSignatureInteger.serialize();
|
|
if (signature == otherSignature) {
|
|
// Both serialized signature is the same
|
|
return true;
|
|
} else if (signature != DEFAULT_NO_SIGNATURE && otherSignature != DEFAULT_NO_SIGNATURE) {
|
|
// Neither is NO_SIGNATURE, need to compare shingles.
|
|
byte[] otherShingles = otherSignatureInteger.getShingles();
|
|
int numberMatchesNeeded = minShinglesMatch;
|
|
// expect bytes are in ascending sorted order
|
|
int i = 0;
|
|
int j = 0;
|
|
while (((numberMatchesNeeded <= (NUM_SHINGLES - i)) // early termination for i
|
|
|| (numberMatchesNeeded <= (NUM_SHINGLES - j))) // early termination j
|
|
&& (i < NUM_SHINGLES) && (j < NUM_SHINGLES)) {
|
|
if (shingles[i] == otherShingles[j]) {
|
|
if (shingles[i] != 0) { // we only consider two shingles equal if they are non zero
|
|
numberMatchesNeeded--;
|
|
if (numberMatchesNeeded == 0) {
|
|
return true;
|
|
}
|
|
}
|
|
i++;
|
|
j++;
|
|
} else if (shingles[i] < otherShingles[j]) {
|
|
i++;
|
|
} else if (shingles[i] > otherShingles[j]) {
|
|
j++;
|
|
}
|
|
}
|
|
}
|
|
// One is NO_SIGNATURE and one is not.
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Returns the sorted array of signature bytes.
|
|
*/
|
|
public byte[] getShingles() {
|
|
return shingles;
|
|
}
|
|
|
|
/**
|
|
* Serialize 4 sorted signature bytes into an integer in big endian order.
|
|
*
|
|
* @return compacted int signature
|
|
*/
|
|
private static int serializeInternal(byte[] shingles) {
|
|
ByteBuffer byteBuffer = ByteBuffer.allocate(NUM_SHINGLES);
|
|
byteBuffer.put(shingles, 0, NUM_SHINGLES);
|
|
return byteBuffer.getInt(0);
|
|
}
|
|
|
|
/**
|
|
* Deserialize an integer into a 4-byte array.
|
|
* @param signature The signature integer.
|
|
* @return A byte array with 4 elements.
|
|
*/
|
|
private static byte[] deserializeInternal(int signature) {
|
|
return ByteBuffer.allocate(NUM_SHINGLES).putInt(signature).array();
|
|
}
|
|
|
|
public int serialize() {
|
|
return signature;
|
|
}
|
|
|
|
public static boolean isFuzzyMatch(int signature1, int signature2) {
|
|
return TweetIntegerShingleSignature.deserialize(signature1).equals(
|
|
TweetIntegerShingleSignature.deserialize(signature2));
|
|
}
|
|
|
|
public static TweetIntegerShingleSignature deserialize(int signature) {
|
|
return new TweetIntegerShingleSignature(signature);
|
|
}
|
|
|
|
public static TweetIntegerShingleSignature deserialize(int signature, int minMatchSingles) {
|
|
return new TweetIntegerShingleSignature(signature, minMatchSingles);
|
|
}
|
|
|
|
@Override
|
|
public String toString() {
|
|
return String.format("%d %d %d %d", shingles[0], shingles[1], shingles[2], shingles[3]);
|
|
}
|
|
}
|