the-algorithm/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.java
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

101 lines
3.9 KiB
Java

package com.twitter.search.earlybird.common.userupdates;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchCustomGauge;
import com.twitter.search.common.metrics.SearchTimerStats;
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent;
/**
* Map of users who have actioned to delete location data from their tweets. UserID's are mapped
* to the maxTweetId that will eventually be scrubbed from the index (userId -> maxTweetId).
*
* ConcurrentHashMap is thread safe without synchronizing the whole map. Reads can happen very fast
* while writes are done with a lock. This is ideal since many Earlybird Searcher threads could
* be reading from the map at once, whereas we will only be adding to the map via kafka.
*
* This map is checked against to filter out tweets that should not be returned to geo queries.
* See: go/realtime-geo-filtering
*/
public class UserScrubGeoMap {
// The number of geo events that contain a user ID already present in the map. This count is used
// to verify the number of users in the map against the number of events consumed from kafka.
private static final SearchCounter USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT =
SearchCounter.export("user_scrub_geo_event_existing_user_count");
public static final SearchTimerStats USER_SCRUB_GEO_EVENT_LAG_STAT =
SearchTimerStats.export("user_scrub_geo_event_lag",
TimeUnit.MILLISECONDS,
false,
true);
private ConcurrentHashMap<Long, Long> map;
public UserScrubGeoMap() {
map = new ConcurrentHashMap<>();
SearchCustomGauge.export("num_users_in_geo_map", this::getNumUsersInMap);
}
/**
* Ensure that the max_tweet_id in the userScrubGeoEvent is greater than the one already stored
* in the map for the given user id (if any) before updating the entry for this user.
* This will protect Earlybirds from potential issues where out of date UserScrubGeoEvents
* appear in the incoming Kafka stream.
*
* @param userScrubGeoEvent
*/
public void indexUserScrubGeoEvent(UserScrubGeoEvent userScrubGeoEvent) {
long userId = userScrubGeoEvent.getUser_id();
long newMaxTweetId = userScrubGeoEvent.getMax_tweet_id();
long oldMaxTweetId = map.getOrDefault(userId, 0L);
if (map.containsKey(userId)) {
USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT.increment();
}
map.put(userId, Math.max(oldMaxTweetId, newMaxTweetId));
USER_SCRUB_GEO_EVENT_LAG_STAT.timerIncrement(computeEventLag(newMaxTweetId));
}
/**
* A tweet is geo scrubbed if it is older than the max tweet id that is scrubbed for the tweet's
* author.
* If there is no entry for the tweet's author in the map, then the tweet is not geo scrubbed.
*
* @param tweetId
* @param fromUserId
* @return
*/
public boolean isTweetGeoScrubbed(long tweetId, long fromUserId) {
return tweetId <= map.getOrDefault(fromUserId, 0L);
}
/**
* The lag (in milliseconds) from when a UserScrubGeoEvent is created, until it is applied to the
* UserScrubGeoMap. Take the maxTweetId found in the current event and convert it to a timestamp.
* The maxTweetId will give us a timestamp closest to when Tweetypie processes macaw-geo requests.
*
* @param maxTweetId
* @return
*/
private long computeEventLag(long maxTweetId) {
long eventCreatedAtTime = SnowflakeIdParser.getTimestampFromTweetId(maxTweetId);
return System.currentTimeMillis() - eventCreatedAtTime;
}
public long getNumUsersInMap() {
return map.size();
}
public ConcurrentHashMap<Long, Long> getMap() {
return map;
}
public boolean isEmpty() {
return map.isEmpty();
}
public boolean isSet(long userId) {
return map.containsKey(userId);
}
}