fix: Errors during shard-publishing should not kill the instance
[demos/kafka/chat] / src / main / java / de / juplo / kafka / chat / backend / implementation / kafka / DataChannel.java
1 package de.juplo.kafka.chat.backend.implementation.kafka;
2
3 import de.juplo.kafka.chat.backend.domain.*;
4 import de.juplo.kafka.chat.backend.domain.exceptions.LoadInProgressException;
5 import de.juplo.kafka.chat.backend.domain.exceptions.ShardNotOwnedException;
6 import de.juplo.kafka.chat.backend.implementation.kafka.messages.AbstractMessageTo;
7 import de.juplo.kafka.chat.backend.implementation.kafka.messages.data.EventChatMessageReceivedTo;
8 import lombok.Getter;
9 import lombok.extern.slf4j.Slf4j;
10 import org.apache.kafka.clients.consumer.*;
11 import org.apache.kafka.clients.producer.Producer;
12 import org.apache.kafka.clients.producer.ProducerRecord;
13 import org.apache.kafka.common.TopicPartition;
14 import org.apache.kafka.common.errors.WakeupException;
15 import reactor.core.publisher.Mono;
16
17 import java.time.*;
18 import java.util.*;
19 import java.util.stream.IntStream;
20
21
22 @Slf4j
23 public class DataChannel implements Runnable, ConsumerRebalanceListener
24 {
25   private final String instanceId;
26   private final String topic;
27   private final Producer<String, AbstractMessageTo> producer;
28   private final Consumer<String, AbstractMessageTo> consumer;
29   private final ZoneId zoneId;
30   private final int numShards;
31   private final Duration pollingInterval;
32   private final int bufferSize;
33   private final Clock clock;
34   private final boolean[] isShardOwned;
35   private final long[] currentOffset;
36   private final long[] nextOffset;
37   private final Map<UUID, ChatRoomData>[] chatRoomData;
38   private final InfoChannel infoChannel;
39   private final ShardingPublisherStrategy shardingPublisherStrategy;
40
41   private boolean running;
42   @Getter
43   private volatile boolean loadInProgress;
44
45
46   public DataChannel(
47     String instanceId,
48     String topic,
49     Producer<String, AbstractMessageTo> producer,
50     Consumer<String, AbstractMessageTo> dataChannelConsumer,
51     ZoneId zoneId,
52     int numShards,
53     Duration pollingInterval,
54     int bufferSize,
55     Clock clock,
56     InfoChannel infoChannel,
57     ShardingPublisherStrategy shardingPublisherStrategy)
58   {
59     log.debug(
60         "{}: Creating DataChannel for topic {} with {} partitions",
61         instanceId,
62         topic,
63         numShards);
64     this.instanceId = instanceId;
65     this.topic = topic;
66     this.consumer = dataChannelConsumer;
67     this.producer = producer;
68     this.zoneId = zoneId;
69     this.numShards = numShards;
70     this.pollingInterval = pollingInterval;
71     this.bufferSize = bufferSize;
72     this.clock = clock;
73     this.isShardOwned = new boolean[numShards];
74     this.currentOffset = new long[numShards];
75     this.nextOffset = new long[numShards];
76     this.chatRoomData = new Map[numShards];
77     IntStream
78         .range(0, numShards)
79         .forEach(shard -> this.chatRoomData[shard] = new HashMap<>());
80     this.infoChannel = infoChannel;
81     this.shardingPublisherStrategy = shardingPublisherStrategy;
82   }
83
84
85
86   Mono<Message> sendChatMessage(
87       UUID chatRoomId,
88       Message.MessageKey key,
89       LocalDateTime timestamp,
90       String text)
91   {
92     ZonedDateTime zdt = ZonedDateTime.of(timestamp, zoneId);
93     return Mono.create(sink ->
94     {
95       ProducerRecord<String, AbstractMessageTo> record =
96           new ProducerRecord<>(
97               topic,
98               null,
99               zdt.toEpochSecond(),
100               chatRoomId.toString(),
101               EventChatMessageReceivedTo.of(key.getUsername(), key.getMessageId(), text));
102
103       producer.send(record, ((metadata, exception) ->
104       {
105         if (exception == null)
106         {
107           // On successful send
108           Message message = new Message(key, metadata.offset(), timestamp, text);
109           log.info("Successfully send message {}", message);
110           sink.success(message);
111         }
112         else
113         {
114           // On send-failure
115           log.error(
116               "Could not send message for chat-room={}, key={}, timestamp={}, text={}: {}",
117               chatRoomId,
118               key,
119               timestamp,
120               text,
121               exception);
122           sink.error(exception);
123         }
124       }));
125     });
126   }
127
128   @Override
129   public void onPartitionsAssigned(Collection<TopicPartition> partitions)
130   {
131     log.info("Newly assigned partitions! Pausing normal operations...");
132     loadInProgress = true;
133
134     consumer.endOffsets(partitions).forEach((topicPartition, currentOffset) ->
135     {
136       int partition = topicPartition.partition();
137       isShardOwned[partition] =  true;
138       this.currentOffset[partition] = currentOffset;
139
140       log.info(
141           "Partition assigned: {} - loading messages: next={} -> current={}",
142           partition,
143           nextOffset[partition],
144           currentOffset);
145
146       consumer.seek(topicPartition, nextOffset[partition]);
147       infoChannel.sendShardAssignedEvent(partition);
148       shardingPublisherStrategy
149           .publishOwnership(partition)
150           .doOnSuccess(instanceId -> log.info(
151               "Successfully published instance {} as owner of shard {}",
152               instanceId,
153               partition))
154           .doOnError(throwable -> log.error(
155               "Could not publish instance {} as owner of shard {}: {}",
156               instanceId,
157               partition,
158               throwable.toString()))
159           .onErrorComplete()
160           .block();
161     });
162
163     consumer.resume(partitions);
164   }
165
166   @Override
167   public void onPartitionsRevoked(Collection<TopicPartition> partitions)
168   {
169     partitions.forEach(topicPartition ->
170     {
171       int partition = topicPartition.partition();
172       isShardOwned[partition] = false;
173       nextOffset[partition] = consumer.position(topicPartition);
174       log.info("Partition revoked: {} - next={}", partition, nextOffset[partition]);
175       infoChannel.sendShardRevokedEvent(partition);
176     });
177   }
178
179   @Override
180   public void onPartitionsLost(Collection<TopicPartition> partitions)
181   {
182     log.warn("Lost partitions: {}, partitions");
183     // TODO: Muss auf den Verlust anders reagiert werden?
184     onPartitionsRevoked(partitions);
185   }
186
187   @Override
188   public void run()
189   {
190     running = true;
191
192     while (running)
193     {
194       try
195       {
196         ConsumerRecords<String, AbstractMessageTo> records = consumer.poll(pollingInterval);
197         log.info("Fetched {} messages", records.count());
198
199         if (loadInProgress)
200         {
201           loadChatRoomData(records);
202
203           if (isLoadingCompleted())
204           {
205             log.info("Loading of messages completed! Pausing all owned partitions...");
206             pauseAllOwnedPartions();
207             log.info("Resuming normal operations...");
208             loadInProgress = false;
209           }
210         }
211         else
212         {
213           if (!records.isEmpty())
214           {
215             throw new IllegalStateException("All owned partitions should be paused, when no load is in progress!");
216           }
217         }
218       }
219       catch (WakeupException e)
220       {
221         log.info("Received WakeupException, exiting!");
222         running = false;
223       }
224     }
225
226     log.info("Exiting normally");
227   }
228
229   private void loadChatRoomData(ConsumerRecords<String, AbstractMessageTo> records)
230   {
231     for (ConsumerRecord<String, AbstractMessageTo> record : records)
232     {
233       UUID chatRoomId = UUID.fromString(record.key());
234
235       switch (record.value().getType())
236       {
237         case EVENT_CHATMESSAGE_RECEIVED:
238           Instant instant = Instant.ofEpochSecond(record.timestamp());
239           LocalDateTime timestamp = LocalDateTime.ofInstant(instant, zoneId);
240           loadChatMessage(
241               chatRoomId,
242               timestamp,
243               record.offset(),
244               (EventChatMessageReceivedTo) record.value(),
245               record.partition());
246           break;
247
248         default:
249           log.debug(
250               "Ignoring message for chat-room {} with offset {}: {}",
251               chatRoomId,
252               record.offset(),
253               record.value());
254       }
255
256       nextOffset[record.partition()] = record.offset() + 1;
257     }
258   }
259
260   private void loadChatMessage(
261       UUID chatRoomId,
262       LocalDateTime timestamp,
263       long offset,
264       EventChatMessageReceivedTo chatMessageTo,
265       int partition)
266   {
267     Message.MessageKey key = Message.MessageKey.of(chatMessageTo.getUser(), chatMessageTo.getId());
268     Message message = new Message(key, offset, timestamp, chatMessageTo.getText());
269
270     ChatRoomData chatRoomData = this
271         .chatRoomData[partition]
272         .computeIfAbsent(chatRoomId, this::computeChatRoomData);
273     KafkaChatMessageService kafkaChatRoomService =
274         (KafkaChatMessageService) chatRoomData.getChatRoomService();
275
276     log.debug(
277         "Loaded message from partition={} at offset={}: {}",
278         partition,
279         offset,
280         message);
281     kafkaChatRoomService.persistMessage(message);
282   }
283
284   private boolean isLoadingCompleted()
285   {
286     return IntStream
287         .range(0, numShards)
288         .filter(shard -> isShardOwned[shard])
289         .allMatch(shard ->
290         {
291           TopicPartition partition = new TopicPartition(topic, shard);
292           long position = consumer.position(partition);
293           return position >= currentOffset[shard];
294         });
295   }
296
297   private void pauseAllOwnedPartions()
298   {
299     consumer.pause(IntStream
300         .range(0, numShards)
301         .filter(shard -> isShardOwned[shard])
302         .mapToObj(shard -> new TopicPartition(topic, shard))
303         .toList());
304   }
305
306
307   int[] getOwnedShards()
308   {
309     return IntStream
310         .range(0, numShards)
311         .filter(shard -> isShardOwned[shard])
312         .toArray();
313   }
314
315   Mono<ChatRoomData> getChatRoomData(int shard, UUID id)
316   {
317     if (loadInProgress)
318     {
319       return Mono.error(new LoadInProgressException());
320     }
321
322     if (!isShardOwned[shard])
323     {
324       return Mono.error(new ShardNotOwnedException(instanceId, shard));
325     }
326
327     return infoChannel
328         .getChatRoomInfo(id)
329         .map(chatRoomInfo ->
330             chatRoomData[shard].computeIfAbsent(id, this::computeChatRoomData));
331   }
332
333   private ChatRoomData computeChatRoomData(UUID chatRoomId)
334   {
335     log.info("Creating ChatRoom {} with buffer-size {}", chatRoomId, bufferSize);
336     KafkaChatMessageService service = new KafkaChatMessageService(this, chatRoomId);
337     return new ChatRoomData(clock, service, bufferSize);
338   }
339
340   ConsumerGroupMetadata getConsumerGroupMetadata()
341   {
342     return consumer.groupMetadata();
343   }
344 }