mirror of
https://github.com/element-hq/dendrite.git
synced 2025-09-13 12:52:24 +03:00
Adjust roomserver locks, don't unsubscribe if new event is inflight (#3588)
This should fix #3484 -- at the very least, it has resolved the issues we've had on our instance. I've extended the lock so it surrounds the unsubscribe as well as a new check if the latest sequential ID seen by the ephemeral thread is newer than the sequential ID seen by the durable thread. This solves a race where the unsubscribe happened while a new message was inflight, and so the message was not handled. This is a fix for a race condition that has been pretty unreliable to reproduce manually, so I don't know if there's a good way to add a reliable automated test for it. If you have any ideas I'm open to it. ### Pull Request Checklist <!-- Please read https://matrix-org.github.io/dendrite/development/contributing before submitting your pull request --> * [x] I have added Go unit tests or [Complement integration tests](https://github.com/matrix-org/complement) for this PR _or_ I have justified why this PR doesn't need tests * [x] Pull request includes a [sign off below](https://element-hq.github.io/dendrite/development/contributing#sign-off) _or_ I have already signed off privately Signed-off-by: `Vivianne Langdon <puttabutta@gmail.com>` --------- Signed-off-by: Vivianne Langdon <puttabutta@gmail.com> Co-authored-by: Kegan Dougal <7190048+kegsay@users.noreply.github.com>
This commit is contained in:
parent
df748c5eae
commit
f24688af11
1 changed files with 28 additions and 6 deletions
|
@ -101,9 +101,12 @@ type worker struct {
|
||||||
roomID string
|
roomID string
|
||||||
subscription *nats.Subscription
|
subscription *nats.Subscription
|
||||||
sentryHub *sentry.Hub
|
sentryHub *sentry.Hub
|
||||||
|
ephemeralSeq uint64
|
||||||
|
// last seq we fully processed
|
||||||
|
durableSeq uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *Inputer) startWorkerForRoom(roomID string) {
|
func (r *Inputer) startWorkerForRoom(roomID string, seq uint64) {
|
||||||
v, loaded := r.workers.LoadOrStore(roomID, &worker{
|
v, loaded := r.workers.LoadOrStore(roomID, &worker{
|
||||||
r: r,
|
r: r,
|
||||||
roomID: roomID,
|
roomID: roomID,
|
||||||
|
@ -112,6 +115,9 @@ func (r *Inputer) startWorkerForRoom(roomID string) {
|
||||||
w := v.(*worker)
|
w := v.(*worker)
|
||||||
w.Lock()
|
w.Lock()
|
||||||
defer w.Unlock()
|
defer w.Unlock()
|
||||||
|
|
||||||
|
w.ephemeralSeq = seq
|
||||||
|
|
||||||
if !loaded || w.subscription == nil {
|
if !loaded || w.subscription == nil {
|
||||||
streamName := r.Cfg.Matrix.JetStream.Prefixed(jetstream.InputRoomEvent)
|
streamName := r.Cfg.Matrix.JetStream.Prefixed(jetstream.InputRoomEvent)
|
||||||
consumer := r.Cfg.Matrix.JetStream.Prefixed("RoomInput" + jetstream.Tokenise(w.roomID))
|
consumer := r.Cfg.Matrix.JetStream.Prefixed("RoomInput" + jetstream.Tokenise(w.roomID))
|
||||||
|
@ -226,7 +232,8 @@ func (r *Inputer) Start() error {
|
||||||
"", // This is blank because we specified it in BindStream.
|
"", // This is blank because we specified it in BindStream.
|
||||||
func(m *nats.Msg) {
|
func(m *nats.Msg) {
|
||||||
roomID := m.Header.Get(jetstream.RoomID)
|
roomID := m.Header.Get(jetstream.RoomID)
|
||||||
r.startWorkerForRoom(roomID)
|
meta, _ := m.Metadata()
|
||||||
|
r.startWorkerForRoom(roomID, meta.Sequence.Stream)
|
||||||
_ = m.Ack()
|
_ = m.Ack()
|
||||||
},
|
},
|
||||||
nats.HeadersOnly(),
|
nats.HeadersOnly(),
|
||||||
|
@ -278,7 +285,19 @@ func (w *worker) _next() {
|
||||||
if len(msgs) != 1 {
|
if len(msgs) != 1 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
case context.DeadlineExceeded, context.Canceled:
|
||||||
|
// The context exceeded, so we've been waiting for more than a
|
||||||
|
// minute for activity in this room. At this point we will shut
|
||||||
|
// down the subscriber to free up resources. It'll get started
|
||||||
|
// again if new activity happens.
|
||||||
|
w.Lock()
|
||||||
|
defer w.Unlock()
|
||||||
|
// inside the lock, let's check if the ephemeral consumer saw something new!
|
||||||
|
// If so, we do have new messages after all, they just came at a bad time.
|
||||||
|
if w.ephemeralSeq > w.durableSeq {
|
||||||
|
w.Act(nil, w._next)
|
||||||
|
return
|
||||||
|
}
|
||||||
case nats.ErrConsumerDeleted, nats.ErrConsumerNotFound:
|
case nats.ErrConsumerDeleted, nats.ErrConsumerNotFound:
|
||||||
// The consumer is gone, therefore it's reached the inactivity
|
// The consumer is gone, therefore it's reached the inactivity
|
||||||
// threshold. Clean up and stop processing at this point, if a
|
// threshold. Clean up and stop processing at this point, if a
|
||||||
|
@ -287,9 +306,7 @@ func (w *worker) _next() {
|
||||||
if err = w.subscription.Unsubscribe(); err != nil {
|
if err = w.subscription.Unsubscribe(); err != nil {
|
||||||
logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID)
|
logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID)
|
||||||
}
|
}
|
||||||
w.Lock()
|
|
||||||
w.subscription = nil
|
w.subscription = nil
|
||||||
w.Unlock()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
@ -297,11 +314,13 @@ func (w *worker) _next() {
|
||||||
// from the queue. In which case, we'll shut down the subscriber
|
// from the queue. In which case, we'll shut down the subscriber
|
||||||
// and wait to be notified about new room activity again. Maybe
|
// and wait to be notified about new room activity again. Maybe
|
||||||
// the problem will be corrected by then.
|
// the problem will be corrected by then.
|
||||||
|
// atomically clear the subscription and unsubscribe
|
||||||
|
w.Lock()
|
||||||
|
|
||||||
logrus.WithError(err).Errorf("Failed to get next stream message for room %q", w.roomID)
|
logrus.WithError(err).Errorf("Failed to get next stream message for room %q", w.roomID)
|
||||||
if err = w.subscription.Unsubscribe(); err != nil {
|
if err = w.subscription.Unsubscribe(); err != nil {
|
||||||
logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID)
|
logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID)
|
||||||
}
|
}
|
||||||
w.Lock()
|
|
||||||
w.subscription = nil
|
w.subscription = nil
|
||||||
w.Unlock()
|
w.Unlock()
|
||||||
return
|
return
|
||||||
|
@ -314,6 +333,9 @@ func (w *worker) _next() {
|
||||||
// fails then we'll terminate the message — this notifies NATS that
|
// fails then we'll terminate the message — this notifies NATS that
|
||||||
// we are done with the message and never want to see it again.
|
// we are done with the message and never want to see it again.
|
||||||
msg := msgs[0]
|
msg := msgs[0]
|
||||||
|
meta, _ := msg.Metadata()
|
||||||
|
w.durableSeq = meta.Sequence.Stream
|
||||||
|
|
||||||
var inputRoomEvent api.InputRoomEvent
|
var inputRoomEvent api.InputRoomEvent
|
||||||
if err = json.Unmarshal(msg.Data, &inputRoomEvent); err != nil {
|
if err = json.Unmarshal(msg.Data, &inputRoomEvent); err != nil {
|
||||||
// using AckWait here makes the call synchronous; 5 seconds is the default value used by NATS
|
// using AckWait here makes the call synchronous; 5 seconds is the default value used by NATS
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue