Adjust roomserver locks, don't unsubscribe if new event is inflight (#3588)

This should fix #3484 -- at the very least, it has resolved the issues
we've had on our instance. I've extended the lock so it surrounds the
unsubscribe as well as a new check if the latest sequential ID seen by
the ephemeral thread is newer than the sequential ID seen by the durable
thread. This solves a race where the unsubscribe happened while a new
message was inflight, and so the message was not handled.

This is a fix for a race condition that has been pretty unreliable to
reproduce manually, so I don't know if there's a good way to add a
reliable automated test for it. If you have any ideas I'm open to it.

### Pull Request Checklist

<!-- Please read
https://matrix-org.github.io/dendrite/development/contributing before
submitting your pull request -->

* [x] I have added Go unit tests or [Complement integration
tests](https://github.com/matrix-org/complement) for this PR _or_ I have
justified why this PR doesn't need tests
* [x] Pull request includes a [sign off
below](https://element-hq.github.io/dendrite/development/contributing#sign-off)
_or_ I have already signed off privately

Signed-off-by: `Vivianne Langdon <puttabutta@gmail.com>`

---------

Signed-off-by: Vivianne Langdon <puttabutta@gmail.com>
Co-authored-by: Kegan Dougal <7190048+kegsay@users.noreply.github.com>
This commit is contained in:
Vivianne 2025-08-12 09:06:42 -04:00 committed by GitHub
parent df748c5eae
commit f24688af11
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -101,9 +101,12 @@ type worker struct {
roomID string roomID string
subscription *nats.Subscription subscription *nats.Subscription
sentryHub *sentry.Hub sentryHub *sentry.Hub
ephemeralSeq uint64
// last seq we fully processed
durableSeq uint64
} }
func (r *Inputer) startWorkerForRoom(roomID string) { func (r *Inputer) startWorkerForRoom(roomID string, seq uint64) {
v, loaded := r.workers.LoadOrStore(roomID, &worker{ v, loaded := r.workers.LoadOrStore(roomID, &worker{
r: r, r: r,
roomID: roomID, roomID: roomID,
@ -112,6 +115,9 @@ func (r *Inputer) startWorkerForRoom(roomID string) {
w := v.(*worker) w := v.(*worker)
w.Lock() w.Lock()
defer w.Unlock() defer w.Unlock()
w.ephemeralSeq = seq
if !loaded || w.subscription == nil { if !loaded || w.subscription == nil {
streamName := r.Cfg.Matrix.JetStream.Prefixed(jetstream.InputRoomEvent) streamName := r.Cfg.Matrix.JetStream.Prefixed(jetstream.InputRoomEvent)
consumer := r.Cfg.Matrix.JetStream.Prefixed("RoomInput" + jetstream.Tokenise(w.roomID)) consumer := r.Cfg.Matrix.JetStream.Prefixed("RoomInput" + jetstream.Tokenise(w.roomID))
@ -226,7 +232,8 @@ func (r *Inputer) Start() error {
"", // This is blank because we specified it in BindStream. "", // This is blank because we specified it in BindStream.
func(m *nats.Msg) { func(m *nats.Msg) {
roomID := m.Header.Get(jetstream.RoomID) roomID := m.Header.Get(jetstream.RoomID)
r.startWorkerForRoom(roomID) meta, _ := m.Metadata()
r.startWorkerForRoom(roomID, meta.Sequence.Stream)
_ = m.Ack() _ = m.Ack()
}, },
nats.HeadersOnly(), nats.HeadersOnly(),
@ -278,7 +285,19 @@ func (w *worker) _next() {
if len(msgs) != 1 { if len(msgs) != 1 {
return return
} }
case context.DeadlineExceeded, context.Canceled:
// The context exceeded, so we've been waiting for more than a
// minute for activity in this room. At this point we will shut
// down the subscriber to free up resources. It'll get started
// again if new activity happens.
w.Lock()
defer w.Unlock()
// inside the lock, let's check if the ephemeral consumer saw something new!
// If so, we do have new messages after all, they just came at a bad time.
if w.ephemeralSeq > w.durableSeq {
w.Act(nil, w._next)
return
}
case nats.ErrConsumerDeleted, nats.ErrConsumerNotFound: case nats.ErrConsumerDeleted, nats.ErrConsumerNotFound:
// The consumer is gone, therefore it's reached the inactivity // The consumer is gone, therefore it's reached the inactivity
// threshold. Clean up and stop processing at this point, if a // threshold. Clean up and stop processing at this point, if a
@ -287,9 +306,7 @@ func (w *worker) _next() {
if err = w.subscription.Unsubscribe(); err != nil { if err = w.subscription.Unsubscribe(); err != nil {
logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID) logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID)
} }
w.Lock()
w.subscription = nil w.subscription = nil
w.Unlock()
return return
default: default:
@ -297,11 +314,13 @@ func (w *worker) _next() {
// from the queue. In which case, we'll shut down the subscriber // from the queue. In which case, we'll shut down the subscriber
// and wait to be notified about new room activity again. Maybe // and wait to be notified about new room activity again. Maybe
// the problem will be corrected by then. // the problem will be corrected by then.
// atomically clear the subscription and unsubscribe
w.Lock()
logrus.WithError(err).Errorf("Failed to get next stream message for room %q", w.roomID) logrus.WithError(err).Errorf("Failed to get next stream message for room %q", w.roomID)
if err = w.subscription.Unsubscribe(); err != nil { if err = w.subscription.Unsubscribe(); err != nil {
logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID) logrus.WithError(err).Errorf("Failed to unsubscribe to stream for room %q", w.roomID)
} }
w.Lock()
w.subscription = nil w.subscription = nil
w.Unlock() w.Unlock()
return return
@ -314,6 +333,9 @@ func (w *worker) _next() {
// fails then we'll terminate the message — this notifies NATS that // fails then we'll terminate the message — this notifies NATS that
// we are done with the message and never want to see it again. // we are done with the message and never want to see it again.
msg := msgs[0] msg := msgs[0]
meta, _ := msg.Metadata()
w.durableSeq = meta.Sequence.Stream
var inputRoomEvent api.InputRoomEvent var inputRoomEvent api.InputRoomEvent
if err = json.Unmarshal(msg.Data, &inputRoomEvent); err != nil { if err = json.Unmarshal(msg.Data, &inputRoomEvent); err != nil {
// using AckWait here makes the call synchronous; 5 seconds is the default value used by NATS // using AckWait here makes the call synchronous; 5 seconds is the default value used by NATS