Use AckExplicitPolicy instead of AckAllPolicy (#3288)

Fixes https://github.com/matrix-org/dendrite/issues/3240 and potentially
a root cause for state resets.

While testing, I've had added some more debug logging:
```
time="2023-12-16T18:13:11.319458084Z" level=warning msg="already processed event" event_id="$qFYMl_F2vb1N0yxmvlFAMhqhGhLKq4kA-o_YCQKH7tQ" kind=KindNew times=2
time="2023-12-16T18:13:14.537389126Z" level=warning msg="already processed event" event_id="$EU-LTsKErT6Mt1k12-p_3xOHfiLaK6gtwVDlZ35lSuo" kind=KindNew times=5
time="2023-12-16T18:13:16.789551206Z" level=warning msg="already processed event" event_id="$dIPuAfTL5x0VyG873LKPslQeljCSxFT1WKxUtjIMUGE" kind=KindNew times=5
time="2023-12-16T18:13:17.383838767Z" level=warning msg="already processed event" event_id="$7noSZiCkzerpkz_UBO3iatpRnaOiPx-3IXc0GPDQVGE" kind=KindNew times=2
time="2023-12-16T18:13:22.091946597Z" level=warning msg="already processed event" event_id="$3Lvo3Wbi2ol9-nNbQ93N-E2MuGQCJZo5397KkFH-W6E" kind=KindNew times=1
time="2023-12-16T18:13:23.026417446Z" level=warning msg="already processed event" event_id="$lj1xS46zsLBCChhKOLJEG-bu7z-_pq9i_Y2DUIjzGy4" kind=KindNew times=4
```

So we did receive the same event over and over again. Given they are
`KindNew`, we don't short circuit if we already processed them, which
potentially caused the state to be calculated with a now wrong state
snapshot.

Also fixes the back pressure metric. We now correctly increment the
counter once we sent the message to NATS and decrement it once we
actually processed an event.
This commit is contained in:
Till 2023-12-19 08:25:47 +01:00 committed by GitHub
parent d65449c782
commit f93d1c4790
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 158 additions and 47 deletions

View file

@ -12,7 +12,9 @@ import (
"github.com/matrix-org/dendrite/internal/eventutil"
"github.com/matrix-org/dendrite/internal/httputil"
"github.com/matrix-org/dendrite/internal/sqlutil"
"github.com/matrix-org/dendrite/roomserver/internal/input"
"github.com/matrix-org/gomatrixserverlib/spec"
"github.com/nats-io/nats.go"
"github.com/stretchr/testify/assert"
"github.com/tidwall/gjson"
@ -1231,3 +1233,54 @@ func TestNewServerACLs(t *testing.T) {
assert.Equal(t, false, banned)
})
}
// Validate that changing the AckPolicy/AckWait of room consumers
// results in their recreation
func TestRoomConsumerRecreation(t *testing.T) {
alice := test.NewUser(t)
room := test.NewRoom(t, alice)
// As this is DB unrelated, just use SQLite
cfg, processCtx, closeDB := testrig.CreateConfig(t, test.DBTypeSQLite)
defer closeDB()
cm := sqlutil.NewConnectionManager(processCtx, cfg.Global.DatabaseOptions)
natsInstance := &jetstream.NATSInstance{}
// Prepare a stream and consumer using the old configuration
jsCtx, _ := natsInstance.Prepare(processCtx, &cfg.Global.JetStream)
streamName := cfg.Global.JetStream.Prefixed(jetstream.InputRoomEvent)
consumer := cfg.Global.JetStream.Prefixed("RoomInput" + jetstream.Tokenise(room.ID))
subject := cfg.Global.JetStream.Prefixed(jetstream.InputRoomEventSubj(room.ID))
consumerConfig := &nats.ConsumerConfig{
Durable: consumer,
AckPolicy: nats.AckAllPolicy,
DeliverPolicy: nats.DeliverAllPolicy,
FilterSubject: subject,
AckWait: (time.Minute * 2) + (time.Second * 10),
InactiveThreshold: time.Hour * 24,
}
// Create the consumer with the old config
_, err := jsCtx.AddConsumer(streamName, consumerConfig)
assert.NoError(t, err)
caches := caching.NewRistrettoCache(128*1024*1024, time.Hour, caching.DisableMetrics)
// start JetStream listeners
rsAPI := roomserver.NewInternalAPI(processCtx, cfg, cm, natsInstance, caches, caching.DisableMetrics)
rsAPI.SetFederationAPI(nil, nil)
// let the RS create the events, this also recreates the Consumers
err = api.SendEvents(context.Background(), rsAPI, api.KindNew, room.Events(), "test", "test", "test", nil, false)
assert.NoError(t, err)
// Validate that AckPolicy and AckWait has changed
info, err := jsCtx.ConsumerInfo(streamName, consumer)
assert.NoError(t, err)
assert.Equal(t, nats.AckExplicitPolicy, info.Config.AckPolicy)
wantAckWait := input.MaximumMissingProcessingTime + (time.Second * 10)
assert.Equal(t, wantAckWait, info.Config.AckWait)
}