From 81982c717bbb55c36306ede938dc189f01f85592 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Tim=20D=C3=BCsterhus?= Date: Tue, 6 Apr 2021 16:24:49 +0200 Subject: [PATCH] Attempt to prevent deadlocks involving marking all notifications as read MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The previous implementation, UPDATEing all rows with a specific userID, needed to create pretty coarse locks on the `userID` INDEX. Specifically it also created gap locks, preventing concurrent *creation* of INDEX records that would have been matched. My current understanding of MySQL's locking behavior is that the `confirmTime` being part of the INDEX is what caused the issue: The `userID` INDEX includes the columns userID, eventID, objectID and confirmTime. Now consider the following: Thread 1: Marks all notifications for userID = A as read. Thread 2: Marks objects X, Y and Z for userID A as read. Thread 1 will lock all existing notifications for userID = A as well as all insertions into the `userID` INDEX with that specific userID. This includes marking notifications as read, because this will delete the old index record and insert a new index record with an updated confirmTime. Thread 2 will lock the specific notifications for userID = A as well as all insertions into the `userID` INDEX with that specific userID and objectIDs. This includes marking notifications as read, because this will delete the old index record and insert a new index record with an updated confirmTime. Now consider the following timeline: T1: Locks the gaps for userID = A. T2: Locks the gaps for userID = A, objects X, Y, Z. These locks are allowed to coexist: > Gap locks in InnoDB are “purely inhibitive”, which means that their only > purpose is to prevent other transactions from inserting to the gap. (https://dev.mysql.com/doc/refman/8.0/en/innodb-locking.html) T1: Wants to UPDATE the confirmTime for userID = A, object X. -> This is prevented by the gap lock held by T2, thus T1 waits. T2: Wants to UPDATE the confirmTime for userID = A, object X. -> This is prevented by the gap lock held by T1, thus T2 waits. Now we have a deadlock. As the current query needs to UPDATE a large number of rows it is fairly slow, holding the locks for a long time and also needing to update the rows one-by-one. This gives other threads enough opportunity to run in-between, wreaking havoc. Fix this issue by first selecting the exact notifications we need to mark as read using a simple SELECT. This SELECT should not be able to deadlock with concurrent write statements. Afterwards we update the notifications based off a condition matching specific rows within the PRIMARY KEY. As these must match existing rows only, no gap locks will be needed, thus reducing the chance to block concurrent threads. Additionally we only need to update a very small number of rows (should be less than 50 in the vast majority of cases), reducing the time spent in the query, further closing the window for concurrent requests and possibly making the process faster due to less rows being updated (and thus needing to be written to disk). --- .../UserNotificationAction.class.php | 47 +++++++++++++++---- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/wcfsetup/install/files/lib/data/user/notification/UserNotificationAction.class.php b/wcfsetup/install/files/lib/data/user/notification/UserNotificationAction.class.php index f0eacae03e..10e9d218cd 100644 --- a/wcfsetup/install/files/lib/data/user/notification/UserNotificationAction.class.php +++ b/wcfsetup/install/files/lib/data/user/notification/UserNotificationAction.class.php @@ -1,6 +1,7 @@ prepareStatement($sql); $statement->execute([ + WCF::getUser()->userID, + 0, TIME_NOW, - WCF::getUser()->userID ]); + $notificationIDs = $statement->fetchAll(\PDO::FETCH_COLUMN); + + // Step 2) Mark the notifications as read. + $condition = new PreparedStatementConditionBuilder(); + $condition->add('notificationID IN (?)', [$notificationIDs]); + + $sql = "UPDATE wcf".WCF_N."_user_notification + SET confirmTime = ? + {$condition}"; + $statement = WCF::getDB()->prepareStatement($sql); + $statement->execute(\array_merge( + [TIME_NOW], + $condition->getParameters() + )); + + // Step 3) Delete notification_to_user assignments (mimic legacy notification system) + + // This conditions technically is not required, because notificationIDs are unique. + // As this is not enforced at the database layer we play safe until this legacy table + // finally is removed. + $condition->add('userID = ?', [WCF::getUser()->userID]); - // delete notification_to_user assignments (mimic legacy notification system) $sql = "DELETE FROM wcf".WCF_N."_user_notification_to_user - WHERE userID = ?"; + {$condition}"; $statement = WCF::getDB()->prepareStatement($sql); - $statement->execute([WCF::getUser()->userID]); + $statement->execute($condition->getParameters()); - // reset notification count + // Step 4) Clear cached values. UserStorageHandler::getInstance()->reset([WCF::getUser()->userID], 'userNotificationCount'); return [ -- 2.20.1