|
72 | 72 | logger = logging.getLogger(__name__) |
73 | 73 |
|
74 | 74 |
|
75 | | -# These values are used in the `enqueus_event` and `_do_fetch` methods to |
| 75 | +# These values are used in the `enqueue_event` and `_fetch_loop` methods to |
76 | 76 | # control how we batch/bulk fetch events from the database. |
77 | 77 | # The values are plucked out of thing air to make initial sync run faster |
78 | 78 | # on jki.re |
@@ -602,7 +602,7 @@ async def _get_events_from_cache_or_db( |
602 | 602 | # already due to `_get_events_from_db`). |
603 | 603 | fetching_deferred: ObservableDeferred[ |
604 | 604 | Dict[str, _EventCacheEntry] |
605 | | - ] = ObservableDeferred(defer.Deferred()) |
| 605 | + ] = ObservableDeferred(defer.Deferred(), consumeErrors=True) |
606 | 606 | for event_id in missing_events_ids: |
607 | 607 | self._current_event_fetches[event_id] = fetching_deferred |
608 | 608 |
|
@@ -736,35 +736,118 @@ async def get_stripped_room_state_from_event_context( |
736 | 736 | for e in state_to_include.values() |
737 | 737 | ] |
738 | 738 |
|
739 | | - def _do_fetch(self, conn: Connection) -> None: |
| 739 | + def _maybe_start_fetch_thread(self) -> None: |
| 740 | + """Starts an event fetch thread if we are not yet at the maximum number.""" |
| 741 | + with self._event_fetch_lock: |
| 742 | + if ( |
| 743 | + self._event_fetch_list |
| 744 | + and self._event_fetch_ongoing < EVENT_QUEUE_THREADS |
| 745 | + ): |
| 746 | + self._event_fetch_ongoing += 1 |
| 747 | + event_fetch_ongoing_gauge.set(self._event_fetch_ongoing) |
| 748 | + # `_event_fetch_ongoing` is decremented in `_fetch_thread`. |
| 749 | + should_start = True |
| 750 | + else: |
| 751 | + should_start = False |
| 752 | + |
| 753 | + if should_start: |
| 754 | + run_as_background_process("fetch_events", self._fetch_thread) |
| 755 | + |
| 756 | + async def _fetch_thread(self) -> None: |
| 757 | + """Services requests for events from `_event_fetch_list`.""" |
| 758 | + exc = None |
| 759 | + try: |
| 760 | + await self.db_pool.runWithConnection(self._fetch_loop) |
| 761 | + except BaseException as e: |
| 762 | + exc = e |
| 763 | + raise |
| 764 | + finally: |
| 765 | + should_restart = False |
| 766 | + event_fetches_to_fail = [] |
| 767 | + with self._event_fetch_lock: |
| 768 | + self._event_fetch_ongoing -= 1 |
| 769 | + event_fetch_ongoing_gauge.set(self._event_fetch_ongoing) |
| 770 | + |
| 771 | + # There may still be work remaining in `_event_fetch_list` if we |
| 772 | + # failed, or it was added in between us deciding to exit and |
| 773 | + # decrementing `_event_fetch_ongoing`. |
| 774 | + if self._event_fetch_list: |
| 775 | + if exc is None: |
| 776 | + # We decided to exit, but then some more work was added |
| 777 | + # before `_event_fetch_ongoing` was decremented. |
| 778 | + # If a new event fetch thread was not started, we should |
| 779 | + # restart ourselves since the remaining event fetch threads |
| 780 | + # may take a while to get around to the new work. |
| 781 | + # |
| 782 | + # Unfortunately it is not possible to tell whether a new |
| 783 | + # event fetch thread was started, so we restart |
| 784 | + # unconditionally. If we are unlucky, we will end up with |
| 785 | + # an idle fetch thread, but it will time out after |
| 786 | + # `EVENT_QUEUE_ITERATIONS * EVENT_QUEUE_TIMEOUT_S` seconds |
| 787 | + # in any case. |
| 788 | + # |
| 789 | + # Note that multiple fetch threads may run down this path at |
| 790 | + # the same time. |
| 791 | + should_restart = True |
| 792 | + elif isinstance(exc, Exception): |
| 793 | + if self._event_fetch_ongoing == 0: |
| 794 | + # We were the last remaining fetcher and failed. |
| 795 | + # Fail any outstanding fetches since no one else will |
| 796 | + # handle them. |
| 797 | + event_fetches_to_fail = self._event_fetch_list |
| 798 | + self._event_fetch_list = [] |
| 799 | + else: |
| 800 | + # We weren't the last remaining fetcher, so another |
| 801 | + # fetcher will pick up the work. This will either happen |
| 802 | + # after their existing work, however long that takes, |
| 803 | + # or after at most `EVENT_QUEUE_TIMEOUT_S` seconds if |
| 804 | + # they are idle. |
| 805 | + pass |
| 806 | + else: |
| 807 | + # The exception is a `SystemExit`, `KeyboardInterrupt` or |
| 808 | + # `GeneratorExit`. Don't try to do anything clever here. |
| 809 | + pass |
| 810 | + |
| 811 | + if should_restart: |
| 812 | + # We exited cleanly but noticed more work. |
| 813 | + self._maybe_start_fetch_thread() |
| 814 | + |
| 815 | + if event_fetches_to_fail: |
| 816 | + # We were the last remaining fetcher and failed. |
| 817 | + # Fail any outstanding fetches since no one else will handle them. |
| 818 | + assert exc is not None |
| 819 | + with PreserveLoggingContext(): |
| 820 | + for _, deferred in event_fetches_to_fail: |
| 821 | + deferred.errback(exc) |
| 822 | + |
| 823 | + def _fetch_loop(self, conn: Connection) -> None: |
740 | 824 | """Takes a database connection and waits for requests for events from |
741 | 825 | the _event_fetch_list queue. |
742 | 826 | """ |
743 | | - try: |
744 | | - i = 0 |
745 | | - while True: |
746 | | - with self._event_fetch_lock: |
747 | | - event_list = self._event_fetch_list |
748 | | - self._event_fetch_list = [] |
749 | | - |
750 | | - if not event_list: |
751 | | - single_threaded = self.database_engine.single_threaded |
752 | | - if ( |
753 | | - not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING |
754 | | - or single_threaded |
755 | | - or i > EVENT_QUEUE_ITERATIONS |
756 | | - ): |
757 | | - break |
758 | | - else: |
759 | | - self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S) |
760 | | - i += 1 |
761 | | - continue |
762 | | - i = 0 |
| 827 | + i = 0 |
| 828 | + while True: |
| 829 | + with self._event_fetch_lock: |
| 830 | + event_list = self._event_fetch_list |
| 831 | + self._event_fetch_list = [] |
| 832 | + |
| 833 | + if not event_list: |
| 834 | + # There are no requests waiting. If we haven't yet reached the |
| 835 | + # maximum iteration limit, wait for some more requests to turn up. |
| 836 | + # Otherwise, bail out. |
| 837 | + single_threaded = self.database_engine.single_threaded |
| 838 | + if ( |
| 839 | + not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING |
| 840 | + or single_threaded |
| 841 | + or i > EVENT_QUEUE_ITERATIONS |
| 842 | + ): |
| 843 | + return |
| 844 | + |
| 845 | + self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S) |
| 846 | + i += 1 |
| 847 | + continue |
| 848 | + i = 0 |
763 | 849 |
|
764 | | - self._fetch_event_list(conn, event_list) |
765 | | - finally: |
766 | | - self._event_fetch_ongoing -= 1 |
767 | | - event_fetch_ongoing_gauge.set(self._event_fetch_ongoing) |
| 850 | + self._fetch_event_list(conn, event_list) |
768 | 851 |
|
769 | 852 | def _fetch_event_list( |
770 | 853 | self, conn: Connection, event_list: List[Tuple[List[str], defer.Deferred]] |
@@ -806,9 +889,7 @@ def fire(): |
806 | 889 | # We only want to resolve deferreds from the main thread |
807 | 890 | def fire(evs, exc): |
808 | 891 | for _, d in evs: |
809 | | - if not d.called: |
810 | | - with PreserveLoggingContext(): |
811 | | - d.errback(exc) |
| 892 | + d.errback(exc) |
812 | 893 |
|
813 | 894 | with PreserveLoggingContext(): |
814 | 895 | self.hs.get_reactor().callFromThread(fire, event_list, e) |
@@ -983,20 +1064,9 @@ async def _enqueue_events(self, events: Iterable[str]) -> Dict[str, _EventRow]: |
983 | 1064 | events_d = defer.Deferred() |
984 | 1065 | with self._event_fetch_lock: |
985 | 1066 | self._event_fetch_list.append((events, events_d)) |
986 | | - |
987 | 1067 | self._event_fetch_lock.notify() |
988 | 1068 |
|
989 | | - if self._event_fetch_ongoing < EVENT_QUEUE_THREADS: |
990 | | - self._event_fetch_ongoing += 1 |
991 | | - event_fetch_ongoing_gauge.set(self._event_fetch_ongoing) |
992 | | - should_start = True |
993 | | - else: |
994 | | - should_start = False |
995 | | - |
996 | | - if should_start: |
997 | | - run_as_background_process( |
998 | | - "fetch_events", self.db_pool.runWithConnection, self._do_fetch |
999 | | - ) |
| 1069 | + self._maybe_start_fetch_thread() |
1000 | 1070 |
|
1001 | 1071 | logger.debug("Loading %d events: %s", len(events), events) |
1002 | 1072 | with PreserveLoggingContext(): |
|
0 commit comments