Import Upstream version 1.8.5
[hcoop/debian/openafs.git] / src / rx / rx_event.c
1 /*
2 * Copyright (c) 2011 Your File System Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS OR
14 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
15 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
16 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
17 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
18 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
19 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
20 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
22 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 */
24
25 /* A reimplementation of the rx_event handler using red/black trees
26 *
27 * The first rx_event implementation used a simple sorted queue of all
28 * events, which lead to O(n^2) performance, where n is the number of
29 * outstanding events. This was found to scale poorly, so was replaced.
30 *
31 * The second implementation used a set of per-second buckets to store
32 * events. Each bucket (referred to as an epoch in the code) stored all
33 * of the events which expired in that second. However, on modern networks
34 * where RTT times are in the millisecond, most connections will have events
35 * expiring within the next second, so the problem reoccurs.
36 *
37 * This new implementation uses Red-Black trees to store a sorted list of
38 * events. Red Black trees are guaranteed to have no worse than O(log N)
39 * insertion, and are commonly used in timer applications
40 */
41
42 #include <afsconfig.h>
43 #include <afs/param.h>
44
45 #ifdef KERNEL
46 # include "afs/sysincludes.h"
47 # include "afsincludes.h"
48 #else
49 # include <roken.h>
50 #endif
51
52 #include <afs/opr.h>
53 #include <opr/queue.h>
54 #include <opr/rbtree.h>
55
56 #include "rx.h"
57 #include "rx_atomic.h"
58 #include "rx_call.h"
59 #include "rx_globals.h"
60
61 struct rxevent {
62 struct opr_queue q;
63 struct opr_rbtree_node node;
64 struct clock eventTime;
65 struct rxevent *next;
66 rx_atomic_t refcnt;
67 int handled;
68 void (*func)(struct rxevent *, void *, void *, int);
69 void *arg;
70 void *arg1;
71 int arg2;
72 };
73
74 struct malloclist {
75 void *mem;
76 int size;
77 struct malloclist *next;
78 };
79
80 static struct {
81 afs_kmutex_t lock;
82 struct opr_queue list;
83 struct malloclist *mallocs;
84 } freeEvents;
85
86 static struct {
87 afs_kmutex_t lock;
88 struct opr_rbtree head;
89 struct rxevent *first;
90 } eventTree;
91
92 static struct {
93 afs_kmutex_t lock;
94 struct clock last;
95 struct clock next;
96 void (*func)(void);
97 int raised;
98 } eventSchedule;
99
100 static int allocUnit = 10;
101
102 static struct rxevent *
103 rxevent_alloc(void) {
104 struct rxevent *evlist;
105 struct rxevent *ev;
106 struct malloclist *mrec;
107 int i;
108
109 MUTEX_ENTER(&freeEvents.lock);
110 if (opr_queue_IsEmpty(&freeEvents.list)) {
111 MUTEX_EXIT(&freeEvents.lock);
112
113 #if defined(AFS_AIX32_ENV) && defined(KERNEL)
114 ev = rxi_Alloc(sizeof(struct rxevent));
115 #else
116 evlist = osi_Alloc(sizeof(struct rxevent) * allocUnit);
117 mrec = osi_Alloc(sizeof(struct malloclist));
118
119 mrec->mem = evlist;
120 mrec->size = sizeof(struct rxevent) * allocUnit;
121
122 MUTEX_ENTER(&freeEvents.lock);
123 for (i = 1; i < allocUnit; i++) {
124 opr_queue_Append(&freeEvents.list, &evlist[i].q);
125 }
126 mrec->next = freeEvents.mallocs;
127 freeEvents.mallocs = mrec;
128 MUTEX_EXIT(&freeEvents.lock);
129 #endif
130 ev = &evlist[0];
131 } else {
132 ev = opr_queue_First(&freeEvents.list, struct rxevent, q);
133 opr_queue_Remove(&ev->q);
134 MUTEX_EXIT(&freeEvents.lock);
135 }
136
137 memset(ev, 0, sizeof(struct rxevent));
138 rx_atomic_set(&ev->refcnt, 1);
139
140 return ev;
141 }
142
143 static void
144 rxevent_free(struct rxevent *ev) {
145 MUTEX_ENTER(&freeEvents.lock);
146 opr_queue_Prepend(&freeEvents.list, &ev->q);
147 MUTEX_EXIT(&freeEvents.lock);
148 }
149
150 static_inline void
151 rxevent_put(struct rxevent *ev) {
152 if (rx_atomic_dec_and_read(&ev->refcnt) == 0) {
153 rxevent_free(ev);
154 }
155 }
156
157 void
158 rxevent_Put(struct rxevent **ev)
159 {
160 rxevent_put(*ev);
161 *ev = NULL;
162 }
163
164 static_inline struct rxevent *
165 rxevent_get(struct rxevent *ev) {
166 rx_atomic_inc(&ev->refcnt);
167 return ev;
168 }
169
170 struct rxevent *
171 rxevent_Get(struct rxevent *ev) {
172 return rxevent_get(ev);
173 }
174
175 /* Called if the time now is older than the last time we recorded running an
176 * event. This test catches machines where the system time has been set
177 * backwards, and avoids RX completely stalling when timers fail to fire.
178 *
179 * Take the different between now and the last event time, and subtract that
180 * from the timing of every event on the system. This does a relatively slow
181 * walk of the completely eventTree, but time-travel will hopefully be a pretty
182 * rare occurrence.
183 *
184 * This can only safely be called from the event thread, as it plays with the
185 * schedule directly.
186 *
187 */
188 static void
189 adjustTimes(void)
190 {
191 struct opr_rbtree_node *node;
192 struct clock adjTime, now;
193
194 MUTEX_ENTER(&eventTree.lock);
195 /* Time adjustment is expensive, make absolutely certain that we have
196 * to do it, by getting an up to date time to base our decision on
197 * once we've acquired the relevant locks.
198 */
199 clock_GetTime(&now);
200 if (!clock_Lt(&now, &eventSchedule.last))
201 goto out;
202
203 adjTime = eventSchedule.last;
204 clock_Zero(&eventSchedule.last);
205
206 clock_Sub(&adjTime, &now);
207
208 /* If there are no events in the tree, then there's nothing to adjust */
209 if (eventTree.first == NULL)
210 goto out;
211
212 node = opr_rbtree_first(&eventTree.head);
213 while(node) {
214 struct rxevent *event = opr_containerof(node, struct rxevent, node);
215
216 clock_Sub(&event->eventTime, &adjTime);
217 node = opr_rbtree_next(node);
218 }
219 eventSchedule.next = eventTree.first->eventTime;
220
221 out:
222 MUTEX_EXIT(&eventTree.lock);
223 }
224
225 static int initialised = 0;
226 void
227 rxevent_Init(int nEvents, void (*scheduler)(void))
228 {
229 if (initialised)
230 return;
231
232 initialised = 1;
233
234 clock_Init();
235 MUTEX_INIT(&eventTree.lock, "event tree lock", MUTEX_DEFAULT, 0);
236 opr_rbtree_init(&eventTree.head);
237
238 MUTEX_INIT(&freeEvents.lock, "free events lock", MUTEX_DEFAULT, 0);
239 opr_queue_Init(&freeEvents.list);
240 freeEvents.mallocs = NULL;
241
242 if (nEvents)
243 allocUnit = nEvents;
244
245 clock_Zero(&eventSchedule.next);
246 clock_Zero(&eventSchedule.last);
247 eventSchedule.raised = 0;
248 eventSchedule.func = scheduler;
249 }
250
251 struct rxevent *
252 rxevent_Post(struct clock *when, struct clock *now,
253 void (*func) (struct rxevent *, void *, void *, int),
254 void *arg, void *arg1, int arg2)
255 {
256 struct rxevent *ev, *event;
257 struct opr_rbtree_node **childptr, *parent = NULL;
258
259 ev = rxevent_alloc();
260 ev->eventTime = *when;
261 ev->func = func;
262 ev->arg = arg;
263 ev->arg1 = arg1;
264 ev->arg2 = arg2;
265
266 if (clock_Lt(now, &eventSchedule.last))
267 adjustTimes();
268
269 MUTEX_ENTER(&eventTree.lock);
270
271 /* Work out where in the tree we'll be storing this */
272 childptr = &eventTree.head.root;
273
274 while(*childptr) {
275 event = opr_containerof((*childptr), struct rxevent, node);
276
277 parent = *childptr;
278 if (clock_Lt(when, &event->eventTime))
279 childptr = &(*childptr)->left;
280 else if (clock_Gt(when, &event->eventTime))
281 childptr = &(*childptr)->right;
282 else {
283 opr_queue_Append(&event->q, &ev->q);
284 goto out;
285 }
286 }
287 opr_queue_Init(&ev->q);
288 opr_rbtree_insert(&eventTree.head, parent, childptr, &ev->node);
289
290 if (eventTree.first == NULL ||
291 clock_Lt(when, &(eventTree.first->eventTime))) {
292 eventTree.first = ev;
293 eventSchedule.raised = 1;
294 clock_Zero(&eventSchedule.next);
295 MUTEX_EXIT(&eventTree.lock);
296 if (eventSchedule.func != NULL)
297 (*eventSchedule.func)();
298 return rxevent_get(ev);
299 }
300
301 out:
302 MUTEX_EXIT(&eventTree.lock);
303 return rxevent_get(ev);
304 }
305
306 /* We're going to remove ev from the tree, so set the first pointer to the
307 * next event after it */
308 static_inline void
309 resetFirst(struct rxevent *ev)
310 {
311 struct opr_rbtree_node *next = opr_rbtree_next(&ev->node);
312 if (next)
313 eventTree.first = opr_containerof(next, struct rxevent, node);
314 else
315 eventTree.first = NULL;
316 }
317
318 /*!
319 * Cancel an event
320 *
321 * Cancels the event pointed to by evp. Returns true if the event has
322 * been succesfully cancelled, or false if the event has already fired.
323 */
324
325 int
326 rxevent_Cancel(struct rxevent **evp)
327 {
328 struct rxevent *event;
329 int cancelled = 0;
330
331 if (!evp || !*evp)
332 return 0;
333
334 event = *evp;
335
336 MUTEX_ENTER(&eventTree.lock);
337
338 if (!event->handled) {
339 /* We're a node on the red/black tree. If our list is non-empty,
340 * then swap the first element in the list in in our place,
341 * promoting it to the list head */
342 if (event->node.parent == NULL
343 && eventTree.head.root != &event->node) {
344 /* Not in the rbtree, therefore must be a list element */
345 opr_queue_Remove(&event->q);
346 } else {
347 if (!opr_queue_IsEmpty(&event->q)) {
348 struct rxevent *next;
349
350 next = opr_queue_First(&event->q, struct rxevent, q);
351 opr_queue_Remove(&next->q); /* Remove ourselves from list */
352 if (event->q.prev == &event->q) {
353 next->q.prev = next->q.next = &next->q;
354 } else {
355 next->q = event->q;
356 next->q.prev->next = &next->q;
357 next->q.next->prev = &next->q;
358 }
359
360 opr_rbtree_replace(&eventTree.head, &event->node,
361 &next->node);
362
363 if (eventTree.first == event)
364 eventTree.first = next;
365
366 } else {
367 if (eventTree.first == event)
368 resetFirst(event);
369
370 opr_rbtree_remove(&eventTree.head, &event->node);
371 }
372 }
373 event->handled = 1;
374 rxevent_put(event); /* Dispose of eventTree reference */
375 cancelled = 1;
376 }
377
378 MUTEX_EXIT(&eventTree.lock);
379
380 *evp = NULL;
381 rxevent_put(event); /* Dispose of caller's reference */
382
383 return cancelled;
384 }
385
386 /* Process all events which have expired. If events remain, then the relative
387 * time until the next event is returned in the parameter 'wait', and the
388 * function returns 1. If no events currently remain, the function returns 0
389 *
390 * If the current time is older than that of the last event processed, then we
391 * assume that time has gone backwards (for example, due to a system time reset)
392 * When this happens, all events in the current queue are rescheduled, using
393 * the difference between the current time and the last event time as a delta
394 */
395
396 int
397 rxevent_RaiseEvents(struct clock *wait)
398 {
399 struct clock now;
400 struct rxevent *event;
401 int ret;
402
403 clock_GetTime(&now);
404
405 /* Check for time going backwards */
406 if (clock_Lt(&now, &eventSchedule.last))
407 adjustTimes();
408 eventSchedule.last = now;
409
410 MUTEX_ENTER(&eventTree.lock);
411 /* Lock our event tree */
412 while (eventTree.first != NULL
413 && clock_Lt(&eventTree.first->eventTime, &now)) {
414
415 /* Grab the next node, either in the event's list, or in the tree node
416 * itself, and remove it from the event tree */
417 event = eventTree.first;
418 if (!opr_queue_IsEmpty(&event->q)) {
419 event = opr_queue_Last(&event->q, struct rxevent, q);
420 opr_queue_Remove(&event->q);
421 } else {
422 resetFirst(event);
423 opr_rbtree_remove(&eventTree.head, &event->node);
424 }
425 event->handled = 1;
426 MUTEX_EXIT(&eventTree.lock);
427
428 /* Fire the event, then free the structure */
429 event->func(event, event->arg, event->arg1, event->arg2);
430 rxevent_put(event);
431
432 MUTEX_ENTER(&eventTree.lock);
433 }
434
435 /* Figure out when we next need to be scheduled */
436 if (eventTree.first != NULL) {
437 *wait = eventSchedule.next = eventTree.first->eventTime;
438 ret = eventSchedule.raised = 1;
439 clock_Sub(wait, &now);
440 } else {
441 ret = eventSchedule.raised = 0;
442 }
443
444 MUTEX_EXIT(&eventTree.lock);
445
446 return ret;
447 }
448
449 void
450 shutdown_rxevent(void)
451 {
452 struct malloclist *mrec, *nmrec;
453
454 if (!initialised) {
455 return;
456 }
457 MUTEX_DESTROY(&eventTree.lock);
458
459 #if !defined(AFS_AIX32_ENV) || !defined(KERNEL)
460 MUTEX_DESTROY(&freeEvents.lock);
461 mrec = freeEvents.mallocs;
462 while (mrec) {
463 nmrec = mrec->next;
464 osi_Free(mrec->mem, mrec->size);
465 osi_Free(mrec, sizeof(struct malloclist));
466 mrec = nmrec;
467 }
468 mrec = NULL;
469 #endif
470 }