Commit | Line | Data |
---|---|---|
805e021f CE |
1 | /* |
2 | * Copyright (c) 2011 Your File System Inc. All rights reserved. | |
3 | * | |
4 | * Redistribution and use in source and binary forms, with or without | |
5 | * modification, are permitted provided that the following conditions | |
6 | * are met: | |
7 | * 1. Redistributions of source code must retain the above copyright | |
8 | * notice, this list of conditions and the following disclaimer. | |
9 | * 2. Redistributions in binary form must reproduce the above copyright | |
10 | * notice, this list of conditions and the following disclaimer in the | |
11 | * documentation and/or other materials provided with the distribution. | |
12 | * | |
13 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND ANY EXPRESS OR | |
14 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | |
15 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | |
16 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, | |
17 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | |
18 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
19 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
20 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | |
22 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
23 | */ | |
24 | ||
25 | /* A reimplementation of the rx_event handler using red/black trees | |
26 | * | |
27 | * The first rx_event implementation used a simple sorted queue of all | |
28 | * events, which lead to O(n^2) performance, where n is the number of | |
29 | * outstanding events. This was found to scale poorly, so was replaced. | |
30 | * | |
31 | * The second implementation used a set of per-second buckets to store | |
32 | * events. Each bucket (referred to as an epoch in the code) stored all | |
33 | * of the events which expired in that second. However, on modern networks | |
34 | * where RTT times are in the millisecond, most connections will have events | |
35 | * expiring within the next second, so the problem reoccurs. | |
36 | * | |
37 | * This new implementation uses Red-Black trees to store a sorted list of | |
38 | * events. Red Black trees are guaranteed to have no worse than O(log N) | |
39 | * insertion, and are commonly used in timer applications | |
40 | */ | |
41 | ||
42 | #include <afsconfig.h> | |
43 | #include <afs/param.h> | |
44 | ||
45 | #ifdef KERNEL | |
46 | # include "afs/sysincludes.h" | |
47 | # include "afsincludes.h" | |
48 | #else | |
49 | # include <roken.h> | |
50 | #endif | |
51 | ||
52 | #include <afs/opr.h> | |
53 | #include <opr/queue.h> | |
54 | #include <opr/rbtree.h> | |
55 | ||
56 | #include "rx.h" | |
57 | #include "rx_atomic.h" | |
58 | #include "rx_call.h" | |
59 | #include "rx_globals.h" | |
60 | ||
61 | struct rxevent { | |
62 | struct opr_queue q; | |
63 | struct opr_rbtree_node node; | |
64 | struct clock eventTime; | |
65 | struct rxevent *next; | |
66 | rx_atomic_t refcnt; | |
67 | int handled; | |
68 | void (*func)(struct rxevent *, void *, void *, int); | |
69 | void *arg; | |
70 | void *arg1; | |
71 | int arg2; | |
72 | }; | |
73 | ||
74 | struct malloclist { | |
75 | void *mem; | |
76 | int size; | |
77 | struct malloclist *next; | |
78 | }; | |
79 | ||
80 | static struct { | |
81 | afs_kmutex_t lock; | |
82 | struct opr_queue list; | |
83 | struct malloclist *mallocs; | |
84 | } freeEvents; | |
85 | ||
86 | static struct { | |
87 | afs_kmutex_t lock; | |
88 | struct opr_rbtree head; | |
89 | struct rxevent *first; | |
90 | } eventTree; | |
91 | ||
92 | static struct { | |
93 | afs_kmutex_t lock; | |
94 | struct clock last; | |
95 | struct clock next; | |
96 | void (*func)(void); | |
97 | int raised; | |
98 | } eventSchedule; | |
99 | ||
100 | static int allocUnit = 10; | |
101 | ||
102 | static struct rxevent * | |
103 | rxevent_alloc(void) { | |
104 | struct rxevent *evlist; | |
105 | struct rxevent *ev; | |
106 | struct malloclist *mrec; | |
107 | int i; | |
108 | ||
109 | MUTEX_ENTER(&freeEvents.lock); | |
110 | if (opr_queue_IsEmpty(&freeEvents.list)) { | |
111 | MUTEX_EXIT(&freeEvents.lock); | |
112 | ||
113 | #if defined(AFS_AIX32_ENV) && defined(KERNEL) | |
114 | ev = rxi_Alloc(sizeof(struct rxevent)); | |
115 | #else | |
116 | evlist = osi_Alloc(sizeof(struct rxevent) * allocUnit); | |
117 | mrec = osi_Alloc(sizeof(struct malloclist)); | |
118 | ||
119 | mrec->mem = evlist; | |
120 | mrec->size = sizeof(struct rxevent) * allocUnit; | |
121 | ||
122 | MUTEX_ENTER(&freeEvents.lock); | |
123 | for (i = 1; i < allocUnit; i++) { | |
124 | opr_queue_Append(&freeEvents.list, &evlist[i].q); | |
125 | } | |
126 | mrec->next = freeEvents.mallocs; | |
127 | freeEvents.mallocs = mrec; | |
128 | MUTEX_EXIT(&freeEvents.lock); | |
129 | #endif | |
130 | ev = &evlist[0]; | |
131 | } else { | |
132 | ev = opr_queue_First(&freeEvents.list, struct rxevent, q); | |
133 | opr_queue_Remove(&ev->q); | |
134 | MUTEX_EXIT(&freeEvents.lock); | |
135 | } | |
136 | ||
137 | memset(ev, 0, sizeof(struct rxevent)); | |
138 | rx_atomic_set(&ev->refcnt, 1); | |
139 | ||
140 | return ev; | |
141 | } | |
142 | ||
143 | static void | |
144 | rxevent_free(struct rxevent *ev) { | |
145 | MUTEX_ENTER(&freeEvents.lock); | |
146 | opr_queue_Prepend(&freeEvents.list, &ev->q); | |
147 | MUTEX_EXIT(&freeEvents.lock); | |
148 | } | |
149 | ||
150 | static_inline void | |
151 | rxevent_put(struct rxevent *ev) { | |
152 | if (rx_atomic_dec_and_read(&ev->refcnt) == 0) { | |
153 | rxevent_free(ev); | |
154 | } | |
155 | } | |
156 | ||
157 | void | |
158 | rxevent_Put(struct rxevent **ev) | |
159 | { | |
160 | rxevent_put(*ev); | |
161 | *ev = NULL; | |
162 | } | |
163 | ||
164 | static_inline struct rxevent * | |
165 | rxevent_get(struct rxevent *ev) { | |
166 | rx_atomic_inc(&ev->refcnt); | |
167 | return ev; | |
168 | } | |
169 | ||
170 | struct rxevent * | |
171 | rxevent_Get(struct rxevent *ev) { | |
172 | return rxevent_get(ev); | |
173 | } | |
174 | ||
175 | /* Called if the time now is older than the last time we recorded running an | |
176 | * event. This test catches machines where the system time has been set | |
177 | * backwards, and avoids RX completely stalling when timers fail to fire. | |
178 | * | |
179 | * Take the different between now and the last event time, and subtract that | |
180 | * from the timing of every event on the system. This does a relatively slow | |
181 | * walk of the completely eventTree, but time-travel will hopefully be a pretty | |
182 | * rare occurrence. | |
183 | * | |
184 | * This can only safely be called from the event thread, as it plays with the | |
185 | * schedule directly. | |
186 | * | |
187 | */ | |
188 | static void | |
189 | adjustTimes(void) | |
190 | { | |
191 | struct opr_rbtree_node *node; | |
192 | struct clock adjTime, now; | |
193 | ||
194 | MUTEX_ENTER(&eventTree.lock); | |
195 | /* Time adjustment is expensive, make absolutely certain that we have | |
196 | * to do it, by getting an up to date time to base our decision on | |
197 | * once we've acquired the relevant locks. | |
198 | */ | |
199 | clock_GetTime(&now); | |
200 | if (!clock_Lt(&now, &eventSchedule.last)) | |
201 | goto out; | |
202 | ||
203 | adjTime = eventSchedule.last; | |
204 | clock_Zero(&eventSchedule.last); | |
205 | ||
206 | clock_Sub(&adjTime, &now); | |
207 | ||
208 | /* If there are no events in the tree, then there's nothing to adjust */ | |
209 | if (eventTree.first == NULL) | |
210 | goto out; | |
211 | ||
212 | node = opr_rbtree_first(&eventTree.head); | |
213 | while(node) { | |
214 | struct rxevent *event = opr_containerof(node, struct rxevent, node); | |
215 | ||
216 | clock_Sub(&event->eventTime, &adjTime); | |
217 | node = opr_rbtree_next(node); | |
218 | } | |
219 | eventSchedule.next = eventTree.first->eventTime; | |
220 | ||
221 | out: | |
222 | MUTEX_EXIT(&eventTree.lock); | |
223 | } | |
224 | ||
225 | static int initialised = 0; | |
226 | void | |
227 | rxevent_Init(int nEvents, void (*scheduler)(void)) | |
228 | { | |
229 | if (initialised) | |
230 | return; | |
231 | ||
232 | initialised = 1; | |
233 | ||
234 | clock_Init(); | |
235 | MUTEX_INIT(&eventTree.lock, "event tree lock", MUTEX_DEFAULT, 0); | |
236 | opr_rbtree_init(&eventTree.head); | |
237 | ||
238 | MUTEX_INIT(&freeEvents.lock, "free events lock", MUTEX_DEFAULT, 0); | |
239 | opr_queue_Init(&freeEvents.list); | |
240 | freeEvents.mallocs = NULL; | |
241 | ||
242 | if (nEvents) | |
243 | allocUnit = nEvents; | |
244 | ||
245 | clock_Zero(&eventSchedule.next); | |
246 | clock_Zero(&eventSchedule.last); | |
247 | eventSchedule.raised = 0; | |
248 | eventSchedule.func = scheduler; | |
249 | } | |
250 | ||
251 | struct rxevent * | |
252 | rxevent_Post(struct clock *when, struct clock *now, | |
253 | void (*func) (struct rxevent *, void *, void *, int), | |
254 | void *arg, void *arg1, int arg2) | |
255 | { | |
256 | struct rxevent *ev, *event; | |
257 | struct opr_rbtree_node **childptr, *parent = NULL; | |
258 | ||
259 | ev = rxevent_alloc(); | |
260 | ev->eventTime = *when; | |
261 | ev->func = func; | |
262 | ev->arg = arg; | |
263 | ev->arg1 = arg1; | |
264 | ev->arg2 = arg2; | |
265 | ||
266 | if (clock_Lt(now, &eventSchedule.last)) | |
267 | adjustTimes(); | |
268 | ||
269 | MUTEX_ENTER(&eventTree.lock); | |
270 | ||
271 | /* Work out where in the tree we'll be storing this */ | |
272 | childptr = &eventTree.head.root; | |
273 | ||
274 | while(*childptr) { | |
275 | event = opr_containerof((*childptr), struct rxevent, node); | |
276 | ||
277 | parent = *childptr; | |
278 | if (clock_Lt(when, &event->eventTime)) | |
279 | childptr = &(*childptr)->left; | |
280 | else if (clock_Gt(when, &event->eventTime)) | |
281 | childptr = &(*childptr)->right; | |
282 | else { | |
283 | opr_queue_Append(&event->q, &ev->q); | |
284 | goto out; | |
285 | } | |
286 | } | |
287 | opr_queue_Init(&ev->q); | |
288 | opr_rbtree_insert(&eventTree.head, parent, childptr, &ev->node); | |
289 | ||
290 | if (eventTree.first == NULL || | |
291 | clock_Lt(when, &(eventTree.first->eventTime))) { | |
292 | eventTree.first = ev; | |
293 | eventSchedule.raised = 1; | |
294 | clock_Zero(&eventSchedule.next); | |
295 | MUTEX_EXIT(&eventTree.lock); | |
296 | if (eventSchedule.func != NULL) | |
297 | (*eventSchedule.func)(); | |
298 | return rxevent_get(ev); | |
299 | } | |
300 | ||
301 | out: | |
302 | MUTEX_EXIT(&eventTree.lock); | |
303 | return rxevent_get(ev); | |
304 | } | |
305 | ||
306 | /* We're going to remove ev from the tree, so set the first pointer to the | |
307 | * next event after it */ | |
308 | static_inline void | |
309 | resetFirst(struct rxevent *ev) | |
310 | { | |
311 | struct opr_rbtree_node *next = opr_rbtree_next(&ev->node); | |
312 | if (next) | |
313 | eventTree.first = opr_containerof(next, struct rxevent, node); | |
314 | else | |
315 | eventTree.first = NULL; | |
316 | } | |
317 | ||
318 | /*! | |
319 | * Cancel an event | |
320 | * | |
321 | * Cancels the event pointed to by evp. Returns true if the event has | |
322 | * been succesfully cancelled, or false if the event has already fired. | |
323 | */ | |
324 | ||
325 | int | |
326 | rxevent_Cancel(struct rxevent **evp) | |
327 | { | |
328 | struct rxevent *event; | |
329 | int cancelled = 0; | |
330 | ||
331 | if (!evp || !*evp) | |
332 | return 0; | |
333 | ||
334 | event = *evp; | |
335 | ||
336 | MUTEX_ENTER(&eventTree.lock); | |
337 | ||
338 | if (!event->handled) { | |
339 | /* We're a node on the red/black tree. If our list is non-empty, | |
340 | * then swap the first element in the list in in our place, | |
341 | * promoting it to the list head */ | |
342 | if (event->node.parent == NULL | |
343 | && eventTree.head.root != &event->node) { | |
344 | /* Not in the rbtree, therefore must be a list element */ | |
345 | opr_queue_Remove(&event->q); | |
346 | } else { | |
347 | if (!opr_queue_IsEmpty(&event->q)) { | |
348 | struct rxevent *next; | |
349 | ||
350 | next = opr_queue_First(&event->q, struct rxevent, q); | |
351 | opr_queue_Remove(&next->q); /* Remove ourselves from list */ | |
352 | if (event->q.prev == &event->q) { | |
353 | next->q.prev = next->q.next = &next->q; | |
354 | } else { | |
355 | next->q = event->q; | |
356 | next->q.prev->next = &next->q; | |
357 | next->q.next->prev = &next->q; | |
358 | } | |
359 | ||
360 | opr_rbtree_replace(&eventTree.head, &event->node, | |
361 | &next->node); | |
362 | ||
363 | if (eventTree.first == event) | |
364 | eventTree.first = next; | |
365 | ||
366 | } else { | |
367 | if (eventTree.first == event) | |
368 | resetFirst(event); | |
369 | ||
370 | opr_rbtree_remove(&eventTree.head, &event->node); | |
371 | } | |
372 | } | |
373 | event->handled = 1; | |
374 | rxevent_put(event); /* Dispose of eventTree reference */ | |
375 | cancelled = 1; | |
376 | } | |
377 | ||
378 | MUTEX_EXIT(&eventTree.lock); | |
379 | ||
380 | *evp = NULL; | |
381 | rxevent_put(event); /* Dispose of caller's reference */ | |
382 | ||
383 | return cancelled; | |
384 | } | |
385 | ||
386 | /* Process all events which have expired. If events remain, then the relative | |
387 | * time until the next event is returned in the parameter 'wait', and the | |
388 | * function returns 1. If no events currently remain, the function returns 0 | |
389 | * | |
390 | * If the current time is older than that of the last event processed, then we | |
391 | * assume that time has gone backwards (for example, due to a system time reset) | |
392 | * When this happens, all events in the current queue are rescheduled, using | |
393 | * the difference between the current time and the last event time as a delta | |
394 | */ | |
395 | ||
396 | int | |
397 | rxevent_RaiseEvents(struct clock *wait) | |
398 | { | |
399 | struct clock now; | |
400 | struct rxevent *event; | |
401 | int ret; | |
402 | ||
403 | clock_GetTime(&now); | |
404 | ||
405 | /* Check for time going backwards */ | |
406 | if (clock_Lt(&now, &eventSchedule.last)) | |
407 | adjustTimes(); | |
408 | eventSchedule.last = now; | |
409 | ||
410 | MUTEX_ENTER(&eventTree.lock); | |
411 | /* Lock our event tree */ | |
412 | while (eventTree.first != NULL | |
413 | && clock_Lt(&eventTree.first->eventTime, &now)) { | |
414 | ||
415 | /* Grab the next node, either in the event's list, or in the tree node | |
416 | * itself, and remove it from the event tree */ | |
417 | event = eventTree.first; | |
418 | if (!opr_queue_IsEmpty(&event->q)) { | |
419 | event = opr_queue_Last(&event->q, struct rxevent, q); | |
420 | opr_queue_Remove(&event->q); | |
421 | } else { | |
422 | resetFirst(event); | |
423 | opr_rbtree_remove(&eventTree.head, &event->node); | |
424 | } | |
425 | event->handled = 1; | |
426 | MUTEX_EXIT(&eventTree.lock); | |
427 | ||
428 | /* Fire the event, then free the structure */ | |
429 | event->func(event, event->arg, event->arg1, event->arg2); | |
430 | rxevent_put(event); | |
431 | ||
432 | MUTEX_ENTER(&eventTree.lock); | |
433 | } | |
434 | ||
435 | /* Figure out when we next need to be scheduled */ | |
436 | if (eventTree.first != NULL) { | |
437 | *wait = eventSchedule.next = eventTree.first->eventTime; | |
438 | ret = eventSchedule.raised = 1; | |
439 | clock_Sub(wait, &now); | |
440 | } else { | |
441 | ret = eventSchedule.raised = 0; | |
442 | } | |
443 | ||
444 | MUTEX_EXIT(&eventTree.lock); | |
445 | ||
446 | return ret; | |
447 | } | |
448 | ||
449 | void | |
450 | shutdown_rxevent(void) | |
451 | { | |
452 | struct malloclist *mrec, *nmrec; | |
453 | ||
454 | if (!initialised) { | |
455 | return; | |
456 | } | |
457 | MUTEX_DESTROY(&eventTree.lock); | |
458 | ||
459 | #if !defined(AFS_AIX32_ENV) || !defined(KERNEL) | |
460 | MUTEX_DESTROY(&freeEvents.lock); | |
461 | mrec = freeEvents.mallocs; | |
462 | while (mrec) { | |
463 | nmrec = mrec->next; | |
464 | osi_Free(mrec->mem, mrec->size); | |
465 | osi_Free(mrec, sizeof(struct malloclist)); | |
466 | mrec = nmrec; | |
467 | } | |
468 | mrec = NULL; | |
469 | #endif | |
470 | } |