| /* | 
 |  * Copyright 2000-2003 Niels Provos <provos@citi.umich.edu> | 
 |  * All rights reserved. | 
 |  * | 
 |  * Redistribution and use in source and binary forms, with or without | 
 |  * modification, are permitted provided that the following conditions | 
 |  * are met: | 
 |  * 1. Redistributions of source code must retain the above copyright | 
 |  *    notice, this list of conditions and the following disclaimer. | 
 |  * 2. Redistributions in binary form must reproduce the above copyright | 
 |  *    notice, this list of conditions and the following disclaimer in the | 
 |  *    documentation and/or other materials provided with the distribution. | 
 |  * 3. The name of the author may not be used to endorse or promote products | 
 |  *    derived from this software without specific prior written permission. | 
 |  * | 
 |  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR | 
 |  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES | 
 |  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. | 
 |  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, | 
 |  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT | 
 |  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF | 
 |  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
 |  */ | 
 | #ifdef HAVE_CONFIG_H | 
 | #include "config.h" | 
 | #endif | 
 |  | 
 | #include <stdint.h> | 
 | #include <sys/types.h> | 
 | #include <sys/resource.h> | 
 | #ifdef HAVE_SYS_TIME_H | 
 | #include <sys/time.h> | 
 | #else | 
 | #include <sys/_libevent_time.h> | 
 | #endif | 
 | #include <sys/queue.h> | 
 | #include <sys/epoll.h> | 
 | #include <signal.h> | 
 | #include <stdio.h> | 
 | #include <stdlib.h> | 
 | #include <string.h> | 
 | #include <unistd.h> | 
 | #include <errno.h> | 
 | #ifdef HAVE_FCNTL_H | 
 | #include <fcntl.h> | 
 | #endif | 
 |  | 
 | #include "event.h" | 
 | #include "event-internal.h" | 
 | #include "evsignal.h" | 
 | #include "log.h" | 
 |  | 
 | /* due to limitations in the epoll interface, we need to keep track of | 
 |  * all file descriptors outself. | 
 |  */ | 
 | struct evepoll { | 
 | 	struct event *evread; | 
 | 	struct event *evwrite; | 
 | }; | 
 |  | 
 | struct epollop { | 
 | 	struct evepoll *fds; | 
 | 	int nfds; | 
 | 	struct epoll_event *events; | 
 | 	int nevents; | 
 | 	int epfd; | 
 | }; | 
 |  | 
 | static void *epoll_init	(struct event_base *); | 
 | static int epoll_add	(void *, struct event *); | 
 | static int epoll_del	(void *, struct event *); | 
 | static int epoll_dispatch	(struct event_base *, void *, struct timeval *); | 
 | static void epoll_dealloc	(struct event_base *, void *); | 
 |  | 
 | const struct eventop epollops = { | 
 | 	"epoll", | 
 | 	epoll_init, | 
 | 	epoll_add, | 
 | 	epoll_del, | 
 | 	epoll_dispatch, | 
 | 	epoll_dealloc, | 
 | 	1 /* need reinit */ | 
 | }; | 
 |  | 
 | #ifdef HAVE_SETFD | 
 | #define FD_CLOSEONEXEC(x) do { \ | 
 |         if (fcntl(x, F_SETFD, 1) == -1) \ | 
 |                 event_warn("fcntl(%d, F_SETFD)", x); \ | 
 | } while (0) | 
 | #else | 
 | #define FD_CLOSEONEXEC(x) | 
 | #endif | 
 |  | 
 | /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout | 
 |  * values bigger than (LONG_MAX - 999ULL)/HZ.  HZ in the wild can be | 
 |  * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the | 
 |  * largest number of msec we can support here is 2147482.  Let's | 
 |  * round that down by 47 seconds. | 
 |  */ | 
 | #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) | 
 |  | 
 | #define INITIAL_NFILES 32 | 
 | #define INITIAL_NEVENTS 32 | 
 | #define MAX_NEVENTS 4096 | 
 |  | 
 | static void * | 
 | epoll_init(struct event_base *base) | 
 | { | 
 | 	int epfd; | 
 | 	struct epollop *epollop; | 
 |  | 
 | 	/* Disable epollueue when this environment variable is set */ | 
 | 	if (evutil_getenv("EVENT_NOEPOLL")) | 
 | 		return (NULL); | 
 |  | 
 | 	/* Initalize the kernel queue */ | 
 | 	if ((epfd = epoll_create(32000)) == -1) { | 
 | 		if (errno != ENOSYS) | 
 | 			event_warn("epoll_create"); | 
 | 		return (NULL); | 
 | 	} | 
 |  | 
 | 	FD_CLOSEONEXEC(epfd); | 
 |  | 
 | 	if (!(epollop = calloc(1, sizeof(struct epollop)))) | 
 | 		return (NULL); | 
 |  | 
 | 	epollop->epfd = epfd; | 
 |  | 
 | 	/* Initalize fields */ | 
 | 	epollop->events = malloc(INITIAL_NEVENTS * sizeof(struct epoll_event)); | 
 | 	if (epollop->events == NULL) { | 
 | 		free(epollop); | 
 | 		return (NULL); | 
 | 	} | 
 | 	epollop->nevents = INITIAL_NEVENTS; | 
 |  | 
 | 	epollop->fds = calloc(INITIAL_NFILES, sizeof(struct evepoll)); | 
 | 	if (epollop->fds == NULL) { | 
 | 		free(epollop->events); | 
 | 		free(epollop); | 
 | 		return (NULL); | 
 | 	} | 
 | 	epollop->nfds = INITIAL_NFILES; | 
 |  | 
 | 	evsignal_init(base); | 
 |  | 
 | 	return (epollop); | 
 | } | 
 |  | 
 | static int | 
 | epoll_recalc(struct event_base *base, void *arg, int max) | 
 | { | 
 | 	struct epollop *epollop = arg; | 
 |  | 
 | 	if (max >= epollop->nfds) { | 
 | 		struct evepoll *fds; | 
 | 		int nfds; | 
 |  | 
 | 		nfds = epollop->nfds; | 
 | 		while (nfds <= max) | 
 | 			nfds <<= 1; | 
 |  | 
 | 		fds = realloc(epollop->fds, nfds * sizeof(struct evepoll)); | 
 | 		if (fds == NULL) { | 
 | 			event_warn("realloc"); | 
 | 			return (-1); | 
 | 		} | 
 | 		epollop->fds = fds; | 
 | 		memset(fds + epollop->nfds, 0, | 
 | 		    (nfds - epollop->nfds) * sizeof(struct evepoll)); | 
 | 		epollop->nfds = nfds; | 
 | 	} | 
 |  | 
 | 	return (0); | 
 | } | 
 |  | 
 | static int | 
 | epoll_dispatch(struct event_base *base, void *arg, struct timeval *tv) | 
 | { | 
 | 	struct epollop *epollop = arg; | 
 | 	struct epoll_event *events = epollop->events; | 
 | 	struct evepoll *evep; | 
 | 	int i, res, timeout = -1; | 
 |  | 
 | 	if (tv != NULL) | 
 | 		timeout = tv->tv_sec * 1000 + (tv->tv_usec + 999) / 1000; | 
 |  | 
 | 	if (timeout > MAX_EPOLL_TIMEOUT_MSEC) { | 
 | 		/* Linux kernels can wait forever if the timeout is too big; | 
 | 		 * see comment on MAX_EPOLL_TIMEOUT_MSEC. */ | 
 | 		timeout = MAX_EPOLL_TIMEOUT_MSEC; | 
 | 	} | 
 |  | 
 | 	res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); | 
 |  | 
 | 	if (res == -1) { | 
 | 		if (errno != EINTR) { | 
 | 			event_warn("epoll_wait"); | 
 | 			return (-1); | 
 | 		} | 
 |  | 
 | 		evsignal_process(base); | 
 | 		return (0); | 
 | 	} else if (base->sig.evsignal_caught) { | 
 | 		evsignal_process(base); | 
 | 	} | 
 |  | 
 | 	event_debug(("%s: epoll_wait reports %d", __func__, res)); | 
 |  | 
 | 	for (i = 0; i < res; i++) { | 
 | 		int what = events[i].events; | 
 | 		struct event *evread = NULL, *evwrite = NULL; | 
 | 		int fd = events[i].data.fd; | 
 |  | 
 | 		if (fd < 0 || fd >= epollop->nfds) | 
 | 			continue; | 
 | 		evep = &epollop->fds[fd]; | 
 |  | 
 | 		if (what & (EPOLLHUP|EPOLLERR)) { | 
 | 			evread = evep->evread; | 
 | 			evwrite = evep->evwrite; | 
 | 		} else { | 
 | 			if (what & EPOLLIN) { | 
 | 				evread = evep->evread; | 
 | 			} | 
 |  | 
 | 			if (what & EPOLLOUT) { | 
 | 				evwrite = evep->evwrite; | 
 | 			} | 
 | 		} | 
 |  | 
 | 		if (!(evread||evwrite)) | 
 | 			continue; | 
 |  | 
 | 		if (evread != NULL) | 
 | 			event_active(evread, EV_READ, 1); | 
 | 		if (evwrite != NULL) | 
 | 			event_active(evwrite, EV_WRITE, 1); | 
 | 	} | 
 |  | 
 | 	if (res == epollop->nevents && epollop->nevents < MAX_NEVENTS) { | 
 | 		/* We used all of the event space this time.  We should | 
 | 		   be ready for more events next time. */ | 
 | 		int new_nevents = epollop->nevents * 2; | 
 | 		struct epoll_event *new_events; | 
 |  | 
 | 		new_events = realloc(epollop->events, | 
 | 		    new_nevents * sizeof(struct epoll_event)); | 
 | 		if (new_events) { | 
 | 			epollop->events = new_events; | 
 | 			epollop->nevents = new_nevents; | 
 | 		} | 
 | 	} | 
 |  | 
 | 	return (0); | 
 | } | 
 |  | 
 |  | 
 | static int | 
 | epoll_add(void *arg, struct event *ev) | 
 | { | 
 | 	struct epollop *epollop = arg; | 
 | 	struct epoll_event epev = {0, {0}}; | 
 | 	struct evepoll *evep; | 
 | 	int fd, op, events; | 
 |  | 
 | 	if (ev->ev_events & EV_SIGNAL) | 
 | 		return (evsignal_add(ev)); | 
 |  | 
 | 	fd = ev->ev_fd; | 
 | 	if (fd >= epollop->nfds) { | 
 | 		/* Extent the file descriptor array as necessary */ | 
 | 		if (epoll_recalc(ev->ev_base, epollop, fd) == -1) | 
 | 			return (-1); | 
 | 	} | 
 | 	evep = &epollop->fds[fd]; | 
 | 	op = EPOLL_CTL_ADD; | 
 | 	events = 0; | 
 | 	if (evep->evread != NULL) { | 
 | 		events |= EPOLLIN; | 
 | 		op = EPOLL_CTL_MOD; | 
 | 	} | 
 | 	if (evep->evwrite != NULL) { | 
 | 		events |= EPOLLOUT; | 
 | 		op = EPOLL_CTL_MOD; | 
 | 	} | 
 |  | 
 | 	if (ev->ev_events & EV_READ) | 
 | 		events |= EPOLLIN; | 
 | 	if (ev->ev_events & EV_WRITE) | 
 | 		events |= EPOLLOUT; | 
 |  | 
 | 	epev.data.fd = fd; | 
 | 	epev.events = events; | 
 | 	if (epoll_ctl(epollop->epfd, op, ev->ev_fd, &epev) == -1) | 
 | 			return (-1); | 
 |  | 
 | 	/* Update events responsible */ | 
 | 	if (ev->ev_events & EV_READ) | 
 | 		evep->evread = ev; | 
 | 	if (ev->ev_events & EV_WRITE) | 
 | 		evep->evwrite = ev; | 
 |  | 
 | 	return (0); | 
 | } | 
 |  | 
 | static int | 
 | epoll_del(void *arg, struct event *ev) | 
 | { | 
 | 	struct epollop *epollop = arg; | 
 | 	struct epoll_event epev = {0, {0}}; | 
 | 	struct evepoll *evep; | 
 | 	int fd, events, op; | 
 | 	int needwritedelete = 1, needreaddelete = 1; | 
 |  | 
 | 	if (ev->ev_events & EV_SIGNAL) | 
 | 		return (evsignal_del(ev)); | 
 |  | 
 | 	fd = ev->ev_fd; | 
 | 	if (fd >= epollop->nfds) | 
 | 		return (0); | 
 | 	evep = &epollop->fds[fd]; | 
 |  | 
 | 	op = EPOLL_CTL_DEL; | 
 | 	events = 0; | 
 |  | 
 | 	if (ev->ev_events & EV_READ) | 
 | 		events |= EPOLLIN; | 
 | 	if (ev->ev_events & EV_WRITE) | 
 | 		events |= EPOLLOUT; | 
 |  | 
 | 	if ((events & (EPOLLIN|EPOLLOUT)) != (EPOLLIN|EPOLLOUT)) { | 
 | 		if ((events & EPOLLIN) && evep->evwrite != NULL) { | 
 | 			needwritedelete = 0; | 
 | 			events = EPOLLOUT; | 
 | 			op = EPOLL_CTL_MOD; | 
 | 		} else if ((events & EPOLLOUT) && evep->evread != NULL) { | 
 | 			needreaddelete = 0; | 
 | 			events = EPOLLIN; | 
 | 			op = EPOLL_CTL_MOD; | 
 | 		} | 
 | 	} | 
 |  | 
 | 	epev.events = events; | 
 | 	epev.data.fd = fd; | 
 |  | 
 | 	if (needreaddelete) | 
 | 		evep->evread = NULL; | 
 | 	if (needwritedelete) | 
 | 		evep->evwrite = NULL; | 
 |  | 
 | 	if (epoll_ctl(epollop->epfd, op, fd, &epev) == -1) | 
 | 		return (-1); | 
 |  | 
 | 	return (0); | 
 | } | 
 |  | 
 | static void | 
 | epoll_dealloc(struct event_base *base, void *arg) | 
 | { | 
 | 	struct epollop *epollop = arg; | 
 |  | 
 | 	evsignal_dealloc(base); | 
 | 	if (epollop->fds) | 
 | 		free(epollop->fds); | 
 | 	if (epollop->events) | 
 | 		free(epollop->events); | 
 | 	if (epollop->epfd >= 0) | 
 | 		close(epollop->epfd); | 
 |  | 
 | 	memset(epollop, 0, sizeof(struct epollop)); | 
 | 	free(epollop); | 
 | } |