epoll patch - take 2

Shachar Shemesh wine-patches at shemesh.biz
Tue Aug 31 15:13:43 CDT 2004


I made it as simple as can be this time around. We just use glibc, and 
que sera sera. If anything doesn't support it, they will just have to 
make do with poll.

This patch also skips the need to seek the end of the pending free list. 
The last node is saved in a static var, so that linking the free list to 
the end of the pending free is done with one simple step.

ChangeLog:
Shachar Shemesh <winecode at shemesh.biz>
configure.ac
configure
include/config.h.in

    * Detect whether the sys/epoll.h header is available

server/fd.c

    * Change the type of the poll_users list to also keep track of the
      state of the node. Change the fd pointer to a union for better
      implementation of free list without casts.
    * Switch to relying solely on the free list to determine where our
      next free node is at.
    * Switch the freelist to use integer offsets into poll_users, rather
      than pointers. The list can now be realloced without being
      invalidated.
    * Implement epoll waiting over the same infrastructures

-- 
Shachar Shemesh
Lingnu Open Source Consulting ltd.
http://www.lingnu.com/

-------------- next part --------------
Index: configure
===================================================================
RCS file: /home/sun/sources/cvs/wine/configure,v
retrieving revision 1.592
diff -u -r1.592 configure
--- configure	24 Aug 2004 21:00:15 -0000	1.592
+++ configure	31 Aug 2004 19:44:58 -0000
@@ -6776,6 +6776,79 @@
 fi
 
 
+echo "$as_me:$LINENO: checking for epoll_create in -lepoll" >&5
+echo $ECHO_N "checking for epoll_create in -lepoll... $ECHO_C" >&6
+if test "${ac_cv_lib_epoll_epoll_create+set}" = set; then
+  echo $ECHO_N "(cached) $ECHO_C" >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lepoll  $LIBS"
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+
+/* Override any gcc2 internal prototype to avoid an error.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+/* We use char because int might match the return type of a gcc2
+   builtin and then its argument prototype would still apply.  */
+char epoll_create ();
+int
+main ()
+{
+epoll_create ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
+  (eval $ac_link) 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } &&
+	 { ac_try='test -z "$ac_c_werror_flag"			 || test ! -s conftest.err'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; } &&
+	 { ac_try='test -s conftest$ac_exeext'
+  { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_lib_epoll_epoll_create=yes
+else
+  echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ac_cv_lib_epoll_epoll_create=no
+fi
+rm -f conftest.err conftest.$ac_objext \
+      conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+echo "$as_me:$LINENO: result: $ac_cv_lib_epoll_epoll_create" >&5
+echo "${ECHO_T}$ac_cv_lib_epoll_epoll_create" >&6
+if test $ac_cv_lib_epoll_epoll_create = yes; then
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LIBEPOLL 1
+_ACEOF
+
+  LIBS="-lepoll $LIBS"
+
+fi
+
+
 echo "$as_me:$LINENO: checking for res_9_init in -lresolv" >&5
 echo $ECHO_N "checking for res_9_init in -lresolv... $ECHO_C" >&6
 if test "${ac_cv_lib_resolv_res_9_init+set}" = set; then
@@ -15960,6 +16033,7 @@
 
 
 
+
 for ac_func in \
 	_lwp_create \
 	_lwp_self \
@@ -15972,6 +16046,7 @@
 	_vsnprintf \
 	chsize \
 	clone \
+	epoll_create \
 	finite \
 	fpclass \
 	fstatfs \
@@ -16207,6 +16282,7 @@
 
 
 
+
 for ac_header in \
 	arpa/inet.h \
 	arpa/nameser.h \
@@ -16269,6 +16345,7 @@
 	sys/msg.h \
 	sys/param.h \
 	sys/poll.h \
+	sys/epoll.h \
 	sys/ptrace.h \
 	sys/reg.h \
 	sys/scsiio.h \
Index: configure.ac
===================================================================
RCS file: /home/sun/sources/cvs/wine/configure.ac,v
retrieving revision 1.303
diff -u -r1.303 configure.ac
--- configure.ac	24 Aug 2004 21:00:15 -0000	1.303
+++ configure.ac	31 Aug 2004 19:42:49 -0000
@@ -132,6 +132,8 @@
 AC_CHECK_LIB(xpg4,_xpg4_setrunelocale)
 dnl Check for -lpoll for Mac OS X/Darwin
 AC_CHECK_LIB(poll,poll)
+dnl Check for -lepoll
+AC_CHECK_LIB(epoll,epoll_create)
 dnl Check for -lresolv for Mac OS X/Darwin
 AC_CHECK_LIB(resolv,res_9_init)
 dnl Check for -lpthread
@@ -1044,6 +1046,7 @@
 	_vsnprintf \
 	chsize \
 	clone \
+	epoll_create \
 	finite \
 	fpclass \
 	fstatfs \
@@ -1156,6 +1159,7 @@
 	sys/msg.h \
 	sys/param.h \
 	sys/poll.h \
+	sys/epoll.h \
 	sys/ptrace.h \
 	sys/reg.h \
 	sys/scsiio.h \
Index: include/config.h.in
===================================================================
RCS file: /home/sun/sources/cvs/wine/include/config.h.in,v
retrieving revision 1.193
diff -u -r1.193 config.h.in
--- include/config.h.in	4 Aug 2004 19:10:26 -0000	1.193
+++ include/config.h.in	31 Aug 2004 19:44:59 -0000
@@ -80,6 +80,9 @@
 /* Define to 1 if you have the <elf.h> header file. */
 #undef HAVE_ELF_H
 
+/* Define to 1 if you have the `epoll_create' function. */
+#undef HAVE_EPOLL_CREATE
+
 /* Define to 1 if you have the `finite' function. */
 #undef HAVE_FINITE
 
@@ -248,6 +251,9 @@
 /* Define if you have the curses library (-lcurses) */
 #undef HAVE_LIBCURSES
 
+/* Define to 1 if you have the `epoll' library (-lepoll). */
+#undef HAVE_LIBEPOLL
+
 /* Define to 1 if you have the `i386' library (-li386). */
 #undef HAVE_LIBI386
 
@@ -608,6 +614,9 @@
 /* Define to 1 if you have the <sys/elf32.h> header file. */
 #undef HAVE_SYS_ELF32_H
 
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+#undef HAVE_SYS_EPOLL_H
+
 /* Define to 1 if you have the <sys/errno.h> header file. */
 #undef HAVE_SYS_ERRNO_H
 
Index: server/fd.c
===================================================================
RCS file: /home/sun/sources/cvs/wine/server/fd.c,v
retrieving revision 1.22
diff -u -r1.22 fd.c
--- server/fd.c	18 Aug 2004 00:04:58 -0000	1.22
+++ server/fd.c	31 Aug 2004 19:51:51 -0000
@@ -2,6 +2,7 @@
  * Server-side file descriptor management
  *
  * Copyright (C) 2000, 2003 Alexandre Julliard
+ * Copyright (C) 2004 Shachar Shemesh for Lingnu Open Source Consulting ltd.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -33,6 +34,10 @@
 #ifdef HAVE_SYS_POLL_H
 #include <sys/poll.h>
 #endif
+#ifdef HAVE_SYS_EPOLL_H
+#include <stdint.h>
+#include <sys/epoll.h>
+#endif
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -255,51 +260,145 @@
 
 
 /****************************************************************/
-/* poll support */
+/* poll and epoll support */
 
-static struct fd **poll_users;              /* users array */
-static struct pollfd *pollfd;               /* poll fd array */
-static int nb_users;                        /* count of array entries actually in use */
+struct fdtrack {
+    enum { FDT_FREE, FDT_UNWATCHED, FDT_WATCHED, FDT_ERROR } state;
+    union {
+        int nextfree;
+        struct fd *fd;
+    } data;
+};
+
+/* Vars used by both */
 static int active_users;                    /* current number of active users */
 static int allocated_users;                 /* count of allocated entries in the array */
-static struct fd **freelist;                /* list of free entries in the array */
+static struct fdtrack *poll_users;          /* users array */
+static int freelist;                        /* list of free entries in the array. End of list marker is -1 */
+
+/* Vars used by poll */
+static struct pollfd *pollfd;               /* poll fd array */
+static int nb_users;                        /* Maximal index of array entry actually in use */
+
+/* Vars used by epoll */
+static int epoll_fd=-1;                     /* File descriptor for the epoll */
+static const int EPOLL_SIZE=1024;           /* "Maximal size" passed to epoll_create. This is an optimization, not a real maximum */
+static struct epoll_event *epoll_revents;   /* Array for event results */
+static int epoll_revents_size;              /* Number of elements in epoll_revents */
+static int in_epoll;                        /* Marks whether we are currently processing the epoll results */
+static int closependinglist;                /* List of users in close pending state. End of list marker is -1 */
+static int closependinglistlast;            /* Last element of the pending list */
+
+#if !defined(HAVE_SYS_EPOLL_H) || !defined(HAVE_EPOLL_CREATE)
+/* epoll not available - make sure that the server still compiles and runs without it */
+
+static inline int epoll_create( int size )
+{
+    errno=ENOSYS;
+    return -1;
+}
+
+static inline int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
+{
+    errno=ENOSYS;
+    return -1;
+}
+
+static inline int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout)
+{
+    errno=ENOSYS;
+    return -1;
+}
+
+/* The following definitions are incorrect ON PURPOSE. They are only meant to make code that will never be reached pass
+ * compilation.
+ */
+struct epoll_event {
+    unsigned int events;
+    struct { unsigned int u32; } data;
+};
+
+enum { EPOLL_CTL_DEL, EPOLL_CTL_MOD, EPOLL_CTL_ADD };
+
+#endif
+
+/* Initialize a new area in the poll_users array to be free.
+ * Does not check input parameters - don't call with increment=0 */
+static void increase_poll_users_size( int increment )
+{
+    struct fdtrack *new_poll_users=realloc(poll_users, sizeof(struct fdtrack)*(increment+allocated_users));
+    struct pollfd *new_pollfd=(epoll_fd>=0)?NULL:realloc(pollfd, sizeof(struct pollfd)*(increment+allocated_users));
+
+    if( new_poll_users!=NULL && ( epoll_fd>=0 || new_pollfd!=NULL ))
+    {
+        int oldend=allocated_users;
+        int old_freelist=freelist;
+        
+        assert(old_freelist==-1); /* This funciton would never be called unless the free list is empty */
+        
+        poll_users=new_poll_users;
+        pollfd=new_pollfd;
+        allocated_users+=increment;
+        freelist=oldend;
+        
+        while( increment>0 )
+        {
+            poll_users[oldend].state=FDT_FREE;
+            poll_users[oldend].data.nextfree=oldend+1;
+
+            if( epoll_fd<0 )
+            {
+                pollfd[oldend].fd=-1;
+                pollfd[oldend].events=0;
+                pollfd[oldend].revents=0;
+            }
+
+            oldend++;
+            increment--;
+        }
+    
+        assert(poll_users[oldend-1].state==FDT_FREE); /* If this assert fails, someone probably called us with a zero increment */
+        poll_users[oldend-1].data.nextfree=old_freelist;
+    }
+    else if( new_poll_users!=NULL )
+    {
+        /* The first realloc succeeded, the second one failed. Make sure the list is consistant. Not much else you can do */
+        poll_users=new_poll_users;
+    }
+}
 
 /* add a user in the poll array and return its index, or -1 on failure */
 static int add_poll_user( struct fd *fd )
 {
     int ret;
-    if (freelist)
+
+    /* Find next free entry in list */
+    if( freelist==-1 )
     {
-        ret = freelist - poll_users;
-        freelist = (struct fd **)poll_users[ret];
+        increase_poll_users_size(allocated_users);
     }
-    else
+
+    ret=freelist;
+
+    if( ret!=-1 )
     {
-        if (nb_users == allocated_users)
-        {
-            struct fd **newusers;
-            struct pollfd *newpoll;
-            int new_count = allocated_users ? (allocated_users + allocated_users / 2) : 16;
-            if (!(newusers = realloc( poll_users, new_count * sizeof(*poll_users) ))) return -1;
-            if (!(newpoll = realloc( pollfd, new_count * sizeof(*pollfd) )))
-            {
-                if (allocated_users)
-                    poll_users = newusers;
-                else
-                    free( newusers );
-                return -1;
-            }
-            poll_users = newusers;
-            pollfd = newpoll;
-            allocated_users = new_count;
-        }
-        ret = nb_users++;
-    }
-    pollfd[ret].fd = -1;
-    pollfd[ret].events = 0;
-    pollfd[ret].revents = 0;
-    poll_users[ret] = fd;
-    active_users++;
+        assert(poll_users[ret].state==FDT_FREE);
+        assert(epoll_fd>=0 || pollfd[ret].fd==-1);
+        assert(epoll_fd>=0 || pollfd[ret].events==0);
+        assert(epoll_fd>=0 || pollfd[ret].revents==0);
+
+        /* Mark the node as allocated */
+        freelist=poll_users[ret].data.nextfree;
+        assert(poll_users[ret].state==FDT_FREE);
+        poll_users[ret].state=FDT_UNWATCHED;
+        poll_users[ret].data.fd = fd;
+        active_users++;
+
+        if( epoll_fd<0 && ret>=nb_users )
+            nb_users=ret+1;
+    }
+    /* If the above "if" fails, it means there is no memory for the new entry */
+
     return ret;
 }
 
@@ -307,17 +406,71 @@
 static void remove_poll_user( struct fd *fd, int user )
 {
     assert( user >= 0 );
-    assert( poll_users[user] == fd );
-    pollfd[user].fd = -1;
-    pollfd[user].events = 0;
-    pollfd[user].revents = 0;
-    poll_users[user] = (struct fd *)freelist;
-    freelist = &poll_users[user];
+    assert( poll_users[user].data.fd==fd && fd->poll_index==user );
+    assert( poll_users[user].state!=FDT_WATCHED || fd->unix_fd!=-1 );
+    /* If this fails it means that there is a file descriptor still being watched by us, that is no longer recorded in the fd struct.
+     * This is not a problem for poll, and is not a problem for epoll IF the file descriptor was indeed closed. */
+    
+    if( epoll_fd>=0 )
+    {
+        if( fd->unix_fd!=-1 )
+        {
+            struct epoll_event event; /* This struct needn't actually contain anything meaningful, as we are merely calling DEL */
+            epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd->unix_fd, &event);
+            /* XXX - May need to check return code, but there is really nothing to do on failure.
+             * In any case, failing with errno=ENOENT is ok.
+             */
+        }
+    }
+    else
+    {
+        pollfd[user].fd = -1;
+        pollfd[user].events = 0;
+        pollfd[user].revents = 0;
+    }
+    
+    /* Mark the entry for reuse */
+    poll_users[user].state=FDT_FREE;
+    if( in_epoll )
+    {
+        /* This junction prevents a race where an epoll returned several fds, and this fd is one of them. Before epoll gets to handling
+         * this fd, however, a previous fd asks to release this fd. As a result, epoll might call a handler on a released fd. epoll checks
+         * the state to avoid this race.
+         * However, it may still happen that the same epoll handling loop may release this fd, and then reallocate for a totally different
+         * purpose. This will foil the state checking safty measure.
+         *
+         * To prevent this, fds freed when epoll is still processing results are
+         * put on a pending list, and are only added to the free list after epoll finishes processing.
+         *
+         * Notice when changing this comment - there are two comments in main_loop that refer to this one. Make sure these are updated as well.
+         */
+        if( closependinglist==-1 )
+            closependinglistlast=user;
+        poll_users[user].data.nextfree=closependinglist;
+        closependinglist=user;
+    }
+    else
+    {
+        poll_users[user].data.nextfree=freelist;
+        freelist=user;
+    }
     active_users--;
 }
 
+/* initialize the fd related state */
+void init_fd()
+{
+    /* If this function fails, epoll_fd will be -1, which will signal
+     * the rest of the functions to use poll instead */
+    epoll_fd=epoll_create(EPOLL_SIZE);
+
+    freelist=-1;
+    closependinglist=-1;
+    closependinglistlast=-1;
+    increase_poll_users_size(32);
+}
 
-/* server main poll() loop */
+/* server main poll() or epoll() loop */
 void main_loop(void)
 {
     int ret;
@@ -341,16 +494,70 @@
             }
             if (!active_users) break;  /* last user removed by a timeout */
         }
-        ret = poll( pollfd, nb_users, diff );
-        if (ret > 0)
+        if( epoll_fd>=0 )
+        {
+            /* if the event handlers added a lot of new events, we may want to increase the returned
+             * events space */
+            if( epoll_revents_size<active_users )
+            {
+                /* If epoll_revents_size is below active_users, there is a (very slim) potential for starvation.
+                 * It is not, strictly speaking, an error though. As such, if the allocation fails, we just go on
+                 * with the smaller array size. */
+                struct epoll_event *new_revents=malloc(sizeof(struct epoll_event)*active_users);
+                if( new_revents!=NULL )
+                {
+                    free(epoll_revents);
+                    epoll_revents=new_revents;
+                    epoll_revents_size=active_users;
+                }
+            }
+            
+            in_epoll=1;
+            ret = epoll_wait(epoll_fd, epoll_revents, epoll_revents_size, diff );
+            if( ret>=0 )
+            {
+                while( (ret--)>0 )
+                {
+                    /* This "if" is preventing a race. The race is documented in a comment inside the "remove_poll_user" function */
+                    if( poll_users[epoll_revents[ret].data.u32].state==FDT_WATCHED )
+                        fd_poll_event(poll_users[epoll_revents[ret].data.u32].data.fd, epoll_revents[ret].events);
+                }
+            }
+            else
+            {
+                if( errno!=EINTR )
+                    perror("wineserver: main_loop epoll");
+                else if( debug_level )
+                    perror("wineserver: main_loop epoll (nothing to worry about)");
+            }
+            in_epoll=0;
+
+            /* Free the close pending users.
+             * Link the current free list to the end of the pending list, and transfer to the free list.
+             * The race prevented by these actions is documented in a comment in the "remove_poll_user" function
+             */
+            if(closependinglist!=-1)
+            {
+                assert(poll_users[closependinglistlast].state==FDT_FREE &&
+                        poll_users[closependinglistlast].data.nextfree==-1);
+
+                poll_users[closependinglistlast].data.nextfree=freelist;
+                freelist=closependinglist;
+                closependinglist=-1;
+            }
+        }
+        else
         {
+            /* No epoll support - use poll */
             int i;
-            for (i = 0; i < nb_users; i++)
+
+            ret = poll( pollfd, nb_users, diff );
+            for (i = 0; ret>0 && i < nb_users; i++)
             {
                 if (pollfd[i].revents)
                 {
-                    fd_poll_event( poll_users[i], pollfd[i].revents );
-                    if (!--ret) break;
+                    fd_poll_event( poll_users[i].data.fd, pollfd[i].revents );
+                    --ret;
                 }
             }
         }
@@ -835,18 +1042,48 @@
 /* set the events that select waits for on this fd */
 void set_fd_events( struct fd *fd, int events )
 {
-    int user = fd->poll_index;
-    assert( poll_users[user] == fd );
-    if (events == -1)  /* stop waiting on this fd completely */
+    int user=fd->poll_index;
+    assert(poll_users[user].state!=FDT_FREE);
+    assert( poll_users[user].data.fd == fd );
+
+    if( epoll_fd>=0 )
     {
-        pollfd[user].fd = -1;
-        pollfd[user].events = POLLERR;
-        pollfd[user].revents = 0;
+        struct epoll_event event;
+        event.events=events;
+        event.data.u32=user;
+
+        assert(fd->unix_fd>=0);
+
+        if( events != -1 && poll_users[user].state!=FDT_ERROR )
+        {
+            int epoll_res;
+
+            epoll_res=epoll_ctl( epoll_fd, (poll_users[user].state==FDT_WATCHED)?EPOLL_CTL_MOD:EPOLL_CTL_ADD, fd->unix_fd, &event );
+            /* FIXME: Need to handle failure here */
+            poll_users[user].state=FDT_WATCHED;
+        }
+        else if( events==-1 )
+        {
+            /* flag fd as erred */
+            epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd->unix_fd, &event );
+            poll_users[user].state=FDT_ERROR;
+        }
     }
-    else if (pollfd[user].fd != -1 || !pollfd[user].events)
+    else
     {
-        pollfd[user].fd = fd->unix_fd;
-        pollfd[user].events = events;
+        if (events == -1)  /* stop waiting on this fd completely */
+        {
+            pollfd[user].fd = -1;
+            pollfd[user].events = POLLERR;
+            pollfd[user].revents = 0;
+            poll_users[user].state=FDT_ERROR;
+        }
+        else if (poll_users[user].state!=FDT_ERROR)
+        {
+            pollfd[user].fd = fd->unix_fd;
+            pollfd[user].events = events;
+            poll_users[user].state = FDT_WATCHED;
+        }
     }
 }
 
Index: server/file.h
===================================================================
RCS file: /home/sun/sources/cvs/wine/server/file.h,v
retrieving revision 1.17
diff -u -r1.17 file.h
--- server/file.h	18 Aug 2004 00:04:58 -0000	1.17
+++ server/file.h	27 Aug 2004 11:42:58 -0000
@@ -44,6 +44,7 @@
 
 /* file descriptor functions */
 
+extern void init_fd();
 extern struct fd *alloc_fd( const struct fd_ops *fd_user_ops, struct object *user );
 extern struct fd *open_fd( struct fd *fd, const char *name, int flags, mode_t *mode,
                            unsigned int access, unsigned int sharing, unsigned int options );
Index: server/main.c
===================================================================
RCS file: /home/sun/sources/cvs/wine/server/main.c,v
retrieving revision 1.31
diff -u -r1.31 main.c
--- server/main.c	26 Mar 2003 01:32:18 -0000	1.31
+++ server/main.c	27 Aug 2004 11:42:58 -0000
@@ -122,6 +122,7 @@
     signal( SIGTERM, sigterm_handler );
     signal( SIGABRT, sigterm_handler );
 
+    init_fd();
     sock_init();
     open_master_socket();
     sync_namespace = create_namespace( 37, TRUE );


More information about the wine-patches mailing list