Almost final epoll patch

Shachar Shemesh wine-devel at shemesh.biz
Fri Aug 27 07:54:05 CDT 2004


Hi list,

Attached is the epoll patch. All problems described in my previous patch 
sent should be solved here. The only reason I'm not sending this to 
wine-patches instead of here is that I have not yet had a chance to 
check it on my client's huge system, and so I cannot say for sure that 
all problems are truly and utterly resolved.

In any case, I would welcome any and all comments about it. This patch 
is against CVS head, and should compile even if epoll.h is not 
available, as well as on old glibc's where epoll.h required stdint.h as 
a prerequisite, and the library's implementation did not have glibc.

             Shachar

-- 
Shachar Shemesh
Lingnu Open Source Consulting ltd.
http://www.lingnu.com/

-------------- next part --------------
Index: configure
===================================================================
RCS file: /home/sun/sources/cvs/wine/configure,v
retrieving revision 1.592
diff -u -r1.592 configure
--- configure	24 Aug 2004 21:00:15 -0000	1.592
+++ configure	27 Aug 2004 11:43:39 -0000
@@ -16207,6 +16207,7 @@
 
 
 
+
 for ac_header in \
 	arpa/inet.h \
 	arpa/nameser.h \
@@ -16269,6 +16270,7 @@
 	sys/msg.h \
 	sys/param.h \
 	sys/poll.h \
+	sys/epoll.h \
 	sys/ptrace.h \
 	sys/reg.h \
 	sys/scsiio.h \
Index: configure.ac
===================================================================
RCS file: /home/sun/sources/cvs/wine/configure.ac,v
retrieving revision 1.303
diff -u -r1.303 configure.ac
--- configure.ac	24 Aug 2004 21:00:15 -0000	1.303
+++ configure.ac	27 Aug 2004 11:42:58 -0000
@@ -1156,6 +1156,7 @@
 	sys/msg.h \
 	sys/param.h \
 	sys/poll.h \
+	sys/epoll.h \
 	sys/ptrace.h \
 	sys/reg.h \
 	sys/scsiio.h \
Index: include/config.h.in
===================================================================
RCS file: /home/sun/sources/cvs/wine/include/config.h.in,v
retrieving revision 1.193
diff -u -r1.193 config.h.in
--- include/config.h.in	4 Aug 2004 19:10:26 -0000	1.193
+++ include/config.h.in	27 Aug 2004 11:42:58 -0000
@@ -608,6 +608,9 @@
 /* Define to 1 if you have the <sys/elf32.h> header file. */
 #undef HAVE_SYS_ELF32_H
 
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+#undef HAVE_SYS_EPOLL_H
+
 /* Define to 1 if you have the <sys/errno.h> header file. */
 #undef HAVE_SYS_ERRNO_H
 
Index: server/fd.c
===================================================================
RCS file: /home/sun/sources/cvs/wine/server/fd.c,v
retrieving revision 1.22
diff -u -r1.22 fd.c
--- server/fd.c	18 Aug 2004 00:04:58 -0000	1.22
+++ server/fd.c	27 Aug 2004 11:47:12 -0000
@@ -33,6 +33,13 @@
 #ifdef HAVE_SYS_POLL_H
 #include <sys/poll.h>
 #endif
+#ifdef HAVE_SYS_EPOLL_H
+#include <stdint.h>
+#include <sys/epoll.h>
+#endif
+#ifdef HAVE_SYS_SYSCALL_H
+#include <sys/syscall.h>
+#endif
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -255,51 +262,173 @@
 
 
 /****************************************************************/
-/* poll support */
+/* poll and epoll support */
 
-static struct fd **poll_users;              /* users array */
-static struct pollfd *pollfd;               /* poll fd array */
-static int nb_users;                        /* count of array entries actually in use */
+struct fdtrack {
+    enum { FDT_FREE, FDT_UNWATCHED, FDT_WATCHED, FDT_ERROR } state;
+    union {
+        int nextfree;
+        struct fd *fd;
+    } data;
+};
+
+/* Vars used by both */
 static int active_users;                    /* current number of active users */
 static int allocated_users;                 /* count of allocated entries in the array */
-static struct fd **freelist;                /* list of free entries in the array */
+static struct fdtrack *poll_users;          /* users array */
+static int freelist;                        /* list of free entries in the array. End of list marker is -1 */
+
+/* Vars used by poll */
+static struct pollfd *pollfd;               /* poll fd array */
+static int nb_users;                        /* Maximal index of array entry actually in use */
+
+/* Vars used by epoll */
+static int epoll_fd=-1;                     /* File descriptor for the epoll */
+static const int EPOLL_SIZE=1024;           /* "Maximal size" passed to epoll_create. This is an optimization, not a real maximum */
+static struct epoll_event *epoll_revents;   /* Array for event results */
+static int epoll_revents_size;              /* Number of elements in epoll_revents */
+static int in_epoll;                        /* Marks whether we are currently processing the epoll results */
+static int closependinglist;                /* List of users in close pending state. End of list marker is -1 */
+
+#if defined(HAVE_SYS_EPOLL_H) && (HAVE_SYS_EPOLL_H)
+/* To prevent wine from not loading on non-epoll supporting platforms,
+ * we need to dynamically call the epoll functions */
+#ifndef __NR_epoll_create
+#define __NR_epoll_create	254
+#endif
+
+#ifndef __NR_epoll_ctl
+#define __NR_epoll_ctl		255
+#endif
+
+#ifndef __NR_epoll_wait
+#define __NR_epoll_wait		256
+#endif
+
+static inline int _epoll_create( int size )
+{
+    return syscall(__NR_epoll_create, size);
+}
+
+static inline int _epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
+{
+    return syscall(__NR_epoll_ctl, epfd, op, fd, event);
+}
+
+static inline int _epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout)
+{
+    return syscall(__NR_epoll_wait, epfd, events, maxevents, timeout);
+}
+#else
+/* epoll not available - make sure that the server still compiles and runs without it */
+
+static inline int _epoll_create( int size )
+{
+    errno=ENOSYS;
+    return -1;
+}
+
+static inline int _epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
+{
+    errno=ENOSYS;
+    return -1;
+}
+
+static inline int _epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout)
+{
+    errno=ENOSYS;
+    return -1;
+}
+
+/* The following definitions are incorrect ON PURPOSE. They are only meant to make code that will never be reached pass
+ * compilation.
+ */
+struct epoll_event {
+    unsigned int events;
+    struct { unsigned int u32; } data;
+};
+
+enum { EPOLL_CTL_DEL, EPOLL_CTL_MOD, EPOLL_CTL_ADD };
+
+#endif
+
+/* Initialize a new area in the poll_users array to be free.
+ * Does not check input parameters - don't call with increment=0 */
+static void increase_poll_users_size( int increment )
+{
+    struct fdtrack *new_poll_users=realloc(poll_users, sizeof(struct fdtrack)*(increment+allocated_users));
+    struct pollfd *new_pollfd=(epoll_fd>=0)?NULL:realloc(pollfd, sizeof(struct pollfd)*(increment+allocated_users));
+
+    if( new_poll_users!=NULL && ( epoll_fd>=0 || new_pollfd!=NULL ))
+    {
+        int oldend=allocated_users;
+        int old_freelist=freelist;
+        
+        assert(old_freelist==-1); /* This funciton would never be called unless the free list is empty */
+        
+        poll_users=new_poll_users;
+        pollfd=new_pollfd;
+        allocated_users+=increment;
+        freelist=oldend;
+        
+        while( increment>0 )
+        {
+            poll_users[oldend].state=FDT_FREE;
+            poll_users[oldend].data.nextfree=oldend+1;
+
+            if( epoll_fd<0 )
+            {
+                pollfd[oldend].fd=-1;
+                pollfd[oldend].events=0;
+                pollfd[oldend].revents=0;
+            }
+
+            oldend++;
+            increment--;
+        }
+    
+        assert(poll_users[oldend-1].state==FDT_FREE); /* If this assert fails, someone probably called us with a zero increment */
+        poll_users[oldend-1].data.nextfree=old_freelist;
+    }
+    else if( new_poll_users!=NULL )
+    {
+        /* The first realloc succeeded, the second one failed. Make sure the list is consistant. Not much else you can do */
+        poll_users=new_poll_users;
+    }
+}
 
 /* add a user in the poll array and return its index, or -1 on failure */
 static int add_poll_user( struct fd *fd )
 {
     int ret;
-    if (freelist)
+
+    /* Find next free entry in list */
+    if( freelist==-1 )
     {
-        ret = freelist - poll_users;
-        freelist = (struct fd **)poll_users[ret];
+        increase_poll_users_size(allocated_users);
     }
-    else
+
+    ret=freelist;
+
+    if( ret!=-1 )
     {
-        if (nb_users == allocated_users)
-        {
-            struct fd **newusers;
-            struct pollfd *newpoll;
-            int new_count = allocated_users ? (allocated_users + allocated_users / 2) : 16;
-            if (!(newusers = realloc( poll_users, new_count * sizeof(*poll_users) ))) return -1;
-            if (!(newpoll = realloc( pollfd, new_count * sizeof(*pollfd) )))
-            {
-                if (allocated_users)
-                    poll_users = newusers;
-                else
-                    free( newusers );
-                return -1;
-            }
-            poll_users = newusers;
-            pollfd = newpoll;
-            allocated_users = new_count;
-        }
-        ret = nb_users++;
-    }
-    pollfd[ret].fd = -1;
-    pollfd[ret].events = 0;
-    pollfd[ret].revents = 0;
-    poll_users[ret] = fd;
-    active_users++;
+        assert(poll_users[ret].state==FDT_FREE);
+        assert(epoll_fd>=0 || pollfd[ret].fd==-1);
+        assert(epoll_fd>=0 || pollfd[ret].events==0);
+        assert(epoll_fd>=0 || pollfd[ret].revents==0);
+
+        /* Mark the node as allocated */
+        freelist=poll_users[ret].data.nextfree;
+        assert(poll_users[ret].state==FDT_FREE);
+        poll_users[ret].state=FDT_UNWATCHED;
+        poll_users[ret].data.fd = fd;
+        active_users++;
+
+        if( epoll_fd<0 && ret>=nb_users )
+            nb_users=ret+1;
+    }
+    /* If the above "if" fails, it means there is no memory for the new entry */
+
     return ret;
 }
 
@@ -307,17 +436,68 @@
 static void remove_poll_user( struct fd *fd, int user )
 {
     assert( user >= 0 );
-    assert( poll_users[user] == fd );
-    pollfd[user].fd = -1;
-    pollfd[user].events = 0;
-    pollfd[user].revents = 0;
-    poll_users[user] = (struct fd *)freelist;
-    freelist = &poll_users[user];
+    assert( poll_users[user].data.fd==fd && fd->poll_index==user );
+    assert( poll_users[user].state!=FDT_WATCHED || fd->unix_fd!=-1 );
+    /* If this fails it means that there is a file descriptor still being watched by us, that is no longer recorded in the fd struct.
+     * This is not a problem for poll, and is not a problem for epoll IF the file descriptor was indeed closed. */
+    
+    if( epoll_fd>=0 )
+    {
+        if( fd->unix_fd!=-1 )
+        {
+            struct epoll_event event; /* This struct needn't actually contain anything meaningful, as we are merely calling DEL */
+            _epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd->unix_fd, &event);
+            /* XXX - May need to check return code, but there is really nothing to do on failure.
+             * In any case, failing with errno=ENOENT is ok.
+             */
+        }
+    }
+    else
+    {
+        pollfd[user].fd = -1;
+        pollfd[user].events = 0;
+        pollfd[user].revents = 0;
+    }
+    
+    /* Mark the entry for reuse */
+    poll_users[user].state=FDT_FREE;
+    if( in_epoll )
+    {
+        /* This junction prevents a race where an epoll returned several fds, and this fd is one of them. Before epoll gets to handling
+         * this fd, however, a previous fd asks to release this fd. As a result, epoll might call a handler on a released fd. epoll checks
+         * the state to avoid this race.
+         * However, it may still happen that the same epoll handling loop may release this fd, and then reallocate for a totally different
+         * purpose. This will foil the state checking safty measure.
+         *
+         * To prevent this, fds freed when epoll is still processing results are
+         * put on a pending list, and are only added to the free list after epoll finishes processing.
+         *
+         * Notice when changing this comment - there are two comments in main_loop that refer to this one. Make sure these are updated as well.
+         */
+        poll_users[user].data.nextfree=closependinglist;
+        closependinglist=user;
+    }
+    else
+    {
+        poll_users[user].data.nextfree=freelist;
+        freelist=user;
+    }
     active_users--;
 }
 
+/* initialize the fd related state */
+void init_fd()
+{
+    /* If this function fails, epoll_fd will be -1, which will signal
+     * the rest of the functions to use poll instead */
+    epoll_fd=_epoll_create(EPOLL_SIZE);
+
+    freelist=-1;
+    closependinglist=-1;
+    increase_poll_users_size(32);
+}
 
-/* server main poll() loop */
+/* server main poll() or epoll() loop */
 void main_loop(void)
 {
     int ret;
@@ -341,16 +521,79 @@
             }
             if (!active_users) break;  /* last user removed by a timeout */
         }
-        ret = poll( pollfd, nb_users, diff );
-        if (ret > 0)
+        if( epoll_fd>=0 )
         {
+            /* if the event handlers added a lot of new events, we may want to increase the returned
+             * events space */
+            if( epoll_revents_size<active_users )
+            {
+                /* If epoll_revents_size is below active_users, there is a (very slim) potential for starvation.
+                 * It is not, strictly speaking, an error though. As such, if the allocation fails, we just go on
+                 * with the smaller array size. */
+                struct epoll_event *new_revents=malloc(sizeof(struct epoll_event)*active_users);
+                if( new_revents!=NULL )
+                {
+                    free(epoll_revents);
+                    epoll_revents=new_revents;
+                    epoll_revents_size=active_users;
+                }
+            }
+            
+            in_epoll=1;
+            ret = _epoll_wait(epoll_fd, epoll_revents, epoll_revents_size, diff );
+            if( ret>=0 )
+            {
+                while( (ret--)>0 )
+                {
+                    /* This "if" is preventing a race. The race is documented in a comment inside the "remove_poll_user" function */
+                    if( poll_users[epoll_revents[ret].data.u32].state==FDT_WATCHED )
+                        fd_poll_event(poll_users[epoll_revents[ret].data.u32].data.fd, epoll_revents[ret].events);
+                }
+            }
+            else
+            {
+                if( errno!=EINTR )
+                    perror("wineserver: main_loop epoll");
+                else if( debug_level )
+                    perror("wineserver: main_loop epoll (nothing to worry about)");
+            }
+            in_epoll=0;
+
+            /* Free the close pending users.
+             * Link the current free list to the end of the pending list, and transfer to the free list.
+             * The race prevented by these actions is documented in a comment in the "remove_poll_user" function
+             */
+            if(closependinglist!=-1)
+            {
+                int nextfree, currentfree;
+                
+                currentfree=closependinglist;
+                
+                assert(poll_users[currentfree].state==FDT_FREE);
+                
+                while( (nextfree=poll_users[currentfree].data.nextfree)!=-1 )
+                {
+                    assert(poll_users[nextfree].state==FDT_FREE);
+                    currentfree=nextfree;
+                }
+                
+                poll_users[currentfree].data.nextfree=freelist;
+                freelist=closependinglist;
+                closependinglist=-1;
+            }
+        }
+        else
+        {
+            /* No epoll support - use poll */
             int i;
-            for (i = 0; i < nb_users; i++)
+
+            ret = poll( pollfd, nb_users, diff );
+            for (i = 0; ret>0 && i < nb_users; i++)
             {
                 if (pollfd[i].revents)
                 {
-                    fd_poll_event( poll_users[i], pollfd[i].revents );
-                    if (!--ret) break;
+                    fd_poll_event( poll_users[i].data.fd, pollfd[i].revents );
+                    --ret;
                 }
             }
         }
@@ -835,18 +1078,48 @@
 /* set the events that select waits for on this fd */
 void set_fd_events( struct fd *fd, int events )
 {
-    int user = fd->poll_index;
-    assert( poll_users[user] == fd );
-    if (events == -1)  /* stop waiting on this fd completely */
+    int user=fd->poll_index;
+    assert(poll_users[user].state!=FDT_FREE);
+    assert( poll_users[user].data.fd == fd );
+
+    if( epoll_fd>=0 )
     {
-        pollfd[user].fd = -1;
-        pollfd[user].events = POLLERR;
-        pollfd[user].revents = 0;
+        struct epoll_event event;
+        event.events=events;
+        event.data.u32=user;
+
+        assert(fd->unix_fd>=0);
+
+        if( events != -1 && poll_users[user].state!=FDT_ERROR )
+        {
+            int epoll_res;
+
+            epoll_res=_epoll_ctl( epoll_fd, (poll_users[user].state==FDT_WATCHED)?EPOLL_CTL_MOD:EPOLL_CTL_ADD, fd->unix_fd, &event );
+            /* FIXME: Need to handle failure here */
+            poll_users[user].state=FDT_WATCHED;
+        }
+        else if( events==-1 )
+        {
+            /* flag fd as erred */
+            _epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd->unix_fd, &event );
+            poll_users[user].state=FDT_ERROR;
+        }
     }
-    else if (pollfd[user].fd != -1 || !pollfd[user].events)
+    else
     {
-        pollfd[user].fd = fd->unix_fd;
-        pollfd[user].events = events;
+        if (events == -1)  /* stop waiting on this fd completely */
+        {
+            pollfd[user].fd = -1;
+            pollfd[user].events = POLLERR;
+            pollfd[user].revents = 0;
+            poll_users[user].state=FDT_ERROR;
+        }
+        else if (poll_users[user].state!=FDT_ERROR)
+        {
+            pollfd[user].fd = fd->unix_fd;
+            pollfd[user].events = events;
+            poll_users[user].state = FDT_WATCHED;
+        }
     }
 }
 
Index: server/file.h
===================================================================
RCS file: /home/sun/sources/cvs/wine/server/file.h,v
retrieving revision 1.17
diff -u -r1.17 file.h
--- server/file.h	18 Aug 2004 00:04:58 -0000	1.17
+++ server/file.h	27 Aug 2004 11:42:58 -0000
@@ -44,6 +44,7 @@
 
 /* file descriptor functions */
 
+extern void init_fd();
 extern struct fd *alloc_fd( const struct fd_ops *fd_user_ops, struct object *user );
 extern struct fd *open_fd( struct fd *fd, const char *name, int flags, mode_t *mode,
                            unsigned int access, unsigned int sharing, unsigned int options );
Index: server/main.c
===================================================================
RCS file: /home/sun/sources/cvs/wine/server/main.c,v
retrieving revision 1.31
diff -u -r1.31 main.c
--- server/main.c	26 Mar 2003 01:32:18 -0000	1.31
+++ server/main.c	27 Aug 2004 11:42:58 -0000
@@ -122,6 +122,7 @@
     signal( SIGTERM, sigterm_handler );
     signal( SIGABRT, sigterm_handler );
 
+    init_fd();
     sock_init();
     open_master_socket();
     sync_namespace = create_namespace( 37, TRUE );


More information about the wine-devel mailing list