對于較多數量的文件描述符的監聽無論是select還是poll系統調用都顯得捉襟見肘,poll每次都需要將所有的文件描述符復制到內核,內核本身不會對這些文件描述符加以保存,這樣的設計就導致了poll的效率的低下。
而epoll則對此做了相應的改進,不是epoll_wait的時候才傳入fd,而是通過epoll_ctl把所有fd傳入內核,再一起”wait”,這就省掉了不必要的重復拷貝。
其次,在 epoll_wait時,也不是把current輪流地加入fd對應的設備等待隊列,而是在設備等待隊列醒來時調用一個回調函數(當然,這就需要“喚醒回調”機制),把產生事件的fd歸入一個鏈表,然后返回這個鏈表上的fd。另外,epoll機制實現了自己特有的文件系統eventpoll filesystem。
epoll初始化
當系統啟動時,epoll會進行初始化操作:
static int __init eventpoll_init(void)
{
mutex_init(&epmutex);
/* Initialize the structure used to perform safe poll wait head wake ups */
ep_poll_safewake_init(&psw);
/* Allocates slab cache used to allocate "struct epitem" items */
epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
NULL);
/* Allocates slab cache used to allocate "struct eppoll_entry" */
pwq_cache = kmem_cache_create("eventpoll_pwq",
sizeof(struct eppoll_entry), 0,
EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
return 0;
}
fs_initcall(eventpoll_init);
上面的代碼實現一些數據結構的初始化,通過fs/eventpoll.c中的注釋可以看出,有三種類型的鎖機制使用場景:
1.epmutex(mutex):用戶關閉文件描述符,但是沒有調用EPOLL_CTL_DEL
2.ep->mtx(mutex):用戶態與內核態的轉換可能會睡眠
3.ep->lock(spinlock):內核態與具體設備中斷過程中的轉換,poll回調
接下來就是使用slab分配器動態分配內存,第一個結構為當系統中添加一個fd時,就創建一epitem結構體,內核管理的基本數據結構。
內核數據結構
epoll在內核主要維護了兩個數據結構eventpoll與epitem,其中eventpoll表示每個epoll實例本身,epitem表示的是每一個IO所對應的的事件。
struct epitem {
/* RB tree node used to link this structure to the eventpoll RB tree */
struct rb_node rbn; /*用于掛載到eventpoll管理的紅黑樹*/
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink; /*掛載到eventpoll.rdlist的事件就緒隊列*/
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next; /*用于主結構體中的鏈表*/
/* The file descriptor information this item refers to */
struct epoll_filefd ffd; /*該結構體對應的被監聽的文件描述符信息(fd+file, 作為紅黑樹的key)*/
/* Number of active wait queue attached to poll operations */
int nwait; /*poll(輪詢操作)的事件個數
/* List containing poll wait queues */
struct list_head pwqlist; /*雙向鏈表,保存被監視文件的等待隊列,功能類似于select/poll中的poll_table;同一個文件上可能會監視多種事件,這些事件可能從屬于不同的wait_queue中,所以需要使用鏈表
/* The "container" of this item */
struct eventpoll *ep; /*當前epitem的所有者(多個epitem從屬于一個eventpoll)*/
/* List header used to link this item to the "struct file" items list */
struct list_head fllink; /*雙向鏈表,用來鏈接被監視的文件描述符對應的struct file。因為file里有f_ep_link用來保存所有監視這個文件的epoll節點
/* The structure that describe the interested events and the source fd */
struct epoll_event event; /*注冊感興趣的事件,也就是用戶空間的epoll_event
};
而每個epoll fd對應的主要數據結構為:
struct eventpoll {
/* Protect the this structure access */
spinlock_t lock; /*自旋鎖,在kernel內部用自旋鎖加鎖,就可以同時多線(進)程對此結構體進行操作,主要是保護ready_list*/
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx; /*防止使用時被刪除*/
/* Wait queue used by sys_epoll_wait() */
wait_queue_head_t wq; /*sys_epoll_wait()使用的等待隊列*/
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait; /*file->epoll()使用的等待隊列*/
/* List of ready file descriptors */
struct list_head rdllist; /*事件就緒鏈表*/
/* RB tree root used to store monitored fd structs */
struct rb_root rbr; /*用于管理當前epoll關注的文件描述符(樹根)*/
/*
* This is a single linked list that chains all the "struct epitem" that
* hAppened while transfering ready events to userspace w/out
* holding ->lock.
*/
struct epitem *ovflist; /*在向用戶空間傳輸就緒事件的時候,將同時發生事件的文件描述符鏈入到這個鏈表里面*/
};
函數調用關系
epoll_create
每個eventpoll通過epoll_create()創建:
asmlinkage long sys_epoll_create(int size)
{
int error, fd = -1;
struct eventpoll *ep;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)n",
current, size));
/*
* Sanity check on the size parameter, and create the internal data
* structure ( "struct eventpoll" ).
*/
error = -EINVAL;
/*為ep分配內存并進行初始化*/
if (size <= 0 || (error = ep_alloc(&ep)) < 0) {
fd = error;
goto error_return;
}
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
/*調用anon_inode_getfd新建一個struct file,也就是epoll可以看成一個文件(由* 于沒有任何文件系統,為匿名文件)。并且將主結構體struct eventpoll *ep放入* file->private項中進行保存(sys_epoll_ctl會取用)*/
fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
if (fd < 0)
ep_free(ep);
error_return:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %dn",
current, size, fd));
return fd;
}
epoll_ctl
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
struct epoll_event __user *event)
{
int error;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)n",
current, epfd, op, fd, event));
error = -EFAULT;
/*判斷參數合法性,將__user *event 復制給epds*/
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
file = fget(epfd); /*epoll fd對應的文件對象*/
if (!file)
goto error_return;
/* Get the "struct file *" for the target file */
tfile = fget(fd); /*fd對應的文件對象*/
if (!tfile)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
goto error_tgt_fput;
...
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data; /*在create時存入進去的(anon_inode_getfd),現在取用。*/
mutex_lock(&ep->mtx);
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
epi = ep_find(ep, tfile, fd); /*防止重復添加(在ep的紅黑樹中查找是否已經存在這個fd)*/
switch (op) {
case EPOLL_CTL_ADD: /*新增一個監聽fd*/
if (!epi) {
epds.events |= POLLERR | POLLHUP; /*默認包含POLLERR和POLLHUP事件*/
error = ep_insert(ep, &epds, tfile, fd); /*在ep的紅黑樹中插入這個fd對應的epitm結構體。*/
} else /*重復添加(在ep的紅黑樹中查找已經存在這個fd)。*/
error = -EEXIST;
break;
...
}
...
return error;
}
其中ep_insert的實現如下:
```c
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd)
{
int error, revents, pwake = 0;
unsigned long flags;
struct epitem *epi;
struct ep_pqueue epq;
error = -ENOMEM;
/*分配一個epitem結構體來保存每個存入的fd*/
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
goto error_return;
/* Item initialization follow here ... */
/*初始化該結構體*/
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
/* Initialize the poll table using the queue callback */
epq.epi = epi;
/*安裝poll回調函數*/
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/*
* Attach the item to the poll hooks and get current event bits.
* We can safely use the file* here because its usage count has
* been increased by the caller of this function. Note that after
* this operation completes, the poll callback can start hitting
* the new item.
*/
/*
* 調用poll函數來獲取當前事件位,其實是利用它來調用注冊函數ep_ptable_queue_proc(poll_wait中調用)。
* 如果fd是套接字,f_op為socket_file_ops,poll函數是sock_poll()。
* 如果是TCP套接字的話,進而會調用到tcp_poll()函數。此處調用poll函數查看當前文件描述符的狀態,存儲在revents中。
* 在poll的處理函數(tcp_poll())中,會調用sock_poll_wait(),
* 在sock_poll_wait()中會調用到epq.pt.qproc指向的函數,也就是ep_ptable_queue_proc()。
*/
revents = tfile->f_op->poll(tfile, &epq.pt);
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_ep_lock);
list_add_tail(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_ep_lock);
/*
* Add the current item to the RB tree. All RB tree operations are
* protected by "mtx", and ep_insert() is called with "mtx" held.
*/
ep_rbtree_insert(ep, epi); /*將該epi插入到ep的紅黑樹中*/
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
/* If the file is already "ready" we drop it inside the ready list */
/*
* revents & event->events:剛才fop->poll的返回值中標識的事件有用戶event關心的事件發生。
* !ep_is_linked(&epi->rdllink):epi的ready隊列中有數據。ep_is_linked用于判斷隊列是否為空。
*/
/* 如果要監視的文件狀態已經就緒并且還沒有加入到就緒隊列中,則將當前的epitem加入到就緒隊列中.如果有進程正在等待該文件的狀態就緒,則喚醒一個等待的進程。 */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
/*將當前epi插入到ep->ready隊列中。*/
list_add_tail(&epi->rdllink, &ep->rdllist);
/* Notify waiting tasks that events are available */
/* 如果有進程正在等待文件的狀態就緒,也就是調用epoll_wait睡眠的進程正在等待,則喚醒一個等待進程。waitqueue_active(q) 等待隊列q中有等待的進程返回1,否則返回0。*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
/* 如果有進程等待eventpoll文件本身(???)的事件就緒,則增加臨時變量pwake的值,pwake的值不為0時,在釋放lock后,會喚醒等待進程。 */
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
/*喚醒等待eventpoll文件狀態就緒的進程*/
ep_poll_safewake(&psw, &ep->poll_wait);
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)n",
current, ep, tfile, fd));
return 0;
...
}
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);和revents = tfile->f_op->poll(tfile, &epq.pt);這兩個函數將ep_ptable_queue_proc注冊到epq.pt中的qproc。
typedef struct poll_table_struct {
poll_queue_proc qproc;
unsigned long key;
}poll_table;
執行f_op->poll(tfile, &epq.pt)時,XXX_poll(tfile, &epq.pt)函數會執行poll_wait(),poll_wait()會調用epq.pt.qproc函數,即ep_ptable_queue_proc。
更多Linux內核視頻教程文檔資料免費領取后臺私信【內核】自行獲取。
內核學習網站:
Linux內核源碼/內存調優/文件系統/進程管理/設備驅動/網絡協議棧-學習視頻教程-騰訊課堂
ep_ptable_queue_proc函數如下:
/*當poll醒來時就回調用該函數,在文件操作中的poll函數中調用,將epoll的回調函數加入到目標文件的喚醒隊列中。如果監視的文件是套接字,參數whead則是sock結構的sk_sleep成員的地址*/
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
/*pt獲取struct ep_queue的epi字段。*/
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
/*如果分配內存失敗,則將nwait置為-1,表示發生錯誤,即內存分配失敗,或者已發生錯誤*/
epi->nwait = -1;
}
}
其中struct eppoll_entry定義如下:
struct eppoll_entry {
struct list_head llink;
struct epitem *base;
wait_queue_t wait;
wait_queue_head_t *whead;
};
ep_ptable_queue_proc 函數完成 epitem 加入到特定文件的wait隊列任務。
ep_ptable_queue_proc有三個參數:
struct file *file; 該fd對應的文件對象
wait_queue_head_t *whead; 該fd對應的設備等待隊列(同select中的mydev->wait_address)
poll_table *pt; f_op->poll(tfile, &epq.pt)中的epq.pt
在ep_ptable_queue_proc函數中,引入了另外一個非常重要的數據結構eppoll_entry。eppoll_entry主要完成epitem和epitem事件發生時的callback(ep_poll_callback)函數之間的關聯。首先將eppoll_entry的whead指向fd的設備等待隊列(同select中的wait_address),然后初始化eppoll_entry的base變量指向epitem,最后通過add_wait_queue將epoll_entry掛載到fd的設備等待隊列上。完成這個動作后,epoll_entry已經被掛載到fd的設備等待隊列。
由于ep_ptable_queue_proc函數設置了等待隊列的ep_poll_callback回調函數。所以在設備硬件數據到來時,硬件中斷處理函數中會喚醒該等待隊列上等待的進程時,會調用喚醒函數ep_poll_callback
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
spin_lock_irqsave(&ep->lock, flags);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
...
/* If this file is already in the ready list we exit soon */
if (ep_is_linked(&epi->rdllink))
goto is_linked;
/*將該fd加入到epoll監聽的就緒鏈表中*/
list_add_tail(&epi->rdllink, &ep->rdllist);
is_linked:
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
/*喚醒調用epoll_wait()函數時睡眠的進程。用戶層epoll_wait(...) 超時前返回。*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&psw, &ep->poll_wait);
return 1;
}
epoll_wait
epoll_wait實現如下:
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, int timeout)
{
int error;
struct file *file;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
return -EINVAL;
/* Verify that the area passed by the user is writeable */
/* 檢查用戶空間傳入的events指向的內存是否可寫。參見__range_not_ok()。*/
if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
error = -EFAULT;
goto error_return;
}
/* Get the "struct file *" for the eventpoll file */
/* 獲取epfd對應的eventpoll文件的file實例,file結構是在epoll_create中創建。 */
error = -EBADF;
file = fget(epfd);
if (!file)
goto error_return;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
/* 通過檢查epfd對應的文件操作是不是eventpoll_fops 來判斷epfd是否是一個eventpoll文件。如果不是則返回EINVAL錯誤。 */
error = -EINVAL;
if (!is_file_epoll(file))
goto error_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
ep = file->private_data;
/* Time to fish for events ... */
error = ep_poll(ep, events, maxevents, timeout);
error_fput:
fput(file);
error_return:
return error;
}
ep_poll
epoll_wait調用ep_poll,ep_poll實現如下:
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
int res, eavail;
unsigned long flags;
long jtimeout;
wait_queue_t wait;
/*
* Calculate the timeout by checking for the "infinite" value ( -1 )
* and the overflow condition. The passed timeout is in milliseconds,
* that why (t * HZ) / 1000.
*/
/* timeout是以毫秒為單位,這里是要轉換為jiffies時間。這里加上999(即1000-1),是為了向上取整。 */
jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;
retry:
spin_lock_irqsave(&ep->lock, flags);
res = 0;
if (list_empty(&ep->rdllist)) {
/*
* We don't have any available event to return to the caller.
* We need to sleep here, and we will be wake up by
* ep_poll_callback() when events will become available.
*/
/* 沒有事件,所以需要睡眠。當有事件到來時,睡眠會被ep_poll_callback函數喚醒。*/
init_waitqueue_entry(&wait, current); /*將current進程放在wait這個等待隊列中。*/
wait.flags |= WQ_FLAG_EXCLUSIVE;
/* 將當前進程加入到eventpoll的等待隊列中,等待文件狀態就緒或直到超時,或被信號中斷。 */
__add_wait_queue(&ep->wq, &wait);
for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
* to TASK_INTERRUPTIBLE before doing the checks.
*/
/* 執行ep_poll_callback()喚醒時應當需要將當前進程喚醒,所以當前進程狀態應該為“可喚醒”TASK_INTERRUPTIBLE */
set_current_state(TASK_INTERRUPTIBLE);
/* 如果就緒隊列不為空,也就是說已經有文件的狀態就緒或者超時,則退出循環。*/
if (!list_empty(&ep->rdllist) || !jtimeout)
break;
/* 如果當前進程接收到信號,則退出循環,返回EINTR錯誤 */
if (signal_pending(current)) {
res = -EINTR;
break;
}
spin_unlock_irqrestore(&ep->lock, flags);
/*
* 主動讓出處理器,等待ep_poll_callback()將當前進程喚醒或者超時,返回值是剩余的時間。
* 從這里開始當前進程會進入睡眠狀態,直到某些文件的狀態就緒或者超時。
* 當文件狀態就緒時,eventpoll的回調函數ep_poll_callback()會喚醒在ep->wq指向的等待隊列中的進程。
*/
jtimeout = schedule_timeout(jtimeout);
spin_lock_irqsave(&ep->lock, flags);
}
__remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
/* Is it worth to try to dig for events ? */
/*
* ep->ovflist鏈表存儲的向用戶傳遞事件時暫存就緒的文件。
* 所以不管是就緒隊列ep->rdllist不為空,或者ep->ovflist不等于
* EP_UNACTIVE_PTR,都有可能現在已經有文件的狀態就緒。
* ep->ovflist不等于EP_UNACTIVE_PTR有兩種情況,一種是NULL,此時
* 可能正在向用戶傳遞事件,不一定就有文件狀態就緒,
* 一種情況時不為NULL,此時可以肯定有文件狀態就緒,
* 參見ep_send_events()。
*/
eavail = !list_empty(&ep->rdllist);
spin_unlock_irqrestore(&ep->lock, flags);
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
* more luck.
*/
/* 如果沒有被信號中斷,并且有事件就緒,但是沒有獲取到事件(有可能被其他進程獲取到了),并且沒有超時,則跳轉到retry標簽處,重新等待文件狀態就緒。 */
if (!res && eavail &&
!(res = ep_send_events(ep, events, maxevents)) && jtimeout)
goto retry;
/* 返回獲取到的事件的個數或者錯誤碼 */
return res;
}
ep_send_events()函數向用戶空間發送就緒事件。
ep_send_events()函數將用戶傳入的內存簡單封裝到ep_send_events_data結構中,然后調用ep_scan_ready_list()將就緒隊列中的事件傳入用戶空間的內存。 用戶空間訪問這個結果,進行處理。