编辑
2025-01-22
工作知识
0
请注意,本文编写于 135 天前,最后修改于 135 天前,其中某些信息可能已经过时。

目录

一、vdso的初始化
二、vdso插入用户内存空间
三、程序使用vdso
3.1 vdso的数据vvar
3.2 vvar的数据更新
四、总结

根据vDSO--什么是vDSO的实验,我们知道了啥是vDSO,本文基于内核的实现,简单介绍一下vDSO的内核原理。

一、vdso的初始化

在代码arch/arm64/kernel/vdso.c中,我们可以看到vdso的初始化如下:

static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { [AA64_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, .mremap = vvar_mremap, }, [AA64_MAP_VDSO] = { .name = "[vdso]", .mremap = vdso_mremap, }, }; static int __init vdso_init(void) { vdso_info[VDSO_ABI_AA64].dm = &aarch64_vdso_maps[AA64_MAP_VVAR]; vdso_info[VDSO_ABI_AA64].cm = &aarch64_vdso_maps[AA64_MAP_VDSO]; return __vdso_init(VDSO_ABI_AA64); } arch_initcall(vdso_init);

可以看到vdso默认通过arch_initcall拉起来,然后,默认初始化了两个特殊页映射的结构体aarch64_vdso_maps,我们关注__vdso_init如下

static int __vdso_init(enum vdso_abi abi) { int i; struct page **vdso_pagelist; unsigned long pfn; if (memcmp(vdso_info[abi].vdso_code_start, "\177ELF", 4)) { pr_err("vDSO is not a valid ELF object!\n"); return -EINVAL; } vdso_info[abi].vdso_pages = ( vdso_info[abi].vdso_code_end - vdso_info[abi].vdso_code_start) >> PAGE_SHIFT; vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages, sizeof(struct page *), GFP_KERNEL); if (vdso_pagelist == NULL) return -ENOMEM; /* Grab the vDSO code pages. */ pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); for (i = 0; i < vdso_info[abi].vdso_pages; i++) vdso_pagelist[i] = pfn_to_page(pfn + i); vdso_info[abi].cm->pages = vdso_pagelist; return 0; }

这里看到计算了vdso的代码所需页数,然后为其kcalloc申请了页,然后通过页地址找到页帧号,然后再找到物理的页地址。

这里我们完成了vdso的整个初始化过程

二、vdso插入用户内存空间

首先我们留意到一个函数:

arch_setup_additional_pages

此时我们关注fs/binfmt_elf.c的如下函数

static int load_elf_binary(struct linux_binprm *bprm)

它有如下代码:

#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES retval = arch_setup_additional_pages(bprm, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */

这里就清楚了,当我们执行一个elf文件的时候,或通过load_elf_binary来解析elf,在这个过程中,我们调用arch_setup_additional_pages将其安插在用户的内存空间布局中。

主要操作如下:

ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_PFNMAP, vdso_info[abi].dm); ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_info[abi].cm);

这里和进程maps对应上了,如下:

7f93766000-7f93768000 r--p 00000000 00:00 0 [vvar] 7f93768000-7f93769000 r-xp 00000000 00:00 0 [vdso]

三、程序使用vdso

根据上面的信息,我们知道了vdso的初始化,vdso在elf加载的时候默认map到程序内存空间,但是具体的,我们需要知道vdso如何优化syscall的调用的,首先,我们得知道如下图:

image.png 这里以gettimeofday为例,我们需要先关注链接脚本文件vdso.lds.S

SECTIONS { PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } /* * Discard .note.gnu.property sections which are unused and have * different alignment requirement from vDSO note sections. */ /DISCARD/ : { *(.note.GNU-stack .note.gnu.property) } .note : { *(.note.*) } :text :note . = ALIGN(16); .text : { *(.text*) } :text =0xd503201f PROVIDE (__etext = .); PROVIDE (_etext = .); PROVIDE (etext = .); .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text _end = .; PROVIDE(end = .); /DISCARD/ : { *(.data .data.* .gnu.linkonce.d.* .sdata*) *(.bss .sbss .dynbss .dynsbss) } }

此时我们查看导出符号

VERSION { LINUX_2.6.39 { global: __kernel_rt_sigreturn; __kernel_gettimeofday; __kernel_clock_gettime; __kernel_clock_getres; local: *; }; }

这里我们知道,用户想要调用gettimeofday,实际上vdso是调用实现的__kernel_gettimeofday,我们追踪此程序的实现:

arch/arm64/kernel/vdso/vgettimeofday.c

int __kernel_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday(tv, tz); }

然后我们找到`__cvdso_gettimeofday`的实现在:

lib/vdso/gettimeofday.c

static __maybe_unused int __cvdso_gettimeofday_data(const struct vdso_data *vd, struct __kernel_old_timeval *tv, struct timezone *tz) { if (likely(tv != NULL)) { struct __kernel_timespec ts; if (do_hres(&vd[CS_HRES_COARSE], CLOCK_REALTIME, &ts)) return gettimeofday_fallback(tv, tz); tv->tv_sec = ts.tv_sec; tv->tv_usec = (u32)ts.tv_nsec / NSEC_PER_USEC; } if (unlikely(tz != NULL)) { if (IS_ENABLED(CONFIG_TIME_NS) && vd->clock_mode == VDSO_CLOCKMODE_TIMENS) vd = __arch_get_timens_vdso_data(); tz->tz_minuteswest = vd[CS_HRES_COARSE].tz_minuteswest; tz->tz_dsttime = vd[CS_HRES_COARSE].tz_dsttime; } return 0; } static __maybe_unused int __cvdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { return __cvdso_gettimeofday_data(__arch_get_vdso_data(), tv, tz); }

这里我们关注函数do_hres,其实现如下:

路径:lib/vdso/gettimeofday.c
ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); ts->tv_nsec = ns;

这里直接给ts赋值即可,如果vdso的实现失效,则切回syscall,如下:

arch/arm64/include/asm/vdso/gettimeofday.h
static __always_inline int gettimeofday_fallback(struct __kernel_old_timeval *_tv, struct timezone *_tz) { register struct timezone *tz asm("x1") = _tz; register struct __kernel_old_timeval *tv asm("x0") = _tv; register long ret asm ("x0"); register long nr asm("x8") = __NR_gettimeofday; asm volatile( " svc #0\n" : "=r" (ret) : "r" (tv), "r" (tz), "r" (nr) : "memory"); return ret; }

这里就存在一个疑问点,我们直接赋值的数据从哪里来。

3.1 vdso的数据vvar

我们已经知道了代码通过vdso下发到直接去数据,我们稍微留意一下就知道这个数据是

__cvdso_gettimeofday_data(__arch_get_vdso_data(), tv, tz);

也就是

static __always_inline const struct vdso_data *__arch_get_vdso_data(void) { return _vdso_data; }

也就是

/* * The vDSO data page. */ static union { struct vdso_data data[CS_BASES]; u8 page[PAGE_SIZE]; } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data;

这里可以知道了,这个数据来源vvar里面,但是数据如何更新的呢

3.2 vvar的数据更新

对于gettimeofday的函数的实现,我们需要关注timer的核心函数timekeeping_update,代码位置如下:

kernel/time/timekeeping.c

我们关心这句话

update_vsyscall(tk);

其实现在如下:

void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata[CS_HRES_COARSE].clock_mode = clock_mode; vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). * Note: No need to have a second copy. */ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high reolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); }

这里一目了然vdso_ts就是vdata的成员,结构体如下:

struct vdso_data { u32 seq; s32 clock_mode; u64 cycle_last; u64 mask; u32 mult; u32 shift; union { struct vdso_timestamp basetime[VDSO_BASES]; struct timens_offset offset[VDSO_BASES]; }; s32 tz_minuteswest; s32 tz_dsttime; u32 hrtimer_res; u32 __unused; struct arch_vdso_data arch_data; };

所以数据存放在vvar区域,我们定义了一个数据结构,在内核中,我们直接利用vvar区域的数据赋值给vdso的代码调用,也就避免了系统调用。

四、总结

至此,我们从内核的所有方面了解到了vdso的实现原理,相当于内核直接实现了一段代码,作为动态链接放在每个程序上运行,这样就避免了syscall带来的性能问题。