在linux中,内核提供了lxc功能,这个功能是实现容器的基础,在容器中,有namespace和cgroups,这里我们先介绍一下cgroups,用于简单了解cgroups。
控制组(cgroups)是Linux内核提供的物理资源隔离机制,用来限制CPU,内存,IO等资源。今天只说V1,不说V2 Linux中CGROUPS如下所示
控制组有多个子系统,通过其子系统可以控制系统的功能。主要如下:
cpuset - 为 cgroup 内的任务分配独立的处理器和内存节点; cpu - 使用调度程序对 cgroup 内的任务提供 CPU 资源的访问; cpuacct - 生成 cgroup 中所有任务的处理器使用情况报告; io - 限制对块设备的读写操作; memory - 限制 cgroup 中的一组任务的内存使用; devices - 限制 cgroup 中的一组任务访问设备; freezer - 允许 cgroup 中的一组任务挂起/恢复; net_cls - 允许对 cgroup 中的任务产生的网络数据包进行标记; net_prio - 针对 cgroup 中的每个网络接口提供一种动态修改网络流量优先级的方法; perf_event - 支持访问 cgroup 中的性能事件); hugetlb - 为 cgroup 开启对大页内存的支持; pid - 限制 cgroup 中的进程数量。
为了测试验证,这里需要在cpu,内存,io上进行验证
限制CPU主要两种
测试代码如下:
int main(void) { for (; ;); return 0; } gcc /root/test_cpu.c -o /root/test_cpu
对于限制核心,如下
设置cpuset
mkdir /sys/fs/cgroup/cpuset/test/ echo 7 > /sys/fs/cgroup/cpuset/test/cpuset.cpus /root/test_cpu & 5233 echo 5233 > /sys/fs/cgroup/cpuset/test/tasks taskset -p 5233
对于设置cpu,cpuacct,如下
cpu.cfs_quota_us / cpu.cfs_period_us 可作为CPU的百分比 mkdir /sys/fs/cgroup/cpu,cpuacct/test echo 20000 > /sys/fs/cgroup/cpu,cpuacct/test/cpu.cfs_quota_us echo 100000 > /sys/fs/cgroup/cpu,cpuacct/test/cpu.cfs_period_us /root/test_cpu & 8359 echo 8359 > /sys/fs/cgroup/cpu,cpuacct/test/tasks
如果通过systemd,则是
systemd-run --unit=test --scope --slice=test /root/test_cpu & systemd-run --unit=test --scope --slice=test -p CPUQuota=20% /root/test_cpu
测试代码如下:
#include <unistd.h> #include <stdio.h> #include <string.h> #include <stdlib.h> int main(void) { unsigned total = 0, alloc_size = 1024 * 1024 * 100 /* MiB */; int *p = NULL; while (1) { if (NULL == (p = (int *)malloc(alloc_size))) { printf("malloc failed!\n"); return 0; } memset(p, 0xff, alloc_size); total += alloc_size; printf("malloc size: %uM\n", total/1024/1024); sleep(1); } return 0; } gcc /root/test_mem.c -o /root/test_mem
验证如下:
mkdir /sys/fs/cgroup/memory/test/ echo $(expr 1024 '*' 1024 '*' 500) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes cgexec -g memory:test /root/test_mem malloc size: 100M, malloc size: 200M, malloc size: 300M, malloc size: 400M, 已杀死
如果通过systemd
systemd-run --unit=test --scope --slice=test /root/test_mem systemctl set-property test.scope MemoryLimit=500M
结果
memory: usage 512000kB, limit 512000kB, failcnt 156 memory+swap: usage 512000kB, limit 9007199254740988kB, failcnt 0 Memory cgroup out of memory: Killed process 3330 (test_mem) total-vm:563012kB, anon-rss:510700kB, file-rss:14808kB, shmem-rss:0kB, UID:0 pgtables:1112kB oom_score_adj:0
对于IO,这里演示限制写的速度,如下:
mkdir /sys/fs/cgroup/blkio/test echo "179:0 `expr 1000 '*' 1000`" > /sys/fs/cgroup/blkio/test/blkio.throttle.write_bps_device cgexec -g blkio:test time dd if=/dev/zero count=1 bs=10M of=/root/test.img conv=fdatasync 记录了1+0 的读入 记录了1+0 的写出 10485760 bytes (10 MB, 10 MiB) copied, 10.0687 s, 1.0 MB/s
使用systemd
systemd-run --unit=test --scope --slice=test -p "IOWriteBandwidthMax=/dev/mmcblk0p6 1M" time dd if=/dev/zero count=1 bs=10M of=/root/test.img conv=fdatasync Running scope as unit: test.scope 记录了1+0 的读入 记录了1+0 的写出 10485760 bytes (10 MB, 10 MiB) copied, 10.4876 s, 1000 kB/s
对于上面的示例,主要根据如下分析调用关系
这里从start_kernel如下
start_kernel----init/main.c cgroup_init_early cgroup_init_subsys cgroup_init---init/main.c cgroup_init_subsys WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); cgroup_add_cftypes cgroup_init_cftypes cgroup_apply_cftypes cgroup_addrm_files cgroup_add_file __kernfs_create_file(创建sysfs文件)
struct cgroup_subsys { struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); int (*css_online)(struct cgroup_subsys_state *css); void (*css_offline)(struct cgroup_subsys_state *css); void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); int (*can_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset); void (*post_attach)(void); int (*can_fork)(struct task_struct *task, struct css_set *cset); void (*cancel_fork)(struct task_struct *task, struct css_set *cset); void (*fork)(struct task_struct *task); void (*exit)(struct task_struct *task); void (*release)(struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); };
cgroup_init_cftypes cgroup_kf_ops static struct kernfs_ops cgroup_kf_ops = { .atomic_write_len = PAGE_SIZE, .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, .seq_show = cgroup_seqfile_show, };
cpuset_cgrp_subsys struct cftype legacy_files[] .name = "cpus", .write = cpuset_write_resmask, update_cpumask cpumask_and
这里cpumask_and是对CPU掩码信息与的函数
struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, .fork = cpuset_fork, .legacy_cftypes = legacy_files, .dfl_cftypes = dfl_files, .early_init = true, .threaded = true, };
cpuacct_cgrp_subsys struct cftype cpu_legacy_files[] .name = "cfs_quota_us", .write_s64 = cpu_cfs_quota_write_s64, tg_set_cfs_quota tg_set_cfs_bandwidth unthrottle_cfs_rq .name = "cfs_period_us", .write_u64 = cpu_cfs_period_write_u64, tg_set_cfs_period tg_set_cfs_bandwidth unthrottle_cfs_rq
这里unthrottle_cfs_rq 控制CPU带宽(给定周期时间内消耗CPU的时间)的具体函数
struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, .legacy_cftypes = files, .early_init = true, };
memory_cgrp_subsys struct cftype mem_cgroup_legacy_files[] .name = "limit_in_bytes", .write = mem_cgroup_write, mem_cgroup_resize_max page_counter_set_max
这里page_counter_set_max是设置系统允许使用的最大页数
struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, };
blkcg_policy_register if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); module_init(throtl_init); blkcg_policy_register(&blkcg_policy_throtl); static struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; struct cftype throtl_legacy_files[] .name = "throttle.write_bps_device", .write = tg_set_conf_u64, tg_set_conf tg_conf_updated tg_bps_limit tg->bps[rw][td->limit_index];
这里throtl_grp是用来控制IO参数的结构体,对tg->bps写值可以控制其bps大小
struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled * together on the default hierarchy so that the owner cgroup can * be retrieved from writeback pages. */ .depends_on = 1 << memory_cgrp_id, #endif }; struct throtl_grp { /* internally used bytes per second rate limits */ uint64_t bps[2][LIMIT_CNT]; ...... }
至此,我们能够简单了解了cgroups,cgroups是在linux 容器中的对资源隔离的必要手段。
bpftrace是eBPF的高级调试语言,它能够简单的直接运行eBPF程序,本文基于eBPF的介绍上,以bpftrace为例,实践bpftrace的基本命令。
为了能够全面的了解bpftrace,建议大家了解如下仓库:
https://github.com/bpftrace/bpftrace
为了了解框架,可以查看如下图
如果看到这个图片,也不用慌,它主要介绍如下两点:
图中带颜色的框图是linux系统的主要功能模块 图中带箭头的,如果指向框图的,那么对应probe的类型 根据这些类型,我们可以通过bpftrace来引用从而调试内核。
如果进入这个仓库的,可以看到仓库的README.md,这里有bpftrace的示例,本文不基于这些事例来进行演示,而是自己测试一个示例do_sys_openat2,readme的示例大家可以自己瞧一瞧。
为了文档延续性,这里还是以do_sys_openat2为例,首先需要查看bpftrace支持的跟踪点,如下
# bpftrace -l '*do_sys_openat2' kprobe:do_sys_openat2
可以看到,bpftrace支持kprobe类型的do_sys_openat2观测
# bpftrace -e 'kprobe:do_sys_openat2 { printf("PID=%d COMM=%s\n", pid, comm )}' Attaching 1 probe... PID=313 COMM=systemd-journal PID=313 COMM=systemd-journal PID=313 COMM=systemd-journal PID=313 COMM=systemd-journal PID=313 COMM=systemd-journal
这里和kprobe达到效果一致。
我们在了解内核可观测的eBPF之前,我们需要了解一个最早期的工具BPF,对于BPF,有一个仓库是BCC其全称BPF Compiler Collection,其意思就说其汇总了BPF的各种调试工具。
我们知道bcc是一系列bpf工具的合集,它非常方便我们对操作系统进行性能定位,其主线仓库地址如下:
https://github.com/iovisor/bcc
其主要包含工具在如下图所示:
对于这个图,大家第一次看到不需要惊慌,我简单给大家解析一下就方便大家看懂。
图中带颜色的框图是linux系统的主要功能模块 图中带箭头的,如果指向框图的,那么对应一个工具,用于这个模块的调测 这里主要包含两点:
如果框图的模块不熟悉,那属于对操作系统不熟悉,建议可以先巩固操作系统概念,或者过一段时间懂了再来看 如果箭头的字不熟悉,完全没关系,一个一个试着敲就清晰了。 本文针对这个图,截取一下我个人觉得需要的工具演示一下。
我们知道bcc工具集要使用,需要两个内容:
内核头文件 bcc工具包 2.1 内核头文件 对于一个内核,我们提取头文件方式其他文章有提到,如:16-内核头文件
对于此,我们只需要如下操作即可:
mkdir /usr/lib/modules/$(uname -r) && cd /usr/lib/modules/$(uname -r) tar xvzf ~/kernel-header.tar.gz -C /usr/lib/modules/$(uname -r) mv kernel-header build
当然如果是标准头文件,直接安装即可。这里就无需介绍
对于bcc工具包,直接安装对应包即可
apt instll bpfcc-tools
监听系统的openat事件,如下:
# opensnoop-bpfcc PID COMM FD ERR PATH 309 systemd-journal 47 0 /proc/400/comm 309 systemd-journal 47 0 /proc/400/cmdline 309 systemd-journal 47 0 /proc/400/status
监听系统的exit事件
# exitsnoop-bpfcc PCOMM PID PPID TID AGE(s) EXIT_CODE preload 5586 640 5586 0.00 0
监听死锁问题
# deadlock-bpfcc $(pidof kylin-nm) --binary /usr/lib/aarch64-linux-gnu/libpthread.so.0 Tracing... Hit Ctrl-C to end.
监听内存申请
# memleak-bpfcc -p $(pidof kylin-nm) Attaching to pid 2444, Ctrl+C to quit. [14:48:56] Top 10 stacks with outstanding allocations: 8 bytes in 1 allocations from stack dbus_malloc+0x28 [libdbus-1.so.3.19.11] [unknown] [libdbus-1.so.3.19.11] dbus_message_copy+0xe0 [libdbus-1.so.3.19.11] [unknown] [libQt5DBus.so.5.12.8] [unknown] [libQt5DBus.so.5.12.8] [unknown] [libQt5DBus.so.5.12.8] [unknown] [libQt5DBus.so.5.12.8] [unknown] [libQt5DBus.so.5.12.8] [unknown] [libQt5DBus.so.5.12.8] QObject::event(QEvent*)+0x200 [libQt5Core.so.5.12.8] QCoreApplication::notifyInternal2(QObject*, QEvent*)+0x154 [libQt5Core.so.5.12.8] QCoreApplicationPrivate::sendPostedEvents(QObject*, int, QThreadData*)+0x168 [libQt5Core.so.5.12.8] [unknown] [libQt5Core.so.5.12.8] g_main_context_dispatch+0x274 [libglib-2.0.so.0.6400.6] [unknown] [libglib-2.0.so.0.6400.6] g_main_context_iteration+0x34 [libglib-2.0.so.0.6400.6] QEventDispatcherGlib::processEvents(QFlags<QEventLoop::ProcessEventsFlag>)+0x54 [libQt5Core.so.5.12.8] QEventLoop::exec(QFlags<QEventLoop::ProcessEventsFlag>)+0xf8 [libQt5Core.so.5.12.8]
监听别人bash在敲什么东西
# /usr/sbin/bashreadline-bpfcc TIME PID COMMAND 14:56:38 3372 bash 14:56:40 22469 history 14:56:51 22469 dmesg
查看系统中cache 命中,miss以及buffer和cache大小
# cachestat-bpfcc HITS MISSES DIRTIES HITRATIO BUFFERS_MB CACHED_MB 3 0 0 100.00% 48 849 1 0 0 100.00% 48 849 3 0 0 100.00% 48 849 4070 0 10 100.00% 48 849 2867 0 84 100.00% 48 849
统计cpu上执行时长的分布情况。
# cpudist-bpfcc Tracing on-CPU time... Hit Ctrl-C to end. usecs : count distribution 0 -> 1 : 140 |***** | 2 -> 3 : 113 |**** | 4 -> 7 : 229 |******** | 8 -> 15 : 619 |*********************** | 16 -> 31 : 1009 |************************************** | 32 -> 63 : 1051 |****************************************| 64 -> 127 : 441 |**************** | 128 -> 255 : 351 |************* | 256 -> 511 : 257 |********* | 512 -> 1023 : 459 |***************** | 1024 -> 2047 : 294 |*********** | 2048 -> 4095 : 284 |********** | 4096 -> 8191 : 379 |************** | 8192 -> 16383 : 363 |************* | 16384 -> 32767 : 278 |********** |
查看dcache的访问情况,打印进程访问dcache的内容
# /usr/sbin/dcsnoop-bpfcc TIME(s) PID COMM T FILE 1.763159 311 systemd-journal M display 1.763206 311 systemd-journal M display 1.763218 311 systemd-journal M log-extra-fields:dbus.service 3.762189 311 systemd-journal M display 3.762222 311 systemd-journal M display 3.762432 311 systemd-journal M log-extra-fields:dbus.service 4.759580 416 NetworkManager M timestamps.ILLFY2
统计系统内文件打开后,读和写的次数和文件内容,如下
15:59:07 loadavg: 0.06 0.08 0.09 1/656 17653 TID COMM READS WRITES R_Kb W_Kb T FILE 17653 clear 2 0 60 0 R xterm 17346 filetop-bpfcc 2 0 15 0 R loadavg 17653 clear 2 0 0 0 R libmali.so.1.9.0 17653 clear 1 0 0 0 R libXau.so.6.0.0 17653 clear 1 0 0 0 R libX11.so.6.3.0 17653 clear 1 0 0 0 R libgcc_s.so.1 17653 clear 1 0 0 0 R libEGL.so.1 17653 clear 1 0 0 0 R libgbm.so.1 17653 clear 1 0 0 0 R libstdc++.so.6.0.28 17653 clear 1 0 0 0 R libGLESv2.so.2
查看进程的缓存命中率
~# llcstat-bpfcc Running for 10 seconds or hit Ctrl-C to end. ^CPID NAME CPU REFERENCE MISS HIT% 415 dbus-daemon 1 20700 9400 54.59% 720 kworker/0:4H 0 19600 9800 50.00% 2351 QThread 2 75300 2700 96.41% 38823 sh 7 33500 12600 62.39% 14 rcuos/0 3 60500 1600 97.36% 53 rcuos/6 0 22700 300 98.68% 2384 QThread 0 39100 2300 94.12% 2822 QThread 3 72400 2900 95.99% 2487 ukui-tablet-des 2 92600 7500 91.90%
可以发现bcc收录了很多有用的trace工具,这些工具基于BPF,希望能对大家有帮助
我们知道任何机器运行都是依赖内存的,通常情况下我们不应该怀疑内存的硬件问题,但在RAS领域上不怀疑是不应该的,对于内存而言,其实很容易出现各类的问题,例如内存大面积损坏,内存单bit翻转等。本文不讨论内存的大面积破坏的问题,因为这已经是不可修复的大缺陷了。这里讨论一种情况,那就是内存的单bit翻转导致的数据不正确时在aarch64系列芯片上的硬件和软件措施
Parity也就是奇偶校验,非常早期的单片机设备总线通信例如spi等,会用到这个,这个相信大家有过介绍和理解,这里重复一下。
奇偶校验就是在一组数据上,新增一个校验位,这个校验位用于计算1的个数,如果1的个数是奇数,则是1,如果偶数,则是0。
假设我们在传输数据时,某个bit发生了翻转现象,那么我们的校验位就能识别出来。
ECC也叫Error-Correcting Code memory,我们知道Parity在简单的数据通讯中能够提示部分错误,但是不能主动回复错误,那么ECC就是一种能够恢复位翻转错误的一种硬件技术,当代内存颗粒基本上都具备ECC校验的基本功能。 ECC有多种纠错算法。这里简单列举一下:
当我们了解了对于内存领域常见的硬件纠错方案之后,我们也需要知道软件是如何处理和规范解决这种ECC错误的
软件的方案在arm架构上主要有两点:
在arm中,对于内存的这类错误有一个单独的概念叫做ESB,他能够记录内存的同步错误。
arm规范中,ESB如下描述:
可以理解到,ESB是arm规范中作为错误同步屏障记录在特殊寄存器DISR(Deferred Interrupt Status Register)上并通过EL1层和EL2层上才能获取。
ESB的状态需要架构打开RAS扩展,否则作为空指令执行。
对于ECC/Parity错误,在arm中默认是通过mm的fault来接受的,流程如下:
首先我们注意异常向量表如下:
SYM_CODE_START(vectors) kernel_ventry 1, sync_invalid // Synchronous EL1t kernel_ventry 1, irq_invalid // IRQ EL1t kernel_ventry 1, fiq_invalid // FIQ EL1t kernel_ventry 1, error_invalid // Error EL1t kernel_ventry 1, sync // Synchronous EL1h kernel_ventry 1, irq // IRQ EL1h kernel_ventry 1, fiq_invalid // FIQ EL1h kernel_ventry 1, error // Error EL1h kernel_ventry 0, sync // Synchronous 64-bit EL0 kernel_ventry 0, irq // IRQ 64-bit EL0 kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 kernel_ventry 0, error // Error 64-bit EL0 #ifdef CONFIG_COMPAT kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 kernel_ventry 0, error_compat, 32 // Error 32-bit EL0 #else kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 #endif SYM_CODE_END(vectors)
我们这里以el0的sync异常为例,因为内存的同步异常通过sync来触发,如下:
kernel_ventry 0, sync // Synchronous 64-bit EL0
此时对于的函数如下:
SYM_CODE_START_LOCAL_NOALIGN(el0_sync) kernel_entry 0 mov x0, sp bl el0_sync_handler b ret_to_user SYM_CODE_END(el0_sync)
这里发现会跳转到函数el0_sync_handler,其实现如下:
asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); switch (ESR_ELx_EC(esr)) { case ESR_ELx_EC_SVC64: el0_svc(regs); break; case ESR_ELx_EC_DABT_LOW: el0_da(regs, esr); break; case ESR_ELx_EC_IABT_LOW: el0_ia(regs, esr); break; case ESR_ELx_EC_FP_ASIMD: el0_fpsimd_acc(regs, esr); break; case ESR_ELx_EC_SVE: el0_sve_acc(regs, esr); break; case ESR_ELx_EC_FP_EXC64: el0_fpsimd_exc(regs, esr); break; case ESR_ELx_EC_SYS64: case ESR_ELx_EC_WFx: el0_sys(regs, esr); break; case ESR_ELx_EC_SP_ALIGN: el0_sp(regs, esr); break; case ESR_ELx_EC_PC_ALIGN: el0_pc(regs, esr); break; case ESR_ELx_EC_UNKNOWN: el0_undef(regs); break; case ESR_ELx_EC_BTI: el0_bti(regs); break; case ESR_ELx_EC_BREAKPT_LOW: case ESR_ELx_EC_SOFTSTP_LOW: case ESR_ELx_EC_WATCHPT_LOW: case ESR_ELx_EC_BRK64: el0_dbg(regs, esr); break; case ESR_ELx_EC_FPAC: el0_fpac(regs, esr); break; default: el0_inv(regs, esr); } }
我们留意data abort error,所以关心如下:
case ESR_ELx_EC_DABT_LOW: el0_da(regs, esr); break;
其函数如下
static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); }
我们看看跳转函数do_mem_abort的实现
void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); unsigned long addr = untagged_addr(far); if (!inf->fn(far, esr, regs)) return; if (!user_mode(regs)) { pr_alert("Unhandled fault at 0x%016lx\n", addr); trace_android_rvh_do_mem_abort(regs, esr, addr, inf->name); mem_abort_decode(esr); show_pte(addr); } /* * At this point we have an unrecognized fault type whose tag bits may * have been defined as UNKNOWN. Therefore we only expose the untagged * address to the signal handler. */ arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr); }
这里留意函数esr_to_fault_info,如下:
static inline const struct fault_info *esr_to_fault_info(unsigned int esr) { return fault_info + (esr & ESR_ELx_FSC); }
所以我们应该关注这个核心的数组fault_info,如下:
static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" }, { do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 12" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" }, { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 18" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 19" }, { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" }, { do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" }, { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented { do_bad, SIGKILL, SI_KERNEL, "unknown 25" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 26" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 27" }, { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented { do_bad, SIGKILL, SI_KERNEL, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 34" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 35" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 36" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 37" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 38" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 39" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 40" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 41" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 42" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 43" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 44" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 45" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 46" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 47" }, { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" }, { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 50" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 51" }, { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" }, { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 54" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 55" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 56" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 57" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 58" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 59" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 60" }, { do_bad, SIGKILL, SI_KERNEL, "section domain fault" }, { do_bad, SIGKILL, SI_KERNEL, "page domain fault" }, { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, };
这里我们留意ECC和Parity错误,如下:
{ do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
到这里,我们知道了常见的ECC/Parity错误会触发到软件的do_sea,这里我们重点开始关心软件上接受错误了是如何的行为,所以留意arm64_notify_die函数
void arm64_notify_die(const char *str, struct pt_regs *regs, int signo, int sicode, unsigned long far, int err) { if (user_mode(regs)) { WARN_ON(regs != current_pt_regs()); current->thread.fault_address = 0; current->thread.fault_code = err; arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } }
这里可以看到区分了用户空间和内核空间
用户空间调用的是arm64_force_sig_fault,这里可以发现其发送了SIGBUS的错误
void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); if (signo == SIGKILL) force_sig(SIGKILL); else force_sig_fault(signo, code, (void __user *)far); }
force_sig_fault已经到信号的实现核心函数上了,这里不做解析了。
而内核空间则调用了die,这里直接oops了,如果打开了panic,则panic了。
void die(const char *str, struct pt_regs *regs, int err) { oops_exit(); if (in_interrupt()) panic("%s: Fatal exception in interrupt", str); if (panic_on_oops) panic("%s: Fatal exception", str); }
至此,我们可以发现,如果系统发生了ECC错误,那么会通过同步异常给到aarch64芯片,我们以el0为例,该错误会通过异常向量表给到do_sea函数,此函数会根据ecc的内存错误发生地方判断是否在用户空间,如果是用户空间,则通过bus error终结程序,如果是内核空间,则发送oops。
SDEI是arm架构提出来的一套软件处理接口,我们从全称就可以了解Software Delegated Exception interface。它的逻辑是通过在非安全事件注册回调,
SDEI在spec中描述的实现在安全世界。其流程如下:
SDEI会定义一系列的交互方式,如下:
这里描述了SDEI handler的交互过程。
我们关注trampoline如下:
SYM_CODE_START(__sdei_asm_entry_trampoline) mrs x4, ttbr1_el1 tbz x4, #USER_ASID_BIT, 1f tramp_map_kernel tmp=x4 isb mov x4, xzr /* * Use reg->interrupted_regs.addr_limit to remember whether to unmap * the kernel on exit. */ 1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] tramp_data_read_var x4, __sdei_asm_handler br x4 SYM_CODE_END(__sdei_asm_entry_trampoline)
其实现如下:
/* * Software Delegated Exception entry point. * * x0: Event number * x1: struct sdei_registered_event argument from registration time. * x2: interrupted PC * x3: interrupted PSTATE * x4: maybe clobbered by the trampoline * * Firmware has preserved x0->x17 for us, we must save/restore the rest to * follow SMC-CC. We save (or retrieve) all the registers as the handler may * want them. */ SYM_CODE_START(__sdei_asm_handler) stp x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC] stp x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2] stp x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3] stp x8, x9, [x1, #SDEI_EVENT_INTREGS + 16 * 4] stp x10, x11, [x1, #SDEI_EVENT_INTREGS + 16 * 5] stp x12, x13, [x1, #SDEI_EVENT_INTREGS + 16 * 6] stp x14, x15, [x1, #SDEI_EVENT_INTREGS + 16 * 7] stp x16, x17, [x1, #SDEI_EVENT_INTREGS + 16 * 8] stp x18, x19, [x1, #SDEI_EVENT_INTREGS + 16 * 9] stp x20, x21, [x1, #SDEI_EVENT_INTREGS + 16 * 10] stp x22, x23, [x1, #SDEI_EVENT_INTREGS + 16 * 11] stp x24, x25, [x1, #SDEI_EVENT_INTREGS + 16 * 12] stp x26, x27, [x1, #SDEI_EVENT_INTREGS + 16 * 13] stp x28, x29, [x1, #SDEI_EVENT_INTREGS + 16 * 14] mov x4, sp stp lr, x4, [x1, #SDEI_EVENT_INTREGS + S_LR] mov x19, x1 /* Store the registered-event for crash_smp_send_stop() */ ldrb w4, [x19, #SDEI_EVENT_PRIORITY] cbnz w4, 1f adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6 b 2f 1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 2: str x19, [x5] #ifdef CONFIG_VMAP_STACK /* * entry.S may have been using sp as a scratch register, find whether * this is a normal or critical event and switch to the appropriate * stack for this CPU. */ cbnz w4, 1f ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 b 2f 1: ldr_this_cpu dst=x5, sym=sdei_stack_critical_ptr, tmp=x6 2: mov x6, #SDEI_STACK_SIZE add x5, x5, x6 mov sp, x5 #endif #ifdef CONFIG_SHADOW_CALL_STACK /* Use a separate shadow call stack for normal and critical events */ cbnz w4, 3f ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal_ptr, tmp=x6 b 4f 3: ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical_ptr, tmp=x6 4: #endif /* * We may have interrupted userspace, or a guest, or exit-from or * return-to either of these. We can't trust sp_el0, restore it. */ mrs x28, sp_el0 ldr_this_cpu dst=x0, sym=__entry_task, tmp=x1 msr sp_el0, x0 /* If we interrupted the kernel point to the previous stack/frame. */ and x0, x3, #0xc mrs x1, CurrentEL cmp x0, x1 csel x29, x29, xzr, eq // fp, or zero csel x4, x2, xzr, eq // elr, or zero stp x29, x4, [sp, #-16]! mov x29, sp add x0, x19, #SDEI_EVENT_INTREGS mov x1, x19 bl __sdei_handler msr sp_el0, x28 /* restore regs >x17 that we clobbered */ mov x4, x19 // keep x4 for __sdei_asm_exit_trampoline ldp x28, x29, [x4, #SDEI_EVENT_INTREGS + 16 * 14] ldp x18, x19, [x4, #SDEI_EVENT_INTREGS + 16 * 9] ldp lr, x1, [x4, #SDEI_EVENT_INTREGS + S_LR] mov sp, x1 mov x1, x0 // address to complete_and_resume /* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */ cmp x0, #1 mov_q x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME csel x0, x2, x3, ls ldr_l x2, sdei_exit_mode /* Clear the registered-event seen by crash_smp_send_stop() */ ldrb w3, [x4, #SDEI_EVENT_PRIORITY] cbnz w3, 1f adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6 b 2f 1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6 2: str xzr, [x5] alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 sdei_handler_exit exit_mode=x2 alternative_else_nop_endif #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline, tmp=x3 br x5 #endif SYM_CODE_END(__sdei_asm_handler) NOKPROBE(__sdei_asm_handler)
这里我们关注其跳转如下:
bl __sdei_handler
其实现如下:
asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; arm64_enter_nmi(regs); ret = _sdei_handler(regs, arg); arm64_exit_nmi(regs); return ret; }
对于_sdei_handler,会按照SDEI协议的event handler去处理,其函数如下:
static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { u32 mode; int i, err = 0; int clobbered_registers = 4; u64 elr = read_sysreg(elr_el1); u32 kernel_mode = read_sysreg(CurrentEL) | 1; /* +SPSel */ unsigned long vbar = read_sysreg(vbar_el1); if (arm64_kernel_unmapped_at_el0()) clobbered_registers++; /* Retrieve the missing registers values */ for (i = 0; i < clobbered_registers; i++) { /* from within the handler, this call always succeeds */ sdei_api_event_context(i, ®s->regs[i]); } /* * We didn't take an exception to get here, set PAN. UAO will be cleared * by sdei_event_handler()s force_uaccess_begin() call. */ __uaccess_enable_hw_pan(); err = sdei_event_handler(regs, arg); if (err) return SDEI_EV_FAILED; if (elr != read_sysreg(elr_el1)) { /* * We took a synchronous exception from the SDEI handler. * This could deadlock, and if you interrupt KVM it will * hyp-panic instead. */ pr_warn("unsafe: exception during handler\n"); } mode = regs->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK); /* * If we interrupted the kernel with interrupts masked, we always go * back to wherever we came from. */ if (mode == kernel_mode && !interrupts_enabled(regs)) return SDEI_EV_HANDLED; /* * Otherwise, we pretend this was an IRQ. This lets user space tasks * receive signals before we return to them, and KVM to invoke it's * world switch to do the same. * * See DDI0487B.a Table D1-7 'Vector offsets from vector table base * address'. */ if (mode == kernel_mode) return vbar + 0x280; else if (mode & PSR_MODE32_BIT) return vbar + 0x680; return vbar + 0x480; }
这里我们关注函数sdei_event_handler,此时函数是acpi/fdt实现的firmware驱动,如下
int sdei_event_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { int err; mm_segment_t orig_addr_limit; u32 event_num = arg->event_num; /* * Save restore 'fs'. * The architecture's entry code save/restores 'fs' when taking an * exception from the kernel. This ensures addr_limit isn't inherited * if you interrupted something that allowed the uaccess routines to * access kernel memory. * Do the same here because this doesn't come via the same entry code. */ orig_addr_limit = force_uaccess_begin(); err = arg->callback(event_num, regs, arg->callback_arg); if (err) pr_err_ratelimited("event %u on CPU %u failed with error: %d\n", event_num, smp_processor_id(), err); force_uaccess_end(orig_addr_limit); return err; } NOKPROBE_SYMBOL(sdei_event_handler);
接下来的流程,就完全符合SDEI定义的交互流程了。
回顾了这些代码,在学习sdei的时候,意外发现一个仓库rasdaemon,此仓库是目的是通过一个上层的程序,捕获常见的ras领域的错误,当然也包括我们的内存单bit翻转的错误。
对于rasdaemon,可以将内存的错误数量进行统计。提供给用户查看。
不幸的是,此工具不在aarch64上实现,我们但是我们在amd上可以看到如下实现:
parse_amd_smca_event--->decode_smca_error
我们随便以smca_mce_descs中的一种desc描述示例如下:
static const char * const smca_smu2_mce_desc[] = { "High SRAM ECC or parity error", "Low SRAM ECC or parity error", "Data Cache Bank A ECC or parity error", "Data Cache Bank B ECC or parity error", "Data Tag Cache Bank A ECC or parity error", "Data Tag Cache Bank B ECC or parity error", "Instruction Cache Bank A ECC or parity error", "Instruction Cache Bank B ECC or parity error", "Instruction Tag Cache Bank A ECC or parity error", "Instruction Tag Cache Bank B ECC or parity error", "System Hub Read Buffer ECC or parity error", "PHY RAS ECC Error", [12 ... 57] = "Reserved", "A correctable error from a GFX Sub-IP", "A fatal error from a GFX Sub-IP", "Reserved", "Reserved", "A poison error from a GFX Sub-IP", "Reserved", };
可以发现,其desc能够捕获ECC和Parity error。
但是鉴于自己没有对应的机器实践,rasdaemon并没有尝试验证。
项目中遇到客户的USB接U盘,如果是3.0系统会直接在内核卡死,卡死在经典函数blk_update_request上,我们知道,这肯定是因为同步导致的,因为io同步需要通过usb controler来执行真正的io,落盘到usb的usbstorage上,问题很简单,usb 控制器驱动那边有问题,来源是硬件。
但是同事有提问,如果是同步原语,那么是自旋锁还是互斥锁还是其他呢?我一时没有给出正确答案,另一个问题,这个问题是解决了,那我到底能了解什么呢,能了解USB吗,其实不能,能了解文件系统吗,其实也不能,我私以为能了解的就只有同步原语,或者直接点,了解锁。这里为了了解锁,介绍一下ABBA锁场景死锁。
本文以原来项目问题为起始,做了锁的ABBA衍生,一方面解答了自旋锁和其他锁的表象问题,一方面简单介绍了经典abba锁问题,供大家知悉,仅此而已。
主要的同步原语这里有四类,具体介绍这里就不说明了,很多书都会详细介绍
自旋锁:竞争时自旋等待,临界区无法休眠
互斥锁:竞争时可睡眠,
信号量:同互斥锁
完成量:等待完成
根据这四种,有不同的变体,除了完成量,其他都能在不严谨的编程中造成abba锁。故咱们仅谈谈上述三种的基本锁,不谈其他变体,例如顺序锁,读写锁,读写信号,多值信号等等。
什么是ABBA锁呢,如下图示:
两个线程中,thread1拿锁A,此时尝试锁B,然后thread2拿锁B,此时尝试锁A,此时thread1和thread2都拿不到B锁和A锁,在这种状态下,互相锁死,也就是场景的ABBA锁的模型。
介绍ABBA锁非常简单,但是代码中ABBA的场景却非常复杂,这里先以最简单的驱动来示例这个ABBA的问题,总代码如下:
#include <linux/init.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/delay.h> static DEFINE_SPINLOCK(spinlock_a); static DEFINE_SPINLOCK(spinlock_b); static DEFINE_MUTEX(mutex_a); static DEFINE_MUTEX(mutex_b); static DEFINE_SEMAPHORE(semaphore_a); static DEFINE_SEMAPHORE(semaphore_b); static int testsuite = 0; module_param(testsuite, int, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP); MODULE_PARM_DESC(testsuite, "Test Lock Suilte"); static struct task_struct *thread1, *thread2; static int thread_1(void *arg) { char *name = (char*)arg; pr_info("Thread %s starting...\n", name); while(!kthread_should_stop()){ schedule_timeout(msecs_to_jiffies(1000)); if(testsuite == 0x1){ spin_lock(&spinlock_a); pr_info("Thread %s hold spinlock_a trying spinlock_b \n", name); spin_lock(&spinlock_b); pr_info("%s:Do something...\n", name); spin_unlock(&spinlock_b); spin_unlock(&spinlock_a); pr_info("Thread %s unlock\n", name); } if(testsuite == 0x2){ mutex_lock(&mutex_a); pr_info("Thread %s hold mutex_a trying mutex_b\n", name); mutex_lock(&mutex_b); pr_info("%s:Do something...\n", name); mutex_unlock(&mutex_b); mutex_unlock(&mutex_a); } if(testsuite == 0x3){ down(&semaphore_a); pr_info("Thread %s hold semaphore_a trying semaphore_b\n", name); down(&semaphore_b); pr_info("%s:Do something...\n", name); up(&semaphore_b); up(&semaphore_a); } } return 0; } static int thread_2(void *arg) { char *name = (char*)arg; pr_info("Thread %s starting...\n", name); while(!kthread_should_stop()){ schedule_timeout(msecs_to_jiffies(1000)); if(testsuite == 0x1){ spin_lock(&spinlock_b); pr_info("Thread %s hold spinlock_b trying spinlock_a \n", name); spin_lock(&spinlock_a); pr_info("%s:Do something...\n", name); spin_unlock(&spinlock_a); spin_unlock(&spinlock_b); pr_info("Thread %s unlock\n", name); } if(testsuite == 0x2){ mutex_lock(&mutex_b); pr_info("Thread %s hold mutex_b trying mutex_a\n", name); mutex_lock(&mutex_a); pr_info("%s:Do something...\n", name); mutex_unlock(&mutex_a); mutex_unlock(&mutex_b); } if(testsuite == 0x3){ down(&semaphore_b); pr_info("Thread %s hold semaphore_b trying semaphore_a\n", name); down(&semaphore_a); pr_info("%s:Do something...\n", name); up(&semaphore_a); up(&semaphore_b); } } return 0; } static void start_test(void) { thread1 = kthread_run(thread_1, "Thread-1", "spinlock_thread1"); thread2 = kthread_run(thread_2, "Thread-2", "spinlock_thread2"); return ; } static int __init test_init(void) { start_test(); return 0; } static void __exit test_exit(void) { kthread_stop(thread1); kthread_stop(thread2); return; } module_init(test_init); module_exit(test_exit); MODULE_AUTHOR("tangfeng <tangfeng@kylinos.cn>"); MODULE_DESCRIPTION("Test spinlock/mutex/semaphore"); MODULE_LICENSE("GPL");
为了支持自旋版本的ABBA,两个线程分别如下:
对于thread1:
spin_lock(&spinlock_a); pr_info("Thread %s hold spinlock_a trying spinlock_b \n", name); spin_lock(&spinlock_b); pr_info("%s:Do something...\n", name); spin_unlock(&spinlock_b); spin_unlock(&spinlock_a); pr_info("Thread %s unlock\n", name);
对于thread2:
spin_lock(&spinlock_b); pr_info("Thread %s hold spinlock_b trying spinlock_a \n", name); spin_lock(&spinlock_a); pr_info("%s:Do something...\n", name); spin_unlock(&spinlock_a); spin_unlock(&spinlock_b); pr_info("Thread %s unlock\n", name);
此时我们加载运行,日志如下:
[ 247.149092] test: module is from the staging directory, the quality is unknown, you have been warned. [ 247.150668] Thread Thread-1 starting... [ 247.150736] Thread Thread-1 hold spinlock_a trying spinlock_b [ 247.150740] Thread-1:Do something... [ 247.150744] Thread Thread-1 unlock [ 247.150748] Thread Thread-2 starting... [ 247.150753] Thread Thread-2 hold spinlock_b trying spinlock_a [ 247.150755] Thread Thread-1 hold spinlock_a trying spinlock_b [ 307.143419] rcu: INFO: rcu_sched self-detected stall on CPU [ 307.143446] rcu: 0-....: (17996 ticks this GP) idle=6a2/1/0x4000000000000002 softirq=4790/4790 fqs=5944 last_accelerate: 0000/04c6 dyntick_enabled: 0 [ 307.143453] (t=18000 jiffies g=14105 q=7019) [ 307.143458] Task dump for CPU 0: [ 307.143464] task:spinlock_thread state:R running task stack: 0 pid: 7795 ppid: 2 flags:0x0000000a [ 307.143474] Call trace: [ 307.143484] dump_backtrace+0x0/0x1e8 [ 307.143491] show_stack+0x1c/0x28 [ 307.143498] sched_show_task+0x154/0x178 [ 307.143505] dump_cpu_task+0x48/0x54 [ 307.143511] rcu_dump_cpu_stacks+0xbc/0xfc [ 307.143517] rcu_sched_clock_irq+0x8b0/0x9d8 [ 307.143524] update_process_times+0x64/0xa0 [ 307.143530] tick_sched_handle.isra.0+0x38/0x58 [ 307.143535] tick_sched_timer+0x50/0xa0 [ 307.143540] __hrtimer_run_queues+0x148/0x2d8 [ 307.143546] hrtimer_interrupt+0xec/0x240 [ 307.143553] arch_timer_handler_phys+0x38/0x48 [ 307.143559] handle_percpu_devid_irq+0x8c/0x210 [ 307.143565] __handle_domain_irq+0x78/0xd8 [ 307.143571] gic_handle_irq+0x88/0x2d8 [ 307.143576] el1_irq+0xc8/0x180 [ 307.143582] queued_spin_lock_slowpath+0x128/0x3b0 [ 307.143587] do_raw_spin_lock+0xd4/0x130 [ 307.143594] _raw_spin_lock+0x14/0x20 [ 307.143605] thread_1+0x174/0x248 [test] [ 307.143612] kthread+0x100/0x130 [ 307.143618] ret_from_fork+0x10/0x18 [ 307.143622] Task dump for CPU 3: [ 307.143626] task:spinlock_thread state:R running task stack: 0 pid: 7796 ppid: 2 flags:0x0000000a [ 307.143635] Call trace: [ 307.143640] __switch_to+0xe4/0x138 [ 307.143645] 0xffffffc00d77be00 [ 307.143651] kthread+0x100/0x130 [ 307.143656] ret_from_fork+0x10/0x18 [ 308.190088] rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 0-... 3-... } 18312 jiffies s: 753 root: 0x9/. [ 308.190098] rcu: blocking rcu_node structures: [ 308.190101] Task dump for CPU 0: [ 308.190103] task:spinlock_thread state:R running task stack: 0 pid: 7795 ppid: 2 flags:0x0000000a [ 308.190107] Call trace: [ 308.190112] __switch_to+0xe4/0x138 [ 308.190115] 0xffffffc00d73be00 [ 308.190118] kthread+0x100/0x130 [ 308.190121] ret_from_fork+0x10/0x18 [ 308.190123] Task dump for CPU 3: [ 308.190124] task:spinlock_thread state:R running task stack: 0 pid: 7796 ppid: 2 flags:0x0000000a [ 308.190128] Call trace: [ 308.190130] __switch_to+0xe4/0x138 [ 308.190132] 0xffffffc00d77be00 [ 308.190134] kthread+0x100/0x130 [ 308.190137] ret_from_fork+0x10/0x18
我们可以通过日志发现:
[ 247.150753] Thread Thread-2 hold spinlock_b trying spinlock_a [ 247.150755] Thread Thread-1 hold spinlock_a trying spinlock_b
thread2先锁住了spinlock_b,然后尝试spinlock_a,而同时thread1锁住了spinlock_a,然后尝试spinlock_b,这就是典型的baab状态。
我们可以留意到如下:
zcat /proc/config.gz | grep CONFIG_RCU_CPU_STALL_TIMEOUT CONFIG_RCU_CPU_STALL_TIMEOUT=60
可以发现rcu stall检测器正好能够检测到spinlock的abba死锁问题
mutex版本的两个thread的代码如下:
thread1:
mutex_lock(&mutex_a); pr_info("Thread %s hold mutex_a trying mutex_b\n", name); mutex_lock(&mutex_b); pr_info("%s:Do something...\n", name); mutex_unlock(&mutex_b); mutex_unlock(&mutex_a);
thread2:
mutex_lock(&mutex_b); pr_info("Thread %s hold mutex_b trying mutex_a\n", name); mutex_lock(&mutex_a); pr_info("%s:Do something...\n", name); mutex_unlock(&mutex_a); mutex_unlock(&mutex_b);
此时我们加载运行,日志如下:
[ 111.601145] Thread Thread-1 starting... [ 111.601222] Thread Thread-1 hold mutex_a trying mutex_b [ 111.601226] Thread-1:Do something... [ 111.601230] Thread Thread-1 hold mutex_a trying mutex_b [ 111.601234] Thread-1:Do something... [ 111.601239] Thread Thread-1 hold mutex_a trying mutex_b [ 111.601244] Thread Thread-2 starting... [ 111.601248] Thread Thread-2 hold mutex_b trying mutex_a
thread1先锁住了mutex_a,然后尝试mutex_b,而同时thread2启动,锁住了mutex_b,然后尝试mutex_a,完成了abba死锁状态。
但是可以发现,此时我们rct stall并不会检测死锁原因,因为mutex在竞争不过的时候会休眠,操作系统仍可以正常使用。为了能够检测这种可休眠的锁的竞争问题,我们需要打开hungtask配置,由内核启动khungtaskd来用于检测软锁问题,如下:
CONFIG_DETECT_HUNG_TASK=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120
当我们的内核支持hungtask后,可以发现内核默认线程khungtaskd启动,如下:
root@kylin:~# ps -ax | grep hungtaskd 71 ? S 0:00 [khungtaskd]
此时,mutex的abba锁存在堆栈,如下:
[ 248.461569] INFO: task spinlock_thread:4537 blocked for more than 122 seconds. [ 248.461598] Tainted: G C 5.10.198 #20 [ 248.461603] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 248.461609] task:spinlock_thread state:D stack: 0 pid: 4537 ppid: 2 flags:0x00000008 [ 248.461618] Call trace: [ 248.461629] __switch_to+0xe4/0x138 [ 248.461637] __schedule+0x2b4/0x818 [ 248.461643] schedule+0x4c/0xd0 [ 248.461649] schedule_preempt_disabled+0x14/0x20 [ 248.461655] __mutex_lock.isra.0+0x184/0x588 [ 248.461660] __mutex_lock_slowpath+0x18/0x20 [ 248.461665] mutex_lock+0x7c/0x88 [ 248.461676] thread_1+0x1f4/0x220 [test] [ 248.461682] kthread+0x100/0x130 [ 248.461688] ret_from_fork+0x10/0x18 [ 248.461693] INFO: task spinlock_thread:4538 blocked for more than 122 seconds. [ 248.461698] Tainted: G C 5.10.198 #20 [ 248.461702] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 248.461706] task:spinlock_thread state:D stack: 0 pid: 4538 ppid: 2 flags:0x00000008 [ 248.461712] Call trace: [ 248.461718] __switch_to+0xe4/0x138 [ 248.461723] __schedule+0x2b4/0x818 [ 248.461728] schedule+0x4c/0xd0 [ 248.461733] schedule_preempt_disabled+0x14/0x20 [ 248.461738] __mutex_lock.isra.0+0x184/0x588 [ 248.461744] __mutex_lock_slowpath+0x18/0x20 [ 248.461754] mutex_lock+0x7c/0x88 [ 248.461761] thread_2+0xec/0x180 [test] [ 248.461766] kthread+0x100/0x130 [ 248.461771] ret_from_fork+0x10/0x18
semaphore版本的两个thread如下:
thread1:
down(&semaphore_a); pr_info("Thread %s hold semaphore_a trying semaphore_b\n", name); down(&semaphore_b); pr_info("%s:Do something...\n", name); up(&semaphore_b); up(&semaphore_a);
thread2:
down(&semaphore_b); pr_info("Thread %s hold semaphore_b trying semaphore_a\n", name); down(&semaphore_a); pr_info("%s:Do something...\n", name); up(&semaphore_a); up(&semaphore_b);
此时加载运行,日志如下:
[ 41.624824] Thread Thread-1 starting... [ 41.625040] Thread Thread-1 hold semaphore_a trying semaphore_b [ 41.625044] Thread-1:Do something... [ 41.625050] Thread Thread-2 starting... [ 41.625051] Thread Thread-1 hold semaphore_a trying semaphore_b [ 41.625055] Thread-1:Do something... [ 41.625076] Thread Thread-1 hold semaphore_a trying semaphore_b [ 41.625130] Thread Thread-2 hold semaphore_b trying semaphore_a [ 248.458566] INFO: task spinlock_thread:3817 blocked for more than 122 seconds. [ 248.458597] Tainted: G C 5.10.198 #20 [ 248.458602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 248.458607] task:spinlock_thread state:D stack: 0 pid: 3817 ppid: 2 flags:0x00000008 [ 248.458616] Call trace: [ 248.458627] __switch_to+0xe4/0x138 [ 248.458635] __schedule+0x2b4/0x818 [ 248.458640] schedule+0x4c/0xd0 [ 248.458646] schedule_timeout+0x290/0x2f0 [ 248.458652] __down+0x74/0xd0 [ 248.458659] down+0x50/0x68 [ 248.458670] thread_1+0xb4/0x220 [test] [ 248.458677] kthread+0x100/0x130 [ 248.458683] ret_from_fork+0x10/0x18 [ 248.458688] INFO: task spinlock_thread:3818 blocked for more than 122 seconds. [ 248.458693] Tainted: G C 5.10.198 #20 [ 248.458696] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 248.458701] task:spinlock_thread state:D stack: 0 pid: 3818 ppid: 2 flags:0x00000008 [ 248.458707] Call trace: [ 248.458712] __switch_to+0xe4/0x138 [ 248.458718] __schedule+0x2b4/0x818 [ 248.458723] schedule+0x4c/0xd0 [ 248.458728] schedule_timeout+0x290/0x2f0 [ 248.458733] __down+0x74/0xd0 [ 248.458738] down+0x50/0x68 [ 248.458745] thread_2+0x140/0x180 [test] [ 248.458751] kthread+0x100/0x130 [ 248.458756] ret_from_fork+0x10/0x18
因为semaphore和mutex实现类似,故hungtask仍可以检测semaphore的abba问题
上面可以查看锁的问题,为了调试锁,我们需要打开内核的LOCKDEP。主要配置如下:
CONFIG_LOCK_STAT=y CONFIG_PROVE_LOCKING=y CONFIG_DEBUG_LOCKDEP=y
此时我们的日志能够更清晰的查看abba锁的内容信息,举例如下:
[ 371.125970] ====================================================== [ 371.132162] [ INFO: possible circular locking dependency detected ] [ 371.138445] 4.9.88 #2 Tainted: G O [ 371.142987] ------------------------------------------------------- [ 371.149265] kworker/0:2/104 is trying to acquire lock: ..............................
为了可调试,可以直接如下:
cat /proc/lockdep
此时出现系统所有lock的地址
all lock classes: 0000000044beef8b ....: logbuf_lock 00000000c80448bf ....: (console_sem).lock 00000000700ad619 ....: console_lock 00000000ef505732 ....: cgroup_mutex 0000000042291e92 ....: console_owner_lock 000000002e29cf8c ....: console_owner .....................
也可以看lock的stat,如下
cat /proc/lockdep_stats lock-classes: 1851 [max: 8191] direct dependencies: 0 [max: 32768] indirect dependencies: 0 all direct dependencies: 0 in-hardirq chains: 0 ....................