A: BUG现象
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | *****:[cpu6] soft lockup - now [16002397s],touch_timestamp[16002307s]!*********************** BUG: soft lockup - CPU#6 stuck for 90s! [swapper:0] Modules linked in: kgdboe Cpu 6 $ 0 : 0000000000000000 ffffffff81590030 ffffffff81132400 0000000000000001 $ 4 : 0000000000000001 a800000031514000 ffffffff81596100 ffffffffffff00fe $ 8 : 0000000000000000 a8000000315cfed0 0000000000000018 0000000000000001 $12 : 0000000000000000 0000000000008c00 a8000000315cc000 0000000000000000 $16 : ffffffff81597a20 0000000000000040 ffffffff81590000 96f513832dea2706 $20 : 2dea27065bd44e0c 5bd44e0cb7a89c19 b7a89c196f513832 6f513832dea27065 $24 : 0000000000000002 ffffffff81106e40 $28 : a800000031514000 a800000031517fc0 a89c196f513832de ffffffff81134b64 Hi : 0000000000000000 Lo : 0000000000000000 epc : ffffffff81132420 r4k_wait+0x20/0x40 Not tainted ra : ffffffff81134b64 cpu_idle+0x7c/0xb8 Status: 10008ce3 KX SX UX KERNEL EXL IE Cause : 40808000 PrId : 000d0408 (Cavium Octeon) ... ************************************************************************************** |
B: BUG重现步骤&现场分析
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | 1: "cpu sched clock" are "jiffies A" prior to a call to kgdb_handle_exception() 2: Debugger waits in kgdb_handle_exception() for 80 seconds, on exit the following is called touch_softlockup_watchdog(). 3: The value of jiffies didn't be updated in kgdb due to the interrupts were disabled. so the touch_timestamp of the softlockup_watchdog is still "jiffies A" in the first timer interrupt after resuming from kgdb_handle_exception. 4: jiffies was updated to "jiffies B" on the tick_do_timer_cpu. so "jiffies B" = "jiffies A" + 80 seconds. 5: because ("jiffies B" - "jiffies A") >= 60 seconds, then it will trip the softlockup warning. |
C: BUG触发原因
In kgdb, the jiffies didn't be updated due to interrupt was disabled.
D: BUG解决方法
在退出kgdb的时候设置个标志位表明要更新jiffies.
由于系统中只能有一个cpu在更新jiffies,所以其它的CPU只能先关闭softlockup功能,在等待那个CPU更新完jiffies后,
再开启,具体实现请看patch.
E: others:
由于这个patch比较ugly,加之目前kgdb还处于比较尴尬的地位,所以并没有做提交到mainline的打算.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | Patch: --- include/linux/sched.h | 4 ++++ include/linux/tick.h | 4 ++++ kernel/kgdb.c | 15 +++++++++++++++ kernel/softlockup.c | 26 +++++++++++++++++++------- kernel/time/tick-common.c | 5 +++++ kernel/time/tick-sched.c | 26 ++++++++++++++++++++++++++ 6 files changed, 73 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b42c488..57e2e2d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -297,6 +297,7 @@ extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); +extern void softlockup_update_jiffies(void); extern unsigned int softlockup_panic; extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; @@ -318,6 +319,9 @@ static inline void touch_softlockup_watchdog_sync(void) static inline void touch_all_softlockup_watchdogs(void) { } +static inline void softlockup_update_jiffies(void) +{ +} #endif diff --git a/include/linux/tick.h b/include/linux/tick.h index 8cf8cfe..93c9ff7 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -69,6 +69,7 @@ struct tick_sched { extern void __init tick_init(void); extern int tick_is_oneshot_available(void); extern struct tick_device *tick_get_device(int cpu); +extern int get_tick_do_timer_cpu(void); # ifdef CONFIG_HIGH_RES_TIMERS extern int tick_init_highres(void); @@ -96,9 +97,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void); extern void tick_clock_notify(void); extern int tick_check_oneshot_change(int allow_nohz); extern struct tick_sched *tick_get_tick_sched(int cpu); +extern int tick_update_jiffies(void); # else static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline int tick_update_jiffies(void) { return 0; } # endif #else /* CONFIG_GENERIC_CLOCKEVENTS */ @@ -106,6 +109,7 @@ static inline void tick_init(void) { } static inline void tick_cancel_sched_timer(int cpu) { } static inline void tick_clock_notify(void) { } static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +static inline int get_tick_do_timer_cpu(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ # ifdef CONFIG_NO_HZ diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 235c3ff..bbe49bb 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -48,6 +48,7 @@ #include <linux/pid.h> #include <linux/smp.h> #include <linux/mm.h> +#include <linux/tick.h> #include <asm/cacheflush.h> #include <asm/byteorder.h> @@ -1565,6 +1566,12 @@ acquirelock: atomic_set(&cpu_in_kgdb[ks->cpu], 0); if (!kgdb_single_step) { + /* + * Set update jiffy flags before releasing + * the others cpu. + */ + softlockup_update_jiffies(); + for (i = NR_CPUS-1; i >= 0; i--) atomic_set(&passive_cpu_wait[i], 0); /* @@ -1585,6 +1592,14 @@ kgdb_restore: else kgdb_sstep_pid = 0; } + + /* + * update the jiffies value if the current cpu is the CPU + * which responsible for global tick when kgdb do single setp. + */ + if (kgdb_single_step && get_tick_do_timer_cpu() == cpu) + softlockup_update_jiffies(); + /* Free kgdb_active */ atomic_set(&kgdb_active, -1); touch_softlockup_watchdog_sync(); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 8b24917..713f5e5 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -16,6 +16,7 @@ #include <linux/lockdep.h> #include <linux/notifier.h> #include <linux/module.h> +#include <linux/tick.h> #include <asm/irq_regs.h> @@ -79,11 +80,18 @@ void touch_softlockup_watchdog(void) EXPORT_SYMBOL(touch_softlockup_watchdog); static int softlock_touch_sync[NR_CPUS]; +atomic_t __read_mostly softlock_update_jiffies = ATOMIC_INIT(0); + + +void softlockup_update_jiffies(void) +{ + atomic_inc(&softlock_update_jiffies); +} void touch_softlockup_watchdog_sync(void) { - softlock_touch_sync[raw_smp_processor_id()] = 1; - __raw_get_cpu_var(touch_timestamp) = 0; + softlock_touch_sync[raw_smp_processor_id()] = 1; + __raw_get_cpu_var(touch_timestamp) = 0; } void touch_all_softlockup_watchdogs(void) @@ -118,11 +126,15 @@ void softlockup_tick(void) if (touch_timestamp == 0) { if (unlikely(softlock_touch_sync[this_cpu])) { - /* - * If the time stamp was touched atomically - * make sure the scheduler tick is up to date. - */ - softlock_touch_sync[this_cpu] = 0; + + /* make sure the jiffies is up to date. */ + if (unlikely(atomic_read(&softlock_update_jiffies))) { + if (tick_update_jiffies()) + return; + atomic_set(&softlock_update_jiffies, 0); + } + + /* make sure the scheduler tick is up to date. */ sched_clock_tick(); } __touch_softlockup_watchdog(); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bad22e2..60e9bee 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -36,6 +36,11 @@ ktime_t tick_period; int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); +int get_tick_do_timer_cpu(void) +{ + return tick_do_timer_cpu; +} + /* * Debugging: see timer_list.c */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb02324..0898427 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -83,6 +83,32 @@ static void tick_do_update_jiffies64(ktime_t now) } /* + * tick_update_jiffies() - update the global jiffies + * + * If current CPU is the CPU which responsible for global tick, then + * do update the jiffies value. or it will do nothing, and return 1. + */ +int tick_update_jiffies(void) +{ + unsigned long flags; + int cpu = smp_processor_id(); + + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + tick_do_timer_cpu = cpu; + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu != cpu) + return 1; + + /* do update jiffies */ + local_irq_save(flags); + tick_do_update_jiffies64(ktime_get()); + local_irq_restore(flags); + + return 0; +} + +/* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) -- 1.6.0.4 |