kgdb抓虫日记 – kgdb 与 jiffies[softlockup]

/ 0评 / 0

A: BUG现象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 
*****:[cpu6] soft lockup - now [16002397s],touch_timestamp[16002307s]!*********************** 
BUG: soft lockup - CPU#6 stuck for 90s! [swapper:0]
Modules linked in: kgdboe
Cpu 6
$ 0   : 0000000000000000 ffffffff81590030 ffffffff81132400 0000000000000001
$ 4   : 0000000000000001 a800000031514000 ffffffff81596100 ffffffffffff00fe
$ 8   : 0000000000000000 a8000000315cfed0 0000000000000018 0000000000000001
$12   : 0000000000000000 0000000000008c00 a8000000315cc000 0000000000000000
$16   : ffffffff81597a20 0000000000000040 ffffffff81590000 96f513832dea2706
$20   : 2dea27065bd44e0c 5bd44e0cb7a89c19 b7a89c196f513832 6f513832dea27065
$24   : 0000000000000002 ffffffff81106e40                                  
$28   : a800000031514000 a800000031517fc0 a89c196f513832de ffffffff81134b64
Hi    : 0000000000000000
Lo    : 0000000000000000
epc   : ffffffff81132420 r4k_wait+0x20/0x40
    Not tainted
ra    : ffffffff81134b64 cpu_idle+0x7c/0xb8
Status: 10008ce3    KX SX UX KERNEL EXL IE 
Cause : 40808000
PrId  : 000d0408 (Cavium Octeon)
...
**************************************************************************************

B: BUG重现步骤&现场分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
1: "cpu sched clock" are "jiffies A" prior to a call to
kgdb_handle_exception()
 
2: Debugger waits in kgdb_handle_exception() for 80 seconds, on exit the
following is called touch_softlockup_watchdog().
 
3: The value of jiffies didn't be updated in kgdb due to the interrupts
were disabled. so the touch_timestamp of the softlockup_watchdog is
still "jiffies A" in the first timer interrupt after resuming from
kgdb_handle_exception.
 
4: jiffies was updated to "jiffies B" on the tick_do_timer_cpu.
so "jiffies B" = "jiffies A" + 80 seconds.
 
5: because ("jiffies B" - "jiffies A") >= 60 seconds, then it will trip
 the softlockup warning.

C: BUG触发原因

In kgdb, the jiffies didn't be updated due to interrupt was disabled.

D: BUG解决方法

在退出kgdb的时候设置个标志位表明要更新jiffies.
由于系统中只能有一个cpu在更新jiffies,所以其它的CPU只能先关闭softlockup功能,在等待那个CPU更新完jiffies后,
再开启,具体实现请看patch.

E: others:

由于这个patch比较ugly,加之目前kgdb还处于比较尴尬的地位,所以并没有做提交到mainline的打算.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
Patch:
---
 include/linux/sched.h     |    4 ++++
 include/linux/tick.h      |    4 ++++
 kernel/kgdb.c             |   15 +++++++++++++++
 kernel/softlockup.c       |   26 +++++++++++++++++++-------
 kernel/time/tick-common.c |    5 +++++
 kernel/time/tick-sched.c  |   26 ++++++++++++++++++++++++++
 6 files changed, 73 insertions(+), 7 deletions(-)
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b42c488..57e2e2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -297,6 +297,7 @@ extern void softlockup_tick(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
+extern void softlockup_update_jiffies(void);
 extern unsigned int  softlockup_panic;
 extern unsigned long sysctl_hung_task_check_count;
 extern unsigned long sysctl_hung_task_timeout_secs;
@@ -318,6 +319,9 @@ static inline void touch_softlockup_watchdog_sync(void)
 static inline void touch_all_softlockup_watchdogs(void)
 {
 }
+static inline void softlockup_update_jiffies(void)
+{
+}
 #endif
 
 
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 8cf8cfe..93c9ff7 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -69,6 +69,7 @@ struct tick_sched {
 extern void __init tick_init(void);
 extern int tick_is_oneshot_available(void);
 extern struct tick_device *tick_get_device(int cpu);
+extern int get_tick_do_timer_cpu(void);
 
 # ifdef CONFIG_HIGH_RES_TIMERS
 extern int tick_init_highres(void);
@@ -96,9 +97,11 @@ extern cpumask_t *tick_get_broadcast_oneshot_mask(void);
 extern void tick_clock_notify(void);
 extern int tick_check_oneshot_change(int allow_nohz);
 extern struct tick_sched *tick_get_tick_sched(int cpu);
+extern int tick_update_jiffies(void);
 # else
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline int tick_update_jiffies(void) { return 0; }
 # endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
@@ -106,6 +109,7 @@ static inline void tick_init(void) { }
 static inline void tick_cancel_sched_timer(int cpu) { }
 static inline void tick_clock_notify(void) { }
 static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+static inline int get_tick_do_timer_cpu(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 # ifdef CONFIG_NO_HZ
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 235c3ff..bbe49bb 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -48,6 +48,7 @@
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/tick.h>
 
 #include <asm/cacheflush.h>
 #include <asm/byteorder.h>
@@ -1565,6 +1566,12 @@ acquirelock:
        atomic_set(&cpu_in_kgdb[ks->cpu], 0);
 
        if (!kgdb_single_step) {
+               /*
+                * Set update jiffy flags before releasing
+                * the others cpu.
+                */
+               softlockup_update_jiffies();
+
                for (i = NR_CPUS-1; i >= 0; i--)
                        atomic_set(&passive_cpu_wait[i], 0);
                /*
@@ -1585,6 +1592,14 @@ kgdb_restore:
                else
                        kgdb_sstep_pid = 0;
        }
+
+       /*
+        * update the jiffies value if the current cpu is the CPU
+        * which responsible for global tick when kgdb do single setp.
+        */
+       if (kgdb_single_step && get_tick_do_timer_cpu() == cpu)
+               softlockup_update_jiffies();
+
        /* Free kgdb_active */
        atomic_set(&kgdb_active, -1);
        touch_softlockup_watchdog_sync();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8b24917..713f5e5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
 #include <linux/lockdep.h>
 #include <linux/notifier.h>
 #include <linux/module.h>
+#include <linux/tick.h>
 
 #include <asm/irq_regs.h>
 
@@ -79,11 +80,18 @@ void touch_softlockup_watchdog(void)
 EXPORT_SYMBOL(touch_softlockup_watchdog);
 
 static int softlock_touch_sync[NR_CPUS];
+atomic_t __read_mostly softlock_update_jiffies = ATOMIC_INIT(0);
+
+
+void softlockup_update_jiffies(void)
+{
+       atomic_inc(&softlock_update_jiffies);
+}
 
 void touch_softlockup_watchdog_sync(void)
 {
-       softlock_touch_sync[raw_smp_processor_id()] = 1;
-       __raw_get_cpu_var(touch_timestamp) = 0;
+       softlock_touch_sync[raw_smp_processor_id()] = 1;
+       __raw_get_cpu_var(touch_timestamp) = 0;
 }
 
 void touch_all_softlockup_watchdogs(void)
@@ -118,11 +126,15 @@ void softlockup_tick(void)
 
        if (touch_timestamp == 0) {
                if (unlikely(softlock_touch_sync[this_cpu])) {
-                       /*
-                        * If the time stamp was touched atomically
-                        * make sure the scheduler tick is up to date.
-                        */
-                       softlock_touch_sync[this_cpu] = 0;
+
+                       /* make sure the jiffies is up to date. */
+                       if (unlikely(atomic_read(&softlock_update_jiffies))) {
+                               if (tick_update_jiffies())
+                                       return;
+                               atomic_set(&softlock_update_jiffies, 0);
+                       }
+
+                       /* make sure the scheduler tick is up to date. */
                        sched_clock_tick();
                }
                __touch_softlockup_watchdog();
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index bad22e2..60e9bee 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -36,6 +36,11 @@ ktime_t tick_period;
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 DEFINE_SPINLOCK(tick_device_lock);
 
+int get_tick_do_timer_cpu(void)
+{
+       return tick_do_timer_cpu;
+}
+
 /*
  * Debugging: see timer_list.c
  */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb02324..0898427 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -83,6 +83,32 @@ static void tick_do_update_jiffies64(ktime_t now)
 }
 
 /*
+ * tick_update_jiffies() - update the global jiffies
+ *
+ * If current CPU is the CPU which responsible for global tick, then
+ * do update the jiffies value. or it will do nothing, and return 1.
+ */
+int tick_update_jiffies(void)
+{
+       unsigned long flags;
+       int cpu = smp_processor_id();
+
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+               tick_do_timer_cpu = cpu;
+
+       /* Check, if the jiffies need an update */
+       if (tick_do_timer_cpu != cpu)
+               return 1;
+
+       /* do update jiffies */
+       local_irq_save(flags);
+       tick_do_update_jiffies64(ktime_get());
+       local_irq_restore(flags);
+
+       return 0;
+}
+
+/*
  * Initialize and return retrieve the jiffies update.
  */
 static ktime_t tick_init_jiffy_update(void)
-- 
1.6.0.4

发表评论

电子邮件地址不会被公开。 必填项已用*标注

此站点使用Akismet来减少垃圾评论。了解我们如何处理您的评论数据