Skip to content

Commit 32da32f

Browse files
PCIe AER printk ratelimiting backport
1 parent 255dc52 commit 32da32f

13 files changed

+1274
-0
lines changed
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
From 8aec28caf011788522c1fd2dea92959477016053 Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <[email protected]>
3+
Date: Thu, 22 May 2025 18:21:09 -0500
4+
Subject: [PATCH] PCI/AER: Factor COR/UNCOR error handling out from
5+
aer_isr_one_error()
6+
MIME-Version: 1.0
7+
Content-Type: text/plain; charset=UTF-8
8+
Content-Transfer-Encoding: 8bit
9+
10+
aer_isr_one_error() duplicates the Error Source ID logging and AER error
11+
processing for Correctable Errors and Uncorrectable Errors. Factor out the
12+
duplicated code to aer_isr_one_error_type().
13+
14+
aer_isr_one_error() doesn't need the struct aer_rpc pointer, so pass it the
15+
Root Port or RCEC pci_dev pointer instead.
16+
17+
Signed-off-by: Bjorn Helgaas <[email protected]>
18+
Reviewed-by: Ilpo Järvinen <[email protected]>
19+
Reviewed-by: Jonathan Cameron <[email protected]>
20+
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
21+
Link: https://patch.msgid.link/[email protected]
22+
(cherry picked from commit 6fc4dae74afcf29ef82afbaaa9b082893871eda4)
23+
---
24+
drivers/pci/pcie/aer.c | 36 +++++++++++++++++++++++-------------
25+
1 file changed, 23 insertions(+), 13 deletions(-)
26+
27+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
28+
index 01234567..89abcdef 100644
29+
--- a/drivers/pci/pcie/aer.c
30+
+++ b/drivers/pci/pcie/aer.c
31+
@@ -1287,17 +1287,32 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
32+
}
33+
34+
/**
35+
- * aer_isr_one_error - consume an error detected by root port
36+
- * @rpc: pointer to the root port which holds an error
37+
+ * aer_isr_one_error_type - consume a Correctable or Uncorrectable Error
38+
+ * detected by Root Port or RCEC
39+
+ * @root: pointer to Root Port or RCEC that signaled AER interrupt
40+
+ * @info: pointer to AER error info
41+
+ */
42+
+static void aer_isr_one_error_type(struct pci_dev *root,
43+
+ struct aer_err_info *info)
44+
+{
45+
+ aer_print_port_info(root, info);
46+
+
47+
+ if (find_source_device(root, info))
48+
+ aer_process_err_devices(info);
49+
+}
50+
+
51+
+/**
52+
+ * aer_isr_one_error - consume error(s) signaled by an AER interrupt from
53+
+ * Root Port or RCEC
54+
+ * @root: pointer to Root Port or RCEC that signaled AER interrupt
55+
* @e_src: pointer to an error source
56+
*/
57+
-static void aer_isr_one_error(struct aer_rpc *rpc,
58+
+static void aer_isr_one_error(struct pci_dev *root,
59+
struct aer_err_source *e_src)
60+
{
61+
- struct pci_dev *pdev = rpc->rpd;
62+
struct aer_err_info e_info;
63+
64+
- pci_rootport_aer_stats_incr(pdev, e_src);
65+
+ pci_rootport_aer_stats_incr(root, e_src);
66+
67+
/*
68+
* There is a possibility that both correctable error and
69+
@@ -1312,10 +1327,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
70+
e_info.multi_error_valid = 1;
71+
else
72+
e_info.multi_error_valid = 0;
73+
- aer_print_port_info(pdev, &e_info);
74+
75+
- if (find_source_device(pdev, &e_info))
76+
- aer_process_err_devices(&e_info);
77+
+ aer_isr_one_error_type(root, &e_info);
78+
}
79+
80+
if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
81+
@@ -1332,10 +1345,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
82+
else
83+
e_info.multi_error_valid = 0;
84+
85+
- aer_print_port_info(pdev, &e_info);
86+
-
87+
- if (find_source_device(pdev, &e_info))
88+
- aer_process_err_devices(&e_info);
89+
+ aer_isr_one_error_type(root, &e_info);
90+
}
91+
}
92+
93+
@@ -1356,7 +1366,7 @@ static irqreturn_t aer_isr(int irq, void *context)
94+
return IRQ_NONE;
95+
96+
while (kfifo_get(&rpc->aer_fifo, &e_src))
97+
- aer_isr_one_error(rpc, &e_src);
98+
+ aer_isr_one_error(rpc->rpd, &e_src);
99+
return IRQ_HANDLED;
100+
}
101+
102+
--
103+
2.47.0
104+
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
From cf5770619326108794a72ca7b3500ad9e3aefa90 Mon Sep 17 00:00:00 2001
2+
From: Vernon Yang <[email protected]>
3+
Date: Fri, 5 Sep 2025 02:25:27 +0800
4+
Subject: [PATCH 1/2] PCI/AER: Fix NULL pointer access by aer_info
5+
6+
The kzalloc(GFP_KERNEL) may return NULL, so all accesses to aer_info->xxx
7+
will result in kernel panic. Fix it.
8+
9+
Signed-off-by: Vernon Yang <[email protected]>
10+
Signed-off-by: Bjorn Helgaas <[email protected]>
11+
Link: https://patch.msgid.link/[email protected]
12+
(cherry picked from commit 0a27bdb14b028fed30a10cec2f945c38cb5ca4fa)
13+
---
14+
drivers/pci/pcie/aer.c | 6 +++++-
15+
1 file changed, 5 insertions(+), 1 deletion(-)
16+
17+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
18+
index 01234567..89abcdef 100644
19+
--- a/drivers/pci/pcie/aer.c
20+
+++ b/drivers/pci/pcie/aer.c
21+
@@ -385,7 +385,11 @@ void pci_aer_init(struct pci_dev *dev)
22+
if (!dev->aer_cap)
23+
return;
24+
25+
- dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
26+
+ dev->aer_stats = kzalloc(sizeof(*dev->aer_stats), GFP_KERNEL);
27+
+ if (!dev->aer_stats) {
28+
+ dev->aer_cap = 0;
29+
+ return;
30+
+ }
31+
32+
ratelimit_state_init(&dev->aer_stats->correctable_ratelimit,
33+
DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
34+
--
35+
2.39.5 (Apple Git-154)
36+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
From fed119ab131c202e7677ce0228d17f5cb74baa29 Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <[email protected]>
3+
Date: Thu, 22 May 2025 18:21:15 -0500
4+
Subject: [PATCH 1/8] PCI/AER: Simplify pci_print_aer()
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
Simplify pci_print_aer() by initializing the struct aer_err_info "info"
10+
with a designated initializer list (it was previously initialized with
11+
memset()) and using pci_name().
12+
13+
Signed-off-by: Bjorn Helgaas <[email protected]>
14+
Tested-by: Krzysztof Wilczyński <[email protected]>
15+
Reviewed-by: Ilpo Järvinen <[email protected]>
16+
Reviewed-by: Jonathan Cameron <[email protected]>
17+
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
18+
Link: https://patch.msgid.link/[email protected]
19+
(cherry picked from commit ad9839137cf9fb0f0c2d531bd04bc4382e6f2de9)
20+
---
21+
drivers/pci/pcie/aer.c | 16 ++++++++--------
22+
1 file changed, 8 insertions(+), 8 deletions(-)
23+
24+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
25+
index 01234567..89abcdef 100644
26+
--- a/drivers/pci/pcie/aer.c
27+
+++ b/drivers/pci/pcie/aer.c
28+
@@ -774,7 +774,10 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
29+
{
30+
int layer, agent, tlp_header_valid = 0;
31+
u32 status, mask;
32+
- struct aer_err_info info;
33+
+ struct aer_err_info info = {
34+
+ .severity = aer_severity,
35+
+ .first_error = PCI_ERR_CAP_FEP(aer->cap_control),
36+
+ };
37+
38+
if (aer_severity == AER_CORRECTABLE) {
39+
status = aer->cor_status;
40+
@@ -785,14 +788,11 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
41+
tlp_header_valid = status & AER_LOG_TLP_MASKS;
42+
}
43+
44+
- layer = AER_GET_LAYER_ERROR(aer_severity, status);
45+
- agent = AER_GET_AGENT(aer_severity, status);
46+
-
47+
- memset(&info, 0, sizeof(info));
48+
- info.severity = aer_severity;
49+
info.status = status;
50+
info.mask = mask;
51+
- info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
52+
+
53+
+ layer = AER_GET_LAYER_ERROR(aer_severity, status);
54+
+ agent = AER_GET_AGENT(aer_severity, status);
55+
56+
pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
57+
__aer_print_error(dev, &info);
58+
@@ -806,7 +806,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
59+
if (tlp_header_valid)
60+
__print_tlp_header(dev, &aer->header_log);
61+
62+
- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
63+
+ trace_aer_event(pci_name(dev), (status & ~mask),
64+
aer_severity, tlp_header_valid, &aer->header_log);
65+
}
66+
#endif
67+
--
68+
2.47.0
69+
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
From 23d8218139853dda49859e3041f17111cdc47400 Mon Sep 17 00:00:00 2001
2+
From: Breno Leitao <[email protected]>
3+
Date: Mon, 29 Sep 2025 02:15:47 -0700
4+
Subject: [PATCH 2/2] PCI/AER: Avoid NULL pointer dereference in
5+
aer_ratelimit()
6+
7+
When platform firmware supplies error information to the OS, e.g., via the
8+
ACPI APEI GHES mechanism, it may identify an error source device that
9+
doesn't advertise an AER Capability and therefore dev->aer_info, which
10+
contains AER stats and ratelimiting data, is NULL.
11+
12+
pci_dev_aer_stats_incr() already checks dev->aer_info for NULL, but
13+
aer_ratelimit() did not, leading to NULL pointer dereferences like this one
14+
from the URL below:
15+
16+
{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0
17+
{1}[Hardware Error]: event severity: corrected
18+
{1}[Hardware Error]: device_id: 0000:00:00.0
19+
{1}[Hardware Error]: vendor_id: 0x8086, device_id: 0x2020
20+
{1}[Hardware Error]: aer_cor_status: 0x00001000, aer_cor_mask: 0x00002000
21+
BUG: kernel NULL pointer dereference, address: 0000000000000264
22+
RIP: 0010:___ratelimit+0xc/0x1b0
23+
pci_print_aer+0x141/0x360
24+
aer_recover_work_func+0xb5/0x130
25+
26+
[8086:2020] is an Intel "Sky Lake-E DMI3 Registers" device that claims to
27+
be a Root Port but does not advertise an AER Capability.
28+
29+
Add a NULL check in aer_ratelimit() to avoid the NULL pointer dereference.
30+
Note that this also prevents ratelimiting these events from GHES.
31+
32+
Fixes: a57f2bfb4a5863 ("PCI/AER: Ratelimit correctable and non-fatal error logging")
33+
Link: https://lore.kernel.org/r/buduna6darbvwfg3aogl5kimyxkggu3n4romnmq6sozut6axeu@clnx7sfsy457/
34+
Signed-off-by: Breno Leitao <[email protected]>
35+
[bhelgaas: add crash details to commit log]
36+
Signed-off-by: Bjorn Helgaas <[email protected]>
37+
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
38+
39+
Link: https://patch.msgid.link/[email protected]
40+
(cherry picked from commit deb2f228388ff3a9d0623e3b59a053e9235c341d)
41+
---
42+
drivers/pci/pcie/aer.c | 3 +++
43+
1 file changed, 3 insertions(+)
44+
45+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
46+
index 01234567..89abcdef 100644
47+
--- a/drivers/pci/pcie/aer.c
48+
+++ b/drivers/pci/pcie/aer.c
49+
@@ -800,6 +800,9 @@ static void __print_tlp_header(struct pci_dev *dev,
50+
51+
static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
52+
{
53+
+ if (!dev->aer_stats)
54+
+ return 1;
55+
+
56+
switch (severity) {
57+
case AER_NONFATAL:
58+
return __ratelimit(&dev->aer_stats->nonfatal_ratelimit);
59+
--
60+
2.39.5 (Apple Git-154)
61+
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
From 8bbcbe849d91b64da2850797482bf0915772898b Mon Sep 17 00:00:00 2001
2+
From: Bjorn Helgaas <[email protected]>
3+
Date: Thu, 22 May 2025 18:21:16 -0500
4+
Subject: [PATCH 2/8] PCI/AER: Update statistics before ratelimiting
5+
MIME-Version: 1.0
6+
Content-Type: text/plain; charset=UTF-8
7+
Content-Transfer-Encoding: 8bit
8+
9+
There are two AER logging entry points:
10+
11+
- aer_print_error() is used by DPC (dpc_process_error()) and native AER
12+
handling (aer_process_err_devices()).
13+
14+
- pci_print_aer() is used by GHES (aer_recover_work_func()) and CXL
15+
(cxl_handle_rdport_errors())
16+
17+
Both use __aer_print_error() to print the AER error bits. Previously
18+
__aer_print_error() also incremented the AER statistics via
19+
pci_dev_aer_stats_incr().
20+
21+
Call pci_dev_aer_stats_incr() early in the entry points instead of in
22+
__aer_print_error() so we update the statistics even if the actual printing
23+
of error bits is rate limited by a future change.
24+
25+
Signed-off-by: Bjorn Helgaas <[email protected]>
26+
Tested-by: Krzysztof Wilczyński <[email protected]>
27+
Reviewed-by: Ilpo Järvinen <[email protected]>
28+
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
29+
Reviewed-by: Jonathan Cameron <[email protected]>
30+
Link: https://patch.msgid.link/[email protected]
31+
(cherry picked from commit 88a7765e62b9e4c79c7ca2c7b749ae04f54a5668)
32+
---
33+
drivers/pci/pcie/aer.c | 5 ++++-
34+
1 file changed, 4 insertions(+), 1 deletion(-)
35+
36+
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
37+
index 01234567..89abcdef 100644
38+
--- a/drivers/pci/pcie/aer.c
39+
+++ b/drivers/pci/pcie/aer.c
40+
@@ -703,7 +703,6 @@ static void __aer_print_error(struct pci_dev *dev,
41+
pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
42+
info->first_error == i ? " (First)" : "");
43+
}
44+
- pci_dev_aer_stats_incr(dev, info);
45+
}
46+
47+
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
48+
@@ -712,6 +711,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
49+
int id = ((dev->bus->number << 8) | dev->devfn);
50+
const char *level;
51+
52+
+ pci_dev_aer_stats_incr(dev, info);
53+
+
54+
if (!info->status) {
55+
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
56+
aer_error_severity_string[info->severity]);
57+
@@ -791,6 +792,8 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
58+
info.status = status;
59+
info.mask = mask;
60+
61+
+ pci_dev_aer_stats_incr(dev, &info);
62+
+
63+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
64+
agent = AER_GET_AGENT(aer_severity, status);
65+
66+
--
67+
2.47.0
68+

0 commit comments

Comments
 (0)