Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
From 8aec28caf011788522c1fd2dea92959477016053 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <[email protected]>
Date: Thu, 22 May 2025 18:21:09 -0500
Subject: [PATCH] PCI/AER: Factor COR/UNCOR error handling out from
aer_isr_one_error()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

aer_isr_one_error() duplicates the Error Source ID logging and AER error
processing for Correctable Errors and Uncorrectable Errors. Factor out the
duplicated code to aer_isr_one_error_type().

aer_isr_one_error() doesn't need the struct aer_rpc pointer, so pass it the
Root Port or RCEC pci_dev pointer instead.

Signed-off-by: Bjorn Helgaas <[email protected]>
Reviewed-by: Ilpo Järvinen <[email protected]>
Reviewed-by: Jonathan Cameron <[email protected]>
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
Link: https://patch.msgid.link/[email protected]
(cherry picked from commit 6fc4dae74afcf29ef82afbaaa9b082893871eda4)
---
drivers/pci/pcie/aer.c | 36 +++++++++++++++++++++++-------------
1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 01234567..89abcdef 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1287,17 +1287,32 @@ static inline void aer_process_err_devices(struct aer_err_info *e_info)
}

/**
- * aer_isr_one_error - consume an error detected by root port
- * @rpc: pointer to the root port which holds an error
+ * aer_isr_one_error_type - consume a Correctable or Uncorrectable Error
+ * detected by Root Port or RCEC
+ * @root: pointer to Root Port or RCEC that signaled AER interrupt
+ * @info: pointer to AER error info
+ */
+static void aer_isr_one_error_type(struct pci_dev *root,
+ struct aer_err_info *info)
+{
+ aer_print_port_info(root, info);
+
+ if (find_source_device(root, info))
+ aer_process_err_devices(info);
+}
+
+/**
+ * aer_isr_one_error - consume error(s) signaled by an AER interrupt from
+ * Root Port or RCEC
+ * @root: pointer to Root Port or RCEC that signaled AER interrupt
* @e_src: pointer to an error source
*/
-static void aer_isr_one_error(struct aer_rpc *rpc,
+static void aer_isr_one_error(struct pci_dev *root,
struct aer_err_source *e_src)
{
- struct pci_dev *pdev = rpc->rpd;
struct aer_err_info e_info;

- pci_rootport_aer_stats_incr(pdev, e_src);
+ pci_rootport_aer_stats_incr(root, e_src);

/*
* There is a possibility that both correctable error and
@@ -1312,10 +1327,8 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
e_info.multi_error_valid = 1;
else
e_info.multi_error_valid = 0;
- aer_print_port_info(pdev, &e_info);

- if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info);
+ aer_isr_one_error_type(root, &e_info);
}

if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
@@ -1332,10 +1345,7 @@ static void aer_isr_one_error(struct aer_rpc *rpc,
else
e_info.multi_error_valid = 0;

- aer_print_port_info(pdev, &e_info);
-
- if (find_source_device(pdev, &e_info))
- aer_process_err_devices(&e_info);
+ aer_isr_one_error_type(root, &e_info);
}
}

@@ -1356,7 +1366,7 @@ static irqreturn_t aer_isr(int irq, void *context)
return IRQ_NONE;

while (kfifo_get(&rpc->aer_fifo, &e_src))
- aer_isr_one_error(rpc, &e_src);
+ aer_isr_one_error(rpc->rpd, &e_src);
return IRQ_HANDLED;
}

--
2.47.0

Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
From cf5770619326108794a72ca7b3500ad9e3aefa90 Mon Sep 17 00:00:00 2001
From: Vernon Yang <[email protected]>
Date: Fri, 5 Sep 2025 02:25:27 +0800
Subject: [PATCH 1/2] PCI/AER: Fix NULL pointer access by aer_info

The kzalloc(GFP_KERNEL) may return NULL, so all accesses to aer_info->xxx
will result in kernel panic. Fix it.

Signed-off-by: Vernon Yang <[email protected]>
Signed-off-by: Bjorn Helgaas <[email protected]>
Link: https://patch.msgid.link/[email protected]
(cherry picked from commit 0a27bdb14b028fed30a10cec2f945c38cb5ca4fa)
---
drivers/pci/pcie/aer.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 01234567..89abcdef 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -385,7 +385,11 @@ void pci_aer_init(struct pci_dev *dev)
if (!dev->aer_cap)
return;

- dev->aer_stats = kzalloc(sizeof(struct aer_stats), GFP_KERNEL);
+ dev->aer_stats = kzalloc(sizeof(*dev->aer_stats), GFP_KERNEL);
+ if (!dev->aer_stats) {
+ dev->aer_cap = 0;
+ return;
+ }

ratelimit_state_init(&dev->aer_stats->correctable_ratelimit,
DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
--
2.39.5 (Apple Git-154)

69 changes: 69 additions & 0 deletions patches-sonic/0001-PCI-AER-Simplify-pci_print_aer.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
From fed119ab131c202e7677ce0228d17f5cb74baa29 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <[email protected]>
Date: Thu, 22 May 2025 18:21:15 -0500
Subject: [PATCH 1/8] PCI/AER: Simplify pci_print_aer()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Simplify pci_print_aer() by initializing the struct aer_err_info "info"
with a designated initializer list (it was previously initialized with
memset()) and using pci_name().

Signed-off-by: Bjorn Helgaas <[email protected]>
Tested-by: Krzysztof Wilczyński <[email protected]>
Reviewed-by: Ilpo Järvinen <[email protected]>
Reviewed-by: Jonathan Cameron <[email protected]>
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
Link: https://patch.msgid.link/[email protected]
(cherry picked from commit ad9839137cf9fb0f0c2d531bd04bc4382e6f2de9)
---
drivers/pci/pcie/aer.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 01234567..89abcdef 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -774,7 +774,10 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
{
int layer, agent, tlp_header_valid = 0;
u32 status, mask;
- struct aer_err_info info;
+ struct aer_err_info info = {
+ .severity = aer_severity,
+ .first_error = PCI_ERR_CAP_FEP(aer->cap_control),
+ };

if (aer_severity == AER_CORRECTABLE) {
status = aer->cor_status;
@@ -785,14 +788,11 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
tlp_header_valid = status & AER_LOG_TLP_MASKS;
}

- layer = AER_GET_LAYER_ERROR(aer_severity, status);
- agent = AER_GET_AGENT(aer_severity, status);
-
- memset(&info, 0, sizeof(info));
- info.severity = aer_severity;
info.status = status;
info.mask = mask;
- info.first_error = PCI_ERR_CAP_FEP(aer->cap_control);
+
+ layer = AER_GET_LAYER_ERROR(aer_severity, status);
+ agent = AER_GET_AGENT(aer_severity, status);

pci_err(dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask);
__aer_print_error(dev, &info);
@@ -806,7 +806,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
if (tlp_header_valid)
__print_tlp_header(dev, &aer->header_log);

- trace_aer_event(dev_name(&dev->dev), (status & ~mask),
+ trace_aer_event(pci_name(dev), (status & ~mask),
aer_severity, tlp_header_valid, &aer->header_log);
}
#endif
--
2.47.0

Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
From 23d8218139853dda49859e3041f17111cdc47400 Mon Sep 17 00:00:00 2001
From: Breno Leitao <[email protected]>
Date: Mon, 29 Sep 2025 02:15:47 -0700
Subject: [PATCH 2/2] PCI/AER: Avoid NULL pointer dereference in
aer_ratelimit()

When platform firmware supplies error information to the OS, e.g., via the
ACPI APEI GHES mechanism, it may identify an error source device that
doesn't advertise an AER Capability and therefore dev->aer_info, which
contains AER stats and ratelimiting data, is NULL.

pci_dev_aer_stats_incr() already checks dev->aer_info for NULL, but
aer_ratelimit() did not, leading to NULL pointer dereferences like this one
from the URL below:

{1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 0
{1}[Hardware Error]: event severity: corrected
{1}[Hardware Error]: device_id: 0000:00:00.0
{1}[Hardware Error]: vendor_id: 0x8086, device_id: 0x2020
{1}[Hardware Error]: aer_cor_status: 0x00001000, aer_cor_mask: 0x00002000
BUG: kernel NULL pointer dereference, address: 0000000000000264
RIP: 0010:___ratelimit+0xc/0x1b0
pci_print_aer+0x141/0x360
aer_recover_work_func+0xb5/0x130

[8086:2020] is an Intel "Sky Lake-E DMI3 Registers" device that claims to
be a Root Port but does not advertise an AER Capability.

Add a NULL check in aer_ratelimit() to avoid the NULL pointer dereference.
Note that this also prevents ratelimiting these events from GHES.

Fixes: a57f2bfb4a5863 ("PCI/AER: Ratelimit correctable and non-fatal error logging")
Link: https://lore.kernel.org/r/buduna6darbvwfg3aogl5kimyxkggu3n4romnmq6sozut6axeu@clnx7sfsy457/
Signed-off-by: Breno Leitao <[email protected]>
[bhelgaas: add crash details to commit log]
Signed-off-by: Bjorn Helgaas <[email protected]>
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
Cc: [email protected]
Link: https://patch.msgid.link/[email protected]
(cherry picked from commit deb2f228388ff3a9d0623e3b59a053e9235c341d)
---
drivers/pci/pcie/aer.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 01234567..89abcdef 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -800,6 +800,9 @@ static void __print_tlp_header(struct pci_dev *dev,

static int aer_ratelimit(struct pci_dev *dev, unsigned int severity)
{
+ if (!dev->aer_stats)
+ return 1;
+
switch (severity) {
case AER_NONFATAL:
return __ratelimit(&dev->aer_stats->nonfatal_ratelimit);
--
2.39.5 (Apple Git-154)

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
From 8bbcbe849d91b64da2850797482bf0915772898b Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <[email protected]>
Date: Thu, 22 May 2025 18:21:16 -0500
Subject: [PATCH 2/8] PCI/AER: Update statistics before ratelimiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are two AER logging entry points:

- aer_print_error() is used by DPC (dpc_process_error()) and native AER
handling (aer_process_err_devices()).

- pci_print_aer() is used by GHES (aer_recover_work_func()) and CXL
(cxl_handle_rdport_errors())

Both use __aer_print_error() to print the AER error bits. Previously
__aer_print_error() also incremented the AER statistics via
pci_dev_aer_stats_incr().

Call pci_dev_aer_stats_incr() early in the entry points instead of in
__aer_print_error() so we update the statistics even if the actual printing
of error bits is rate limited by a future change.

Signed-off-by: Bjorn Helgaas <[email protected]>
Tested-by: Krzysztof Wilczyński <[email protected]>
Reviewed-by: Ilpo Järvinen <[email protected]>
Reviewed-by: Kuppuswamy Sathyanarayanan <[email protected]>
Reviewed-by: Jonathan Cameron <[email protected]>
Link: https://patch.msgid.link/[email protected]
(cherry picked from commit 88a7765e62b9e4c79c7ca2c7b749ae04f54a5668)
---
drivers/pci/pcie/aer.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 01234567..89abcdef 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -703,7 +703,6 @@ static void __aer_print_error(struct pci_dev *dev,
pci_printk(level, dev, " [%2d] %-22s%s\n", i, errmsg,
info->first_error == i ? " (First)" : "");
}
- pci_dev_aer_stats_incr(dev, info);
}

void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
@@ -712,6 +711,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
int id = ((dev->bus->number << 8) | dev->devfn);
const char *level;

+ pci_dev_aer_stats_incr(dev, info);
+
if (!info->status) {
pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, (Unregistered Agent ID)\n",
aer_error_severity_string[info->severity]);
@@ -791,6 +792,8 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
info.status = status;
info.mask = mask;

+ pci_dev_aer_stats_incr(dev, &info);
+
layer = AER_GET_LAYER_ERROR(aer_severity, status);
agent = AER_GET_AGENT(aer_severity, status);

--
2.47.0

Loading
Loading