From c0c61419714a4280aaeaa8694c95f17ed0dfb742 Mon Sep 17 00:00:00 2001 From: Adam Joseph Date: Tue, 13 Jun 2023 01:42:17 -0700 Subject: [PATCH] coreboot: kgpe-d16: do not enable hw monitor until kernel boots This commit adds a coreboot patch which causes kgpe-d16 to skip the PNP enumeration/assignment process for the "hardware monitor" (basically a temperature sensor and fan-speed controller) block on the southbridge chip. I have found this patch solves the last remaining boot reliability problem I was having with my unattended kgpe-d16 machines. The commit message for this patch is below: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - The hardware monitor is one of the blocks within the w83667hg-a chip (there are many others). It is basically a bunch of ADCs (analog to digital converters) hooked up to voltage, current, and temperature sensors in various locations on the motherboard. This block has the ability to generate several different interrupts (SMI#, OVT#, etc) in response to thermal conditions. It appears to sometimes (about 10% of boot-ups, depending on temperature) spew erroneous alarm interrupts the instant you enable it, when doing so from within coreboot. This causes the w83667hg-a chip and the entire system to hang, and the watchdog cannot recover from this state because it is part of the w83667hg-a chip too. An even bigger problem is that the hardware monitor is initialized *before* the fans are brought up to full speed. So if the CPU is above the critical temperature it will remain there because the fans are in their default low-speed boot state. The chip just keeps getting hotter and hotter -- not enough to damage itself, but hot enough that it won't come down to an acceptable temperature with simple reboots and power-cycles; you have to leave the system off for a while. Since the fans aren't running while the system is off this takes quite a while (several minutes). It's a very fussy and fidgety process, and not something you want to walk a remote-hands guy at the datacenter through over the phone. To avoid this whole mess, let's simply not assign PNP resources to the hardware monitor from coreboot. Linux doesn't need these anyways; it communicates with the hardware using I2C. --- src/coreboot/default.nix | 1 + ...enable-hw-monitor-until-kernel-boots.patch | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 src/coreboot/patches/0002-kgpe-d16-do-not-enable-hw-monitor-until-kernel-boots.patch diff --git a/src/coreboot/default.nix b/src/coreboot/default.nix index 35adbc6..c837b51 100644 --- a/src/coreboot/default.nix +++ b/src/coreboot/default.nix @@ -108,6 +108,7 @@ stdenv.mkDerivation { ./patches/0003-kgpe-d16-bootblock.c-use-RTC_BOOT_BYTE-even-when-CON.patch ./patches/0004-superio-winbond-w83667hg-a-superio.c-do-not-use-get_.patch ./patches/0001-romstage-print-out-dimm-voltages.patch + ./patches/0002-kgpe-d16-do-not-enable-hw-monitor-until-kernel-boots.patch # am1i patches ./patches/0021-am1i-omit-amdfw.rom-completely-it-has-broken-address.patch diff --git a/src/coreboot/patches/0002-kgpe-d16-do-not-enable-hw-monitor-until-kernel-boots.patch b/src/coreboot/patches/0002-kgpe-d16-do-not-enable-hw-monitor-until-kernel-boots.patch new file mode 100644 index 0000000..677342f --- /dev/null +++ b/src/coreboot/patches/0002-kgpe-d16-do-not-enable-hw-monitor-until-kernel-boots.patch @@ -0,0 +1,53 @@ +From 9ab2d370ea17e86301d8de4134f4c0abf82b211e Mon Sep 17 00:00:00 2001 +From: Your Name +Date: Mon, 12 Jun 2023 02:29:45 -0700 +Subject: [PATCH 2/4] kgpe-d16: do not enable hw monitor until kernel boots + +The hardware monitor is one of the blocks within the w83667hg-a chip +(there are many others). It is basically a bunch of ADCs (analog to +digital converters) hooked up to voltage, current, and temperature +sensors in various locations on the motherboard. + +This block has the ability to generate several different interrupts +(SMI#, OVT#, etc) in response to thermal conditions. It appears to +sometimes (about 10% of boot-ups, depending on temperature) spew +erroneous alarm interrupts the instant you enable it, when doing so +from within coreboot. This causes the w83667hg-a chip and the +entire system to hang, and the watchdog cannot recover from this +state because it is part of the w83667hg-a chip too. + +An even bigger problem is that the hardware monitor is initialized +*before* the fans are brought up to full speed. So if the CPU is +above the critical temperature it will remain there because the fans +are in their default low-speed boot state. The chip just keeps +getting hotter and hotter -- not enough to damage itself, but hot +enough that it won't come down to an acceptable temperature with +simple reboots and power-cycles; you have to leave the system off +for a while. Since the fans aren't running while the system is off +this takes quite a while (several minutes). It's a very fussy and +fidgety process, and not something you want to walk a remote-hands +guy at the datacenter through over the phone. + +To avoid this whole mess, let's simply not assign PNP resources to +the hardware monitor from coreboot. Linux doesn't need these +anyways; it communicates with the hardware using I2C. +--- + src/mainboard/asus/kgpe-d16/devicetree.cb | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/mainboard/asus/kgpe-d16/devicetree.cb b/src/mainboard/asus/kgpe-d16/devicetree.cb +index 3be328d1725..5725fce4a09 100644 +--- a/src/mainboard/asus/kgpe-d16/devicetree.cb ++++ b/src/mainboard/asus/kgpe-d16/devicetree.cb +@@ -206,7 +206,7 @@ chip northbridge/amd/amdfam10/root_complex # Root complex + device pnp 2e.209 off end # GPIO4 + device pnp 2e.309 off end # GPIO5 + device pnp 2e.a on end # ACPI +- device pnp 2e.b on # HW Monitor ++ device pnp 2e.b off # HW Monitor + io 0x60 = 0x290 + # IRQ purposefully not assigned to prevent lockups + end +-- +2.39.1 +