Leon M. Busch-George 98d325aaf8 ipq40xx: wpj428: panic on squashfs error to work around boot limbo
Apparently, a few ipq40xx devices have sporadic problems when reading the
flash over SPI. When that happens, the result of the faulty SPI read is
cached and it isn't re-attempted. Depending on when it happens, the router
either panics and reboots or is left in a partially broken state (an
application wont start).
The data on the flash is alright.

This wasn't the case with Openwrt with Linux < 5.x but I wasn't able to
work out which software change was responsible.

Github user karlpip created a patch for testing that disabled the cache
entirely and added logs. Typically, only one or two SPI operations fail at
a time:

  [689200.631152] spi-nor spi0.0: SPI transfer failed: -110
  [689200.631280] spi_master spi0: failed to transfer one message from queue
  [689200.635369] jffs2: Write of 68 bytes at 0x00ffccf4 failed. returned -110, retlen 0
  [689200.642014] jffs2: Not marking the space at 0x00ffccf4 as dirty because the flash driver returned retlen zero

Because reads aren't re-attempted, squashfs can't recover:

  [3171844.279235] SQUASHFS error: Failed to read block 0x2bb912: -5
  [3171844.279284] SQUASHFS error: Unable to read fragment cache entry [2bb912]
  [3171844.283980] SQUASHFS error: Unable to read page, block 2bb912, size 14e6c
  [3171844.291650] SQUASHFS error: Unable to read fragment cache entry [2bb912]
  [3171844.297831] SQUASHFS error: Unable to read page, block 2bb912, size 14e6c

I assume there to be some kind of underlying electrical problem because,
in my experience, this happens a lot more when PoE is used.

NoTengoBattery has made an in-depth investigation:
https://forum.openwrt.org/t/patch-squashfs-data-probably-corrupt/70480

.. and created a patch that evicts the page cache and retries reading:
https://github.com/NoTengoBattery/openwrt/blob/linksys-ea6350v3-mastertrack/target/linux/ipq40xx/patches-5.4/9996-fs_squashfs_improve_squashfs_error_resistance.patch

The patch also works well with the WPJ428 but NoTengoBattery didn't try to
upstream it ("This is not the solution that should be used").

In 2020, I tried and failed to create a working patch that prevents faulty pages to
be cached in the first place. Because I needed a solution, I backported
  "squashfs: add option to panic on errors " (10dde05b89980ef)
which has since become available in Openwrt.

The 'error=panic' option has been tested on a fleet of multiple hundred
WPJ428s over multiple years. Without this patch, devices regularly went
into 'limbo' on reboot or update and required a manual reboot.
Devices with this patch don't. I was initially concerned that the kernel
panic would leave devices with a real corrupted data but I haven't seen a
case of actual corruption since (outside of people turning off the power
during upgrades).

The WPJ428 is the only device I tested this patch on - others might also
benefit.

Reviewed-by: Robert Marko <robimarko@gmail.com>
Signed-off-by: Leon M. Busch-George <leon@georgemail.eu>
2023-09-24 18:55:35 +02:00

314 lines
5.9 KiB
Plaintext

/* Copyright (c) 2015, The Linux Foundation. All rights reserved.
* Copyright (c) 2017, Christian Mehlis <christian@m3hlis.de>
* Copyright (c) 2017-2018, Sven Eckelmann <sven.eckelmann@openmesh.com>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
#include "qcom-ipq4019.dtsi"
#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/input/input.h>
#include <dt-bindings/soc/qcom,tcsr.h>
/ {
model = "Compex WPJ428";
compatible = "compex,wpj428";
chosen {
/*
* There's a chance that SPI reads fail even though the data itself is alright.
* The read result is cached and squashfs can't recover.
* Just panic when that happens and hope that next time it doesn't.
*/
bootargs-append = " rootflags=errors=panic";
};
soc {
rng@22000 {
status = "okay";
};
mdio@90000 {
status = "okay";
pinctrl-0 = <&mdio_pins>;
pinctrl-names = "default";
reset-gpios = <&tlmm 59 GPIO_ACTIVE_LOW>;
reset-delay-us = <2000>;
};
tcsr@194b000 {
/* select hostmode */
compatible = "qcom,tcsr";
reg = <0x194b000 0x100>;
qcom,usb-hsphy-mode-select = <TCSR_USB_HSPHY_HOST_MODE>;
status = "okay";
};
tcsr@1949000 {
compatible = "qcom,tcsr";
reg = <0x1949000 0x100>;
qcom,wifi_glb_cfg = <TCSR_WIFI_GLB_CFG>;
};
ess_tcsr@1953000 {
compatible = "qcom,tcsr";
reg = <0x1953000 0x1000>;
qcom,ess-interface-select = <TCSR_ESS_PSGMII>;
};
tcsr@1957000 {
compatible = "qcom,tcsr";
reg = <0x1957000 0x100>;
qcom,wifi_noc_memtype_m0_m2 = <TCSR_WIFI_NOC_MEMTYPE_M0_M2>;
};
usb2: usb2@60f8800 {
status = "okay";
};
usb3: usb3@8af8800 {
status = "okay";
};
crypto@8e3a000 {
status = "okay";
};
watchdog@b017000 {
status = "okay";
};
};
keys {
compatible = "gpio-keys";
reset {
label = "reset";
gpios = <&tlmm 63 GPIO_ACTIVE_LOW>;
linux,code = <KEY_RESTART>;
};
};
aliases {
led-boot = &status;
led-failsafe = &status;
led-upgrade = &status;
};
leds {
compatible = "gpio-leds";
status: rss4 {
label = "green:rss4";
gpios = <&tlmm 5 GPIO_ACTIVE_HIGH>;
};
rss3 {
label = "green:rss3";
gpios = <&tlmm 4 GPIO_ACTIVE_HIGH>;
};
};
beeper: beeper {
compatible = "gpio-beeper";
gpios = <&tlmm 58 GPIO_ACTIVE_HIGH>;
};
};
&tlmm {
mdio_pins: mdio_pinmux {
mux_1 {
pins = "gpio53";
function = "mdio";
bias-pull-up;
};
mux_2 {
pins = "gpio52";
function = "mdc";
bias-pull-up;
};
};
serial_pins: serial_pinmux {
mux {
pins = "gpio60", "gpio61";
function = "blsp_uart0";
bias-disable;
};
};
spi_0_pins: spi_0_pinmux {
pin {
function = "blsp_spi0";
pins = "gpio55", "gpio56", "gpio57";
drive-strength = <12>;
bias-disable;
};
pin_cs {
function = "gpio";
pins = "gpio54";
drive-strength = <2>;
bias-disable;
output-high;
};
};
};
&blsp_dma {
status = "okay";
};
&blsp1_spi1 {
pinctrl-0 = <&spi_0_pins>;
pinctrl-names = "default";
status = "okay";
cs-gpios = <&tlmm 54 GPIO_ACTIVE_HIGH>;
m25p80@0 {
compatible = "jedec,spi-nor";
reg = <0>;
spi-max-frequency = <24000000>;
partitions {
compatible = "fixed-partitions";
#address-cells = <1>;
#size-cells = <1>;
partition0@0 {
label = "0:SBL1";
reg = <0x00000000 0x00040000>;
read-only;
};
partition1@40000 {
label = "0:MIBIB";
reg = <0x00040000 0x00020000>;
read-only;
};
partition2@60000 {
label = "0:QSEE";
reg = <0x00060000 0x00060000>;
read-only;
};
partition3@c0000 {
label = "0:CDT";
reg = <0x000c0000 0x00010000>;
read-only;
};
partition4@d0000 {
label = "0:DDRPARAMS";
reg = <0x000d0000 0x00010000>;
read-only;
};
partition5@e0000 {
label = "0:APPSBLENV"; /* uboot env*/
reg = <0x000e0000 0x00010000>;
read-only;
};
partition5@f0000 {
label = "0:APPSBL"; /* uboot */
reg = <0x000f0000 0x00080000>;
read-only;
};
partition5@170000 {
label = "0:ART";
reg = <0x00170000 0x00010000>;
read-only;
compatible = "nvmem-cells";
#address-cells = <1>;
#size-cells = <1>;
precal_art_1000: precal@1000 {
reg = <0x1000 0x2f20>;
};
precal_art_5000: precal@5000 {
reg = <0x5000 0x2f20>;
};
macaddr_art_e010: mac-address@e010 {
reg = <0xe010 0x6>;
};
macaddr_art_e018: mac-address@e018 {
reg = <0xe018 0x6>;
};
};
partition6@180000 {
compatible = "denx,fit";
label = "firmware";
reg = <0x00180000 0x01e80000>;
};
};
};
};
&blsp1_uart1 {
pinctrl-0 = <&serial_pins>;
pinctrl-names = "default";
status = "okay";
};
&cryptobam {
status = "okay";
};
&gmac {
status = "okay";
};
&switch {
status = "okay";
};
&swport4 {
status = "okay";
label = "lan1";
nvmem-cells = <&macaddr_art_e018>;
nvmem-cell-names = "mac-address";
};
&swport5 {
status = "okay";
label = "lan2";
nvmem-cells = <&macaddr_art_e010>;
nvmem-cell-names = "mac-address";
};
&usb3_ss_phy {
status = "okay";
};
&usb3_hs_phy {
status = "okay";
};
&usb2_hs_phy {
status = "okay";
};
&wifi0 {
status = "okay";
nvmem-cell-names = "pre-calibration";
nvmem-cells = <&precal_art_1000>;
};
&wifi1 {
status = "okay";
nvmem-cell-names = "pre-calibration";
nvmem-cells = <&precal_art_5000>;
};