dimm: Allow ECC error threshold to be configured
authorSepherosa Ziehau <sephe@dragonflybsd.org>
Thu, 14 May 2015 13:31:48 +0000 (21:31 +0800)
committerSepherosa Ziehau <sephe@dragonflybsd.org>
Thu, 14 May 2015 13:31:48 +0000 (21:31 +0800)
sys/dev/misc/dimm/dimm.c
sys/dev/misc/dimm/dimm.h
sys/dev/misc/ecc/ecc_e5.c

index 18a3cca..a661c72 100644 (file)
@@ -47,6 +47,8 @@
 #define DIMM_TEMP_HIWAT_DEFAULT        85
 #define DIMM_TEMP_LOWAT_DEFAULT        75
 
+#define DIMM_ECC_THRESH_DEFAULT        5
+
 struct dimm_softc {
        TAILQ_ENTRY(dimm_softc) dimm_link;
        int                     dimm_node;
@@ -57,6 +59,7 @@ struct dimm_softc {
        int                     dimm_id;
        int                     dimm_ref;
        int                     dimm_ecc_cnt;
+       int                     dimm_ecc_thresh;
 
        struct ksensordev       dimm_sensdev;
        uint32_t                dimm_sens_taskflags;    /* DIMM_SENS_TF_ */
@@ -115,6 +118,7 @@ dimm_create(int node, int chan, int slot)
        sc->dimm_ref = 1;
        sc->dimm_temp_hiwat = DIMM_TEMP_HIWAT_DEFAULT;
        sc->dimm_temp_lowat = DIMM_TEMP_LOWAT_DEFAULT;
+       sc->dimm_ecc_thresh = DIMM_ECC_THRESH_DEFAULT;
 
        ksnprintf(sc->dimm_sensdev.xname, sizeof(sc->dimm_sensdev.xname),
            "dimm%d", sc->dimm_id);
@@ -150,6 +154,10 @@ dimm_create(int node, int chan, int slot)
                    "temp_lowat", CTLFLAG_RW, &sc->dimm_temp_lowat, 0,
                    "Cancel alarm once DIMM temperature is below this value "
                    "(unit: C)");
+               SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx,
+                   SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO,
+                   "ecc_thresh", CTLFLAG_RW, &sc->dimm_ecc_thresh, 0,
+                   "Raise alarm once number ECC errors go above this value");
        }
 
        if (after == NULL) {
@@ -207,6 +215,12 @@ dimm_set_temp_thresh(struct dimm_softc *sc, int hiwat, int lowat)
        sc->dimm_temp_lowat = lowat;
 }
 
+void
+dimm_set_ecc_thresh(struct dimm_softc *sc, int thresh)
+{
+       sc->dimm_ecc_thresh = thresh;
+}
+
 void
 dimm_sensor_temp(struct dimm_softc *sc, struct ksensor *sens, int temp)
 {
@@ -246,6 +260,10 @@ dimm_sensor_ecc_set(struct dimm_softc *sc, struct ksensor *sens,
        enum sensor_status status;
 
        sc->dimm_ecc_cnt = ecc_cnt;
+
+       if (!crit && sc->dimm_ecc_cnt >= sc->dimm_ecc_thresh)
+               crit = TRUE;
+
        if (crit && (sc->dimm_sens_taskflags & DIMM_SENS_TF_ECC_CRIT) == 0) {
                char ecc_str[16], data[64];
 
index 9c56df5..69166f2 100644 (file)
@@ -47,6 +47,9 @@ void                  dimm_sensor_detach(struct dimm_softc *_sc,
 
 void                   dimm_set_temp_thresh(struct dimm_softc *_sc,
                            int _hiwat, int _lowat);
+void                   dimm_set_ecc_thresh(struct dimm_softc *_sc,
+                           int _thresh);
+
 void                   dimm_sensor_temp(struct dimm_softc *_sc,
                            struct ksensor *_sens, int _temp);
 void                   dimm_sensor_ecc_set(struct dimm_softc *_sc,
index 9bdd8da..86ef450 100644 (file)
@@ -318,24 +318,20 @@ ecc_e5_attach(device_t dev)
                return 0;
        }
 
-       if (bootverbose) {
-               for (rank = 0; rank < sc->ecc_rank_cnt; ++rank) {
-                       const struct ecc_e5_rank *rk = &sc->ecc_rank[rank];
-                       uint32_t thr, mask;
-                       int ofs;
-
-                       ofs = PCI_E5_IMC_ERROR_COR_ERR_TH(rank / 2);
-                       if (rank & 1)
-                               mask = PCI_E5_IMC_ERROR_COR_ERR_TH_HI;
-                       else
-                               mask = PCI_E5_IMC_ERROR_COR_ERR_TH_LO;
-
-                       thr = pci_read_config(sc->ecc_dev, ofs, 4);
-                       ecc_printf(sc, "DIMM%d rank%d, "
-                           "corrected error threshold %d\n",
-                           rk->rank_dimm, rk->rank_dimm_rank,
-                           __SHIFTOUT(thr, mask));
-               }
+       for (rank = 0; rank < sc->ecc_rank_cnt; ++rank) {
+               const struct ecc_e5_rank *rk = &sc->ecc_rank[rank];
+               uint32_t thr, mask;
+               int ofs;
+
+               ofs = PCI_E5_IMC_ERROR_COR_ERR_TH(rank / 2);
+               if (rank & 1)
+                       mask = PCI_E5_IMC_ERROR_COR_ERR_TH_HI;
+               else
+                       mask = PCI_E5_IMC_ERROR_COR_ERR_TH_LO;
+
+               thr = pci_read_config(sc->ecc_dev, ofs, 4);
+               dimm_set_ecc_thresh(rk->rank_dimm_sc->dimm_softc,
+                   __SHIFTOUT(thr, mask));
        }
 
        sensor_task_register(sc, ecc_e5_sensor_task, 1);