From 7ee0de79152d1f7788c347f64b52790d187699e0 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Thu, 14 May 2015 21:31:48 +0800 Subject: [PATCH] dimm: Allow ECC error threshold to be configured --- sys/dev/misc/dimm/dimm.c | 18 ++++++++++++++++++ sys/dev/misc/dimm/dimm.h | 3 +++ sys/dev/misc/ecc/ecc_e5.c | 32 ++++++++++++++------------------ 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/sys/dev/misc/dimm/dimm.c b/sys/dev/misc/dimm/dimm.c index 18a3ccaf8e..a661c72c26 100644 --- a/sys/dev/misc/dimm/dimm.c +++ b/sys/dev/misc/dimm/dimm.c @@ -47,6 +47,8 @@ #define DIMM_TEMP_HIWAT_DEFAULT 85 #define DIMM_TEMP_LOWAT_DEFAULT 75 +#define DIMM_ECC_THRESH_DEFAULT 5 + struct dimm_softc { TAILQ_ENTRY(dimm_softc) dimm_link; int dimm_node; @@ -57,6 +59,7 @@ struct dimm_softc { int dimm_id; int dimm_ref; int dimm_ecc_cnt; + int dimm_ecc_thresh; struct ksensordev dimm_sensdev; uint32_t dimm_sens_taskflags; /* DIMM_SENS_TF_ */ @@ -115,6 +118,7 @@ dimm_create(int node, int chan, int slot) sc->dimm_ref = 1; sc->dimm_temp_hiwat = DIMM_TEMP_HIWAT_DEFAULT; sc->dimm_temp_lowat = DIMM_TEMP_LOWAT_DEFAULT; + sc->dimm_ecc_thresh = DIMM_ECC_THRESH_DEFAULT; ksnprintf(sc->dimm_sensdev.xname, sizeof(sc->dimm_sensdev.xname), "dimm%d", sc->dimm_id); @@ -150,6 +154,10 @@ dimm_create(int node, int chan, int slot) "temp_lowat", CTLFLAG_RW, &sc->dimm_temp_lowat, 0, "Cancel alarm once DIMM temperature is below this value " "(unit: C)"); + SYSCTL_ADD_INT(&sc->dimm_sysctl_ctx, + SYSCTL_CHILDREN(sc->dimm_sysctl_tree), OID_AUTO, + "ecc_thresh", CTLFLAG_RW, &sc->dimm_ecc_thresh, 0, + "Raise alarm once number ECC errors go above this value"); } if (after == NULL) { @@ -207,6 +215,12 @@ dimm_set_temp_thresh(struct dimm_softc *sc, int hiwat, int lowat) sc->dimm_temp_lowat = lowat; } +void +dimm_set_ecc_thresh(struct dimm_softc *sc, int thresh) +{ + sc->dimm_ecc_thresh = thresh; +} + void dimm_sensor_temp(struct dimm_softc *sc, struct ksensor *sens, int temp) { @@ -246,6 +260,10 @@ dimm_sensor_ecc_set(struct dimm_softc *sc, struct ksensor *sens, enum sensor_status status; sc->dimm_ecc_cnt = ecc_cnt; + + if (!crit && sc->dimm_ecc_cnt >= sc->dimm_ecc_thresh) + crit = TRUE; + if (crit && (sc->dimm_sens_taskflags & DIMM_SENS_TF_ECC_CRIT) == 0) { char ecc_str[16], data[64]; diff --git a/sys/dev/misc/dimm/dimm.h b/sys/dev/misc/dimm/dimm.h index 9c56df5d16..69166f2f45 100644 --- a/sys/dev/misc/dimm/dimm.h +++ b/sys/dev/misc/dimm/dimm.h @@ -47,6 +47,9 @@ void dimm_sensor_detach(struct dimm_softc *_sc, void dimm_set_temp_thresh(struct dimm_softc *_sc, int _hiwat, int _lowat); +void dimm_set_ecc_thresh(struct dimm_softc *_sc, + int _thresh); + void dimm_sensor_temp(struct dimm_softc *_sc, struct ksensor *_sens, int _temp); void dimm_sensor_ecc_set(struct dimm_softc *_sc, diff --git a/sys/dev/misc/ecc/ecc_e5.c b/sys/dev/misc/ecc/ecc_e5.c index 9bdd8dace4..86ef45085d 100644 --- a/sys/dev/misc/ecc/ecc_e5.c +++ b/sys/dev/misc/ecc/ecc_e5.c @@ -318,24 +318,20 @@ ecc_e5_attach(device_t dev) return 0; } - if (bootverbose) { - for (rank = 0; rank < sc->ecc_rank_cnt; ++rank) { - const struct ecc_e5_rank *rk = &sc->ecc_rank[rank]; - uint32_t thr, mask; - int ofs; - - ofs = PCI_E5_IMC_ERROR_COR_ERR_TH(rank / 2); - if (rank & 1) - mask = PCI_E5_IMC_ERROR_COR_ERR_TH_HI; - else - mask = PCI_E5_IMC_ERROR_COR_ERR_TH_LO; - - thr = pci_read_config(sc->ecc_dev, ofs, 4); - ecc_printf(sc, "DIMM%d rank%d, " - "corrected error threshold %d\n", - rk->rank_dimm, rk->rank_dimm_rank, - __SHIFTOUT(thr, mask)); - } + for (rank = 0; rank < sc->ecc_rank_cnt; ++rank) { + const struct ecc_e5_rank *rk = &sc->ecc_rank[rank]; + uint32_t thr, mask; + int ofs; + + ofs = PCI_E5_IMC_ERROR_COR_ERR_TH(rank / 2); + if (rank & 1) + mask = PCI_E5_IMC_ERROR_COR_ERR_TH_HI; + else + mask = PCI_E5_IMC_ERROR_COR_ERR_TH_LO; + + thr = pci_read_config(sc->ecc_dev, ofs, 4); + dimm_set_ecc_thresh(rk->rank_dimm_sc->dimm_softc, + __SHIFTOUT(thr, mask)); } sensor_task_register(sc, ecc_e5_sensor_task, 1); -- 2.41.0