From 254817bd7d6b589089830101b9a8c2df4a9c9e97 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Sat, 14 Feb 2015 22:47:02 +0800 Subject: [PATCH] memtemp/e5: Send devctl notify, if DIMM is too hot, and set sensor status - There is no need to save DIMM external id in dimm softc; use a stack variable instead. - White space cleanup. --- sbin/devd/devd.conf.5 | 23 +++- sys/dev/misc/ecc/e5_imc_reg.h | 14 +++ sys/dev/powermng/memtemp/memtemp_e5.c | 149 +++++++++++++++++++++++--- 3 files changed, 173 insertions(+), 13 deletions(-) diff --git a/sbin/devd/devd.conf.5 b/sbin/devd/devd.conf.5 index 5baa1cf9de..0a33b46cd4 100644 --- a/sbin/devd/devd.conf.5 +++ b/sbin/devd/devd.conf.5 @@ -41,7 +41,7 @@ .\" ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS .\" SOFTWARE. .\" -.Dd February 22, 2013 +.Dd February 14, 2015 .Dt DEVD.CONF 5 .Os .Sh NAME @@ -244,6 +244,8 @@ Device name of parent bus. Device node path if one is created by the .Xr devfs 5 filesystem. +.It Li channel +Channel of the DIMM (memtemp). .It Li cisproduct CIS-product. .It Li cisvendor @@ -258,6 +260,8 @@ Device Class (USB) Device Sub-class (USB) .It Li device-name Name of attached/detached device. +.It Li dimm +DIMM id within the channel and node (memtemp). .It Li endpoints Endpoint count (USB) .It Li function @@ -284,6 +288,8 @@ Parent device Hub port number (USB) .It Li product Product ID (pccard/USB). +.It Li node +Node of the DIMM (memtemp). .It Li release Hardware revision (USB) .It Li sernum @@ -406,6 +412,20 @@ Notification that the CPU core has reached critical temperature. String containing the temperature of the core that has become too hot. .El .El +.It Li memtemp +Events related to the +.Xr memtemp 4 +device. +.Bl -tag -width ".Sy Subsystem" -compact +.It Sy Subsystem +.It Li Thermal +Notification that the DIMM has reached critical temperature. +.Bl -tag -width ".Ar temperature" -compact +.It Sy Type +.It Ar temperature +String containing the temperature of the DIMM that has become too hot. +.El +.El .Pp .It Li kern Events related to the kernel. @@ -546,6 +566,7 @@ The installed has many additional examples. .Sh SEE ALSO .Xr coretemp 4 , +.Xr memtemp 4 , .Xr devfs 5 , .Xr re_format 7 , .Xr devd 8 diff --git a/sys/dev/misc/ecc/e5_imc_reg.h b/sys/dev/misc/ecc/e5_imc_reg.h index 7946492b98..3672f4bd7a 100644 --- a/sys/dev/misc/ecc/e5_imc_reg.h +++ b/sys/dev/misc/ecc/e5_imc_reg.h @@ -180,7 +180,21 @@ #define PCIFUNC_E5V3_IMC1_THERMAL_CHN1 1 #define PCI_E5V3_IMC1_THERMAL_CHN1_DID_ID 0x2fd1 /* Thermal regs */ +#define PCI_E5_IMC_THERMAL_DIMM_TEMP_TH(dimm) (0x120 + ((dimm) * 4)) +#define PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPHI __BITS(16, 23) +#define PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPMID __BITS(8, 15) +#define PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPLO __BITS(0, 7) +#define PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPMIN 32 /* [MIN, MAX) */ +#define PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPMAX 128 +#define PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_DISABLE 255 #define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT(dimm) (0x150 + ((dimm) * 4)) +#define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPHI __BIT(28) +#define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPMID __BIT(27) +#define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPLO __BIT(26) +#define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPOEMLO __BIT(25) +#define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPOEMHI __BIT(24) #define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMP __BITS(0, 7) +#define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPMIN 0 /* [MIN, MAX) */ +#define PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPMAX 127 #endif /* !_E5_IMC_REG_H_ */ diff --git a/sys/dev/powermng/memtemp/memtemp_e5.c b/sys/dev/powermng/memtemp/memtemp_e5.c index 05a56729be..16a335f927 100644 --- a/sys/dev/powermng/memtemp/memtemp_e5.c +++ b/sys/dev/powermng/memtemp/memtemp_e5.c @@ -52,6 +52,9 @@ #include #include +#define MEMTEMP_E5_DIMM_TEMP_HIWAT 85 /* spec default TEMPLO */ +#define MEMTEMP_E5_DIMM_TEMP_STEP 5 /* spec TEMPLO/MID/HI step */ + struct memtemp_e5_softc; struct memtemp_e5_dimm { @@ -60,9 +63,13 @@ struct memtemp_e5_dimm { struct ksensor dimm_sensor; struct memtemp_e5_softc *dimm_parent; int dimm_id; - int dimm_extid; + int dimm_temp_hiwat; + int dimm_temp_lowat; + int dimm_flags; }; +#define MEMTEMP_E5_DIMM_FLAG_CRIT 0x1 + struct memtemp_e5_softc { device_t temp_dev; const struct e5_imc_chan *temp_chan; @@ -74,6 +81,8 @@ static int memtemp_e5_probe(device_t); static int memtemp_e5_attach(device_t); static int memtemp_e5_detach(device_t); +static int memtemp_e5_tempth_adjust(int); +static void memtemp_e5_tempth_str(int, char *, int); static void memtemp_e5_sensor_task(void *); #define MEMTEMP_E5_CHAN(v, imc, c, c_ext) \ @@ -90,7 +99,7 @@ static void memtemp_e5_sensor_task(void *); #define MEMTEMP_E5_CHAN_IMC0_V3(c) MEMTEMP_E5_CHAN(3, 0, c, c) #define MEMTEMP_E5_CHAN_IMC1_V3(c, c_ext) \ MEMTEMP_E5_CHAN(3, 1, c, c_ext) -#define MEMTEMP_E5_CHAN_END E5_IMC_CHAN_END +#define MEMTEMP_E5_CHAN_END E5_IMC_CHAN_END static const struct e5_imc_chan memtemp_e5_chans[] = { MEMTEMP_E5_CHAN_V2(0), @@ -170,6 +179,28 @@ memtemp_e5_probe(device_t dev) return ENXIO; } +static int +memtemp_e5_tempth_adjust(int temp) +{ + if (temp == PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_DISABLE) + return 0; + else if (temp < PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPMIN || + temp >= PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPMAX) + return -1; + return temp; +} + +static void +memtemp_e5_tempth_str(int temp, char *temp_str, int temp_strlen) +{ + if (temp < 0) + strlcpy(temp_str, "reserved", temp_strlen); + else if (temp == 0) + strlcpy(temp_str, "disabled", temp_strlen); + else + ksnprintf(temp_str, temp_strlen, "%dC", temp); +} + static int memtemp_e5_attach(device_t dev) { @@ -180,10 +211,12 @@ memtemp_e5_attach(device_t dev) TAILQ_INIT(&sc->temp_dimm); for (dimm = 0; dimm < PCI_E5_IMC_CHN_DIMM_MAX; ++dimm) { + char temp_lostr[16], temp_midstr[16], temp_histr[16]; struct memtemp_e5_dimm *dimm_sc; - uint32_t dimmmtr; + int dimm_extid, temp_lo, temp_mid, temp_hi; + uint32_t dimmmtr, temp_th; - dimmmtr = IMC_CTAD_READ_4(sc->temp_dev, sc->temp_chan, + dimmmtr = IMC_CTAD_READ_4(dev, sc->temp_chan, PCI_E5_IMC_CTAD_DIMMMTR(dimm)); if ((dimmmtr & PCI_E5_IMC_CTAD_DIMMMTR_DIMM_POP) == 0) @@ -193,24 +226,70 @@ memtemp_e5_attach(device_t dev) M_WAITOK | M_ZERO); dimm_sc->dimm_id = dimm; dimm_sc->dimm_parent = sc; - dimm_sc->dimm_extid = + + dimm_extid = (sc->temp_node * PCI_E5_IMC_CHN_MAX * PCI_E5_IMC_CHN_DIMM_MAX) + (sc->temp_chan->chan_ext * PCI_E5_IMC_CHN_DIMM_MAX) + dimm; - ksnprintf(dimm_sc->dimm_sensordev.xname, sizeof(dimm_sc->dimm_sensordev.xname), - "dimm%d", dimm_sc->dimm_extid); + "dimm%d", dimm_extid); dimm_sc->dimm_sensor.type = SENSOR_TEMP; sensor_attach(&dimm_sc->dimm_sensordev, &dimm_sc->dimm_sensor); if (sensor_task_register(dimm_sc, memtemp_e5_sensor_task, 2)) { - device_printf(sc->temp_dev, "DIMM%d sensor task " - "register failed\n", dimm); + device_printf(dev, "DIMM%d sensor task register " + "failed\n", dimm); kfree(dimm_sc, M_DEVBUF); continue; } sensordev_install(&dimm_sc->dimm_sensordev); TAILQ_INSERT_TAIL(&sc->temp_dimm, dimm_sc, dimm_link); + + temp_th = pci_read_config(dev, + PCI_E5_IMC_THERMAL_DIMM_TEMP_TH(dimm), 4); + + temp_lo = __SHIFTOUT(temp_th, + PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPLO); + temp_lo = memtemp_e5_tempth_adjust(temp_lo); + memtemp_e5_tempth_str(temp_lo, temp_lostr, sizeof(temp_lostr)); + + temp_mid = __SHIFTOUT(temp_th, + PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPMID); + temp_mid = memtemp_e5_tempth_adjust(temp_mid); + memtemp_e5_tempth_str(temp_mid, temp_midstr, + sizeof(temp_midstr)); + + temp_hi = __SHIFTOUT(temp_th, + PCI_E5_IMC_THERMAL_DIMM_TEMP_TH_TEMPHI); + temp_hi = memtemp_e5_tempth_adjust(temp_hi); + memtemp_e5_tempth_str(temp_hi, temp_histr, sizeof(temp_histr)); + + /* + * NOTE: + * - TEMPHI initiates THRTCRIT. + * - TEMPMID initiates THRTHI, so it is also taken into + * consideration. + * - Some BIOSes program temp_lo to a rediculous low value, + * so ignore TEMPLO here. + */ + if (temp_mid <= 0) { + if (temp_hi <= 0) + dimm_sc->dimm_temp_hiwat = MEMTEMP_E5_DIMM_TEMP_HIWAT; + else + dimm_sc->dimm_temp_hiwat = temp_hi; + } else { + dimm_sc->dimm_temp_hiwat = temp_mid; + } + if (dimm_sc->dimm_temp_hiwat < MEMTEMP_E5_DIMM_TEMP_STEP) + dimm_sc->dimm_temp_hiwat = MEMTEMP_E5_DIMM_TEMP_HIWAT; + dimm_sc->dimm_temp_lowat = dimm_sc->dimm_temp_hiwat - + MEMTEMP_E5_DIMM_TEMP_STEP; + + device_printf(dev, "DIMM%d " + "temp_hi %s, temp_mid %s, temp_lo %s\n", dimm, + temp_histr, temp_midstr, temp_lostr); + device_printf(dev, "DIMM%d hiwat %dC, lowat %dC\n", dimm, + dimm_sc->dimm_temp_hiwat, dimm_sc->dimm_temp_lowat); } return 0; } @@ -237,13 +316,59 @@ memtemp_e5_sensor_task(void *xdimm_sc) { struct memtemp_e5_dimm *dimm_sc = xdimm_sc; struct ksensor *sensor = &dimm_sc->dimm_sensor; + device_t dev = dimm_sc->dimm_parent->temp_dev; + int dimm = dimm_sc->dimm_id; uint32_t val; - int temp; + int temp, reg; + + reg = PCI_E5_IMC_THERMAL_DIMMTEMPSTAT(dimm); + + val = pci_read_config(dev, reg, 4); + if (val & (PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPHI | + PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPMID | + PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPLO)) + pci_write_config(dev, reg, val, 4); - val = pci_read_config(dimm_sc->dimm_parent->temp_dev, - PCI_E5_IMC_THERMAL_DIMMTEMPSTAT(dimm_sc->dimm_id), 4); temp = __SHIFTOUT(val, PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMP); + if (temp < PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPMIN || + temp >= PCI_E5_IMC_THERMAL_DIMMTEMPSTAT_TEMPMAX) { + sensor->status = SENSOR_S_UNSPEC; + sensor->flags |= SENSOR_FINVALID; + sensor->value = 0; + return; + } + + /* + * Some BIOSes will always turn on TEMPMID, so we rely on + * our own hiwat/lowat to send the notification. + */ + if (temp >= dimm_sc->dimm_temp_hiwat && + (dimm_sc->dimm_flags & MEMTEMP_E5_DIMM_FLAG_CRIT) == 0) { + int node, chan; + char temp_str[16], data[64]; + + node = dimm_sc->dimm_parent->temp_node; + chan = dimm_sc->dimm_parent->temp_chan->chan_ext; + + ksnprintf(temp_str, sizeof(temp_str), "%d", temp); + ksnprintf(data, sizeof(data), + "node=%d channel=%d dimm=%d", node, chan, dimm); + devctl_notify("memtemp", "Thermal", temp_str, data); + + device_printf(dev, "node%d channel%d DIMM%d " + "temperature (%dC) is too high (>= %d)\n", + node, chan, dimm, temp, dimm_sc->dimm_temp_hiwat); + + dimm_sc->dimm_flags |= MEMTEMP_E5_DIMM_FLAG_CRIT; + } else if ((dimm_sc->dimm_flags & MEMTEMP_E5_DIMM_FLAG_CRIT) && + temp < dimm_sc->dimm_temp_lowat) { + dimm_sc->dimm_flags &= ~MEMTEMP_E5_DIMM_FLAG_CRIT; + } + if (dimm_sc->dimm_flags & MEMTEMP_E5_DIMM_FLAG_CRIT) + sensor->status = SENSOR_S_CRIT; + else + sensor->status = SENSOR_S_OK; sensor->flags &= ~SENSOR_FINVALID; sensor->value = (temp * 1000000) + 273150000; } -- 2.41.0