dimm: Allow ECC error threshold to be configured
[dragonfly.git] / sys / dev / misc / ecc / ecc_e5.c
1 /*
2  * Copyright (c) 2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Sepherosa Ziehau <sepherosa@gmail.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/bitops.h>
38 #include <sys/bus.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
41 #include <sys/queue.h>
42 #include <sys/sensors.h>
43
44 #include <bus/pci/pcivar.h>
45 #include <bus/pci/pcireg.h>
46 #include <bus/pci/pcibus.h>
47 #include <bus/pci/pci_cfgreg.h>
48 #include <bus/pci/pcib_private.h>
49
50 #include "pcib_if.h"
51
52 #include <dev/misc/dimm/dimm.h>
53 #include <dev/misc/ecc/e5_imc_reg.h>
54 #include <dev/misc/ecc/e5_imc_var.h>
55
56 struct ecc_e5_dimm {
57         TAILQ_ENTRY(ecc_e5_dimm) dimm_link;
58         struct dimm_softc       *dimm_softc;
59         struct ksensor          dimm_sensor;
60 };
61
62 struct ecc_e5_rank {
63         struct ecc_e5_dimm *rank_dimm_sc;
64         int             rank_dimm;      /* owner dimm */
65         int             rank_dimm_rank; /* rank within the owner dimm */
66 };
67
68 struct ecc_e5_softc {
69         device_t                ecc_dev;
70         const struct e5_imc_chan *ecc_chan;
71         int                     ecc_node;
72         int                     ecc_rank_cnt;
73         struct ecc_e5_rank      ecc_rank[PCI_E5_IMC_ERROR_RANK_MAX];
74         TAILQ_HEAD(, ecc_e5_dimm) ecc_dimm;
75 };
76
77 #define ecc_printf(sc, fmt, arg...) \
78         device_printf((sc)->ecc_dev, fmt , ##arg)
79
80 static int      ecc_e5_probe(device_t);
81 static int      ecc_e5_attach(device_t);
82 static int      ecc_e5_detach(device_t);
83 static void     ecc_e5_shutdown(device_t);
84
85 static void     ecc_e5_sensor_task(void *);
86
87 #define ECC_E5_CHAN(v, imc, c, c_ext)                           \
88 {                                                               \
89         .did            = PCI_E5V##v##_IMC##imc##_ERROR_CHN##c##_DID_ID, \
90         .slot           = PCISLOT_E5V##v##_IMC##imc##_ERROR_CHN##c, \
91         .func           = PCIFUNC_E5V##v##_IMC##imc##_ERROR_CHN##c, \
92         .desc           = "Intel E5 v" #v " ECC",               \
93                                                                 \
94         E5_IMC_CHAN_FIELDS(v, imc, c, c_ext)                    \
95 }
96
97 #define ECC_E5_CHAN_V2(c)               ECC_E5_CHAN(2, 0, c, c)
98 #define ECC_E5_CHAN_IMC0_V3(c)          ECC_E5_CHAN(3, 0, c, c)
99 #define ECC_E5_CHAN_IMC1_V3(c, c_ext)   ECC_E5_CHAN(3, 1, c, c_ext)
100 #define ECC_E5_CHAN_END                 E5_IMC_CHAN_END
101
102 static const struct e5_imc_chan ecc_e5_chans[] = {
103         ECC_E5_CHAN_V2(0),
104         ECC_E5_CHAN_V2(1),
105         ECC_E5_CHAN_V2(2),
106         ECC_E5_CHAN_V2(3),
107
108         ECC_E5_CHAN_IMC0_V3(0),
109         ECC_E5_CHAN_IMC0_V3(1),
110         ECC_E5_CHAN_IMC0_V3(2),
111         ECC_E5_CHAN_IMC0_V3(3),
112         ECC_E5_CHAN_IMC1_V3(0, 2),      /* IMC1 chan0 -> channel2 */
113         ECC_E5_CHAN_IMC1_V3(1, 3),      /* IMC1 chan1 -> channel3 */
114
115         ECC_E5_CHAN_END
116 };
117
118 #undef ECC_E5_CHAN_END
119 #undef ECC_E5_CHAN_V2
120 #undef ECC_E5_CHAN
121
122 static device_method_t ecc_e5_methods[] = {
123         /* Device interface */
124         DEVMETHOD(device_probe,         ecc_e5_probe),
125         DEVMETHOD(device_attach,        ecc_e5_attach),
126         DEVMETHOD(device_detach,        ecc_e5_detach),
127         DEVMETHOD(device_shutdown,      ecc_e5_shutdown),
128         DEVMETHOD(device_suspend,       bus_generic_suspend),
129         DEVMETHOD(device_resume,        bus_generic_resume),
130         DEVMETHOD_END
131 };
132
133 static driver_t ecc_e5_driver = {
134         "ecc",
135         ecc_e5_methods,
136         sizeof(struct ecc_e5_softc)
137 };
138 static devclass_t ecc_devclass;
139 DRIVER_MODULE(ecc_e5, pci, ecc_e5_driver, ecc_devclass, NULL, NULL);
140 MODULE_DEPEND(ecc_e5, pci, 1, 1, 1);
141 MODULE_DEPEND(ecc_e5, dimm, 1, 1, 1);
142
143 static int
144 ecc_e5_probe(device_t dev)
145 {
146         const struct e5_imc_chan *c;
147         uint16_t vid, did;
148         int slot, func;
149
150         vid = pci_get_vendor(dev);
151         if (vid != PCI_E5_IMC_VID_ID)
152                 return ENXIO;
153
154         did = pci_get_device(dev);
155         slot = pci_get_slot(dev);
156         func = pci_get_function(dev);
157
158         for (c = ecc_e5_chans; c->desc != NULL; ++c) {
159                 if (c->did == did && c->slot == slot && c->func == func) {
160                         struct ecc_e5_softc *sc = device_get_softc(dev);
161                         int node;
162
163                         node = e5_imc_node_probe(dev, c);
164                         if (node < 0)
165                                 break;
166
167                         device_set_desc(dev, c->desc);
168
169                         sc->ecc_chan = c;
170                         sc->ecc_node = node;
171                         return 0;
172                 }
173         }
174         return ENXIO;
175 }
176
177 static int
178 ecc_e5_attach(device_t dev)
179 {
180         struct ecc_e5_softc *sc = device_get_softc(dev);
181         uint32_t mcmtr;
182         int dimm, rank;
183
184         TAILQ_INIT(&sc->ecc_dimm);
185         sc->ecc_dev = dev;
186
187         mcmtr = IMC_CPGC_READ_4(sc->ecc_dev, sc->ecc_chan,
188             PCI_E5_IMC_CPGC_MCMTR);
189         if (bootverbose) {
190                 if (sc->ecc_chan->ver == E5_IMC_CHAN_VER3 &&
191                     (mcmtr & PCI_E5V3_IMC_CPGC_MCMTR_DDR4))
192                         ecc_printf(sc, "DDR4\n");
193                 if (__SHIFTOUT(mcmtr, PCI_E5_IMC_CPGC_MCMTR_IMC_MODE) ==
194                     PCI_E5_IMC_CPGC_MCMTR_IMC_MODE_DDR3) {
195                         ecc_printf(sc, "native %s\n",
196                             sc->ecc_chan->ver == E5_IMC_CHAN_VER2 ?
197                             "DDR3" : "DDR");
198                 }
199         }
200
201         rank = 0;
202         for (dimm = 0; dimm < PCI_E5_IMC_CHN_DIMM_MAX; ++dimm) {
203                 struct ecc_e5_dimm *dimm_sc;
204                 struct ksensor *sens;
205                 const char *width;
206                 uint32_t dimmmtr;
207                 int rank_cnt, r;
208                 int density;
209                 int val;
210
211                 dimmmtr = IMC_CTAD_READ_4(sc->ecc_dev, sc->ecc_chan,
212                     PCI_E5_IMC_CTAD_DIMMMTR(dimm));
213
214                 if ((dimmmtr & PCI_E5_IMC_CTAD_DIMMMTR_DIMM_POP) == 0)
215                         continue;
216
217                 val = __SHIFTOUT(dimmmtr, PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT);
218                 switch (val) {
219                 case PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT_SR:
220                         rank_cnt = 1;
221                         break;
222                 case PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT_DR:
223                         rank_cnt = 2;
224                         break;
225                 case PCI_E5_IMC_CTAD_DIMMMTR_RANK_CNT_QR:
226                         rank_cnt = 4;
227                         break;
228                 case PCI_E5V3_IMC_CTAD_DIMMMTR_RANK_CNT_8R:
229                         if (sc->ecc_chan->ver >= E5_IMC_CHAN_VER3) {
230                                 rank_cnt = 8;
231                                 break;
232                         }
233                         /* FALL THROUGH */
234                 default:
235                         ecc_printf(sc, "unknown rank count 0x%x\n", val);
236                         return ENXIO;
237                 }
238
239                 val = __SHIFTOUT(dimmmtr, PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH);
240                 switch (val) {
241                 case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH_4:
242                         width = "x4";
243                         break;
244                 case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH_8:
245                         width = "x8";
246                         break;
247                 case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_WIDTH_16:
248                         width = "x16";
249                         break;
250                 default:
251                         ecc_printf(sc, "unknown ddr3 width 0x%x\n", val);
252                         return ENXIO;
253                 }
254
255                 val = __SHIFTOUT(dimmmtr, PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY);
256                 switch (val) {
257                 case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_2G:
258                         density = 2;
259                         break;
260                 case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_4G:
261                         density = 4;
262                         break;
263                 case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_8G:
264                         density = 8;
265                         break;
266                 case PCI_E5_IMC_CTAD_DIMMMTR_DDR3_DNSTY_1G:
267                         if (sc->ecc_chan->ver < E5_IMC_CHAN_VER3) {
268                                 density = 1;
269                                 break;
270                         }
271                         /* FALL THROUGH */
272                 default:
273                         ecc_printf(sc, "unknown ddr3 density 0x%x\n", val);
274                         return ENXIO;
275                 }
276
277                 if (bootverbose) {
278                         ecc_printf(sc, "DIMM%d %dGB, %d%s, density %dGB\n",
279                             dimm, density * rank_cnt * 2,
280                             rank_cnt, width, density);
281                 }
282
283                 dimm_sc = kmalloc(sizeof(*dimm_sc), M_DEVBUF,
284                     M_WAITOK | M_ZERO);
285                 dimm_sc->dimm_softc =
286                     dimm_create(sc->ecc_node, sc->ecc_chan->chan_ext, dimm);
287
288                 sens = &dimm_sc->dimm_sensor;
289                 ksnprintf(sens->desc, sizeof(sens->desc),
290                     "node%d chan%d DIMM%d ecc",
291                     sc->ecc_node, sc->ecc_chan->chan_ext, dimm);
292                 sens->type = SENSOR_ECC;
293                 sensor_set(sens, 0, SENSOR_S_OK);
294                 dimm_sensor_attach(dimm_sc->dimm_softc, sens);
295
296                 TAILQ_INSERT_TAIL(&sc->ecc_dimm, dimm_sc, dimm_link);
297
298                 for (r = 0; r < rank_cnt; ++r) {
299                         struct ecc_e5_rank *rk;
300
301                         if (rank >= PCI_E5_IMC_ERROR_RANK_MAX) {
302                                 ecc_printf(sc, "too many ranks\n");
303                                 return ENXIO;
304                         }
305                         rk = &sc->ecc_rank[rank];
306
307                         rk->rank_dimm_sc = dimm_sc;
308                         rk->rank_dimm = dimm;
309                         rk->rank_dimm_rank = r;
310
311                         ++rank;
312                 }
313         }
314         sc->ecc_rank_cnt = rank;
315
316         if ((mcmtr & PCI_E5_IMC_CPGC_MCMTR_ECC_EN) == 0) {
317                 ecc_printf(sc, "ECC is not enabled\n");
318                 return 0;
319         }
320
321         for (rank = 0; rank < sc->ecc_rank_cnt; ++rank) {
322                 const struct ecc_e5_rank *rk = &sc->ecc_rank[rank];
323                 uint32_t thr, mask;
324                 int ofs;
325
326                 ofs = PCI_E5_IMC_ERROR_COR_ERR_TH(rank / 2);
327                 if (rank & 1)
328                         mask = PCI_E5_IMC_ERROR_COR_ERR_TH_HI;
329                 else
330                         mask = PCI_E5_IMC_ERROR_COR_ERR_TH_LO;
331
332                 thr = pci_read_config(sc->ecc_dev, ofs, 4);
333                 dimm_set_ecc_thresh(rk->rank_dimm_sc->dimm_softc,
334                     __SHIFTOUT(thr, mask));
335         }
336
337         sensor_task_register(sc, ecc_e5_sensor_task, 1);
338         return 0;
339 }
340
341 static void
342 ecc_e5_sensor_task(void *xsc)
343 {
344         struct ecc_e5_softc *sc = xsc;
345         uint32_t err_ranks, val;
346
347         val = pci_read_config(sc->ecc_dev, PCI_E5_IMC_ERROR_COR_ERR_STAT, 4);
348
349         err_ranks = (val & PCI_E5_IMC_ERROR_COR_ERR_STAT_RANKS);
350         while (err_ranks != 0) {
351                 int rank;
352
353                 rank = ffs(err_ranks) - 1;
354                 err_ranks &= ~(1 << rank);
355
356                 if (rank < sc->ecc_rank_cnt) {
357                         const struct ecc_e5_rank *rk = &sc->ecc_rank[rank];
358                         struct ecc_e5_dimm *dimm_sc = rk->rank_dimm_sc;
359                         uint32_t err, mask;
360                         int ofs, ecc_cnt;
361
362                         ofs = PCI_E5_IMC_ERROR_COR_ERR_CNT(rank / 2);
363                         if (rank & 1)
364                                 mask = PCI_E5_IMC_ERROR_COR_ERR_CNT_HI;
365                         else
366                                 mask = PCI_E5_IMC_ERROR_COR_ERR_CNT_LO;
367
368                         err = pci_read_config(sc->ecc_dev, ofs, 4);
369                         ecc_cnt = __SHIFTOUT(err, mask);
370
371                         dimm_sensor_ecc_set(dimm_sc->dimm_softc,
372                             &dimm_sc->dimm_sensor, ecc_cnt, TRUE);
373                 }
374         }
375
376         if (val & PCI_E5_IMC_ERROR_COR_ERR_STAT_RANKS) {
377                 pci_write_config(sc->ecc_dev, PCI_E5_IMC_ERROR_COR_ERR_STAT,
378                     val, 4);
379         }
380 }
381
382 static void
383 ecc_e5_stop(device_t dev)
384 {
385         struct ecc_e5_softc *sc = device_get_softc(dev);
386
387         sensor_task_unregister(sc);
388 }
389
390 static int
391 ecc_e5_detach(device_t dev)
392 {
393         struct ecc_e5_softc *sc = device_get_softc(dev);
394         struct ecc_e5_dimm *dimm_sc;
395
396         ecc_e5_stop(dev);
397
398         while ((dimm_sc = TAILQ_FIRST(&sc->ecc_dimm)) != NULL) {
399                 TAILQ_REMOVE(&sc->ecc_dimm, dimm_sc, dimm_link);
400                 dimm_sensor_detach(dimm_sc->dimm_softc, &dimm_sc->dimm_sensor);
401                 dimm_destroy(dimm_sc->dimm_softc);
402
403                 kfree(dimm_sc, M_DEVBUF);
404         }
405         return 0;
406 }
407
408 static void
409 ecc_e5_shutdown(device_t dev)
410 {
411         ecc_e5_stop(dev);
412 }