mirror of
https://git.FreeBSD.org/src.git
synced 2024-12-11 09:50:12 +00:00
cxgbe(4): Changes to the fatal error handler.
* New error_flags that can be used from the error ithread and elsewhere without a synch_op. * Stop the adapter immediately in t4_fatal_err but defer most of the rest of the handling to a task. The task is allowed to sleep, unlike the ithread. Remove async_event_task as it is no longer needed. * Dump the devlog, CIMLA, and PCIE_FW exactly once on any fatal error involving the firmware or the CIM block. While here, dump some additional info (see dump_cim_regs) for these errors. * If both reset_on_fatal_err and panic_on_fatal_err are set then attempt a reset first and do not panic the system if it is successful. MFC after: 1 week Sponsored by: Chelsio Communications
This commit is contained in:
parent
7e05fa3b44
commit
e9e7bc8250
@ -154,18 +154,21 @@ enum {
|
||||
};
|
||||
|
||||
enum {
|
||||
/* adapter flags */
|
||||
/* adapter flags. synch_op or adapter_lock. */
|
||||
FULL_INIT_DONE = (1 << 0),
|
||||
FW_OK = (1 << 1),
|
||||
CHK_MBOX_ACCESS = (1 << 2),
|
||||
MASTER_PF = (1 << 3),
|
||||
/* 1 << 4 is unused, was ADAP_SYSCTL_CTX */
|
||||
ADAP_ERR = (1 << 5),
|
||||
BUF_PACKING_OK = (1 << 6),
|
||||
IS_VF = (1 << 7),
|
||||
KERN_TLS_ON = (1 << 8), /* HW is configured for KERN_TLS */
|
||||
CXGBE_BUSY = (1 << 9),
|
||||
HW_OFF_LIMITS = (1 << 10), /* off limits to all except reset_thread */
|
||||
|
||||
/* adapter error_flags. reg_lock for HW_OFF_LIMITS, atomics for the rest. */
|
||||
ADAP_STOPPED = (1 << 0), /* Adapter has been stopped. */
|
||||
ADAP_FATAL_ERR = (1 << 1), /* Encountered a fatal error. */
|
||||
HW_OFF_LIMITS = (1 << 2), /* off limits to all except reset_thread */
|
||||
ADAP_CIM_ERR = (1 << 3), /* Error was related to FW/CIM. */
|
||||
|
||||
/* port flags */
|
||||
HAS_TRACEQ = (1 << 3),
|
||||
@ -906,7 +909,6 @@ struct adapter {
|
||||
int nrawf;
|
||||
|
||||
struct taskqueue *tq[MAX_NCHAN]; /* General purpose taskqueues */
|
||||
struct task async_event_task;
|
||||
struct port_info *port[MAX_NPORTS];
|
||||
uint8_t chan_map[MAX_NCHAN]; /* channel -> port */
|
||||
|
||||
@ -937,6 +939,7 @@ struct adapter {
|
||||
int active_ulds; /* ULDs activated on this adapter */
|
||||
int flags;
|
||||
int debug_flags;
|
||||
int error_flags; /* Used by error handler and live reset. */
|
||||
|
||||
char ifp_lockname[16];
|
||||
struct mtx ifp_lock;
|
||||
@ -993,6 +996,7 @@ struct adapter {
|
||||
struct mtx tc_lock;
|
||||
struct task tc_task;
|
||||
|
||||
struct task fatal_error_task;
|
||||
struct task reset_task;
|
||||
const void *reset_thread;
|
||||
int num_resets;
|
||||
@ -1091,7 +1095,9 @@ forwarding_intr_to_fwq(struct adapter *sc)
|
||||
static inline bool
|
||||
hw_off_limits(struct adapter *sc)
|
||||
{
|
||||
return (__predict_false(sc->flags & HW_OFF_LIMITS));
|
||||
int off_limits = atomic_load_int(&sc->error_flags) & HW_OFF_LIMITS;
|
||||
|
||||
return (__predict_false(off_limits != 0));
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
@ -1288,8 +1294,7 @@ void free_atid(struct adapter *, int);
|
||||
void release_tid(struct adapter *, int, struct sge_wrq *);
|
||||
int cxgbe_media_change(struct ifnet *);
|
||||
void cxgbe_media_status(struct ifnet *, struct ifmediareq *);
|
||||
bool t4_os_dump_cimla(struct adapter *, int, bool);
|
||||
void t4_os_dump_devlog(struct adapter *);
|
||||
void t4_os_cim_err(struct adapter *);
|
||||
|
||||
#ifdef KERN_TLS
|
||||
/* t4_kern_tls.c */
|
||||
|
@ -582,6 +582,7 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
|
||||
int size, void *rpl, bool sleep_ok, int timeout);
|
||||
int t4_wr_mbox_meat(struct adapter *adap, int mbox, const void *cmd, int size,
|
||||
void *rpl, bool sleep_ok);
|
||||
void t4_report_fw_error(struct adapter *adap);
|
||||
|
||||
static inline int t4_wr_mbox_timeout(struct adapter *adap, int mbox,
|
||||
const void *cmd, int size, void *rpl,
|
||||
@ -617,7 +618,7 @@ struct fw_filter_wr;
|
||||
void t4_intr_enable(struct adapter *adapter);
|
||||
void t4_intr_disable(struct adapter *adapter);
|
||||
void t4_intr_clear(struct adapter *adapter);
|
||||
int t4_slow_intr_handler(struct adapter *adapter, bool verbose);
|
||||
bool t4_slow_intr_handler(struct adapter *adapter, bool verbose);
|
||||
|
||||
int t4_hash_mac_addr(const u8 *addr);
|
||||
int t4_link_l1cfg(struct adapter *adap, unsigned int mbox, unsigned int port,
|
||||
|
@ -196,7 +196,7 @@ u32 t4_hw_pci_read_cfg4(adapter_t *adap, int reg)
|
||||
* If the firmware has indicated an error, print out the reason for
|
||||
* the firmware error.
|
||||
*/
|
||||
static void t4_report_fw_error(struct adapter *adap)
|
||||
void t4_report_fw_error(struct adapter *adap)
|
||||
{
|
||||
static const char *const reason[] = {
|
||||
"Crash", /* PCIE_FW_EVAL_CRASH */
|
||||
@ -212,11 +212,8 @@ static void t4_report_fw_error(struct adapter *adap)
|
||||
|
||||
pcie_fw = t4_read_reg(adap, A_PCIE_FW);
|
||||
if (pcie_fw & F_PCIE_FW_ERR) {
|
||||
adap->flags &= ~FW_OK;
|
||||
CH_ERR(adap, "firmware reports adapter error: %s (0x%08x)\n",
|
||||
reason[G_PCIE_FW_EVAL(pcie_fw)], pcie_fw);
|
||||
if (pcie_fw != 0xffffffff)
|
||||
t4_os_dump_devlog(adap);
|
||||
}
|
||||
}
|
||||
|
||||
@ -374,6 +371,12 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
|
||||
/*
|
||||
* Attempt to gain access to the mailbox.
|
||||
*/
|
||||
pcie_fw = 0;
|
||||
if (!(adap->flags & IS_VF)) {
|
||||
pcie_fw = t4_read_reg(adap, A_PCIE_FW);
|
||||
if (pcie_fw & F_PCIE_FW_ERR)
|
||||
goto failed;
|
||||
}
|
||||
for (i = 0; i < 4; i++) {
|
||||
ctl = t4_read_reg(adap, ctl_reg);
|
||||
v = G_MBOWNER(ctl);
|
||||
@ -385,7 +388,11 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
|
||||
* If we were unable to gain access, report the error to our caller.
|
||||
*/
|
||||
if (v != X_MBOWNER_PL) {
|
||||
t4_report_fw_error(adap);
|
||||
if (!(adap->flags & IS_VF)) {
|
||||
pcie_fw = t4_read_reg(adap, A_PCIE_FW);
|
||||
if (pcie_fw & F_PCIE_FW_ERR)
|
||||
goto failed;
|
||||
}
|
||||
ret = (v == X_MBOWNER_FW) ? -EBUSY : -ETIMEDOUT;
|
||||
return ret;
|
||||
}
|
||||
@ -436,7 +443,6 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
|
||||
* Loop waiting for the reply; bail out if we time out or the firmware
|
||||
* reports an error.
|
||||
*/
|
||||
pcie_fw = 0;
|
||||
for (i = 0; i < timeout; i += ms) {
|
||||
if (!(adap->flags & IS_VF)) {
|
||||
pcie_fw = t4_read_reg(adap, A_PCIE_FW);
|
||||
@ -494,15 +500,9 @@ int t4_wr_mbox_meat_timeout(struct adapter *adap, int mbox, const void *cmd,
|
||||
*(const u8 *)cmd, mbox, pcie_fw);
|
||||
CH_DUMP_MBOX(adap, mbox, 0, "cmdsent", cmd_rpl, true);
|
||||
CH_DUMP_MBOX(adap, mbox, data_reg, "current", NULL, true);
|
||||
|
||||
if (pcie_fw & F_PCIE_FW_ERR) {
|
||||
ret = -ENXIO;
|
||||
t4_report_fw_error(adap);
|
||||
} else {
|
||||
ret = -ETIMEDOUT;
|
||||
t4_os_dump_devlog(adap);
|
||||
}
|
||||
|
||||
failed:
|
||||
adap->flags &= ~FW_OK;
|
||||
ret = pcie_fw & F_PCIE_FW_ERR ? -ENXIO : -ETIMEDOUT;
|
||||
t4_fatal_err(adap, true);
|
||||
return ret;
|
||||
}
|
||||
@ -4464,10 +4464,6 @@ static bool sge_intr_handler(struct adapter *adap, int arg, bool verbose)
|
||||
*/
|
||||
static bool cim_intr_handler(struct adapter *adap, int arg, bool verbose)
|
||||
{
|
||||
static const struct intr_action cim_host_intr_actions[] = {
|
||||
{ F_TIMER0INT, 0, t4_os_dump_cimla },
|
||||
{ 0 },
|
||||
};
|
||||
static const struct intr_details cim_host_intr_details[] = {
|
||||
/* T6+ */
|
||||
{ F_PCIE2CIMINTFPARERR, "CIM IBQ PCIe interface parity error" },
|
||||
@ -4513,7 +4509,7 @@ static bool cim_intr_handler(struct adapter *adap, int arg, bool verbose)
|
||||
.fatal = 0x007fffe6,
|
||||
.flags = NONFATAL_IF_DISABLED,
|
||||
.details = cim_host_intr_details,
|
||||
.actions = cim_host_intr_actions,
|
||||
.actions = NULL,
|
||||
};
|
||||
static const struct intr_details cim_host_upacc_intr_details[] = {
|
||||
{ F_EEPROMWRINT, "CIM EEPROM came out of busy state" },
|
||||
@ -4578,10 +4574,6 @@ static bool cim_intr_handler(struct adapter *adap, int arg, bool verbose)
|
||||
u32 val, fw_err;
|
||||
bool fatal;
|
||||
|
||||
fw_err = t4_read_reg(adap, A_PCIE_FW);
|
||||
if (fw_err & F_PCIE_FW_ERR)
|
||||
t4_report_fw_error(adap);
|
||||
|
||||
/*
|
||||
* When the Firmware detects an internal error which normally wouldn't
|
||||
* raise a Host Interrupt, it forces a CIM Timer0 interrupt in order
|
||||
@ -4589,16 +4581,19 @@ static bool cim_intr_handler(struct adapter *adap, int arg, bool verbose)
|
||||
* Timer0 interrupt and don't see a Firmware Crash, ignore the Timer0
|
||||
* interrupt.
|
||||
*/
|
||||
fw_err = t4_read_reg(adap, A_PCIE_FW);
|
||||
val = t4_read_reg(adap, A_CIM_HOST_INT_CAUSE);
|
||||
if (val & F_TIMER0INT && (!(fw_err & F_PCIE_FW_ERR) ||
|
||||
G_PCIE_FW_EVAL(fw_err) != PCIE_FW_EVAL_CRASH)) {
|
||||
t4_write_reg(adap, A_CIM_HOST_INT_CAUSE, F_TIMER0INT);
|
||||
}
|
||||
|
||||
fatal = false;
|
||||
fatal = (fw_err & F_PCIE_FW_ERR) != 0;
|
||||
fatal |= t4_handle_intr(adap, &cim_host_intr_info, 0, verbose);
|
||||
fatal |= t4_handle_intr(adap, &cim_host_upacc_intr_info, 0, verbose);
|
||||
fatal |= t4_handle_intr(adap, &cim_pf_host_intr_info, 0, verbose);
|
||||
if (fatal)
|
||||
t4_os_cim_err(adap);
|
||||
|
||||
return (fatal);
|
||||
}
|
||||
@ -5297,7 +5292,7 @@ static bool plpl_intr_handler(struct adapter *adap, int arg, bool verbose)
|
||||
* The designation 'slow' is because it involves register reads, while
|
||||
* data interrupts typically don't involve any MMIOs.
|
||||
*/
|
||||
int t4_slow_intr_handler(struct adapter *adap, bool verbose)
|
||||
bool t4_slow_intr_handler(struct adapter *adap, bool verbose)
|
||||
{
|
||||
static const struct intr_details pl_intr_details[] = {
|
||||
{ F_MC1, "MC1" },
|
||||
@ -5376,7 +5371,6 @@ int t4_slow_intr_handler(struct adapter *adap, bool verbose)
|
||||
.details = pl_intr_details,
|
||||
.actions = pl_intr_action,
|
||||
};
|
||||
bool fatal;
|
||||
u32 perr;
|
||||
|
||||
perr = t4_read_reg(adap, pl_perr_cause.cause_reg);
|
||||
@ -5387,11 +5381,8 @@ int t4_slow_intr_handler(struct adapter *adap, bool verbose)
|
||||
if (verbose)
|
||||
perr |= t4_read_reg(adap, pl_intr_info.enable_reg);
|
||||
}
|
||||
fatal = t4_handle_intr(adap, &pl_intr_info, perr, verbose);
|
||||
if (fatal)
|
||||
t4_fatal_err(adap, false);
|
||||
|
||||
return (0);
|
||||
return (t4_handle_intr(adap, &pl_intr_info, perr, verbose));
|
||||
}
|
||||
|
||||
#define PF_INTR_MASK (F_PFSW | F_PFCIM)
|
||||
@ -7521,8 +7512,6 @@ int t4_fw_hello(struct adapter *adap, unsigned int mbox, unsigned int evt_mbox,
|
||||
if (ret != FW_SUCCESS) {
|
||||
if ((ret == -EBUSY || ret == -ETIMEDOUT) && retries-- > 0)
|
||||
goto retry;
|
||||
if (t4_read_reg(adap, A_PCIE_FW) & F_PCIE_FW_ERR)
|
||||
t4_report_fw_error(adap);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -854,7 +854,7 @@ static int hold_clip_addr(struct adapter *, struct t4_clip_addr *);
|
||||
static int release_clip_addr(struct adapter *, struct t4_clip_addr *);
|
||||
#ifdef TCP_OFFLOAD
|
||||
static int toe_capability(struct vi_info *, bool);
|
||||
static void t4_async_event(void *, int);
|
||||
static void t4_async_event(struct adapter *);
|
||||
#endif
|
||||
#ifdef KERN_TLS
|
||||
static int ktls_capability(struct adapter *, bool);
|
||||
@ -864,7 +864,11 @@ static int notify_siblings(device_t, int);
|
||||
static uint64_t vi_get_counter(struct ifnet *, ift_counter);
|
||||
static uint64_t cxgbe_get_counter(struct ifnet *, ift_counter);
|
||||
static void enable_vxlan_rx(struct adapter *);
|
||||
static void reset_adapter(void *, int);
|
||||
static void reset_adapter_task(void *, int);
|
||||
static void fatal_error_task(void *, int);
|
||||
static void dump_devlog(struct adapter *);
|
||||
static void dump_cim_regs(struct adapter *);
|
||||
static void dump_cimla(struct adapter *);
|
||||
|
||||
struct {
|
||||
uint16_t device;
|
||||
@ -1168,13 +1172,10 @@ t4_attach(device_t dev)
|
||||
|
||||
callout_init(&sc->ktls_tick, 1);
|
||||
|
||||
#ifdef TCP_OFFLOAD
|
||||
TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc);
|
||||
#endif
|
||||
|
||||
refcount_init(&sc->vxlan_refcount, 0);
|
||||
|
||||
TASK_INIT(&sc->reset_task, 0, reset_adapter, sc);
|
||||
TASK_INIT(&sc->reset_task, 0, reset_adapter_task, sc);
|
||||
TASK_INIT(&sc->fatal_error_task, 0, fatal_error_task, sc);
|
||||
|
||||
sc->ctrlq_oid = SYSCTL_ADD_NODE(&sc->ctx,
|
||||
SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "ctrlq",
|
||||
@ -1709,10 +1710,6 @@ t4_detach_common(device_t dev)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef TCP_OFFLOAD
|
||||
taskqueue_drain(taskqueue_thread, &sc->async_event_task);
|
||||
#endif
|
||||
|
||||
for (i = 0; i < sc->intr_count; i++)
|
||||
t4_free_irq(sc, &sc->irq[i]);
|
||||
|
||||
@ -1862,6 +1859,14 @@ ok_to_reset(struct adapter *sc)
|
||||
return (true);
|
||||
}
|
||||
|
||||
static inline int
|
||||
stop_adapter(struct adapter *sc)
|
||||
{
|
||||
if (atomic_testandset_int(&sc->error_flags, ilog2(ADAP_STOPPED)))
|
||||
return (1); /* Already stopped. */
|
||||
return (t4_shutdown_adapter(sc));
|
||||
}
|
||||
|
||||
static int
|
||||
t4_suspend(device_t dev)
|
||||
{
|
||||
@ -1897,7 +1902,7 @@ t4_suspend(device_t dev)
|
||||
}
|
||||
|
||||
/* No more DMA or interrupts. */
|
||||
t4_shutdown_adapter(sc);
|
||||
stop_adapter(sc);
|
||||
|
||||
/* Quiesce all activity. */
|
||||
for_each_port(sc, i) {
|
||||
@ -1973,12 +1978,11 @@ t4_suspend(device_t dev)
|
||||
|
||||
/* Mark the adapter totally off limits. */
|
||||
mtx_lock(&sc->reg_lock);
|
||||
sc->flags |= HW_OFF_LIMITS;
|
||||
atomic_set_int(&sc->error_flags, HW_OFF_LIMITS);
|
||||
sc->flags &= ~(FW_OK | MASTER_PF);
|
||||
sc->reset_thread = NULL;
|
||||
mtx_unlock(&sc->reg_lock);
|
||||
|
||||
sc->num_resets++;
|
||||
CH_ALERT(sc, "suspend completed.\n");
|
||||
done:
|
||||
end_synchronized_op(sc, 0);
|
||||
@ -2165,6 +2169,9 @@ t4_resume(device_t dev)
|
||||
goto done;
|
||||
}
|
||||
|
||||
/* Note that HW_OFF_LIMITS is cleared a bit later. */
|
||||
atomic_clear_int(&sc->error_flags, ADAP_FATAL_ERR | ADAP_STOPPED);
|
||||
|
||||
/* Restore memory window. */
|
||||
setup_memwin(sc);
|
||||
|
||||
@ -2173,7 +2180,7 @@ t4_resume(device_t dev)
|
||||
CH_ALERT(sc, "recovery mode on resume.\n");
|
||||
rc = 0;
|
||||
mtx_lock(&sc->reg_lock);
|
||||
sc->flags &= ~HW_OFF_LIMITS;
|
||||
atomic_clear_int(&sc->error_flags, HW_OFF_LIMITS);
|
||||
mtx_unlock(&sc->reg_lock);
|
||||
goto done;
|
||||
}
|
||||
@ -2242,7 +2249,7 @@ t4_resume(device_t dev)
|
||||
* this thread is still in the middle of a synchronized_op.
|
||||
*/
|
||||
mtx_lock(&sc->reg_lock);
|
||||
sc->flags &= ~HW_OFF_LIMITS;
|
||||
atomic_clear_int(&sc->error_flags, HW_OFF_LIMITS);
|
||||
mtx_unlock(&sc->reg_lock);
|
||||
|
||||
if (sc->flags & FULL_INIT_DONE) {
|
||||
@ -2357,17 +2364,16 @@ t4_reset_post(device_t dev, device_t child)
|
||||
return (0);
|
||||
}
|
||||
|
||||
static void
|
||||
reset_adapter(void *arg, int pending)
|
||||
static int
|
||||
reset_adapter(struct adapter *sc)
|
||||
{
|
||||
struct adapter *sc = arg;
|
||||
int rc;
|
||||
int rc, oldinc, error_flags;
|
||||
|
||||
CH_ALERT(sc, "reset requested.\n");
|
||||
|
||||
rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst1");
|
||||
if (rc != 0)
|
||||
return;
|
||||
return (EBUSY);
|
||||
|
||||
if (hw_off_limits(sc)) {
|
||||
CH_ERR(sc, "adapter is suspended, use resume (not reset).\n");
|
||||
@ -2383,17 +2389,41 @@ reset_adapter(void *arg, int pending)
|
||||
}
|
||||
|
||||
done:
|
||||
oldinc = sc->incarnation;
|
||||
end_synchronized_op(sc, 0);
|
||||
if (rc != 0)
|
||||
return; /* Error logged already. */
|
||||
return (rc); /* Error logged already. */
|
||||
|
||||
atomic_add_int(&sc->num_resets, 1);
|
||||
mtx_lock(&Giant);
|
||||
rc = BUS_RESET_CHILD(device_get_parent(sc->dev), sc->dev, 0);
|
||||
mtx_unlock(&Giant);
|
||||
if (rc != 0)
|
||||
CH_ERR(sc, "bus_reset_child failed: %d.\n", rc);
|
||||
else
|
||||
CH_ALERT(sc, "bus_reset_child succeeded.\n");
|
||||
else {
|
||||
rc = begin_synchronized_op(sc, NULL, SLEEP_OK, "t4rst2");
|
||||
if (rc != 0)
|
||||
return (EBUSY);
|
||||
error_flags = atomic_load_int(&sc->error_flags);
|
||||
if (sc->incarnation > oldinc && error_flags == 0) {
|
||||
CH_ALERT(sc, "bus_reset_child succeeded.\n");
|
||||
} else {
|
||||
CH_ERR(sc, "adapter did not reset properly, flags "
|
||||
"0x%08x, error_flags 0x%08x.\n", sc->flags,
|
||||
error_flags);
|
||||
rc = ENXIO;
|
||||
}
|
||||
end_synchronized_op(sc, 0);
|
||||
}
|
||||
|
||||
return (rc);
|
||||
}
|
||||
|
||||
static void
|
||||
reset_adapter_task(void *arg, int pending)
|
||||
{
|
||||
/* XXX: t4_async_event here? */
|
||||
reset_adapter(arg);
|
||||
}
|
||||
|
||||
static int
|
||||
@ -3489,35 +3519,63 @@ delayed_panic(void *arg)
|
||||
panic("%s: panic on fatal error", device_get_nameunit(sc->dev));
|
||||
}
|
||||
|
||||
void
|
||||
t4_fatal_err(struct adapter *sc, bool fw_error)
|
||||
static void
|
||||
fatal_error_task(void *arg, int pending)
|
||||
{
|
||||
struct adapter *sc = arg;
|
||||
int rc;
|
||||
|
||||
t4_shutdown_adapter(sc);
|
||||
log(LOG_ALERT, "%s: encountered fatal error, adapter stopped.\n",
|
||||
device_get_nameunit(sc->dev));
|
||||
if (fw_error) {
|
||||
if (sc->flags & CHK_MBOX_ACCESS)
|
||||
ASSERT_SYNCHRONIZED_OP(sc);
|
||||
sc->flags |= ADAP_ERR;
|
||||
} else {
|
||||
ADAPTER_LOCK(sc);
|
||||
sc->flags |= ADAP_ERR;
|
||||
ADAPTER_UNLOCK(sc);
|
||||
}
|
||||
#ifdef TCP_OFFLOAD
|
||||
taskqueue_enqueue(taskqueue_thread, &sc->async_event_task);
|
||||
t4_async_event(sc);
|
||||
#endif
|
||||
if (atomic_testandclear_int(&sc->error_flags, ilog2(ADAP_CIM_ERR))) {
|
||||
dump_cim_regs(sc);
|
||||
dump_cimla(sc);
|
||||
dump_devlog(sc);
|
||||
}
|
||||
|
||||
if (t4_reset_on_fatal_err) {
|
||||
CH_ALERT(sc, "resetting on fatal error.\n");
|
||||
rc = reset_adapter(sc);
|
||||
if (rc == 0 && t4_panic_on_fatal_err) {
|
||||
CH_ALERT(sc, "reset was successful, "
|
||||
"system will NOT panic.\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (t4_panic_on_fatal_err) {
|
||||
CH_ALERT(sc, "panicking on fatal error (after 30s).\n");
|
||||
callout_reset(&fatal_callout, hz * 30, delayed_panic, sc);
|
||||
} else if (t4_reset_on_fatal_err) {
|
||||
CH_ALERT(sc, "resetting on fatal error.\n");
|
||||
taskqueue_enqueue(reset_tq, &sc->reset_task);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
t4_fatal_err(struct adapter *sc, bool fw_error)
|
||||
{
|
||||
const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
|
||||
|
||||
stop_adapter(sc);
|
||||
if (atomic_testandset_int(&sc->error_flags, ilog2(ADAP_FATAL_ERR)))
|
||||
return;
|
||||
if (fw_error) {
|
||||
/*
|
||||
* We are here because of a firmware error/timeout and not
|
||||
* because of a hardware interrupt. It is possible (although
|
||||
* not very likely) that an error interrupt was also raised but
|
||||
* this thread ran first and inhibited t4_intr_err. We walk the
|
||||
* main INT_CAUSE registers here to make sure we haven't missed
|
||||
* anything interesting.
|
||||
*/
|
||||
t4_slow_intr_handler(sc, verbose);
|
||||
atomic_set_int(&sc->error_flags, ADAP_CIM_ERR);
|
||||
}
|
||||
t4_report_fw_error(sc);
|
||||
log(LOG_ALERT, "%s: encountered fatal error, adapter stopped (%d).\n",
|
||||
device_get_nameunit(sc->dev), fw_error);
|
||||
taskqueue_enqueue(reset_tq, &sc->fatal_error_task);
|
||||
}
|
||||
|
||||
void
|
||||
t4_add_adapter(struct adapter *sc)
|
||||
{
|
||||
@ -8923,24 +8981,44 @@ sysctl_cim_la(SYSCTL_HANDLER_ARGS)
|
||||
return (rc);
|
||||
}
|
||||
|
||||
bool
|
||||
t4_os_dump_cimla(struct adapter *sc, int arg, bool verbose)
|
||||
static void
|
||||
dump_cim_regs(struct adapter *sc)
|
||||
{
|
||||
log(LOG_DEBUG, "%s: CIM debug regs %08x %08x %08x %08x %08x\n",
|
||||
device_get_nameunit(sc->dev),
|
||||
t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA0),
|
||||
t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA1),
|
||||
t4_read_reg(sc, A_EDC_H_BIST_USER_WDATA2),
|
||||
t4_read_reg(sc, A_EDC_H_BIST_DATA_PATTERN),
|
||||
t4_read_reg(sc, A_EDC_H_BIST_STATUS_RDATA));
|
||||
}
|
||||
|
||||
static void
|
||||
dump_cimla(struct adapter *sc)
|
||||
{
|
||||
struct sbuf sb;
|
||||
int rc;
|
||||
|
||||
if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb)
|
||||
return (false);
|
||||
if (sbuf_new(&sb, NULL, 4096, SBUF_AUTOEXTEND) != &sb) {
|
||||
log(LOG_DEBUG, "%s: failed to generate CIM LA dump.\n",
|
||||
device_get_nameunit(sc->dev));
|
||||
return;
|
||||
}
|
||||
rc = sbuf_cim_la(sc, &sb, M_NOWAIT);
|
||||
if (rc == 0) {
|
||||
rc = sbuf_finish(&sb);
|
||||
if (rc == 0) {
|
||||
log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s",
|
||||
log(LOG_DEBUG, "%s: CIM LA dump follows.\n%s\n",
|
||||
device_get_nameunit(sc->dev), sbuf_data(&sb));
|
||||
}
|
||||
}
|
||||
sbuf_delete(&sb);
|
||||
return (false);
|
||||
}
|
||||
|
||||
void
|
||||
t4_os_cim_err(struct adapter *sc)
|
||||
{
|
||||
atomic_set_int(&sc->error_flags, ADAP_CIM_ERR);
|
||||
}
|
||||
|
||||
static int
|
||||
@ -9356,8 +9434,8 @@ sysctl_devlog(SYSCTL_HANDLER_ARGS)
|
||||
return (rc);
|
||||
}
|
||||
|
||||
void
|
||||
t4_os_dump_devlog(struct adapter *sc)
|
||||
static void
|
||||
dump_devlog(struct adapter *sc)
|
||||
{
|
||||
int rc;
|
||||
struct sbuf sb;
|
||||
@ -11014,14 +11092,14 @@ sysctl_reset(SYSCTL_HANDLER_ARGS)
|
||||
u_int val;
|
||||
int rc;
|
||||
|
||||
val = sc->num_resets;
|
||||
val = atomic_load_int(&sc->num_resets);
|
||||
rc = sysctl_handle_int(oidp, &val, 0, req);
|
||||
if (rc != 0 || req->newptr == NULL)
|
||||
return (rc);
|
||||
|
||||
if (val == 0) {
|
||||
/* Zero out the counter that tracks reset. */
|
||||
sc->num_resets = 0;
|
||||
atomic_store_int(&sc->num_resets, 0);
|
||||
return (0);
|
||||
}
|
||||
|
||||
@ -12486,10 +12564,9 @@ t4_deactivate_uld(struct adapter *sc, int id)
|
||||
}
|
||||
|
||||
static void
|
||||
t4_async_event(void *arg, int n)
|
||||
t4_async_event(struct adapter *sc)
|
||||
{
|
||||
struct uld_info *ui;
|
||||
struct adapter *sc = (struct adapter *)arg;
|
||||
|
||||
if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4async") != 0)
|
||||
return;
|
||||
|
@ -1304,7 +1304,7 @@ t4_intr_err(void *arg)
|
||||
uint32_t v;
|
||||
const bool verbose = (sc->debug_flags & DF_VERBOSE_SLOWINTR) != 0;
|
||||
|
||||
if (sc->flags & ADAP_ERR)
|
||||
if (atomic_load_int(&sc->error_flags) & ADAP_FATAL_ERR)
|
||||
return;
|
||||
|
||||
v = t4_read_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE));
|
||||
@ -1313,7 +1313,8 @@ t4_intr_err(void *arg)
|
||||
t4_write_reg(sc, MYPF_REG(A_PL_PF_INT_CAUSE), v);
|
||||
}
|
||||
|
||||
t4_slow_intr_handler(sc, verbose);
|
||||
if (t4_slow_intr_handler(sc, verbose))
|
||||
t4_fatal_err(sc, false);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -99,3 +99,8 @@ nodevice snd_cmi
|
||||
# Don't yet have hwpmc(4)
|
||||
nodevice hwpmc
|
||||
nooptions HWPMC_HOOKS
|
||||
|
||||
# riscv doesn't yet have atomic_testandset_int and atomic_testandclear_int.
|
||||
nodevice ccr
|
||||
nodevice cxgbe
|
||||
nodevice cxgbev
|
||||
|
Loading…
Reference in New Issue
Block a user