Skip to content

Commit

Permalink
rasdaemon: Add support for post-processing MCA errors
Browse files Browse the repository at this point in the history
Currently, the rasdaemon performs detailed error decoding of received
MCA errors on the system only whence it is running, either as a daemon
or in the foreground.

As such, error decoding cannot be undertaken for any MCA errors received
whence the rasdaemon wasn't running. Additionally, if the error decoding
modules like edac_mce_amd too have not been loaded, error records in the
demsg buffer might correspond to raw values in associated MSRs, compelling
users to undertake decoding manually. The scenario seems more plausible on
AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA
Extended Error Descriptions from the edac_mce_amd module in an effort to
offload SMCA Error Decoding to the rasdaemon.

As such, add support to post-process and decode MCA Errors received on AMD
SMCA systems from raw MSR values. Support for post-processing and decoding
of MCA Errors received on CPUs of other vendors can be added in the future,
as needed.

Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
  • Loading branch information
Avadhut Naik committed Jun 30, 2023
1 parent 3414753 commit 4741d1c
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 10 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,16 @@ required):
# rasdaemon -f -r
```

To post-process and decode received MCA errors on AMD SMCA systems, run:

```
# rasdaemon -p --status <STATUS_reg> --ipid <IPID_reg> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
```

Status and IPID Register values (in hex) are mandatory. The `smca` flag
with `family` and `model` are required if not decoding locally. `Bank`
parameter is optional.

You may also start it via systemd:

```
Expand Down
8 changes: 5 additions & 3 deletions mce-amd-smca.c
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};

static void amd_decode_errcode(struct mce_event *e)
void amd_decode_errcode(struct mce_event *e)
{

decode_amd_errcode(e);
Expand Down Expand Up @@ -782,7 +782,7 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
}

/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
Expand Down Expand Up @@ -827,7 +827,9 @@ static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
/* Only print the descriptor of valid extended error code */
if (xec < smca_mce_descs[bank_type].num_descs)
mce_snprintf(e->mcastatus_msg,
" %s.\n", smca_mce_descs[bank_type].descs[xec]);
"%s. Ext Err Code: %d",
smca_mce_descs[bank_type].descs[xec],
xec);

if (bank_type == SMCA_UMC && xec == 0) {
channel = find_umc_channel(e);
Expand Down
79 changes: 79 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,85 @@ static int add_event_handler(struct ras_events *ras, struct tep_handle *pevent,
return 0;
}

static int report_mce_offline(struct trace_seq *s,
struct mce_event *mce,
struct mce_priv *priv)
{
time_t now;
struct tm *tm;

time(&now);
tm = localtime(&now);

if (tm)
strftime(mce->timestamp, sizeof(mce->timestamp),
"%Y-%m-%d %H:%M:%S %z", tm);
trace_seq_printf(s, "%s,", mce->timestamp);

if (*mce->bank_name)
trace_seq_printf(s, " %s,", mce->bank_name);
else
trace_seq_printf(s, " bank=%x,", mce->bank);

if (*mce->mcastatus_msg)
trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);

if (*mce->mcistatus_msg)
trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);

if (*mce->mc_location)
trace_seq_printf(s, " Locn: %s,", mce->mc_location);

if (*mce->error_msg)
trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);

return 0;
}

int ras_offline_mc_event(struct ras_mc_offline_event *event)
{
int rc = 0;
struct trace_seq s;
struct mce_event *mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
struct mce_priv *priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));

trace_seq_init(&s);

if (event->smca) {
priv->cputype = CPU_AMD_SMCA;
priv->family = event->family;
priv->model = event->model;
} else {
rc = detect_cpu(priv);
if (rc)
return -EINVAL;
}

mce->status = event->status;
mce->bank = event->bank;

switch (priv->cputype) {
case CPU_AMD_SMCA:
mce->ipid = event->ipid;
if (!mce->ipid || !mce->status) {
printf("%s MSR required.\n", mce->ipid ? "Status" : "Ipid");
return -EINVAL;
}
decode_smca_error(mce, priv);
amd_decode_errcode(mce);
break;
default:
break;
}

report_mce_offline(&s, mce, priv);
trace_seq_do_printf(&s);
fflush(stdout);
trace_seq_destroy(&s);

return 0;
}

int handle_ras_events(int record_events)
{
int rc, page_size, i;
Expand Down
1 change: 1 addition & 0 deletions ras-events.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ enum ghes_severity {

/* Function prototypes */
int toggle_ras_mc_event(int enable);
int ras_offline_mc_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);

#endif
11 changes: 4 additions & 7 deletions ras-mce-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,8 @@ static char *cputype_name[] = {
[CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};

static enum cputype select_intel_cputype(struct ras_events *ras)
static enum cputype select_intel_cputype(struct mce_priv *mce)
{
struct mce_priv *mce = ras->mce_priv;

if (mce->family == 15) {
if (mce->model == 6)
return CPU_TULSA;
Expand Down Expand Up @@ -140,9 +138,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
}

static int detect_cpu(struct ras_events *ras)
int detect_cpu(struct mce_priv *mce)
{
struct mce_priv *mce = ras->mce_priv;
FILE *f;
int ret = 0;
char *line = NULL;
Expand Down Expand Up @@ -221,7 +218,7 @@ static int detect_cpu(struct ras_events *ras)
}
goto ret;
} else if (!strcmp(mce->vendor,"GenuineIntel")) {
mce->cputype = select_intel_cputype(ras);
mce->cputype = select_intel_cputype(mce);
} else {
ret = EINVAL;
}
Expand All @@ -246,7 +243,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus)

mce = ras->mce_priv;

rc = detect_cpu(ras);
rc = detect_cpu(mce);
if (rc) {
if (mce->processor_flags)
free (mce->processor_flags);
Expand Down
6 changes: 6 additions & 0 deletions ras-mce-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_seq *s,
/* enables intel iMC logs */
int set_intel_imc_log(enum cputype cputype, unsigned ncpus);

/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);

/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);
void core2_decode_model(struct mce_event *e);
Expand Down Expand Up @@ -165,6 +169,8 @@ void decode_amd_errcode(struct mce_event *e);
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
#define MCG_STATUS_LMCE (1ULL<<3) /* local machine check signaled */

int detect_cpu(struct mce_priv *mce);

/* Those functions are defined on per-cpu vendor C files */
int parse_intel_event(struct ras_events *ras, struct mce_event *e);

Expand Down
9 changes: 9 additions & 0 deletions ras-record.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#define __RAS_RECORD_H

#include <stdint.h>
#include <stdbool.h>
#include "config.h"

#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
Expand All @@ -42,6 +43,14 @@ struct ras_mc_event {
const char *driver_detail;
};

struct ras_mc_offline_event {
unsigned int family, model;
bool smca;
uint8_t bank;
uint64_t ipid;
uint64_t status;
};

struct ras_aer_event {
char timestamp[64];
const char *error_type;
Expand Down
76 changes: 76 additions & 0 deletions rasdaemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,20 @@ struct arguments {
int record_events;
int enable_ras;
int foreground;
int offline;
};

enum OFFLINE_ARG_KEYS {
SMCA = 0x100,
MODEL,
FAMILY,
BANK_NUM,
IPID_REG,
STATUS_REG,
};

struct ras_mc_offline_event event;

static error_t parse_opt(int k, char *arg, struct argp_state *state)
{
struct arguments *args = state->input;
Expand All @@ -62,6 +74,39 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state)
case 'f':
args->foreground++;
break;
case 'p':
if (state->argc < 4)
argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
args->offline++;
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}

static error_t parse_opt_offline(int key, char *arg,
struct argp_state *state)
{
switch (key) {
case SMCA:
event.smca = true;
break;
case MODEL:
event.model = strtoul(state->argv[state->next], NULL, 0);
break;
case FAMILY:
event.family = strtoul(state->argv[state->next], NULL, 0);
break;
case BANK_NUM:
event.bank = atoi(state->argv[state->next]);
break;
case IPID_REG:
event.ipid = strtoull(state->argv[state->next], NULL, 0);
break;
case STATUS_REG:
event.status = strtoull(state->argv[state->next], NULL, 0);
break;
default:
return ARGP_ERR_UNKNOWN;
}
Expand All @@ -74,13 +119,38 @@ int main(int argc, char *argv[])
{
struct arguments args;
int idx = -1;

const struct argp_option offline_options[] = {
{"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
{"model", MODEL, 0, 0, "CPU Model"},
{"family", FAMILY, 0, 0, "CPU Family"},
{"bank", BANK_NUM, 0, 0, "Bank Number"},
{"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
{"status", STATUS_REG, 0, 0, "Status Register"},
{0, 0, 0, 0, 0, 0},
};

struct argp offline_argp = {
.options = offline_options,
.parser = parse_opt_offline,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,
};

struct argp_child offline_parser[] = {
{&offline_argp, 0, "Post-Processing Options:", 0},
{0, 0, 0, 0},
};

const struct argp_option options[] = {
{"enable", 'e', 0, 0, "enable RAS events and exit", 0},
{"disable", 'd', 0, 0, "disable RAS events and exit", 0},
#ifdef HAVE_SQLITE3
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
{"post-processing", 'p', 0, 0,
"Post-processing MCE's with raw register values"},

{ 0, 0, 0, 0, 0, 0 }
};
Expand All @@ -89,6 +159,7 @@ int main(int argc, char *argv[])
.parser = parse_opt,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,
.children = offline_parser,

};
memset (&args, 0, sizeof(args));
Expand All @@ -111,6 +182,11 @@ int main(int argc, char *argv[])
return 0;
}

if (args.offline) {
ras_offline_mc_event(&event);
return 0;
}

openlog(TOOL_NAME, 0, LOG_DAEMON);
if (!args.foreground)
if (daemon(0,0))
Expand Down

0 comments on commit 4741d1c

Please sign in to comment.