From 509350457fcd021ee664199afcb8dbb9db0f81d6 Mon Sep 17 00:00:00 2001 From: rbaron Date: Tue, 6 Jun 2023 19:42:39 +0200 Subject: [PATCH] [ZigBee sample] Introduce debug counters In the effort to increse reliability of the Zigbee sample, #126 and #130 introduced handling for possibly rare, hard-to-debug events that may be occurring. While these changes seem to help in practive, we don't know exactly how often, if ever, they are triggered. This PR introduces debug_counters in hope to add some visilibty in these hard-to-debug, rare conditions. Calling `prst_debug_counters_increment("some_counter")` will increment a value stored in flash. These values are then dumped via logging when the device boots. Moving forward we can also consider exposing these values via a ZigBee cluster. --- .../nrf-connect/samples/zigbee/CMakeLists.txt | 2 + code/nrf-connect/samples/zigbee/prj.conf | 8 +- .../samples/zigbee/src/debug_counters.c | 131 ++++++++++++++++++ .../samples/zigbee/src/debug_counters.h | 20 +++ .../zigbee/src/double_reset_detector.c | 17 +-- .../nrf-connect/samples/zigbee/src/flash_fs.c | 26 ++++ .../nrf-connect/samples/zigbee/src/flash_fs.h | 9 ++ code/nrf-connect/samples/zigbee/src/main.c | 21 ++- .../samples/zigbee/src/restart_handler.c | 15 +- 9 files changed, 229 insertions(+), 20 deletions(-) create mode 100644 code/nrf-connect/samples/zigbee/src/debug_counters.c create mode 100644 code/nrf-connect/samples/zigbee/src/debug_counters.h create mode 100644 code/nrf-connect/samples/zigbee/src/flash_fs.c create mode 100644 code/nrf-connect/samples/zigbee/src/flash_fs.h diff --git a/code/nrf-connect/samples/zigbee/CMakeLists.txt b/code/nrf-connect/samples/zigbee/CMakeLists.txt index 9fac7f5..18e9b62 100644 --- a/code/nrf-connect/samples/zigbee/CMakeLists.txt +++ b/code/nrf-connect/samples/zigbee/CMakeLists.txt @@ -12,8 +12,10 @@ include_directories(src) target_sources(app PRIVATE src/main.c + src/debug_counters.c src/double_reset_detector.c src/factory_reset.c + src/flash_fs.c src/prst_zb_attrs.c src/prst_zb_soil_moisture_defs.c src/restart_handler.c diff --git a/code/nrf-connect/samples/zigbee/prj.conf b/code/nrf-connect/samples/zigbee/prj.conf index 5583b53..668d4e6 100644 --- a/code/nrf-connect/samples/zigbee/prj.conf +++ b/code/nrf-connect/samples/zigbee/prj.conf @@ -5,6 +5,9 @@ CONFIG_I2C=y CONFIG_ADC=y CONFIG_GPIO=y +CONFIG_USE_SEGGER_RTT=y +CONFIG_SEGGER_RTT_BUFFER_SIZE_UP=4096 + CONFIG_PM=y CONFIG_PM_DEVICE=y @@ -49,4 +52,7 @@ CONFIG_FLASH=y CONFIG_FLASH_MAP=y CONFIG_FLASH_PAGE_LAYOUT=y CONFIG_FILE_SYSTEM=y -CONFIG_FILE_SYSTEM_LITTLEFS=y \ No newline at end of file +CONFIG_FILE_SYSTEM_LITTLEFS=y + +# Uncomment for debug log level. +# CONFIG_LOG_DEFAULT_LEVEL=4 diff --git a/code/nrf-connect/samples/zigbee/src/debug_counters.c b/code/nrf-connect/samples/zigbee/src/debug_counters.c new file mode 100644 index 0000000..a75f1f3 --- /dev/null +++ b/code/nrf-connect/samples/zigbee/src/debug_counters.c @@ -0,0 +1,131 @@ +#include "debug_counters.h" + +#include +#include +#include +#include +#include + +LOG_MODULE_REGISTER(debug_counters, CONFIG_LOG_DEFAULT_LEVEL); + +static const char kDebugCountersDir[] = "/lfs/debug_counters"; + +#define PRST_MAX_COUNTER_NAME_LENGTH 64 +#define PRST_MAX_FILE_NAME_LENGTH (sizeof(kDebugCountersDir) + 1 + PRST_MAX_COUNTER_NAME_LENGTH) + +static int mk_filename(const char* counter_name, char* buff, size_t buff_size) { + RET_CHECK(strlen(counter_name) <= PRST_MAX_FILE_NAME_LENGTH, "Counter name too long"); + strcpy(buff, kDebugCountersDir); + strcat(buff, "/"); + strcat(buff, counter_name); + return 0; +} + +int prst_debug_counters_init() { + struct fs_dirent entry; + int err = fs_stat(kDebugCountersDir, &entry); + if (err == 0) { + LOG_DBG("Directory %s already exists", kDebugCountersDir); + return 0; + } else if (err == -ENOENT) { + LOG_DBG("Creating directory %s", kDebugCountersDir); + return fs_mkdir(kDebugCountersDir); + } + LOG_ERR("Unexpected error in fs_stat for %s: %d", kDebugCountersDir, err); + return err; +} + +int prst_debug_counters_increment(const char* counter_name) { + char filename[PRST_MAX_FILE_NAME_LENGTH]; + RET_IF_ERR(mk_filename(counter_name, filename, sizeof(filename))); + + LOG_DBG("Incrementing counter %s", filename); + + // Open the file. + struct fs_file_t file; + fs_file_t_init(&file); + RET_IF_ERR(fs_open(&file, filename, FS_O_CREATE | FS_O_RDWR)); + + // Read the current value. + prst_debug_counter_t value; + int n_read = fs_read(&file, &value, sizeof(value)); + if (n_read != sizeof(value)) { + LOG_WRN("fs_read returned %d, expected %d, assuming first access", n_read, sizeof(value)); + value = 0; + } + + // Increment the value. + value++; + + // Write back to file. + RET_CHECK(fs_seek(&file, 0, SEEK_SET) == 0, "Unable to seek"); + ssize_t written = fs_write(&file, &value, sizeof(value)); + if (written != sizeof(value)) { + LOG_ERR("fs_write returned %d, expected %d", written, sizeof(value)); + } + return fs_close(&file); +} + +int prst_debug_counters_get(const char* counter_name, prst_debug_counter_t* value) { + char filename[PRST_MAX_FILE_NAME_LENGTH]; + RET_IF_ERR(mk_filename(counter_name, filename, sizeof(filename))); + + LOG_DBG("Getting counter %s", filename); + + // Open the file. + struct fs_file_t file; + fs_file_t_init(&file); + RET_IF_ERR(fs_open(&file, filename, FS_O_CREATE | FS_O_READ)); + + // Read the current value. + int n_read = fs_read(&file, value, sizeof(value)); + if (n_read != sizeof(prst_debug_counter_t)) { + LOG_WRN("fs_read returned %d, expected %d, assuming first access", n_read, sizeof(prst_debug_counter_t)); + // return -1; + *value = 0; + } + + return fs_close(&file); +} + +int prst_debug_counters_get_all(prst_debug_counters_callback_t callback) { + LOG_DBG("Getting all counters from %s", kDebugCountersDir); + struct fs_dir_t dirp; + fs_dir_t_init(&dirp); + + static struct fs_dirent entry; + RET_IF_ERR(fs_opendir(&dirp, kDebugCountersDir)); + prst_debug_counter_t value; + for (;;) { + RET_IF_ERR(fs_readdir(&dirp, &entry)); + // End of directory; + if (entry.name[0] == 0) { + break; + } + LOG_DBG("Found %s", entry.name); + prst_debug_counters_get(entry.name, &value); + callback(entry.name, value); + } + return fs_closedir(&dirp); +} + +typedef struct { + struct k_work work; + const char* counter_name; +} counter_increment_work_t; + +static void increment_counter_callback(struct k_work* work) { + counter_increment_work_t* counter_work = CONTAINER_OF(work, counter_increment_work_t, work); + LOG_DBG("Incrementing counter callback for %s", counter_work->counter_name); + prst_debug_counters_increment(counter_work->counter_name); + free(counter_work); + LOG_DBG("Freed memory for %s", counter_work->counter_name); +} + +int prst_debug_counters_increment_async(const char* counter_name) { + counter_increment_work_t* work = malloc(sizeof(counter_increment_work_t)); + work->counter_name = counter_name; + k_work_init(&work->work, increment_counter_callback); + LOG_DBG("Scheduling increment of %s", counter_name); + return k_work_submit(&work->work); +} \ No newline at end of file diff --git a/code/nrf-connect/samples/zigbee/src/debug_counters.h b/code/nrf-connect/samples/zigbee/src/debug_counters.h new file mode 100644 index 0000000..3000102 --- /dev/null +++ b/code/nrf-connect/samples/zigbee/src/debug_counters.h @@ -0,0 +1,20 @@ +#ifndef _PRST_DEBUG_COUNTERS_H_ +#define _PRST_DEBUG_COUNTERS_H_ + +#include + +typedef uint32_t prst_debug_counter_t; + +typedef void (*prst_debug_counters_callback_t)(const char* counter_name, prst_debug_counter_t value); + +int prst_debug_counters_init(); + +int prst_debug_counters_increment(const char* counter_name); + +int prst_debug_counters_increment_async(const char* counter_name); + +int prst_debug_counters_get(const char* counter_name, prst_debug_counter_t* value); + +int prst_debug_counters_get_all(prst_debug_counters_callback_t callback); + +#endif // _PRST_DEBUG_COUNTERS_H_ \ No newline at end of file diff --git a/code/nrf-connect/samples/zigbee/src/double_reset_detector.c b/code/nrf-connect/samples/zigbee/src/double_reset_detector.c index 56e3bf6..eb2b1aa 100644 --- a/code/nrf-connect/samples/zigbee/src/double_reset_detector.c +++ b/code/nrf-connect/samples/zigbee/src/double_reset_detector.c @@ -9,15 +9,6 @@ LOG_MODULE_REGISTER(double_reset_detector, CONFIG_LOG_DEFAULT_LEVEL); -FS_LITTLEFS_DECLARE_DEFAULT_CONFIG(storage); - -static struct fs_mount_t lfs_storage_mnt = { - .type = FS_LITTLEFS, - .fs_data = &storage, - .storage_dev = (void *)FLASH_AREA_ID(storage), - .mnt_point = "/lfs", -}; - static const char *flag_filename = "/lfs/reset_flag"; static const char flag_prefix[] = "prst-rst-count"; @@ -38,12 +29,6 @@ void erase_flag_callback(struct k_work *work) { K_WORK_DELAYABLE_DEFINE(erase_flag_work, erase_flag_callback); int prst_detect_double_reset(prst_double_reset_callback_t on_double_reset) { - // TODO: if booting for the first time after a full flash erase, fs_mount will - // complain (via a LOG_ERR) and then automatically format the flash. It all works, - // but avoiding a scary red message would be ideal. Maybe somehow check if it's - // formatted before mounting? - RET_IF_ERR(fs_mount(&lfs_storage_mnt)); - fs_file_t_init(&flag_file); RET_IF_ERR(fs_open(&flag_file, flag_filename, FS_O_CREATE | FS_O_RDWR)); @@ -64,7 +49,7 @@ int prst_detect_double_reset(prst_double_reset_callback_t on_double_reset) { // Write the flag and erase it after some time. ssize_t written = fs_write(&flag_file, flag_prefix, sizeof(flag_prefix)); if (written != sizeof(flag_prefix)) { - LOG_ERR("s_write returned %d, expected %d", written, sizeof(flag_prefix)); + LOG_ERR("fs_write returned %d, expected %d", written, sizeof(flag_prefix)); return -1; } diff --git a/code/nrf-connect/samples/zigbee/src/flash_fs.c b/code/nrf-connect/samples/zigbee/src/flash_fs.c new file mode 100644 index 0000000..01e72f5 --- /dev/null +++ b/code/nrf-connect/samples/zigbee/src/flash_fs.c @@ -0,0 +1,26 @@ +#include "flash_fs.h" + +#include +#include +#include +#include + +LOG_MODULE_REGISTER(flash_fs, CONFIG_LOG_DEFAULT_LEVEL); + +FS_LITTLEFS_DECLARE_DEFAULT_CONFIG(storage); + +static struct fs_mount_t lfs_storage_mnt = { + .type = FS_LITTLEFS, + .fs_data = &storage, + .storage_dev = (void *)FLASH_AREA_ID(storage), + .mnt_point = "/lfs", +}; + +int prst_flash_fs_init() { + // TODO: if booting for the first time after a full flash erase, fs_mount will + // complain (via a LOG_ERR) and then automatically format the flash. It all works, + // but avoiding a scary red message would be ideal. Maybe somehow check if it's + // formatted before mounting? + RET_IF_ERR(fs_mount(&lfs_storage_mnt)); + return 0; +} \ No newline at end of file diff --git a/code/nrf-connect/samples/zigbee/src/flash_fs.h b/code/nrf-connect/samples/zigbee/src/flash_fs.h new file mode 100644 index 0000000..34e39a6 --- /dev/null +++ b/code/nrf-connect/samples/zigbee/src/flash_fs.h @@ -0,0 +1,9 @@ +#ifndef _PRST_FLASH_FS_H_ +#define _PRST_FLASH_FS_H_ + +#include + +// Initializes and mounts littlefs. +int prst_flash_fs_init(); + +#endif // _PRST_FLASH_FS_H_ \ No newline at end of file diff --git a/code/nrf-connect/samples/zigbee/src/main.c b/code/nrf-connect/samples/zigbee/src/main.c index 16ee57b..b1c1c42 100644 --- a/code/nrf-connect/samples/zigbee/src/main.c +++ b/code/nrf-connect/samples/zigbee/src/main.c @@ -15,7 +15,9 @@ #include #include +#include "debug_counters.h" #include "factory_reset.h" +#include "flash_fs.h" #include "prst_zb_attrs.h" #include "prst_zb_endpoint_defs.h" #include "prst_zb_soil_moisture_defs.h" @@ -131,13 +133,15 @@ void zboss_signal_handler(zb_bufid_t bufid) { LOG_DBG("Steering complete. Status: %d", status); if (status == RET_OK) { LOG_DBG("Steering successful. Status: %d", status); + prst_debug_counters_increment("steering_success"); prst_led_flash(/*times=*/3); k_timer_stop(&led_flashing_timer); prst_restart_watchdog_stop(); prst_led_off(); } else { LOG_DBG("Steering failed. Status: %d", status); - prst_led_flash(7); + prst_debug_counters_increment("steering_failure"); + prst_led_flash(/*times=*/7); prst_restart_watchdog_start(); k_timer_stop(&led_flashing_timer); // Power saving prst_led_off(); @@ -173,6 +177,7 @@ void zboss_signal_handler(zb_bufid_t bufid) { For details see: https://developer.nordicsemi.com/nRF_Connect_SDK/doc/latest/nrf/known_issues.html?v=v2-3-0 */ if (stack_initialised && !joining_signal_received) { + prst_debug_counters_increment("krknwk_12017_reset"); zb_reset(0); } } @@ -194,6 +199,7 @@ void update_sensors_cb(zb_uint8_t arg) { ZB_TIME_ONE_SECOND * CONFIG_PRST_ZB_SLEEP_DURATION_SEC); if (prst_sensors_read_all(&sensors)) { + prst_debug_counters_increment("sensors_read_error"); LOG_ERR("Unable to read sensors"); return; } @@ -235,10 +241,21 @@ void update_sensors_cb(zb_uint8_t arg) { &log_lux); } +void log_counter(const char *counter_name, prst_debug_counter_t value) { + LOG_INF("- %s: %d", counter_name, value); +} + int main(void) { RET_IF_ERR(prst_adc_init()); RET_IF_ERR(prst_led_init()); RET_IF_ERR(prst_button_init()); + RET_IF_ERR(prst_flash_fs_init()); + RET_IF_ERR(prst_debug_counters_init()); + + prst_debug_counters_increment("boot"); + + LOG_INF("Dumping debug counters:"); + prst_debug_counters_get_all(log_counter); RET_IF_ERR(prst_zb_factory_reset_check()); @@ -260,5 +277,7 @@ int main(void) { zigbee_enable(); zigbee_configure_sleepy_behavior(/*enable=*/true); + prst_debug_counters_increment("main_finish"); + return 0; } diff --git a/code/nrf-connect/samples/zigbee/src/restart_handler.c b/code/nrf-connect/samples/zigbee/src/restart_handler.c index b864f69..0a23df3 100644 --- a/code/nrf-connect/samples/zigbee/src/restart_handler.c +++ b/code/nrf-connect/samples/zigbee/src/restart_handler.c @@ -4,14 +4,25 @@ #include #include +#include "debug_counters.h" + LOG_MODULE_REGISTER(restart_handler, CONFIG_LOG_DEFAULT_LEVEL); -static void restart_network_steering_cb(struct k_timer *timer) { - LOG_DBG("Restart handler expired. Restarting network steering."); +void callback_work_handler(struct k_work *work) { + LOG_INF("Running restart callback_work_handler."); + prst_debug_counters_increment("steering_watchdog_restart"); // If the device is not commissioned, the rejoin procedure is started. user_input_indicate(); } +K_WORK_DEFINE(callback_work, callback_work_handler); + +// Runs in an ISR context. We offload the actual work to a workqueue. +static void restart_network_steering_cb(struct k_timer *timer) { + LOG_INF("Triggered restart_network_steering_cb. Offloading work."); + k_work_submit(&callback_work); +} + K_TIMER_DEFINE(restart_timer, restart_network_steering_cb, NULL); void prst_restart_watchdog_start() {