diff --git a/darshan-runtime/lib/darshan-config.c b/darshan-runtime/lib/darshan-config.c index 09d6ad51a..3a7ff4b16 100644 --- a/darshan-runtime/lib/darshan-config.c +++ b/darshan-runtime/lib/darshan-config.c @@ -437,6 +437,18 @@ void darshan_parse_config_env(struct darshan_config *cfg) } } } + envstr = getenv("DXT_ENABLE_STACK_TRACE"); + if(envstr) + { + struct dxt_trigger *trigger = malloc(sizeof(*trigger)); + if(trigger) + { + trigger->type = DXT_COLLECT_STACK_TRACE; + trigger->u.unaligned_io.thresh_pct = 0; + cfg->stack_trace_trigger = trigger; + } + } + if(getenv("DARSHAN_DUMP_CONFIG")) cfg->dump_config_flag = 1; if(getenv("DARSHAN_INTERNAL_TIMING")) diff --git a/darshan-runtime/lib/darshan-config.h b/darshan-runtime/lib/darshan-config.h index f3f7c212a..6c6b499a2 100644 --- a/darshan-runtime/lib/darshan-config.h +++ b/darshan-runtime/lib/darshan-config.h @@ -37,6 +37,7 @@ struct darshan_config char *rank_inclusions; struct dxt_trigger *small_io_trigger; struct dxt_trigger *unaligned_io_trigger; + struct dxt_trigger *stack_trace_trigger; int internal_timing_flag; int disable_shared_redux_flag; int dump_config_flag; diff --git a/darshan-runtime/lib/darshan-core.c b/darshan-runtime/lib/darshan-core.c index 1a1bda192..1729c185c 100644 --- a/darshan-runtime/lib/darshan-core.c +++ b/darshan-runtime/lib/darshan-core.c @@ -20,6 +20,7 @@ #endif #include #include +#include #include #include #include @@ -30,11 +31,13 @@ #include #include #include +#include #include #include #include #include #include +#include #ifdef HAVE_MPI #include @@ -71,6 +74,7 @@ static int parent_pid; static struct darshan_core_mnt_data mnt_data_array[DARSHAN_MAX_MNTS]; static int mnt_data_count = 0; +static char *exe_name = ""; #ifdef DARSHAN_BGQ extern void bgq_runtime_initialize(); #endif @@ -206,7 +210,6 @@ void darshan_core_initialize(int argc, char **argv) int ret; int i; struct timespec start_ts; - /* setup darshan runtime if darshan is enabled and hasn't been initialized already */ if (__darshan_core != NULL || getenv("DARSHAN_DISABLE")) return; @@ -325,7 +328,6 @@ void darshan_core_initialize(int argc, char **argv) /* collect information about command line and mounted file systems */ darshan_get_exe_and_mounts(init_core, argc, argv); - if(!darshan_should_instrument_app(init_core)) { /* do not instrument excluded applications */ @@ -394,6 +396,18 @@ void darshan_core_initialize(int argc, char **argv) darshan_core_fprintf(stderr, "darshan:init\t%d\t%f\n", nprocs, init_time); } + if(init_core->config.stack_trace_trigger){ + dxt_enable_stack_trace(); + } + + char *p; + p = strtok(init_core->log_exemnt_p, "\n"); + char *exe; + exe = strtok(p, " "); + + if(exe) + exe_name = exe; + return; } @@ -545,6 +559,7 @@ void darshan_core_shutdown(int write_log) /* get the log file name */ darshan_get_logfile_name(logfile_name, final_core); + if(strlen(logfile_name) == 0) { /* failed to generate log file name */ @@ -587,6 +602,7 @@ void darshan_core_shutdown(int write_log) if(final_core->config.unaligned_io_trigger) dxt_posix_apply_trace_filter(final_core->config.unaligned_io_trigger); + /* loop over globally used darshan modules and: * - get final output buffer * - compress (zlib) provided output buffer @@ -635,7 +651,7 @@ void darshan_core_shutdown(int write_log) mod_shared_recs[mod_shared_rec_cnt++] = shared_recs[j]; } } - + /* allow the module an opportunity to reduce shared files */ if(this_mod->mod_funcs.mod_redux_func && (mod_shared_rec_cnt > 0)) { @@ -657,6 +673,209 @@ void darshan_core_shutdown(int write_log) /* get the final output buffer */ this_mod->mod_funcs.mod_output_func(&mod_buf, &mod_buf_sz); } + +/* Code added by Hammad Ather (hather@lbl.gov) and Jean Luca Bez (jlbez@lbl.gov) */ +#ifdef HAVE_MPI + if(using_mpi) + { + if (i == DXT_POSIX_MOD) { + PMPI_Barrier(MPI_COMM_WORLD); + if (my_rank == 0 && final_core->config.stack_trace_trigger) { + FILE *fptr; + + typedef struct { + char address[32]; /* key */ + UT_hash_handle hh; /* makes this structure hashable */ + } unique_stack_struct; + + unique_stack_struct *unique_mem_addr = NULL; + + for (int rank = 0; rank < nprocs; rank++) { + char stack_file_name_posix[50]; + sprintf(stack_file_name_posix, ".%d.darshan-posix", rank); + fptr = fopen(stack_file_name_posix, "r"); + if (fptr) { + char line[32]; + + while (fgets(line, sizeof(line), fptr)) { + line[strcspn(line, "\n")] = 0; + unique_stack_struct *d = NULL; + + HASH_FIND_STR(unique_mem_addr, line, d); + + if (!d) { + unique_stack_struct *e = (unique_stack_struct *) malloc(sizeof *e); + strcpy(e->address, line); + + HASH_ADD_STR(unique_mem_addr, address, e); + } + } + + fclose(fptr); + remove(stack_file_name_posix); + } + } + + unique_stack_struct *d = NULL; + char * exe_name = darshan_exe(); + int line_mappings_index = 0; + char address_line_mapping[4096] = {}; + + for (d = unique_mem_addr; d != NULL; d = (unique_stack_struct *)(d->hh.next)) { + FILE *fp; + char cmd[256]; + char *line = NULL; + size_t len = 0; + + char addr[32]; + sprintf(addr, "%s", d->address); + + char *const args[] = { "/usr/bin/addr2line", "-a", addr, "-e", exe_name, NULL }; + + int pipe_fd[2]; + pid_t child_pid; + int status; + + // Create a pipe to capture the command's output + if (pipe(pipe_fd) == -1) { + perror("pipe"); + // return 1; + } + int ret; + // Use posix_spawn to execute the command + posix_spawn_file_actions_t action; + posix_spawn_file_actions_init(&action); + posix_spawn_file_actions_addclose(&action, pipe_fd[0]); // Close the read end of the pipe + posix_spawn_file_actions_adddup2(&action, pipe_fd[1], STDOUT_FILENO); // Redirect stdout to the write end of the pipe + if (posix_spawn(&child_pid, "/usr/bin/addr2line", &action, NULL, args, NULL) == 0) { + + // Close the write end of the pipe in the parent process + close(pipe_fd[1]); + + // Read the output from the pipe + char buffer[4096]; + ssize_t bytes_read; + FILE* debug; + debug = fopen("/dev/null", "w"); + while ((bytes_read = read(pipe_fd[0], buffer, sizeof(buffer))) > 0) { + fwrite(buffer, 1, bytes_read, debug); + } + + char * token = strtok(buffer, "\n"); + token = strtok(NULL, "\n"); + int number = (int)strtol(buffer, NULL, 16); + sprintf(cmd, "%p, %s\n", number, token); + strcat(address_line_mapping, cmd); + } + HASH_DEL(unique_mem_addr, d); + } + + strcpy(final_core->log_hdr_p->posix_line_mapping, address_line_mapping); + } + } + else if (i == DXT_MPIIO_MOD) { + PMPI_Barrier(MPI_COMM_WORLD); + if (my_rank == 0 && final_core->config.stack_trace_trigger) { + FILE *fptr; + + typedef struct { + char address[32]; /* key */ + UT_hash_handle hh; /* makes this structure hashable */ + } unique_stack_struct; + + unique_stack_struct *unique_mem_addr = NULL; + + for (int rank = 0; rank < nprocs; rank++) { + char stack_file_name_mpiio[50]; + sprintf(stack_file_name_mpiio, ".%d.darshan-mpiio", rank); + fptr = fopen(stack_file_name_mpiio, "r"); + if (fptr) { + char line[32]; + + while (fgets(line, sizeof(line), fptr)) { + line[strcspn(line, "\n")] = 0; + unique_stack_struct *d = NULL; + + HASH_FIND_STR(unique_mem_addr, line, d); + + if (!d) { + unique_stack_struct *e = (unique_stack_struct *) malloc(sizeof *e); + strcpy(e->address, line); + + HASH_ADD_STR(unique_mem_addr, address, e); + } + } + + fclose(fptr); + remove(stack_file_name_mpiio); + } + } + + unique_stack_struct *d = NULL; + + char * exe_name = darshan_exe(); + + int line_mappings_index = 0; + + char address_line_mapping[4096] = {}; + + for (d = unique_mem_addr; d != NULL; d = (unique_stack_struct *)(d->hh.next)) { + + FILE *fp; + char cmd[256]; + char *line = NULL; + size_t len = 0; + + char addr[32]; + sprintf(addr, "%s", d->address); + + char *const args[] = { "/usr/bin/addr2line", "-a", addr, "-e", exe_name, NULL }; + + int pipe_fd[2]; + pid_t child_pid; + int status; + + if (pipe(pipe_fd) == -1) { + perror("pipe"); + // return 1; + } + int ret; + // Use posix_spawn to execute the command + posix_spawn_file_actions_t action; + posix_spawn_file_actions_init(&action); + posix_spawn_file_actions_addclose(&action, pipe_fd[0]); // Close the read end of the pipe + posix_spawn_file_actions_adddup2(&action, pipe_fd[1], STDOUT_FILENO); // Redirect stdout to the write end of the pipe + if (posix_spawn(&child_pid, "/usr/bin/addr2line", &action, NULL, args, NULL) == 0) { + // Close the write end of the pipe in the parent process + close(pipe_fd[1]); + + // Read the output from the pipe + char buffer[4096]; + ssize_t bytes_read; + FILE* debug; + debug = fopen("/dev/null", "w"); + while ((bytes_read = read(pipe_fd[0], buffer, sizeof(buffer))) > 0) { + fwrite(buffer, 1, bytes_read, debug); + } + + // Wait for the child process to complete + waitpid(child_pid, &status, 0); + + char * token = strtok(buffer, "\n"); + token = strtok(NULL, "\n"); + int number = (int)strtol(buffer, NULL, 16); + sprintf(cmd, "%p, %s\n", number, token); + strcat(address_line_mapping, cmd); + } + + HASH_DEL(unique_mem_addr, d); + } + + strcpy(final_core->log_hdr_p->mpiio_line_mapping, address_line_mapping); + } + } + } +#endif /* append this module's data to the darshan log */ final_core->log_hdr_p->mod_map[i].off = gz_fp; @@ -671,7 +890,7 @@ void darshan_core_shutdown(int write_log) DARSHAN_CHECK_ERR(ret, "unable to write %s module data to log file %s", darshan_module_names[i], logfile_name); } - + if(internal_timing_flag) header1 = darshan_core_wtime_absolute(); ret = darshan_log_write_header(log_fh, final_core); @@ -999,6 +1218,8 @@ static void add_entry(char* buf, int* space_left, struct mntent* entry) * collects command line and list of mounted file systems into a string that * will be stored with the job-level metadata */ + + static void darshan_get_exe_and_mounts(struct darshan_core_runtime *core, int argc, char **argv) { @@ -2064,7 +2285,7 @@ void darshan_log_finalize(char *logfile_name, double start_log_time) /* set permissions on log file */ chmod(new_logfile_name, chmod_mode); free(new_logfile_name); - } + } } return; @@ -2635,7 +2856,6 @@ void *darshan_core_register_record( __DARSHAN_CORE_UNLOCK(); return(NULL); } - /* check to see if this module has enough space to store a new record */ if(__darshan_core->mod_array[mod_id]->rec_mem_avail < rec_size) { @@ -2761,6 +2981,11 @@ void darshan_core_fprintf( return; } +char *darshan_exe() +{ + return exe_name; +} + /* * Local variables: * c-indent-level: 4 @@ -2769,3 +2994,4 @@ void darshan_core_fprintf( * * vim: ts=8 sts=4 sw=4 expandtab */ + diff --git a/darshan-runtime/lib/darshan-dxt.c b/darshan-runtime/lib/darshan-dxt.c index 38e9c5f24..b585c1b86 100644 --- a/darshan-runtime/lib/darshan-dxt.c +++ b/darshan-runtime/lib/darshan-dxt.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "utlist.h" #include "uthash.h" @@ -58,6 +59,9 @@ typedef int64_t off64_t; /* NOTE: when this size is exceeded, the buffer size is doubled */ #define IO_TRACE_BUF_SIZE 64 +#define STACK_TRACE_BUF_SIZE 60 + +bool isStackTrace = false; /* The dxt_file_record_ref structure maintains necessary runtime metadata * for the DXT file record (dxt_file_record structure, defined in * darshan-dxt-log-format.h) pointed to by 'file_rec'. This metadata @@ -257,13 +261,21 @@ void dxt_posix_write(darshan_record_id rec_id, int64_t offset, DXT_UNLOCK(); return; } - rec_ref->write_traces[file_rec->write_count].offset = offset; rec_ref->write_traces[file_rec->write_count].length = length; rec_ref->write_traces[file_rec->write_count].start_time = start_time; rec_ref->write_traces[file_rec->write_count].end_time = end_time; - file_rec->write_count += 1; + /* Code added by Hammad Ather (hather@lbl.gov) and Jean Luca Bez (jlbez@lbl.gov) */ + if (isStackTrace){ + int size = backtrace (rec_ref->write_traces[file_rec->write_count].address_array, STACK_TRACE_BUF_SIZE); + rec_ref->write_traces[file_rec->write_count].noStackTrace = 1; + rec_ref->write_traces[file_rec->write_count].size = size; + } + else + rec_ref->write_traces[file_rec->write_count].noStackTrace = 0; + + file_rec->write_count += 1; DXT_UNLOCK(); } @@ -307,8 +319,15 @@ void dxt_posix_read(darshan_record_id rec_id, int64_t offset, rec_ref->read_traces[file_rec->read_count].length = length; rec_ref->read_traces[file_rec->read_count].start_time = start_time; rec_ref->read_traces[file_rec->read_count].end_time = end_time; + /* Code added by Hammad Ather (hather@lbl.gov) and Jean Luca Bez (jlbez@lbl.gov) */ + if (isStackTrace){ + int size = backtrace (rec_ref->read_traces[file_rec->read_count].address_array , STACK_TRACE_BUF_SIZE); + rec_ref->read_traces[file_rec->read_count].noStackTrace = 1; + rec_ref->read_traces[file_rec->read_count].size = size; + } + else + rec_ref->read_traces[file_rec->read_count].noStackTrace = 0; file_rec->read_count += 1; - DXT_UNLOCK(); } @@ -338,7 +357,7 @@ void dxt_mpiio_write(darshan_record_id rec_id, int64_t offset, return; } } - + file_rec = rec_ref->file_rec; check_wr_trace_buf(rec_ref, DXT_MPIIO_MOD, dxt_mpiio_runtime); if(file_rec->write_count == rec_ref->write_available_buf) @@ -347,13 +366,21 @@ void dxt_mpiio_write(darshan_record_id rec_id, int64_t offset, DXT_UNLOCK(); return; } - + rec_ref->write_traces[file_rec->write_count].length = length; rec_ref->write_traces[file_rec->write_count].offset = offset; rec_ref->write_traces[file_rec->write_count].start_time = start_time; rec_ref->write_traces[file_rec->write_count].end_time = end_time; - file_rec->write_count += 1; + /* Code added by Hammad Ather (hather@lbl.gov) and Jean Luca Bez (jlbez@lbl.gov) */ + if (isStackTrace){ + int size = backtrace (rec_ref->write_traces[file_rec->write_count].address_array, STACK_TRACE_BUF_SIZE); + rec_ref->write_traces[file_rec->write_count].noStackTrace = 1; + rec_ref->write_traces[file_rec->write_count].size = size; + } + else + rec_ref->write_traces[file_rec->write_count].noStackTrace = 0; + file_rec->write_count += 1; DXT_UNLOCK(); } @@ -392,16 +419,28 @@ void dxt_mpiio_read(darshan_record_id rec_id, int64_t offset, DXT_UNLOCK(); return; } - + rec_ref->read_traces[file_rec->read_count].length = length; rec_ref->read_traces[file_rec->read_count].offset = offset; rec_ref->read_traces[file_rec->read_count].start_time = start_time; rec_ref->read_traces[file_rec->read_count].end_time = end_time; + /* Code added by Hammad Ather (hather@lbl.gov) and Jean Luca Bez (jlbez@lbl.gov) */ + if (isStackTrace){ + int size = backtrace (rec_ref->read_traces[file_rec->read_count].address_array , STACK_TRACE_BUF_SIZE); + rec_ref->read_traces[file_rec->read_count].noStackTrace = 1; + rec_ref->read_traces[file_rec->read_count].size = size; + } + else + rec_ref->read_traces[file_rec->read_count].noStackTrace = 0; file_rec->read_count += 1; - DXT_UNLOCK(); } +void dxt_enable_stack_trace () +{ + isStackTrace = true; +} + static void dxt_posix_filter_traces_iterator(void *rec_ref_p, void *user_ptr) { struct dxt_file_record_ref *psx_rec_ref, *mpiio_rec_ref; @@ -788,9 +827,66 @@ static void dxt_serialize_posix_records(void *rec_ref_p, void *user_ptr) record_write_count = file_rec->write_count; record_read_count = file_rec->read_count; + if (record_write_count == 0 && record_read_count == 0) return; + + /* Code added by Hammad Ather (hather@lbl.gov) and Jean Luca Bez (jlbez@lbl.gov) */ + if (isStackTrace){ + char stack_file_name[50]; + sprintf(stack_file_name, ".%d.darshan-posix", dxt_my_rank); + + FILE *fptr; + fptr = fopen(stack_file_name, "a+"); + + typedef struct { + void *address; /* key */ + UT_hash_handle hh; /* makes this structure hashable */ + } stack_struct; + + + char * exe_name = darshan_exe(); + for(int i = 0; i < record_write_count; i++){ + char **strings; + int size = rec_ref->write_traces[i].size; + strings = backtrace_symbols (rec_ref->write_traces[i].address_array, size); + if (strings != NULL) + { + for (int j = 0; j < size; j++){ + if (strstr(strings[j], exe_name) != NULL) { + char * token = strtok(strings[j], "["); + token = strtok(NULL, "["); + token = strtok(token, "]"); + int number = (int)strtol(token, NULL, 16); + fprintf(fptr, "%p\n", number); + } + } + free(strings); + } + } + for(int i = 0; i < record_read_count; i++){ + char **strings; + int size = rec_ref->read_traces[i].size; + strings = backtrace_symbols (rec_ref->read_traces[i].address_array, size); + if (strings != NULL) + { + for (int j = 0; j < size; j++){ + if (strstr(strings[j], exe_name) != NULL) { + char * token = strtok(strings[j], "["); + token = strtok(NULL, "["); + token = strtok(token, "]"); + int number = (int)strtol(token, NULL, 16); + fprintf(fptr, "%p\n", number); + } + } + free(strings); + } + } + + fclose(fptr); + } + /* * Buffer format: * dxt_file_record + write_traces + read_traces @@ -805,9 +901,10 @@ static void dxt_serialize_posix_records(void *rec_ref_p, void *user_ptr) memcpy(tmp_buf_ptr, (void *)file_rec, sizeof(struct dxt_file_record)); tmp_buf_ptr = (void *)(tmp_buf_ptr + sizeof(struct dxt_file_record)); - /*Copy write record */ + /*Copy write record */ memcpy(tmp_buf_ptr, (void *)(rec_ref->write_traces), record_write_count * sizeof(segment_info)); + tmp_buf_ptr = (void *)(tmp_buf_ptr + record_write_count * sizeof(segment_info)); @@ -817,6 +914,7 @@ static void dxt_serialize_posix_records(void *rec_ref_p, void *user_ptr) tmp_buf_ptr = (void *)(tmp_buf_ptr + record_read_count * sizeof(segment_info)); + //printf("%i\n", file_rec->base_rec.rank); dxt_posix_runtime->record_buf_size += record_size; } @@ -881,7 +979,63 @@ static void dxt_serialize_mpiio_records(void *rec_ref_p, void *user_ptr) record_read_count = file_rec->read_count; if (record_write_count == 0 && record_read_count == 0) return; + + /* Code added by Hammad Ather (hather@lbl.gov) and Jean Luca Bez (jlbez@lbl.gov) */ + if (isStackTrace){ + char stack_file_name[50]; + sprintf(stack_file_name, ".%d.darshan-mpiio", dxt_my_rank); + + FILE *fptr; + fptr = fopen(stack_file_name, "a+"); + + typedef struct { + void *address; /* key */ + UT_hash_handle hh; /* makes this structure hashable */ + } stack_struct; + + + char * exe_name = darshan_exe(); + for(int i = 0; i < record_write_count; i++){ + char **strings; + int size = rec_ref->write_traces[i].size; + strings = backtrace_symbols (rec_ref->write_traces[i].address_array, size); + if (strings != NULL) + { + for (int j = 0; j < size; j++){ + if (strstr(strings[j], exe_name) != NULL) { + char * token = strtok(strings[j], "["); + token = strtok(NULL, "["); + token = strtok(token, "]"); + int number = (int)strtol(token, NULL, 16); + fprintf(fptr, "%p\n", number); + } + } + free(strings); + } + } + + for(int i = 0; i < record_read_count; i++){ + char **strings; + int size = rec_ref->read_traces[i].size; + strings = backtrace_symbols (rec_ref->read_traces[i].address_array, size); + if (strings != NULL) + { + for (int j = 0; j < size; j++){ + if (strstr(strings[j], exe_name) != NULL) { + char * token = strtok(strings[j], "["); + token = strtok(NULL, "["); + token = strtok(token, "]"); + int number = (int)strtol(token, NULL, 16); + fprintf(fptr, "%p\n", number); + } + } + free(strings); + } + } + + fclose(fptr); + } /* * Buffer format: * dxt_file_record + write_traces + read_traces @@ -907,7 +1061,6 @@ static void dxt_serialize_mpiio_records(void *rec_ref_p, void *user_ptr) record_read_count * sizeof(segment_info)); tmp_buf_ptr = (void *)(tmp_buf_ptr + record_read_count * sizeof(segment_info)); - dxt_mpiio_runtime->record_buf_size += record_size; } @@ -963,3 +1116,4 @@ static void dxt_mpiio_cleanup() * * vim: ts=8 sts=4 sw=4 expandtab */ + diff --git a/darshan-runtime/lib/darshan-dxt.h b/darshan-runtime/lib/darshan-dxt.h index a03ce9608..812796b31 100644 --- a/darshan-runtime/lib/darshan-dxt.h +++ b/darshan-runtime/lib/darshan-dxt.h @@ -19,7 +19,8 @@ enum dxt_trigger_type { DXT_SMALL_IO_TRIGGER, - DXT_UNALIGNED_IO_TRIGGER + DXT_UNALIGNED_IO_TRIGGER, + DXT_COLLECT_STACK_TRACE }; struct dxt_trigger { @@ -36,6 +37,7 @@ struct dxt_trigger } u; }; + /* dxt_posix_runtime_initialize() * * DXT function exposed to POSIX module for initializing DXT-POSIX runtime. @@ -72,4 +74,6 @@ void dxt_mpiio_read(darshan_record_id rec_id, int64_t offset, void dxt_posix_apply_trace_filter(struct dxt_trigger *trigger); +void dxt_enable_stack_trace(); + #endif /* __DARSHAN_DXT_H */ diff --git a/darshan-runtime/lib/darshan-hdf5.c b/darshan-runtime/lib/darshan-hdf5.c index a654888bd..2b6fb95f7 100644 --- a/darshan-runtime/lib/darshan-hdf5.c +++ b/darshan-runtime/lib/darshan-hdf5.c @@ -2054,4 +2054,4 @@ static void hdf5_dataset_cleanup() * End: * * vim: ts=8 sts=4 sw=4 expandtab - */ + */ \ No newline at end of file diff --git a/darshan-runtime/lib/darshan.h b/darshan-runtime/lib/darshan.h index f29c36bdc..2807a0691 100644 --- a/darshan-runtime/lib/darshan.h +++ b/darshan-runtime/lib/darshan.h @@ -314,6 +314,15 @@ int darshan_core_register_module( void darshan_core_unregister_module( darshan_module_id mod_id); + +char *darshan_exe(); + +void set_posix_line_mapping( + char *mapping_array, bool isStackTrace); + +void set_mpiio_line_mapping( + char *mapping_array, bool isStackTrace); + /* darshan_instrument_fs_data() * * Allow file system-specific modules to instrument data for the file diff --git a/darshan-util/darshan-dxt-logutils.c b/darshan-util/darshan-dxt-logutils.c index 28eb50a52..cd64d0f6e 100644 --- a/darshan-util/darshan-dxt-logutils.c +++ b/darshan-util/darshan-dxt-logutils.c @@ -19,9 +19,12 @@ #include #include #include +#include #include "darshan-logutils.h" +#define STACK_TRACE_BUF_SIZE 60 + static int dxt_log_get_posix_file(darshan_fd fd, void** dxt_posix_buf_p); static int dxt_log_put_posix_file(darshan_fd fd, void* dxt_posix_buf); @@ -300,11 +303,12 @@ void dxt_log_print_posix_file(void *posix_file_rec, char *file_name, int64_t rank = file_rec->base_rec.rank; char *hostname = file_rec->hostname; + int64_t write_count = file_rec->write_count; int64_t read_count = file_rec->read_count; segment_info *io_trace = (segment_info *) ((void *)file_rec + sizeof(struct dxt_file_record)); - + /* Lustre File System */ struct darshan_lustre_record *rec; int lustreFS = !strcmp(fs_type, "lustre"); @@ -313,11 +317,15 @@ void dxt_log_print_posix_file(void *posix_file_rec, char *file_name, int64_t cur_offset; int print_count; int ost_idx; + bool isStackTrace = true; if (!lustre_rec_ref) { lustreFS = 0; } + if (io_trace->noStackTrace==0) + isStackTrace = false; + printf("\n# DXT, file_id: %" PRIu64 ", file_name: %s\n", f_id, file_name); printf("# DXT, rank: %" PRId64 ", hostname: %s\n", rank, hostname); printf("# DXT, write_count: %" PRId64 ", read_count: %" PRId64 "\n", @@ -339,11 +347,14 @@ void dxt_log_print_posix_file(void *posix_file_rec, char *file_name, } /* Print header */ - printf("# Module Rank Wt/Rd Segment Offset Length Start(s) End(s)"); + printf("# Module Rank Wt/Rd Segment Offset Length Start(s) End(s)"); if (lustreFS) { - printf(" [OST]"); + printf(" [OST]"); } + + if (isStackTrace) + printf(" Stack Memory Addresses"); printf("\n"); /* Print IO Traces information */ @@ -361,7 +372,7 @@ void dxt_log_print_posix_file(void *posix_file_rec, char *file_name, print_count = 0; while (cur_offset < offset + length) { - printf(" [%3" PRId64 "]", (rec->ost_ids)[ost_idx]); + printf(" [%3" PRId64 "]", (rec->ost_ids)[ost_idx]); cur_offset = (cur_offset / stripe_size + 1) * stripe_size; ost_idx = (ost_idx == stripe_count - 1) ? 0 : ost_idx + 1; @@ -372,6 +383,19 @@ void dxt_log_print_posix_file(void *posix_file_rec, char *file_name, } } + if (isStackTrace){ + bool first = true; + printf(" ["); + for (int j = 0; j < STACK_TRACE_BUF_SIZE; j++) { + if (io_trace[i].address_array[j]){ + if (j != STACK_TRACE_BUF_SIZE - 1 && first == false) + printf(", "); + printf("%p", io_trace[i].address_array[j]); + first = false; + } + } + printf("]"); + } printf("\n"); } @@ -389,7 +413,7 @@ void dxt_log_print_posix_file(void *posix_file_rec, char *file_name, print_count = 0; while (cur_offset < offset + length) { - printf(" [%3" PRId64 "]", (rec->ost_ids)[ost_idx]); + printf(" [%3" PRId64 "]", (rec->ost_ids)[ost_idx]); cur_offset = (cur_offset / stripe_size + 1) * stripe_size; ost_idx = (ost_idx == stripe_count - 1) ? 0 : ost_idx + 1; @@ -400,6 +424,20 @@ void dxt_log_print_posix_file(void *posix_file_rec, char *file_name, } } + if (isStackTrace){ + bool first = true; + printf(" ["); + for (int j = 0; j < STACK_TRACE_BUF_SIZE; j++) { + if (io_trace[i].address_array[j]){ + if (j != STACK_TRACE_BUF_SIZE - 1 && first == false) + printf(", "); + printf("%p", io_trace[i].address_array[j]); + first = false; + } + } + printf("]"); + } + printf("\n"); } return; @@ -426,6 +464,10 @@ void dxt_log_print_mpiio_file(void *mpiio_file_rec, char *file_name, segment_info *io_trace = (segment_info *) ((void *)file_rec + sizeof(struct dxt_file_record)); + + bool isStackTrace = true; + if (io_trace[0].noStackTrace == 0) + isStackTrace = false; printf("\n# DXT, file_id: %" PRIu64 ", file_name: %s\n", f_id, file_name); printf("# DXT, rank: %" PRId64 ", hostname: %s\n", rank, hostname); @@ -435,7 +477,10 @@ void dxt_log_print_mpiio_file(void *mpiio_file_rec, char *file_name, printf("# DXT, mnt_pt: %s, fs_type: %s\n", mnt_pt, fs_type); /* Print header */ - printf("# Module Rank Wt/Rd Segment Offset Length Start(s) End(s)\n"); + if (isStackTrace) + printf("# Module Rank Wt/Rd Segment Offset Length Start(s) End(s) Stack Memory Addresses\n"); + else + printf("# Module Rank Wt/Rd Segment Offset Length Start(s) End(s)\n"); /* Print IO Traces information */ for (i = 0; i < write_count; i++) { @@ -444,7 +489,22 @@ void dxt_log_print_mpiio_file(void *mpiio_file_rec, char *file_name, start_time = io_trace[i].start_time; end_time = io_trace[i].end_time; - printf("%8s%8" PRId64 "%7s%9d%16" PRId64 "%16" PRId64 "%12.4f%12.4f\n", "X_MPIIO", rank, "write", i, offset, length, start_time, end_time); + printf("%8s%8" PRId64 "%7s%9d%16" PRId64 "%16" PRId64 "%12.4f%12.4f", "X_MPIIO", rank, "write", i, offset, length, start_time, end_time); + + if (isStackTrace){ + bool first = true; + printf(" ["); + for (int j = 0; j < STACK_TRACE_BUF_SIZE; j++) { + if (io_trace[i].address_array[j]){ + if (j != STACK_TRACE_BUF_SIZE - 1 && first == false) + printf(", "); + printf("%p", io_trace[i].address_array[j]); + first = false; + } + } + printf("]"); + } + printf("\n"); } for (i = write_count; i < write_count + read_count; i++) { @@ -453,7 +513,22 @@ void dxt_log_print_mpiio_file(void *mpiio_file_rec, char *file_name, start_time = io_trace[i].start_time; end_time = io_trace[i].end_time; - printf("%8s%8" PRId64 "%7s%9d%16" PRId64 "%16" PRId64 "%12.4f%12.4f\n", "X_MPIIO", rank, "read", (int)(i - write_count), offset, length, start_time, end_time); + printf("%8s%8" PRId64 "%7s%9d%16" PRId64 "%16" PRId64 "%12.4f%12.4f", "X_MPIIO", rank, "read", (int)(i - write_count), offset, length, start_time, end_time); + + if (isStackTrace){ + bool first = true; + printf(" ["); + for (int j = 0; j < STACK_TRACE_BUF_SIZE; j++) { + if (io_trace[i].address_array[j]){ + if (j != STACK_TRACE_BUF_SIZE - 1 && first == false) + printf(", "); + printf("%p", io_trace[i].address_array[j]); + first = false; + } + } + printf("]"); + } + printf("\n"); } return; diff --git a/darshan-util/darshan-dxt-parser.c b/darshan-util/darshan-dxt-parser.c index 23007b92c..7c203a73a 100644 --- a/darshan-util/darshan-dxt-parser.c +++ b/darshan-util/darshan-dxt-parser.c @@ -138,6 +138,18 @@ int main(int argc, char **argv) printf("# metadata: %s = %s\n", key, value); } + if (strlen(fd->posix_line_mapping) != 0){ + printf("\n# DXT-POSIX address to line mapping\n"); + printf("# -------------------------------------------------------\n"); + printf("%s", fd->posix_line_mapping); + } + + if (strlen(fd->mpiio_line_mapping) != 0){ + printf("\n# DXT-MPIIO address to line mapping\n"); + printf("# -------------------------------------------------------\n"); + printf("%s", fd->mpiio_line_mapping); + } + /* print breakdown of each log file region's contribution to file size */ printf("\n# log file regions\n"); printf("# -------------------------------------------------------\n"); diff --git a/darshan-util/darshan-logutils.c b/darshan-util/darshan-logutils.c index 613d49f5d..3e315db5f 100644 --- a/darshan-util/darshan-logutils.c +++ b/darshan-util/darshan-logutils.c @@ -432,12 +432,14 @@ int darshan_log_get_exe(darshan_fd fd, char *buf) } /* exe string is located before the first line break */ + // printf("%s", state->exe_mnt_data); newline = strchr(state->exe_mnt_data, '\n'); /* copy over the exe string */ if(newline) memcpy(buf, state->exe_mnt_data, (newline - state->exe_mnt_data)); - + else + memcpy(buf, state->exe_mnt_data, strlen(state->exe_mnt_data)); return (0); } @@ -1085,7 +1087,7 @@ static int darshan_log_get_header(darshan_fd fd) int log_ver_maj, log_ver_min; int i; int ret; - + ret = darshan_log_seek(fd, 0); if(ret < 0) { @@ -1141,7 +1143,7 @@ static int darshan_log_get_header(darshan_fd fd) /* read uncompressed header from log file */ /* NOTE: header bumped from 16 to 64 modules at log ver 3.41 */ if(((log_ver_maj == 3) && (log_ver_min >= 41)) || (log_ver_maj > 3)) - { + { ret = darshan_log_read(fd, &header, sizeof(header)); if(ret != (int)sizeof(header)) { @@ -1164,6 +1166,8 @@ static int darshan_log_get_header(darshan_fd fd) struct darshan_log_map name_map; struct darshan_log_map mod_map[DARSHAN_MAX_MODS_3_00]; uint32_t mod_ver[DARSHAN_MAX_MODS_3_00]; + char posix_line_mapping[1024]; + char mpiio_line_mapping[1024]; } header_3_00; /* read old header structure */ @@ -1173,7 +1177,6 @@ static int darshan_log_get_header(darshan_fd fd) fprintf(stderr, "Error: failed to read darshan log file header.\n"); return(-1); } - /* set new header structure */ memset(&header, 0, sizeof(header)); strncpy(header.version_string, header_3_00.version_string, 8); @@ -1184,7 +1187,10 @@ static int darshan_log_get_header(darshan_fd fd) (1 + DARSHAN_MAX_MODS_3_00) * sizeof(header_3_00.name_map)); memcpy(&header.mod_ver, &header_3_00.mod_ver, (DARSHAN_MAX_MODS_3_00) * sizeof(header_3_00.mod_ver[0])); - + memcpy(&header.posix_line_mapping, &header_3_00.posix_line_mapping, + strlen(header.posix_line_mapping)); + memcpy(&header.mpiio_line_mapping, &header_3_00.mpiio_line_mapping, + strlen(header.mpiio_line_mapping)); fd->job_map.off = sizeof(header_3_00); } @@ -1265,6 +1271,7 @@ static int darshan_log_get_header(darshan_fd fd) // zero out bits up to (and including) PNETCDF_VAR in shifted flags partial_flag_shift = (partial_flag_shift >> (DARSHAN_PNETCDF_VAR_MOD+1)) << (DARSHAN_PNETCDF_VAR_MOD+1); + // zero out PNETCDF_VAR and all bits higher than it in original flags fd->partial_flag = fd->partial_flag & ((1 << DARSHAN_PNETCDF_VAR_MOD) - 1); // combine original flags and shifted flags @@ -1299,6 +1306,10 @@ static int darshan_log_get_header(darshan_fd fd) fd->job_map.len = fd->name_map.off - fd->job_map.off; } + if (strlen(header.posix_line_mapping) != 0) + memcpy(&fd->posix_line_mapping, &header.posix_line_mapping, strlen(header.posix_line_mapping)); + if (strlen(header.mpiio_line_mapping) != 0) + memcpy(&fd->mpiio_line_mapping, &header.mpiio_line_mapping, strlen(header.mpiio_line_mapping)); return(0); } diff --git a/darshan-util/darshan-logutils.h b/darshan-util/darshan-logutils.h index ee4033e11..031253001 100644 --- a/darshan-util/darshan-logutils.h +++ b/darshan-util/darshan-logutils.h @@ -40,7 +40,8 @@ struct darshan_fd_s struct darshan_log_map mod_map[DARSHAN_MAX_MODS]; /* module-specific log-format versions contained in log */ uint32_t mod_ver[DARSHAN_MAX_MODS]; - + char posix_line_mapping[4096]; + char mpiio_line_mapping[4096]; /* KEEP OUT -- remaining state hidden in logutils source */ struct darshan_fd_int_state *state; diff --git a/darshan-util/darshan-parser.c b/darshan-util/darshan-parser.c index 83c97eefc..a2a02fe4a 100644 --- a/darshan-util/darshan-parser.c +++ b/darshan-util/darshan-parser.c @@ -236,7 +236,6 @@ int main(int argc, char **argv) value++; printf("# metadata: %s = %s\n", key, value); } - /* print breakdown of each log file region's contribution to file size */ printf("\n# log file regions\n"); printf("# -------------------------------------------------------\n"); diff --git a/darshan-util/pydarshan/darshan/backend/api_def_c.py b/darshan-util/pydarshan/darshan/backend/api_def_c.py index 50ae9cae3..7a9179d96 100644 --- a/darshan-util/pydarshan/darshan/backend/api_def_c.py +++ b/darshan-util/pydarshan/darshan/backend/api_def_c.py @@ -32,6 +32,47 @@ struct darshan_file_category_counters category_counters[7]; }; +#define DARSHAN_MAX_MODS 64 + +struct darshan_log_map +{ + uint64_t off; + uint64_t len; +}; + +struct darshan_fd_int_state; + +/* darshan file descriptor definition */ +struct darshan_fd_s +{ + /* log file version */ + char version[8]; + /* flag indicating whether byte swapping needs to be + * performed on log file data + */ + int swap_flag; + /* bit-field indicating whether modules contain incomplete data */ + uint64_t partial_flag; + /* compression type used on log file */ + enum darshan_comp_type comp_type; + /* log file offset/length maps for each log file region */ + struct darshan_log_map job_map; + struct darshan_log_map name_map; + struct darshan_log_map mod_map[DARSHAN_MAX_MODS]; + /* module-specific log-format versions contained in log */ + uint32_t mod_ver[DARSHAN_MAX_MODS]; + char posix_line_mapping[4096]; + char mpiio_line_mapping[4096]; + /* KEEP OUT -- remaining state hidden in logutils source */ + struct darshan_fd_int_state *state; + + /* workaround to parse logs with slightly inconsistent heatmap bin + * counts as described in https://github.com/darshan-hpc/darshan/issues/941 + */ + int64_t first_heatmap_record_nbins; + double first_heatmap_record_bin_width_seconds; +}; + struct darshan_mnt_info { char mnt_type[3015]; @@ -63,6 +104,7 @@ /* from darshan-log-format.h */ typedef uint64_t darshan_record_id; +#define STACK_TRACE_BUF_SIZE 60 struct darshan_job { @@ -177,6 +219,9 @@ int64_t length; double start_time; double end_time; + void *address_array[STACK_TRACE_BUF_SIZE]; + int noStackTrace; + int size; } segment_info; /* counter names */ diff --git a/darshan-util/pydarshan/darshan/backend/cffi_backend.py b/darshan-util/pydarshan/darshan/backend/cffi_backend.py index bd7c338d8..074189d84 100644 --- a/darshan-util/pydarshan/darshan/backend/cffi_backend.py +++ b/darshan-util/pydarshan/darshan/backend/cffi_backend.py @@ -42,6 +42,8 @@ except: pass +flagPOSIX = False +flagMPIIO = False API_def_c = load_darshan_header(addins) ffi = cffi.FFI() @@ -52,6 +54,8 @@ check_version(ffi, libdutil) +logfilename = None + _mod_names = [ "NULL", @@ -93,7 +97,7 @@ def mod_name_to_idx(mod_name): "APMPI-PERF": "struct darshan_apmpi_perf_record **", } - +STACK_TRACE_BUF_SIZE = 60 def get_lib_version(): """ @@ -121,7 +125,9 @@ def log_open(filename): Return: log handle """ + b_fname = filename.encode() + logfilename = filename handle = libdutil.darshan_log_open(b_fname) log = {"handle": handle, 'modules': None, 'name_records': None} @@ -577,13 +583,11 @@ def log_get_dxt_record(log, mod_name, reads=True, writes=True, dtype='dict'): """ - modules = log_get_modules(log) if mod_name not in modules: return None mod_type = _structdefs[mod_name] #name_records = log_get_name_records(log) - rec = {} buf = ffi.new("void **") r = libdutil.darshan_log_get_record(log['handle'], modules[mod_name]['idx'], buf) @@ -605,11 +609,9 @@ def log_get_dxt_record(log, mod_name, reads=True, writes=True, dtype='dict'): rec['write_segments'] = [] rec['read_segments'] = [] - size_of = ffi.sizeof("struct dxt_file_record") segments = ffi.cast("struct segment_info *", buf[0] + size_of ) - for i in range(wcnt): seg = { "offset": segments[i].offset, @@ -617,6 +619,16 @@ def log_get_dxt_record(log, mod_name, reads=True, writes=True, dtype='dict'): "start_time": segments[i].start_time, "end_time": segments[i].end_time } + seg_array = [] + if not segments[i].noStackTrace == 0: + for j in range(STACK_TRACE_BUF_SIZE): + if (segments[i].address_array[j]): + addr = str(segments[i].address_array[j]) + addr = addr.split("'void *' ") + addr = addr[1].split(">") + seg_array.append(str(addr[0])) + seg["stack_memory_addresses"] = seg_array + rec['write_segments'].append(seg) @@ -628,6 +640,16 @@ def log_get_dxt_record(log, mod_name, reads=True, writes=True, dtype='dict'): "start_time": segments[i].start_time, "end_time": segments[i].end_time } + seg_array = [] + if not segments[i].noStackTrace == 0: + for j in range(STACK_TRACE_BUF_SIZE): + if (segments[i].address_array[j]): + addr = str(segments[i].address_array[j]) + addr = addr.split("'void *' ") + addr = addr[1].split(">") + seg_array.append(str(addr[0])) + seg["stack_memory_addresses"] = seg_array + rec['read_segments'].append(seg) @@ -635,6 +657,65 @@ def log_get_dxt_record(log, mod_name, reads=True, writes=True, dtype='dict'): rec['read_segments'] = pd.DataFrame(rec['read_segments']) rec['write_segments'] = pd.DataFrame(rec['write_segments']) + + size_of = ffi.sizeof("struct darshan_fd_s") + address_line_mapping = ffi.cast("struct darshan_fd_s *", log['handle']) + + global flagPOSIX + global flagMPIIO + if mod_name == 'DXT_POSIX': + if flagPOSIX == False: + flagPOSIX = True + rec['address_line_mapping'] = [] + data = ffi.string(address_line_mapping.posix_line_mapping) + data = data.decode('utf-8') + data = data.split('\n') + for item in data: + if item: + item = item.split(",") + address = item[0] + + func_line = item[1] + func_line = func_line.split(":") + function_name = func_line[0] + line_number = func_line[1] + + mapping = { + "address": address, + "function_name": function_name, + "line_number": line_number + } + + rec['address_line_mapping'].append(mapping) + else: + rec['address_line_mapping'] = {} + elif mod_name == 'DXT_MPIIO': + if flagMPIIO == False: + flagMPIIO = True + rec['address_line_mapping'] = [] + data = ffi.string(address_line_mapping.mpiio_line_mapping) + data = data.decode('utf-8') + data = data.split('\n') + for item in data: + if item: + item = item.split(",") + address = item[0] + + func_line = item[1] + func_line = func_line.split(":") + function_name = func_line[0] + line_number = func_line[1] + + mapping = { + "address": address, + "function_name": function_name, + "line_number": line_number + } + + rec['address_line_mapping'].append(mapping) + else: + rec['address_line_mapping'] = {} + libdutil.darshan_free(buf[0]) return rec diff --git a/darshan-util/pydarshan/darshan/report.py b/darshan-util/pydarshan/darshan/report.py index 047e4d568..5676f71f2 100644 --- a/darshan-util/pydarshan/darshan/report.py +++ b/darshan-util/pydarshan/darshan/report.py @@ -268,6 +268,10 @@ def to_df(self, attach="default"): for rec in records: rec['read_segments'] = pd.DataFrame(rec['read_segments']) rec['write_segments'] = pd.DataFrame(rec['write_segments']) + if mod == 'DXT_POSIX': + rec['address_line_mapping'] = pd.DataFrame(rec['address_line_mapping']) + elif mod == 'DXT_MPIIO': + rec['address_line_mapping'] = pd.DataFrame(rec['address_line_mapping']) else: df_recs = pd.DataFrame.from_records(records) # generic records have counter and fcounter arrays to collect diff --git a/include/darshan-dxt-log-format.h b/include/darshan-dxt-log-format.h index b31fc9928..2468761d1 100644 --- a/include/darshan-dxt-log-format.h +++ b/include/darshan-dxt-log-format.h @@ -12,6 +12,9 @@ #define HOSTNAME_SIZE 64 +#define STACK_TRACE_BUF_SIZE 60 + +#include /* * DXT, the segment_info structure maintains detailed Segment IO tracing * information @@ -21,6 +24,9 @@ typedef struct segment_info { int64_t length; double start_time; double end_time; + void *address_array[STACK_TRACE_BUF_SIZE]; + int noStackTrace; + int size; } segment_info; #define X(a) a, diff --git a/include/darshan-log-format.h b/include/darshan-log-format.h index 4fbf37b53..e3fc9d97e 100644 --- a/include/darshan-log-format.h +++ b/include/darshan-log-format.h @@ -76,6 +76,8 @@ struct darshan_header struct darshan_log_map name_map; struct darshan_log_map mod_map[DARSHAN_MAX_MODS]; uint32_t mod_ver[DARSHAN_MAX_MODS]; + char posix_line_mapping[4096]; + char mpiio_line_mapping[4096]; }; /* job-level metadata stored for this application */