diff --git a/api_test/main.c b/api_test/main.c index 17e1582b4..431e22bbe 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -5,6 +5,7 @@ #define CMARK_NO_SHORT_NAMES #include "cmark.h" #include "node.h" +#include "parser.h" #include "harness.h" #include "cplusplus.h" @@ -90,7 +91,10 @@ static void accessors(test_batch_runner *runner) { "\n" "
html
\n" "\n" - "[link](url 'title')\n"; + "[link](url 'title')\n" + "\n" + "[foo]: /bar 'title'\n" + "\n"; cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT); @@ -140,6 +144,11 @@ static void accessors(test_batch_runner *runner) { cmark_node *string = cmark_node_first_child(link); STR_EQ(runner, cmark_node_get_literal(string), "link", "get_literal string"); + cmark_node *reference = cmark_node_next(paragraph); + STR_EQ(runner, cmark_node_get_url(reference), "/bar", "get_reference_url"); + STR_EQ(runner, cmark_node_get_title(reference), "title", "get_reference_title"); + STR_EQ(runner, cmark_node_get_label(reference), "foo", "get_reference_label"); + // Setters OK(runner, cmark_node_set_heading_level(heading, 3), "set_heading_level"); @@ -169,6 +178,10 @@ static void accessors(test_batch_runner *runner) { OK(runner, cmark_node_set_url(link, "URL"), "set_url"); OK(runner, cmark_node_set_title(link, "TITLE"), "set_title"); + OK(runner, cmark_node_set_url(reference, "URL"), "set_reference_url"); + OK(runner, cmark_node_set_title(reference, "TITLE"), "set_reference_title"); + OK(runner, cmark_node_set_label(reference, "LABEL"), "set_reference_label"); + OK(runner, cmark_node_set_literal(string, "prefix-LINK"), "set_literal string"); @@ -214,6 +227,7 @@ static void accessors(test_batch_runner *runner) { "get_fence_info error"); OK(runner, cmark_node_get_url(html) == NULL, "get_url error"); OK(runner, cmark_node_get_title(heading) == NULL, "get_title error"); + OK(runner, cmark_node_get_label(link) == NULL, "get_label error"); // Setter errors @@ -229,6 +243,7 @@ static void accessors(test_batch_runner *runner) { "set_fence_info error"); OK(runner, !cmark_node_set_url(html, "url"), "set_url error"); OK(runner, !cmark_node_set_title(heading, "title"), "set_title error"); + OK(runner, !cmark_node_set_label(link, "label"), "set_label error"); OK(runner, !cmark_node_set_heading_level(heading, 0), "set_heading_level too small"); @@ -883,6 +898,41 @@ static void test_feed_across_line_ending(test_batch_runner *runner) { cmark_node_free(document); } +static cmark_node *S_parse_with_fake_total(bufsize_t fake_total, + const char *str, + cmark_err_type *err) { + cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT); + parser->total_bytes = fake_total; + cmark_parser_feed(parser, str, strlen(str)); + cmark_node *doc = cmark_parser_finish(parser); + *err = cmark_parser_get_error(parser); + cmark_parser_free(parser); + return doc; +} + +static void test_bufsize_overflow(test_batch_runner *runner) { + cmark_node *doc; + cmark_err_type err; + + doc = S_parse_with_fake_total(BUFSIZE_MAX, "a", &err); + OK(runner, doc == NULL, "parse 1 byte after BUFSIZE_MAX bytes fails"); + INT_EQ(runner, err, CMARK_ERR_INPUT_TOO_LARGE, + "parse 1 byte after BUFSIZE_MAX bytes error code"); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 9, "0123456789", &err); + OK(runner, doc == NULL, "parse 10 byte after BUFSIZE_MAX-9 bytes fails"); + INT_EQ(runner, err, CMARK_ERR_INPUT_TOO_LARGE, + "parse 10 byte after BUFSIZE_MAX-9 bytes error code"); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 1, "a", &err); + OK(runner, doc != NULL, "parse 1 byte after BUFSIZE_MAX-1 bytes"); + cmark_node_free(doc); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 10, "0123456789", &err); + OK(runner, doc != NULL, "parse 10 byte after BUFSIZE_MAX-10 bytes"); + cmark_node_free(doc); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -908,6 +958,7 @@ int main() { test_cplusplus(runner); test_safe(runner); test_feed_across_line_ending(runner); + test_bufsize_overflow(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f52ded6ca..b9220af0f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,7 @@ set(HEADERS houdini.h cmark_ctype.h render.h + source_map.h ) set(LIBRARY_SOURCES cmark.c @@ -40,6 +41,7 @@ set(LIBRARY_SOURCES houdini_html_e.c houdini_html_u.c cmark_ctype.c + source_map.c ${HEADERS} ) @@ -54,6 +56,9 @@ include_directories(. ${CMAKE_CURRENT_BINARY_DIR}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmark_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/cmark_version.h) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmark-format.in + ${CMAKE_CURRENT_BINARY_DIR}/cmark-format) + include (GenerateExportHeader) add_executable(${PROGRAM} ${PROGRAM_SOURCES}) diff --git a/src/blocks.c b/src/blocks.c index 99dd08265..4abe7d3c4 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -28,6 +28,10 @@ #define MIN(x, y) ((x < y) ? x : y) #endif +#ifndef MAX +#define MAX(x, y) ((x > y) ? x : y) +#endif + #define peek_at(i, n) (i)->data[n] static bool S_last_line_blank(const cmark_node *node) { @@ -92,7 +96,10 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) { parser->refmap = cmark_reference_map_new(mem); parser->root = document; parser->current = document; + parser->error_code = CMARK_ERR_NONE; + parser->total_bytes = 0; parser->line_number = 0; + parser->line_offset = 0; parser->offset = 0; parser->column = 0; parser->first_nonspace = 0; @@ -104,6 +111,9 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) { parser->options = options; parser->last_buffer_ended_with_cr = false; + if (options & CMARK_OPT_SOURCEPOS) + parser->source_map = source_map_new(mem); + return parser; } @@ -116,6 +126,7 @@ void cmark_parser_free(cmark_parser *parser) { cmark_mem *mem = parser->mem; cmark_strbuf_free(&parser->curline); cmark_strbuf_free(&parser->linebuf); + source_map_free(parser->source_map); cmark_reference_map_free(parser->refmap); mem->free(parser); } @@ -255,18 +266,28 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { switch (S_type(b)) { case CMARK_NODE_PARAGRAPH: + source_map_start_cursor(parser->source_map, parser->last_paragraph_extent); while (cmark_strbuf_at(node_content, 0) == '[' && (pos = cmark_parse_reference_inline(parser->mem, node_content, - parser->refmap))) { - + parser->refmap, b, + parser->source_map))) { + source_map_start_cursor(parser->source_map, + source_map_get_cursor(parser->source_map)); cmark_strbuf_drop(node_content, pos); } + + while (parser->last_paragraph_extent != source_map_get_cursor(parser->source_map)) { + if (parser->last_paragraph_extent->node == b) { + parser->last_paragraph_extent->node = parser->root; + } + parser->last_paragraph_extent = parser->last_paragraph_extent->next; + } + if (is_blank(node_content, 0)) { // remove blank node (former reference def) cmark_node_free(b); } break; - case CMARK_NODE_CODE_BLOCK: if (!b->as.code.fenced) { // indented code remove_trailing_blank_lines(node_content); @@ -361,21 +382,36 @@ static cmark_node *add_child(cmark_parser *parser, cmark_node *parent, // Walk through node and all children, recursively, parsing // string content into inline content where appropriate. -static void process_inlines(cmark_mem *mem, cmark_node *root, - cmark_reference_map *refmap, int options) { - cmark_iter *iter = cmark_iter_new(root); +static void process_inlines(cmark_parser *parser) { + cmark_iter *iter = cmark_iter_new(parser->root); cmark_node *cur; cmark_event_type ev_type; + cmark_source_extent *cur_extent = NULL; + + cur_extent = source_map_get_head(parser->source_map); while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { cur = cmark_iter_get_node(iter); if (ev_type == CMARK_EVENT_ENTER) { if (contains_inlines(S_type(cur))) { - cmark_parse_inlines(mem, cur, refmap, options); + while (cur_extent && cur_extent->node != cur) { + cur_extent = source_map_stitch_extent(parser->source_map, cur_extent, parser->root, cur, parser->line_offset); + } + + if (parser->source_map) + assert(cur_extent); + + source_map_start_cursor(parser->source_map, cur_extent); + + cmark_parse_inlines(parser->mem, cur, parser->refmap, parser->options, parser->source_map, parser->line_offset); } } } + while (cur_extent) { + cur_extent = source_map_stitch_extent(parser->source_map, cur_extent, parser->root, NULL, parser->line_offset); + } + cmark_iter_free(iter); } @@ -482,7 +518,10 @@ static cmark_node *finalize_document(cmark_parser *parser) { } finalize(parser, parser->root); - process_inlines(parser->mem, parser->root, parser->refmap, parser->options); + + process_inlines(parser); + + assert(source_map_check(parser->source_map, parser->line_offset)); return parser->root; } @@ -524,8 +563,23 @@ void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) { static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, size_t len, bool eof) { const unsigned char *end = buffer + len; + const unsigned char *skipped; static const uint8_t repl[] = {239, 191, 189}; + if (parser->error_code) { + return; + } + + // Limit maximum document size to BUFSIZE_MAX. This makes sure that we + // never create strbufs larger than BUFSIZE_MAX. Unfortunately, the + // public API doesn't have an error reporting mechanism, so all we can + // do is to abort. + if (len > (size_t)(BUFSIZE_MAX - parser->total_bytes)) { + parser->error_code = CMARK_ERR_INPUT_TOO_LARGE; + return; + } + parser->total_bytes += (bufsize_t)len; + if (parser->last_buffer_ended_with_cr && *buffer == '\n') { // skip NL if last buffer ended with CR ; see #117 buffer++; @@ -534,6 +588,7 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, while (buffer < end) { const unsigned char *eol; bufsize_t chunk_len; + bufsize_t linebuf_size = 0; bool process = false; for (eol = buffer; eol < end; ++eol) { if (S_is_line_end_char(*eol)) { @@ -551,6 +606,7 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, chunk_len = (eol - buffer); if (process) { if (parser->linebuf.size > 0) { + linebuf_size = cmark_strbuf_len(&parser->linebuf); cmark_strbuf_put(&parser->linebuf, buffer, chunk_len); S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size); cmark_strbuf_clear(&parser->linebuf); @@ -569,6 +625,8 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, } buffer += chunk_len; + skipped = buffer; + if (buffer < end) { if (*buffer == '\0') { // skip over NULL @@ -584,6 +642,11 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, buffer++; } } + chunk_len += buffer - skipped; + chunk_len += linebuf_size; + + if (process) + parser->line_offset += chunk_len; } } @@ -643,11 +706,13 @@ static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) { // indicates a number of columns; otherwise, a number of bytes. // If advancing a certain number of columns partially consumes // a tab character, parser->partially_consumed_tab is set to true. -static void S_advance_offset(cmark_parser *parser, cmark_chunk *input, - bufsize_t count, bool columns) { +static void S_advance_offset(cmark_parser *parser, cmark_node *container, cmark_extent_type type, + cmark_chunk *input, bufsize_t count, bool columns) { char c; int chars_to_tab; int chars_to_advance; + int initial_pos = parser->offset + parser->line_offset; + while (count > 0 && (c = peek_at(input, parser->offset))) { if (c == '\t') { chars_to_tab = TAB_STOP - (parser->column % TAB_STOP); @@ -670,6 +735,8 @@ static void S_advance_offset(cmark_parser *parser, cmark_chunk *input, count -= 1; } } + + source_map_append_extent(parser->source_map, initial_pos, parser->offset + parser->line_offset, container, type); } static bool S_last_child_is_open(cmark_node *container) { @@ -677,7 +744,7 @@ static bool S_last_child_is_open(cmark_node *container) { (container->last_child->flags & CMARK_NODE__OPEN); } -static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) { +static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input, cmark_node *container) { bool res = false; bufsize_t matched = 0; @@ -685,10 +752,10 @@ static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) { parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>'; if (matched) { - S_advance_offset(parser, input, parser->indent + 1, true); + S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, parser->indent + 1, true); if (S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, 1, true); } res = true; @@ -702,7 +769,7 @@ static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input, if (parser->indent >= container->as.list.marker_offset + container->as.list.padding) { - S_advance_offset(parser, input, container->as.list.marker_offset + + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, container->as.list.marker_offset + container->as.list.padding, true); res = true; @@ -710,7 +777,7 @@ static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input, // if container->first_child is NULL, then the opening line // of the list item was blank after the list marker; in this // case, we are done with the list item. - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset, false); res = true; } @@ -724,10 +791,10 @@ static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input, if (!container->as.code.fenced) { // indented if (parser->indent >= CODE_INDENT) { - S_advance_offset(parser, input, CODE_INDENT, true); + S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, CODE_INDENT, true); res = true; } else if (parser->blank) { - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset, false); res = true; } @@ -743,14 +810,15 @@ static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input, // closing fence - and since we're at // the end of a line, we can stop processing it: *should_continue = false; - S_advance_offset(parser, input, matched, false); + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace, false); + S_advance_offset(parser, container, CMARK_EXTENT_CLOSER, input, parser->offset + matched, false); parser->current = finalize(parser, container); } else { // skip opt. spaces of fence parser->offset int i = container->as.code.fence_offset; while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, 1, true); i--; } res = true; @@ -807,7 +875,7 @@ static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input, switch (cont_type) { case CMARK_NODE_BLOCK_QUOTE: - if (!parse_block_quote_prefix(parser, input)) + if (!parse_block_quote_prefix(parser, input, container)) goto done; break; case CMARK_NODE_ITEM: @@ -867,29 +935,26 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, indented = parser->indent >= CODE_INDENT; if (!indented && peek_at(input, parser->first_nonspace) == '>') { + *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE, + parser->first_nonspace + 1); - bufsize_t blockquote_startpos = parser->first_nonspace; - - S_advance_offset(parser, input, + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + 1 - parser->offset, false); // optional following character if (S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true); } - *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE, - blockquote_startpos + 1); } else if (!indented && (matched = scan_atx_heading_start( input, parser->first_nonspace))) { bufsize_t hashpos; int level = 0; - bufsize_t heading_startpos = parser->first_nonspace; - S_advance_offset(parser, input, + *container = add_child(parser, *container, CMARK_NODE_HEADING, + parser->first_nonspace + 1); + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + matched - parser->offset, false); - *container = add_child(parser, *container, CMARK_NODE_HEADING, - heading_startpos + 1); hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace); @@ -911,7 +976,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->as.code.fence_offset = (int8_t)(parser->first_nonspace - parser->offset); (*container)->as.code.info = cmark_chunk_literal(""); - S_advance_offset(parser, input, + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + matched - parser->offset, false); @@ -931,14 +996,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->type = (uint16_t)CMARK_NODE_HEADING; (*container)->as.heading.level = lev; (*container)->as.heading.setext = true; - S_advance_offset(parser, input, input->len - 1 - parser->offset, false); + S_advance_offset(parser, *container, CMARK_EXTENT_CLOSER, input, input->len - 1 - parser->offset, false); } else if (!indented && !(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) && (matched = scan_thematic_break(input, parser->first_nonspace))) { // it's only now that we know the line is not part of a setext heading: *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK, parser->first_nonspace + 1); - S_advance_offset(parser, input, input->len - 1 - parser->offset, false); + S_advance_offset(parser, *container, CMARK_EXTENT_CONTENT, input, input->len - 1 - parser->offset, false); } else if ((!indented || cont_type == CMARK_NODE_LIST) && (matched = parse_list_marker( parser->mem, input, parser->first_nonspace, @@ -946,20 +1011,37 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, // Note that we can have new list items starting with >= 4 // spaces indent, as long as the list container is still open. + cmark_node *list = NULL; + cmark_node *item = NULL; + cmark_source_extent *save_source_map_tail; int i = 0; + if (cont_type != CMARK_NODE_LIST || + !lists_match(&((*container)->as.list), data)) { + *container = add_child(parser, *container, CMARK_NODE_LIST, + parser->first_nonspace + 1); + list = *container; + + } + + // add the list item + *container = add_child(parser, *container, CMARK_NODE_ITEM, + parser->first_nonspace + 1); + item = *container; + // compute padding: - S_advance_offset(parser, input, + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + matched - parser->offset, false); save_partially_consumed_tab = parser->partially_consumed_tab; save_offset = parser->offset; save_column = parser->column; + save_source_map_tail = source_map_get_tail(parser->source_map); while (parser->column - save_column <= 5 && S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true); } i = parser->column - save_column; @@ -969,9 +1051,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, data->padding = matched + 1; parser->offset = save_offset; parser->column = save_column; + if (save_source_map_tail) { + cmark_source_extent *tmp_extent; + for (tmp_extent = save_source_map_tail->next; tmp_extent; tmp_extent = source_map_free_extent(parser->source_map, tmp_extent)); + } + parser->partially_consumed_tab = save_partially_consumed_tab; if (i > 0) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true); } } else { data->padding = matched + i; @@ -982,22 +1069,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, data->marker_offset = parser->indent; - if (cont_type != CMARK_NODE_LIST || - !lists_match(&((*container)->as.list), data)) { - *container = add_child(parser, *container, CMARK_NODE_LIST, - parser->first_nonspace + 1); - - memcpy(&((*container)->as.list), data, sizeof(*data)); - } - - // add the list item - *container = add_child(parser, *container, CMARK_NODE_ITEM, - parser->first_nonspace + 1); /* TODO: static */ - memcpy(&((*container)->as.list), data, sizeof(*data)); + if (list) + memcpy(&(list->as.list), data, sizeof(*data)); + if (item) + memcpy(&(item->as.list), data, sizeof(*data)); + parser->mem->free(data); } else if (indented && !maybe_lazy && !parser->blank) { - S_advance_offset(parser, input, CODE_INDENT, true); *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK, parser->offset + 1); (*container)->as.code.fenced = false; @@ -1006,6 +1085,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->as.code.fence_offset = 0; (*container)->as.code.info = cmark_chunk_literal(""); + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, CODE_INDENT, true); } else { break; } @@ -1070,6 +1150,11 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container, } if (S_type(container) == CMARK_NODE_CODE_BLOCK) { + source_map_append_extent(parser->source_map, + parser->offset + parser->line_offset, + parser->line_offset + input->len, + container, + CMARK_EXTENT_CONTENT); add_line(container, input, parser); } else if (S_type(container) == CMARK_NODE_HTML_BLOCK) { add_line(container, input, parser); @@ -1106,26 +1191,48 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container, break; } + source_map_append_extent(parser->source_map, + parser->offset + parser->line_offset, + parser->line_offset + input->len, + container, + CMARK_EXTENT_CONTENT); + if (matches_end_condition) { container = finalize(parser, container); assert(parser->current != NULL); } } else if (parser->blank) { - // ??? do nothing + source_map_append_extent(parser->source_map, + parser->line_offset + parser->offset, + parser->line_offset + input->len, + container, + CMARK_EXTENT_BLANK); } else if (accepts_lines(S_type(container))) { + bufsize_t initial_len = input->len; + bool chopped = false; + if (S_type(container) == CMARK_NODE_HEADING && container->as.heading.setext == false) { chop_trailing_hashtags(input); + chopped = true; } - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset, false); add_line(container, input, parser); + + if (chopped) + source_map_append_extent(parser->source_map, + MAX(parser->line_offset + parser->offset, parser->line_offset + input->len), + parser->line_offset + initial_len, + container, + CMARK_EXTENT_CLOSER); } else { // create paragraph container for line container = add_child(parser, container, CMARK_NODE_PARAGRAPH, parser->first_nonspace + 1); - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, parser->first_nonspace - parser->offset, false); + parser->last_paragraph_extent = source_map_get_tail(parser->source_map); add_line(container, input, parser); } @@ -1187,17 +1294,23 @@ static void S_process_line(cmark_parser *parser, const unsigned char *buffer, cmark_node *cmark_parser_finish(cmark_parser *parser) { if (parser->linebuf.size) { S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size); + parser->line_offset += parser->linebuf.size; cmark_strbuf_clear(&parser->linebuf); } + cmark_strbuf_clear(&parser->curline); + + if (parser->error_code) { + cmark_node_free(parser->root); + return NULL; + } + finalize_document(parser); if (parser->options & CMARK_OPT_NORMALIZE) { cmark_consolidate_text_nodes(parser->root); } - cmark_strbuf_free(&parser->curline); - #if CMARK_DEBUG_NODES if (cmark_node_check(parser->root, stderr)) { abort(); @@ -1205,3 +1318,32 @@ cmark_node *cmark_parser_finish(cmark_parser *parser) { #endif return parser->root; } + +cmark_source_extent * +cmark_parser_get_first_source_extent(cmark_parser *parser) +{ + return source_map_get_head(parser->source_map); +} + +cmark_err_type cmark_parser_get_error(cmark_parser *parser) { + return parser->error_code; +} + +const char *cmark_parser_get_error_message(cmark_parser *parser) { + const char *str = NULL; + + switch (parser->error_code) { + case CMARK_ERR_OUT_OF_MEMORY: + str = "Out of memory"; + break; + case CMARK_ERR_INPUT_TOO_LARGE: + str = "Input too large"; + break; + default: + str = "Unknown error"; + break; + } + + return str; +} + diff --git a/src/buffer.c b/src/buffer.c index a6754b64f..9a9e9adcc 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -33,6 +33,11 @@ void cmark_strbuf_init(cmark_mem *mem, cmark_strbuf *buf, } static CMARK_INLINE void S_strbuf_grow_by(cmark_strbuf *buf, bufsize_t add) { + // Safety check for overflow. + if (add > BUFSIZE_MAX - buf->size) { + fprintf(stderr, "Internal cmark_strbuf overflow"); + abort(); + } cmark_strbuf_grow(buf, buf->size + add); } @@ -42,18 +47,25 @@ void cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size) { if (target_size < buf->asize) return; - if (target_size > (bufsize_t)(INT32_MAX / 2)) - abort(); - - /* Oversize the buffer by 50% to guarantee amortized linear time - * complexity on append operations. */ - bufsize_t new_size = target_size + target_size / 2; - new_size += 1; - new_size = (new_size + 7) & ~7; + // Oversize the buffer by 50% to guarantee amortized linear time + // complexity on append operations. + bufsize_t add = target_size / 2; + // Account for terminating NUL byte. + add += 1; + // Round up to multiple of eight. + add = (add + 7) & ~7; + + // Check for overflow but allow an additional NUL byte. + if (target_size + add > BUFSIZE_MAX + 1) { + target_size = BUFSIZE_MAX + 1; + } + else { + target_size += add; + } buf->ptr = (unsigned char *)buf->mem->realloc(buf->asize ? buf->ptr : NULL, - new_size); - buf->asize = new_size; + target_size); + buf->asize = target_size; } bufsize_t cmark_strbuf_len(const cmark_strbuf *buf) { return buf->size; } diff --git a/src/buffer.h b/src/buffer.h index e8780753f..7f31a74bb 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -13,8 +13,28 @@ extern "C" { #endif +#ifndef CMARK_HUGE_DOCS + +// Maximum strbuf size without terminating NUL byte. +#define BUFSIZE_MAX (INT32_MAX - 1) + typedef int32_t bufsize_t; +#else // CMARK_HUGE_DOCS + +// This is an untested proof of concept of how to handle multi-gigabyte +// documents on 64-bit platforms at the expense of internal struct sizes. + +#ifdef PTRDIFF_MAX + #define BUFSIZE_MAX (PTRDIFF_MAX - 1) +#else + #define BUFSIZE_MAX (ptrdiff_t)((size_t)-1 / 2) +#endif + +typedef ptrdiff_t bufsize_t; + +#endif // CMARK_HUGE_DOCS + typedef struct { cmark_mem *mem; unsigned char *ptr; diff --git a/src/cmark-format.in b/src/cmark-format.in new file mode 100755 index 000000000..b1e2f53ed --- /dev/null +++ b/src/cmark-format.in @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse + +HERE = "@CMAKE_CURRENT_SOURCE_DIR@" +sys.path.append(HERE) +sys.path.append(os.path.join(HERE, os.pardir, 'wrappers')) + +from remarkor import * +from wrapper import conf + +conf.set_library_path("@CMAKE_CURRENT_BINARY_DIR@") + +if __name__=='__main__': + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('input') + arg_parser.add_argument('--width', type=int, default=80) + args = arg_parser.parse_args() + + remarkor = Remarkor.from_filename(args.input) + res = remarkor.remark(width=args.width) + sys.stdout.write(res) diff --git a/src/cmark.c b/src/cmark.c index 0d3bc1669..da93abe21 100644 --- a/src/cmark.c +++ b/src/cmark.c @@ -24,6 +24,11 @@ static void *xrealloc(void *ptr, size_t size) { return new_ptr; } +void cmark_default_mem_free(void *ptr) +{ + free(ptr); +} + cmark_mem DEFAULT_MEM_ALLOCATOR = {xcalloc, xrealloc, free}; char *cmark_markdown_to_html(const char *text, size_t len, int options) { @@ -31,6 +36,9 @@ char *cmark_markdown_to_html(const char *text, size_t len, int options) { char *result; doc = cmark_parse_document(text, len, options); + if (doc == NULL) { + return NULL; + } result = cmark_render_html(doc, options); cmark_node_free(doc); diff --git a/src/cmark.h b/src/cmark.h index 6ed7eb057..322ac9601 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -22,7 +22,7 @@ extern "C" { /** Convert 'text' (assumed to be a UTF-8 encoded string with length * 'len') from CommonMark Markdown to HTML, returning a null-terminated, * UTF-8-encoded string. It is the caller's responsibility - * to free the returned buffer. + * to free the returned buffer. Returns NULL on error. */ CMARK_EXPORT char *cmark_markdown_to_html(const char *text, size_t len, int options); @@ -45,9 +45,10 @@ typedef enum { CMARK_NODE_PARAGRAPH, CMARK_NODE_HEADING, CMARK_NODE_THEMATIC_BREAK, + CMARK_NODE_REFERENCE, CMARK_NODE_FIRST_BLOCK = CMARK_NODE_DOCUMENT, - CMARK_NODE_LAST_BLOCK = CMARK_NODE_THEMATIC_BREAK, + CMARK_NODE_LAST_BLOCK = CMARK_NODE_REFERENCE, /* Inline */ CMARK_NODE_TEXT, @@ -65,6 +66,21 @@ typedef enum { CMARK_NODE_LAST_INLINE = CMARK_NODE_IMAGE, } cmark_node_type; +typedef enum { + CMARK_EXTENT_NONE, + CMARK_EXTENT_OPENER, + CMARK_EXTENT_CLOSER, + CMARK_EXTENT_BLANK, + CMARK_EXTENT_CONTENT, + CMARK_EXTENT_PUNCTUATION, + CMARK_EXTENT_LINK_DESTINATION, + CMARK_EXTENT_LINK_TITLE, + CMARK_EXTENT_LINK_LABEL, + CMARK_EXTENT_REFERENCE_DESTINATION, + CMARK_EXTENT_REFERENCE_LABEL, + CMARK_EXTENT_REFERENCE_TITLE, +} cmark_extent_type; + /* For backwards compatibility: */ #define CMARK_NODE_HEADER CMARK_NODE_HEADING #define CMARK_NODE_HRULE CMARK_NODE_THEMATIC_BREAK @@ -83,9 +99,16 @@ typedef enum { CMARK_PAREN_DELIM } cmark_delim_type; +typedef enum { + CMARK_ERR_NONE, + CMARK_ERR_OUT_OF_MEMORY, + CMARK_ERR_INPUT_TOO_LARGE +} cmark_err_type; + typedef struct cmark_node cmark_node; typedef struct cmark_parser cmark_parser; typedef struct cmark_iter cmark_iter; +typedef struct cmark_source_extent cmark_source_extent; /** * ## Custom memory allocator support @@ -100,6 +123,11 @@ typedef struct cmark_mem { void (*free)(void *); } cmark_mem; +/** Convenience function for bindings. + */ +CMARK_EXPORT +void cmark_default_mem_free(void *ptr); + /** * ## Creating and Destroying Nodes */ @@ -333,26 +361,36 @@ CMARK_EXPORT const char *cmark_node_get_fence_info(cmark_node *node); */ CMARK_EXPORT int cmark_node_set_fence_info(cmark_node *node, const char *info); -/** Returns the URL of a link or image 'node', or an empty string +/** Returns the URL of a link, image or reference 'node', or an empty string if no URL is set. */ CMARK_EXPORT const char *cmark_node_get_url(cmark_node *node); -/** Sets the URL of a link or image 'node'. Returns 1 on success, +/** Sets the URL of a link, image or reference 'node'. Returns 1 on success, * 0 on failure. */ CMARK_EXPORT int cmark_node_set_url(cmark_node *node, const char *url); -/** Returns the title of a link or image 'node', or an empty +/** Returns the title of a link, image or reference 'node', or an empty string if no title is set. */ CMARK_EXPORT const char *cmark_node_get_title(cmark_node *node); -/** Sets the title of a link or image 'node'. Returns 1 on success, +/** Sets the title of a link, image or reference 'node'. Returns 1 on success, * 0 on failure. */ CMARK_EXPORT int cmark_node_set_title(cmark_node *node, const char *title); +/** Returns the label of a reference 'node', or an empty + string if no label is set. + */ +CMARK_EXPORT const char *cmark_node_get_label(cmark_node *node); + +/** Sets the label of a reference 'node'. Returns 1 on success, + * 0 on failure. + */ +CMARK_EXPORT int cmark_node_set_label(cmark_node *node, const char *label); + /** Returns the literal "on enter" text for a custom 'node', or an empty string if no on_enter is set. */ @@ -467,20 +505,35 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem); CMARK_EXPORT void cmark_parser_free(cmark_parser *parser); +/** Return the error code after a failed operation. + */ +CMARK_EXPORT +cmark_err_type cmark_parser_get_error(cmark_parser *parser); + +/** Return the error code after a failed operation. + */ +CMARK_EXPORT +const char *cmark_parser_get_error_message(cmark_parser *parser); + /** Feeds a string of length 'len' to 'parser'. */ CMARK_EXPORT void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len); -/** Finish parsing and return a pointer to a tree of nodes. +/** Finish parsing and return a pointer to a tree of nodes or NULL on error. */ CMARK_EXPORT cmark_node *cmark_parser_finish(cmark_parser *parser); +/** Return a pointer to the first extent of the parser's source map + */ +CMARK_EXPORT +cmark_source_extent *cmark_parser_get_first_source_extent(cmark_parser *parser); + /** Parse a CommonMark document in 'buffer' of length 'len'. * Returns a pointer to a tree of nodes. The memory allocated for * the node tree should be released using 'cmark_node_free' - * when it is no longer needed. + * when it is no longer needed. Returns NULL on error. */ CMARK_EXPORT cmark_node *cmark_parse_document(const char *buffer, size_t len, int options); @@ -488,10 +541,44 @@ cmark_node *cmark_parse_document(const char *buffer, size_t len, int options); /** Parse a CommonMark document in file 'f', returning a pointer to * a tree of nodes. The memory allocated for the node tree should be * released using 'cmark_node_free' when it is no longer needed. + * Returns NULL on error. */ CMARK_EXPORT cmark_node *cmark_parse_file(FILE *f, int options); +/** + * ## Source map API + */ + +/* Return the index, in bytes, of the start of this extent */ +CMARK_EXPORT +size_t cmark_source_extent_get_start(cmark_source_extent *extent); + +/* Return the index, in bytes, of the stop of this extent. This + * index is not included in the extent*/ +CMARK_EXPORT +size_t cmark_source_extent_get_stop(cmark_source_extent *extent); + +/* Return the extent immediately following 'extent' */ +CMARK_EXPORT +cmark_source_extent *cmark_source_extent_get_next(cmark_source_extent *extent); + +/* Return the extent immediately preceding 'extent' */ +CMARK_EXPORT +cmark_source_extent *cmark_source_extent_get_previous(cmark_source_extent *extent); + +/* Return the node 'extent' maps to */ +CMARK_EXPORT +cmark_node *cmark_source_extent_get_node(cmark_source_extent *extent); + +/* Return the type of 'extent' */ +CMARK_EXPORT +cmark_extent_type cmark_source_extent_get_type(cmark_source_extent *extent); + +/* Return a string representation of 'extent' */ +CMARK_EXPORT +const char *cmark_source_extent_get_type_string(cmark_source_extent *extent); + /** * ## Rendering */ diff --git a/src/commonmark.c b/src/commonmark.c index b8b182068..7479079ec 100644 --- a/src/commonmark.c +++ b/src/commonmark.c @@ -336,6 +336,9 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, OUT(cmark_node_get_literal(node), allow_wrap, NORMAL); break; + case CMARK_NODE_REFERENCE: + break; + case CMARK_NODE_LINEBREAK: if (!(CMARK_OPT_HARDBREAKS & options)) { LIT(" "); diff --git a/src/html.c b/src/html.c index a680e4a50..c10dcc478 100644 --- a/src/html.c +++ b/src/html.c @@ -217,6 +217,9 @@ static int S_render_node(cmark_node *node, cmark_event_type ev_type, } break; + case CMARK_NODE_REFERENCE: + break; + case CMARK_NODE_TEXT: escape_html(html, node->as.literal.data, node->as.literal.len); break; diff --git a/src/inlines.c b/src/inlines.c index 92e79c787..d2378b53f 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -13,6 +13,10 @@ #include "scanners.h" #include "inlines.h" +#ifndef MIN +#define MIN(x, y) ((x < y) ? x : y) +#endif + static const char *EMDASH = "\xE2\x80\x94"; static const char *ENDASH = "\xE2\x80\x93"; static const char *ELLIPSES = "\xE2\x80\xA6"; @@ -40,6 +44,7 @@ typedef struct delimiter { unsigned char delim_char; bool can_open; bool can_close; + cmark_source_extent *extent; } delimiter; typedef struct bracket { @@ -50,6 +55,7 @@ typedef struct bracket { bool image; bool active; bool bracket_after; + cmark_source_extent *extent; } bracket; typedef struct { @@ -61,6 +67,7 @@ typedef struct { bracket *last_bracket; bufsize_t backticks[MAXBACKTICKS + 1]; bool scanned_for_backticks; + cmark_source_map *source_map; } subject; static CMARK_INLINE bool S_is_line_end_char(char c) { @@ -73,7 +80,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, static int parse_inline(subject *subj, cmark_node *parent, int options); static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap); + cmark_reference_map *refmap, cmark_source_map *source_map); static bufsize_t subject_find_special_char(subject *subj, int options); // Create an inline with a literal string value. @@ -149,7 +156,7 @@ static CMARK_INLINE cmark_node *make_autolink(cmark_mem *mem, cmark_chunk url, } static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap) { + cmark_reference_map *refmap, cmark_source_map *source_map) { int i; e->mem = mem; e->input.data = buffer->ptr; @@ -159,6 +166,7 @@ static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, e->refmap = refmap; e->last_delim = NULL; e->last_bracket = NULL; + e->source_map = source_map; for (i = 0; i <= MAXBACKTICKS; i++) { e->backticks[i] = 0; } @@ -406,6 +414,7 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open, if (delim->previous != NULL) { delim->previous->next = delim; } + delim->extent = NULL; subj->last_delim = delim; } @@ -421,11 +430,12 @@ static void push_bracket(subject *subj, bool image, cmark_node *inl_text) { b->previous_delimiter = subj->last_delim; b->position = subj->pos; b->bracket_after = false; + b->extent = NULL; subj->last_bracket = b; } // Assumes the subject has a c at the current position. -static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { +static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart, bool *pushed) { bufsize_t numdelims; cmark_node *inl_text; bool can_open, can_close; @@ -446,6 +456,9 @@ static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { push_delimiter(subj, c, can_open, can_close, inl_text); + *pushed = true; + } else { + *pushed = false; } return inl_text; @@ -612,6 +625,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, bufsize_t opener_num_chars = opener_inl->as.literal.len; bufsize_t closer_num_chars = closer_inl->as.literal.len; cmark_node *tmp, *tmpnext, *emph; + cmark_source_extent *tmp_extent = NULL; // calculate the actual number of characters used from this closer if (closer_num_chars < 3 || opener_num_chars < 3) { @@ -647,9 +661,30 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, } cmark_node_insert_after(opener_inl, emph); + if (subj->source_map) { + tmp_extent = closer->extent->prev; + + source_map_insert_extent(subj->source_map, + opener->extent, + opener->extent->stop - use_delims, + opener->extent->stop, + emph, + CMARK_EXTENT_OPENER); + opener->extent->stop -= use_delims; + + source_map_insert_extent(subj->source_map, + tmp_extent, + closer->extent->start, + closer->extent->start + use_delims, + emph, + CMARK_EXTENT_CLOSER); + closer->extent->start += use_delims; + } + // if opener has 0 characters, remove it and its associated inline if (opener_num_chars == 0) { cmark_node_free(opener_inl); + source_map_free_extent(subj->source_map, opener->extent); remove_delimiter(subj, opener); } @@ -659,6 +694,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, cmark_node_free(closer_inl); // remove closer from list tmp_delim = closer->next; + source_map_free_extent(subj->source_map, closer->extent); remove_delimiter(subj, closer); closer = tmp_delim; } @@ -883,6 +919,8 @@ static cmark_node *handle_close_bracket(subject *subj) { int found_label; cmark_node *tmp, *tmpnext; bool is_image; + bool is_inline = false; + bool is_shortcut = false; advance(subj); // advance past ] initial_pos = subj->pos; @@ -933,6 +971,7 @@ static cmark_node *handle_close_bracket(subject *subj) { title = cmark_clean_title(subj->mem, &title_chunk); cmark_chunk_free(subj->mem, &url_chunk); cmark_chunk_free(subj->mem, &title_chunk); + is_inline = true; goto match; } else { @@ -955,6 +994,7 @@ static cmark_node *handle_close_bracket(subject *subj) { cmark_chunk_free(subj->mem, &raw_label); raw_label = cmark_chunk_dup(&subj->input, opener->position, initial_pos - opener->position - 1); + is_shortcut = true; found_label = true; } @@ -984,6 +1024,31 @@ static cmark_node *handle_close_bracket(subject *subj) { cmark_node_insert_before(opener->inl_text, inl); // Add link text: tmp = opener->inl_text->next; + + if (subj->source_map) { + assert(opener->extent); + + opener->extent->node = inl; + opener->extent->type = CMARK_EXTENT_OPENER; + } + + source_map_splice_extent(subj->source_map, initial_pos - 1, initial_pos, inl, CMARK_EXTENT_PUNCTUATION); + if (is_inline) { + source_map_splice_extent(subj->source_map, after_link_text_pos, starturl, inl, CMARK_EXTENT_PUNCTUATION); + source_map_splice_extent(subj->source_map, starturl, endurl, inl, CMARK_EXTENT_LINK_DESTINATION); + if (endtitle != starttitle) { + source_map_splice_extent(subj->source_map, endurl, starttitle, inl, CMARK_EXTENT_BLANK); + source_map_splice_extent(subj->source_map, starttitle, endtitle, inl, CMARK_EXTENT_LINK_TITLE); + source_map_splice_extent(subj->source_map, endtitle, subj->pos, inl, CMARK_EXTENT_PUNCTUATION); + } else { + source_map_splice_extent(subj->source_map, endurl, subj->pos, inl, CMARK_EXTENT_PUNCTUATION); + } + } else if (!is_shortcut) { + source_map_splice_extent(subj->source_map, initial_pos, initial_pos + 1, inl, CMARK_EXTENT_PUNCTUATION); + source_map_splice_extent(subj->source_map, initial_pos + 1, subj->pos - 1, inl, CMARK_EXTENT_LINK_LABEL); + source_map_splice_extent(subj->source_map, subj->pos - 1, subj->pos, inl, CMARK_EXTENT_PUNCTUATION); + } + while (tmp) { tmpnext = tmp->next; cmark_node_append_child(inl, tmp); @@ -1087,6 +1152,11 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { cmark_chunk contents; unsigned char c; bufsize_t endpos; + bufsize_t startpos = subj->pos; + bufsize_t trimmed_spaces = 0; + bool add_extent_to_last_bracket = false; + bool add_extent_to_last_delimiter = false; + c = peek_char(subj); if (c == 0) { return 0; @@ -1095,6 +1165,8 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { case '\r': case '\n': new_inl = handle_newline(subj); + if (new_inl->type == CMARK_NODE_LINEBREAK) + startpos -= 2; break; case '`': new_inl = handle_backticks(subj); @@ -1112,7 +1184,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { case '_': case '\'': case '"': - new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0); + new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0, &add_extent_to_last_delimiter); break; case '-': new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0); @@ -1124,6 +1196,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { advance(subj); new_inl = make_str(subj->mem, cmark_chunk_literal("[")); push_bracket(subj, false, new_inl); + add_extent_to_last_bracket = true; break; case ']': new_inl = handle_close_bracket(subj); @@ -1134,6 +1207,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { advance(subj); new_inl = make_str(subj->mem, cmark_chunk_literal("![")); push_bracket(subj, true, new_inl); + add_extent_to_last_bracket = true; } else { new_inl = make_str(subj->mem, cmark_chunk_literal("!")); } @@ -1145,12 +1219,24 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { // if we're at a newline, strip trailing spaces. if (S_is_line_end_char(peek_char(subj))) { + bufsize_t initial_size = contents.len; cmark_chunk_rtrim(&contents); + trimmed_spaces = initial_size - contents.len; } new_inl = make_str(subj->mem, contents); } + if (new_inl != NULL) { + cmark_source_extent *extent; + + extent = source_map_splice_extent(subj->source_map, startpos, subj->pos - trimmed_spaces, new_inl, CMARK_EXTENT_CONTENT); + + if (add_extent_to_last_bracket) + subj->last_bracket->extent = extent; + else if (add_extent_to_last_delimiter) + subj->last_delim->extent = extent; + cmark_node_append_child(parent, new_inl); } @@ -1159,9 +1245,11 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { // Parse inlines from parent's string_content, adding as children of parent. extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, - cmark_reference_map *refmap, int options) { + cmark_reference_map *refmap, int options, + cmark_source_map *source_map, bufsize_t total_length) { subject subj; - subject_from_buf(mem, &subj, &parent->content, refmap); + subject_from_buf(mem, &subj, &parent->content, refmap, source_map); + bufsize_t initial_len = subj.input.len; cmark_chunk_rtrim(&subj.input); while (!is_eof(&subj) && parse_inline(&subj, parent, options)) @@ -1175,6 +1263,14 @@ extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, while (subj.last_bracket) { pop_bracket(&subj); } + + if (source_map) + source_map_insert_extent(source_map, + source_map->cursor, + source_map->cursor->stop, + MIN(source_map->cursor->stop + initial_len - subj.input.len, total_length), + parent, + CMARK_EXTENT_BLANK); } // Parse zero or more space characters, including at most one newline. @@ -1190,65 +1286,106 @@ static void spnl(subject *subj) { // Return 0 if no reference found, otherwise position of subject // after reference is parsed. bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, - cmark_reference_map *refmap) { + cmark_reference_map *refmap, + cmark_node *container, + cmark_source_map *source_map) { subject subj; + cmark_node *reference = cmark_node_new(CMARK_NODE_REFERENCE); + cmark_reference *ref; cmark_chunk lab; cmark_chunk url; cmark_chunk title; bufsize_t matchlen = 0; - bufsize_t beforetitle; + bufsize_t starttitle, endtitle; + bufsize_t endlabel; + bufsize_t starturl, endurl; - subject_from_buf(mem, &subj, input, NULL); + subject_from_buf(mem, &subj, input, NULL, source_map); // parse label: if (!link_label(&subj, &lab) || lab.len == 0) - return 0; + goto nomatch; + + endlabel = subj.pos - 1; // colon: if (peek_char(&subj) == ':') { advance(&subj); } else { - return 0; + goto nomatch; } // parse link url: spnl(&subj); + starturl = subj.pos; matchlen = manual_scan_link_url(&subj.input, subj.pos); if (matchlen > 0) { url = cmark_chunk_dup(&subj.input, subj.pos, matchlen); subj.pos += matchlen; } else { - return 0; + goto nomatch; } // parse optional link_title - beforetitle = subj.pos; + endurl = subj.pos; spnl(&subj); + starttitle = subj.pos; matchlen = scan_link_title(&subj.input, subj.pos); if (matchlen) { title = cmark_chunk_dup(&subj.input, subj.pos, matchlen); subj.pos += matchlen; } else { - subj.pos = beforetitle; + subj.pos = endurl; + starttitle = endurl; + endtitle = endurl; title = cmark_chunk_literal(""); } + endtitle = subj.pos; + // parse final spaces and newline: skip_spaces(&subj); if (!skip_line_end(&subj)) { if (matchlen) { // try rewinding before title - subj.pos = beforetitle; + subj.pos = endurl; + starttitle = endurl; + endtitle = endurl; skip_spaces(&subj); + title = cmark_chunk_literal(""); if (!skip_line_end(&subj)) { - return 0; + goto nomatch; } } else { - return 0; + goto nomatch; } } // insert reference into refmap - cmark_reference_create(refmap, &lab, &url, &title); + ref = cmark_reference_create(refmap, &lab, &url, &title); + + if (ref) { + cmark_chunk_set_cstr(mem, &reference->as.reference.label, (char *) ref->label); + cmark_chunk_set_cstr(mem, &reference->as.reference.url, cmark_chunk_to_cstr(mem, &ref->url)); + cmark_chunk_set_cstr(mem, &reference->as.reference.title, cmark_chunk_to_cstr(mem, &ref->title)); + cmark_node_insert_before(container, reference); + + cmark_reference_add(refmap, ref); + } + + // Mark the extents of the reference + source_map_splice_extent(source_map, 0, 1, reference, CMARK_EXTENT_OPENER); + source_map_splice_extent(source_map, 1, endlabel, reference, CMARK_EXTENT_REFERENCE_LABEL); + source_map_splice_extent(source_map, endlabel, endlabel + 2, reference, CMARK_EXTENT_PUNCTUATION); + source_map_splice_extent(source_map, endlabel + 2, starturl, reference, CMARK_EXTENT_BLANK); + source_map_splice_extent(source_map, starturl, endurl, reference, CMARK_EXTENT_REFERENCE_DESTINATION); + source_map_splice_extent(source_map, endurl, starttitle, reference, CMARK_EXTENT_BLANK); + source_map_splice_extent(source_map, starttitle, endtitle, reference, CMARK_EXTENT_REFERENCE_TITLE); + source_map_splice_extent(source_map, endtitle, subj.pos, reference, CMARK_EXTENT_BLANK); + return subj.pos; + +nomatch: + cmark_node_free(reference); + return 0; } diff --git a/src/inlines.h b/src/inlines.h index 52be76820..ee85b87de 100644 --- a/src/inlines.h +++ b/src/inlines.h @@ -1,6 +1,10 @@ #ifndef CMARK_INLINES_H #define CMARK_INLINES_H +#include "chunk.h" +#include "references.h" +#include "source_map.h" + #ifdef __cplusplus extern "C" { #endif @@ -9,10 +13,13 @@ cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url); cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title); void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, - cmark_reference_map *refmap, int options); + cmark_reference_map *refmap, int options, + cmark_source_map *source_map, bufsize_t total_length); bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, - cmark_reference_map *refmap); + cmark_reference_map *refmap, + cmark_node *container, + cmark_source_map *source_map); #ifdef __cplusplus } diff --git a/src/latex.c b/src/latex.c index e78c7d916..3dd5f0798 100644 --- a/src/latex.c +++ b/src/latex.c @@ -332,6 +332,9 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, } break; + case CMARK_NODE_REFERENCE: + break; + case CMARK_NODE_TEXT: OUT(cmark_node_get_literal(node), allow_wrap, NORMAL); break; diff --git a/src/main.c b/src/main.c index 42cd8b163..aeb81de4d 100644 --- a/src/main.c +++ b/src/main.c @@ -181,6 +181,11 @@ int main(int argc, char *argv[]) { document = cmark_parser_finish(parser); cmark_parser_free(parser); + if (document == NULL) { + fprintf(stderr, "%s", cmark_parser_get_error_message(parser)); + exit(1); + } + print_document(document, writer, options, width); cmark_node_free(document); diff --git a/src/man.c b/src/man.c index 1c76f68bb..9c2b0629a 100644 --- a/src/man.c +++ b/src/man.c @@ -82,6 +82,7 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, switch (node->type) { case CMARK_NODE_DOCUMENT: + case CMARK_NODE_REFERENCE: break; case CMARK_NODE_BLOCK_QUOTE: diff --git a/src/node.c b/src/node.c index e722acf90..44fc28afa 100644 --- a/src/node.c +++ b/src/node.c @@ -65,6 +65,9 @@ static bool S_can_contain(cmark_node *node, cmark_node *child) { case CMARK_NODE_CUSTOM_INLINE: return S_is_inline(child); + case CMARK_NODE_REFERENCE: + return false; + default: break; } @@ -123,6 +126,11 @@ static void S_free_nodes(cmark_node *e) { cmark_chunk_free(NODE_MEM(e), &e->as.link.url); cmark_chunk_free(NODE_MEM(e), &e->as.link.title); break; + case CMARK_NODE_REFERENCE: + cmark_chunk_free(NODE_MEM(e), &e->as.reference.url); + cmark_chunk_free(NODE_MEM(e), &e->as.reference.title); + cmark_chunk_free(NODE_MEM(e), &e->as.reference.label); + break; case CMARK_NODE_CUSTOM_BLOCK: case CMARK_NODE_CUSTOM_INLINE: cmark_chunk_free(NODE_MEM(e), &e->as.custom.on_enter); @@ -182,6 +190,8 @@ const char *cmark_node_get_type_string(cmark_node *node) { return "paragraph"; case CMARK_NODE_HEADING: return "heading"; + case CMARK_NODE_REFERENCE: + return "reference"; case CMARK_NODE_THEMATIC_BREAK: return "thematic_break"; case CMARK_NODE_TEXT: @@ -486,6 +496,8 @@ const char *cmark_node_get_url(cmark_node *node) { case CMARK_NODE_LINK: case CMARK_NODE_IMAGE: return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.link.url); + case CMARK_NODE_REFERENCE: + return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.reference.url); default: break; } @@ -503,6 +515,9 @@ int cmark_node_set_url(cmark_node *node, const char *url) { case CMARK_NODE_IMAGE: cmark_chunk_set_cstr(NODE_MEM(node), &node->as.link.url, url); return 1; + case CMARK_NODE_REFERENCE: + cmark_chunk_set_cstr(NODE_MEM(node), &node->as.reference.url, url); + return 1; default: break; } @@ -519,6 +534,8 @@ const char *cmark_node_get_title(cmark_node *node) { case CMARK_NODE_LINK: case CMARK_NODE_IMAGE: return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.link.title); + case CMARK_NODE_REFERENCE: + return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.reference.title); default: break; } @@ -536,6 +553,40 @@ int cmark_node_set_title(cmark_node *node, const char *title) { case CMARK_NODE_IMAGE: cmark_chunk_set_cstr(NODE_MEM(node), &node->as.link.title, title); return 1; + case CMARK_NODE_REFERENCE: + cmark_chunk_set_cstr(NODE_MEM(node), &node->as.reference.title, title); + return 1; + default: + break; + } + + return 0; +} + +const char *cmark_node_get_label(cmark_node *node) { + if (node == NULL) { + return NULL; + } + + switch (node->type) { + case CMARK_NODE_REFERENCE: + return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.reference.label); + default: + break; + } + + return NULL; +} + +int cmark_node_set_label(cmark_node *node, const char *label) { + if (node == NULL) { + return 0; + } + + switch (node->type) { + case CMARK_NODE_REFERENCE: + cmark_chunk_set_cstr(NODE_MEM(node), &node->as.reference.label, label); + return 1; default: break; } diff --git a/src/node.h b/src/node.h index 65d857f0b..35bd6d4aa 100644 --- a/src/node.h +++ b/src/node.h @@ -41,6 +41,12 @@ typedef struct { cmark_chunk title; } cmark_link; +typedef struct { + cmark_chunk label; + cmark_chunk url; + cmark_chunk title; +} cmark_reference_node; + typedef struct { cmark_chunk on_enter; cmark_chunk on_exit; @@ -75,6 +81,7 @@ struct cmark_node { cmark_code code; cmark_heading heading; cmark_link link; + cmark_reference_node reference; cmark_custom custom; int html_block_type; } as; diff --git a/src/parser.h b/src/parser.h index 0c5033bd2..7b4fdbc9b 100644 --- a/src/parser.h +++ b/src/parser.h @@ -2,9 +2,11 @@ #define CMARK_AST_H #include +#include "cmark.h" #include "node.h" #include "buffer.h" #include "memory.h" +#include "source_map.h" #ifdef __cplusplus extern "C" { @@ -17,6 +19,8 @@ struct cmark_parser { struct cmark_reference_map *refmap; struct cmark_node *root; struct cmark_node *current; + cmark_err_type error_code; + bufsize_t total_bytes; int line_number; bufsize_t offset; bufsize_t column; @@ -27,9 +31,12 @@ struct cmark_parser { bool partially_consumed_tab; cmark_strbuf curline; bufsize_t last_line_length; + bufsize_t line_offset; cmark_strbuf linebuf; int options; bool last_buffer_ended_with_cr; + cmark_source_map *source_map; + cmark_source_extent *last_paragraph_extent; }; #ifdef __cplusplus diff --git a/src/references.c b/src/references.c index 89f2dc8cb..c9e5d06b0 100644 --- a/src/references.c +++ b/src/references.c @@ -53,7 +53,7 @@ static unsigned char *normalize_reference(cmark_mem *mem, cmark_chunk *ref) { return result; } -static void add_reference(cmark_reference_map *map, cmark_reference *ref) { +void cmark_reference_add(cmark_reference_map *map, cmark_reference *ref) { cmark_reference *t = ref->next = map->table[ref->hash % REFMAP_SIZE]; while (t) { @@ -68,14 +68,14 @@ static void add_reference(cmark_reference_map *map, cmark_reference *ref) { map->table[ref->hash % REFMAP_SIZE] = ref; } -void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, +cmark_reference *cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, cmark_chunk *url, cmark_chunk *title) { cmark_reference *ref; unsigned char *reflabel = normalize_reference(map->mem, label); /* empty reference name, or composed from only whitespace */ if (reflabel == NULL) - return; + return NULL; ref = (cmark_reference *)map->mem->calloc(1, sizeof(*ref)); ref->label = reflabel; @@ -84,7 +84,7 @@ void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, ref->title = cmark_clean_title(map->mem, title); ref->next = NULL; - add_reference(map, ref); + return ref; } // Returns reference if refmap contains a reference with matching diff --git a/src/references.h b/src/references.h index f075bbbd9..be174533b 100644 --- a/src/references.h +++ b/src/references.h @@ -31,8 +31,9 @@ cmark_reference_map *cmark_reference_map_new(cmark_mem *mem); void cmark_reference_map_free(cmark_reference_map *map); cmark_reference *cmark_reference_lookup(cmark_reference_map *map, cmark_chunk *label); -extern void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, - cmark_chunk *url, cmark_chunk *title); +void cmark_reference_add(cmark_reference_map *map, cmark_reference *ref); +extern cmark_reference *cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, + cmark_chunk *url, cmark_chunk *title); #ifdef __cplusplus } diff --git a/src/remarkor.py b/src/remarkor.py new file mode 100644 index 000000000..a2b7784ff --- /dev/null +++ b/src/remarkor.py @@ -0,0 +1,462 @@ +from wrapper import * +import re +from collections import defaultdict + +class RemarkorError(LibcmarkError): + pass + +def pretty_print_extents(source_map): + for extent in source_map: + print ('%d-%d %s for %s' % (extent.start, extent.stop, extent.type, type(extent.node))) + +ESCAPE_REGEX = re.compile('^' # Start of the string + '(' # The potential problematic pattern + '[' # Any of these characters + '#' # A heading + '|>' # Or a blockquote + '|*|+|-' # Or an unordered list start + ']' # End of single characters + '|[0-9]+[.|)]' # Ordered list start + ')' # End of the problematic pattern + '(' + '[ ]+.*' + '|$)' + ) + +ESCAPE_THEMATIC_REGEX = re.compile('^' # Start of the string + '((\*\s*){3,}|(\-\s*){3,}|(_\s*){3,})' # Either '*' or '-' or '_' 3 times or more, ws allowed + '$' # Nothing else is allowed + ) + +ESCAPE_CODE_BLOCK_REGEX = re.compile('^' # Start of the string + '(`{3,}|~{3,})' # Either '`' or `~` 3 times or more + '[^`]*' # Anything but '`' + '$' # Nothing else is allowed + ) + +ESCAPE_SETEXT_REGEX = re.compile('^' # Start of the string + '(\-+|=+)' # Either '-' or '=' one time or more + '[ ]*' # Optionally followed by 0 or more whitespace characters + '$' # Nothing else is allowed + ) + +ESCAPE_REFERENCE_DEF_REGEX = re.compile('^' # Start of the string + '\[' # Opening '[' + '.*' # Anything + '\]' # Closing ']' + ':' # Literal ':' + '.*' # Consume the remainder + ) + +def build_reverse_source_map(source_map): + rmap = defaultdict(list) + for ex in source_map: + rmap[ex.node].append(ex) + return rmap + +class Remarkor: + def __init__(self, contents): + self.dump_context = None + if type(contents) == str: + self.source = contents.encode('utf8') + else: + assert type(contents) == bytes + self.source = contents + + def remark(self, width=80, validate=True): + self.__reset(width) + + self.__dump(self.root_node, '') + self.need_cr = 1 + self.__flush('', '') + + res = '\n'.join(self.result) + + if validate: + self.__validate(res) + + return res + + @staticmethod + def from_filename(filename): + with open(filename, 'rb') as _: + contents = _.read() + return Remarkor(contents) + + def __reset(self, width): + self.parser = Parser(options=Parser.OPT_SOURCEPOS) + self.parser.feed(self.source) + self.root_node = self.parser.finish() + self.source_map = self.parser.get_source_map() + self.rmap = build_reverse_source_map(self.source_map) + + # List of lines + self.result = [''] + # Number of new lines to insert before flushing new content + self.need_cr = 0 + # Whether to insert 1 or 2 new lines before the next item + self.in_tight_list = False + # Workaround for indented lists, which are not reliably breakable by + # any block (in particular indented code) + # FIXME: Ask why this case is even part of the spec, because afaiu it's just broken + self.break_out_of_list = False + # Maximum number of columns + self.width = width + # Whether flush operations can break lines + self.flush_can_break = True + # The offset in the last line to check escape from + self.last_line_content_offset = 0 + # If we break the line when rendering this node, escape the last character + self.escape_link_if_breaking = None + # Do not try to escape anything + self.no_escape = False + # Do not try to escape html blocks, type link + self.no_escape_html_block = False + + def __normalize_texts(self, node): + if type(node) == Text: + node.literal = ' '.join(node.literal.split()) + if not node.literal: + node.unlink() + for c in node: + self.__normalize_texts(c) + + def __strip_blanks(self, node): + if type(node) == SoftBreak: + node.insert_after(Text(literal=' ')) + node.unlink() + return None + elif type(node) == HtmlBlock: + if node.literal.strip() == "": + node.unlink() + for c in node: + self.__strip_blanks(c) + + # This method compares the result with the original AST, stripping + # all blank nodes, all html end-list workaround blocks, and + # consolidating and normalizing text nodes. + def __validate(self, res): + parser = Parser() + parser.feed(res) + new_root_node = parser.finish() + + self.__strip_blanks(self.root_node) + self.__strip_blanks(new_root_node) + self.root_node.consolidate_text_nodes() + new_root_node.consolidate_text_nodes() + self.__normalize_texts(self.root_node) + self.__normalize_texts(new_root_node) + if self.root_node.to_xml() != new_root_node.to_xml(): + raise RemarkorError('Refactoring changed the AST !') + + def __utf8(self, start, stop): + return self.source[start:stop].decode('utf8') + + def __get_extent_utf8(self, extent): + if extent: + return self.__utf8(extent.start, extent.stop) + return '' + + def __get_closer_utf8(self, node): + for ex in reversed(self.rmap[node]): + if ex.type == ExtentType.CLOSER: + return self.__get_extent_utf8(ex) + return '' + return self.__get_extent_utf8(self.get_closer(node)) + + def __get_opener_utf8(self, node): + for ex in self.rmap[node]: + if ex.type == ExtentType.OPENER: + return self.__get_extent_utf8(ex) + return '' + + def __breakup_contents(self, node): + skip_next_ws = False + token = '' + extents = self.rmap[node] + + is_text = type(node) is Text + is_escaped = False + + if is_text: + while node.next: + node = node.next + if type(node) is not Text: + break + extents += self.rmap[node] + self.rmap[node] = [] + + def sanitize(token): + if is_text: + if type(node) is Link and re.match('.*\[.*\]$', token): + self.escape_link_if_breaking = node + return token + + for ex in extents: + if ex.type != ExtentType.CONTENT: + continue + for c in self.__utf8(ex.start, ex.stop): + if c == ' ' and not is_escaped: + if token: + yield token + token = '' + if not skip_next_ws: + yield ' ' + skip_next_ws = True + else: + token += c + skip_next_ws = False + if c == '\\': + is_escaped = not is_escaped + else: + is_escaped = False + if token: + yield sanitize(token) + + def __blankline(self): + self.need_cr = 2 + + def __cr(self): + self.need_cr = max(self.need_cr, 1) + + def __check_escape(self): + if self.no_escape: + self.no_escape = False + return + + prefix = self.result[-1][:self.last_line_content_offset] + unprefixed = self.result[-1][self.last_line_content_offset:] + m = re.match(ESCAPE_REGEX, unprefixed) + if (m): + try: + first_space = unprefixed.index(' ') + except ValueError: + first_space = len(unprefixed) + self.result[-1] = '%s%s\\%s' % (prefix, + unprefixed[0:first_space - 1], + unprefixed[first_space - 1:]) + return + + m = re.match(ESCAPE_THEMATIC_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + m = re.match(ESCAPE_CODE_BLOCK_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + m = re.match(ESCAPE_SETEXT_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + m = re.match(ESCAPE_REFERENCE_DEF_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + # FIXME: certainly very expensive, but as we make it so + # html inlines can never start a line, it is at least + # safe and correct + if not self.no_escape_html_block: + root_node = parse_document(unprefixed) + if type(root_node.first_child) in [HtmlBlock, Reference]: + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + self.no_escape_html_block = False + + def __check_prefix(self, prefix): + if not self.result[-1]: + self.result[-1] = prefix + self.last_line_content_offset = len(prefix) + + def __flush(self, prefix, utf8, escape_if_breaking=0): + if self.in_tight_list: + self.need_cr = min(self.need_cr, 1) + + while self.need_cr: + self.__check_prefix(prefix) + self.__check_escape() + self.result.append('') + self.need_cr -= 1 + + self.__check_prefix(prefix) + + if (utf8 and + self.flush_can_break and + len(self.result[-1]) > self.last_line_content_offset and + len(self.result[-1]) + len(utf8) >= self.width): + self.result[-1] = self.result[-1].rstrip(' ') + self.__check_escape() + if escape_if_breaking: + self.result[-1] = "%s\\%s" % (self.result[-1][:escape_if_breaking], + self.result[-1][escape_if_breaking:]) + self.result.append('') + self.__check_prefix(prefix) + if utf8 == ' ': + return + + self.result[-1] += utf8 + + def __dump(self, node, prefix): + old_in_tight_list = self.in_tight_list + old_break_out_of_list = self.break_out_of_list + old_flush_can_break = self.flush_can_break + + opener_utf8 = self.__get_opener_utf8(node).strip() + + if type(node) is BlockQuote: + self.__flush(prefix, opener_utf8 + ' ') + self.last_line_content_offset = len(opener_utf8 + ' ' + prefix) + prefix += opener_utf8 + ' ' + elif type(node) is Heading: + self.flush_can_break = False + if (opener_utf8): + self.__flush(prefix, opener_utf8 + ' ') + self.no_escape = True + elif type(node) is Item: + opener_utf8_with_blank = '' + last_stop = -1 + # Very awkward, see list item indentation tests + for ex in self.rmap[node]: + if last_stop != -1 and ex.start != last_stop: + break + last_stop = ex.stop + opener_utf8_with_blank += self.__get_extent_utf8(ex) + if len(opener_utf8_with_blank) > 4: + self.break_out_of_list = True + + self.__flush(prefix, opener_utf8 + ' ') + self.last_line_content_offset = len(opener_utf8 + ' ' + prefix) + # Only setting here to make sure the call to flush was made with + # the right tightness. + self.in_tight_list = node.parent.tight + prefix += (len(opener_utf8) + 1) * ' ' + elif type(node) in [CodeBlock, HtmlBlock]: + self.flush_can_break = False + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + self.no_escape = True + # Make sure to prefix the next line + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + self.__blankline() + elif type(node) is ThematicBreak: + self.flush_can_break = False + utf8 = ' '.join(self.__breakup_contents(node)).rstrip('\r\n') + self.__flush(prefix, utf8) + self.no_escape = True + # Make sure to prefix the next line + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + self.__blankline() + elif type(node) is Reference: + self.flush_can_break = False + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8) + if ex.type == ExtentType.OPENER: + self.no_escape = True + self.flush_can_break = old_flush_can_break + if type(node.next) is Reference: # Keep reference lists tight + self.__cr() + else: + self.__blankline() + elif type(node) is Text: + for word in self.__breakup_contents(node): + self.__flush(prefix, word) + elif type(node) is SoftBreak: + self.__flush(prefix, ' ') + elif type(node) is LineBreak: + self.flush_can_break = False + content = ''.join([self.__get_extent_utf8(ex).rstrip('\r\n') for ex in self.rmap[node]]) + # Keep the source hardbreak style + if '\\' in content: + self.__flush(prefix, content) + else: + self.__flush(prefix, ' ') + self.flush_can_break = old_flush_can_break + self.__cr() + elif type(node) in [Emph, Strong]: + self.__flush(prefix, opener_utf8) + if self.result[-1] == prefix + opener_utf8: + self.no_escape = True + self.flush_can_break = False + elif type(node) in [Link, Image]: + if self.escape_link_if_breaking == node: + self.__flush(prefix, opener_utf8, escape_if_breaking=-1) + else: + self.__flush(prefix, opener_utf8) + if self.result[-1] == prefix + opener_utf8: + self.no_escape = True + elif type(node) is HtmlInline: + self.flush_can_break = False + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + elif type(node) is Code: + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + if utf8.endswith('\n'): + self.__cr() + + for child in node: + tmp_flush_can_break = self.flush_can_break + tmp_node = child + + # See __breakup_contents + while type(tmp_node) is Text and type(tmp_node.next) is Text: + tmp_node = tmp_node.next + + if type(tmp_node.next) is HtmlInline or type(tmp_node.previous) is HtmlInline: + self.flush_can_break = False + self.__dump(child, prefix) + self.flush_can_break = tmp_flush_can_break + + if type(node) in [Emph, Strong]: + self.__flush(prefix, self.__get_closer_utf8(node).rstrip('\r\n')) + self.flush_can_break = old_flush_can_break + elif type(node) is List: + self.in_tight_list = old_in_tight_list + if self.break_out_of_list: + self.__cr() + self.__flush(prefix, "") + self.no_escape = True + self.__cr() + self.break_out_of_list = old_break_out_of_list + elif type(node) is Heading: + for ex in self.rmap[node]: + if ex.type != ExtentType.OPENER: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + self.no_escape = True + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + elif type(node) in [Link, Image]: + for ex in self.rmap[node]: + if ex.type != ExtentType.OPENER: + self.flush_can_break = old_flush_can_break + utf8 = self.__get_extent_utf8(ex).strip(' \r\n') + if ex.type == ExtentType.PUNCTUATION and prev_extent.type == ExtentType.PUNCTUATION: + self.flush_can_break = False + elif ex.type == ExtentType.LINK_TITLE: + self.__flush(prefix, ' ') + if ex.type != ExtentType.BLANK: + self.__flush(prefix, utf8) + if ex.type == ExtentType.LINK_DESTINATION: + if self.result[-1] == prefix + utf8 and re.match('^<.*>$', utf8): + self.no_escape_html_block = True + prev_extent = ex + self.flush_can_break = old_flush_can_break + + if type(node) in [Paragraph, List, BlockQuote, Item, Heading, Document]: + self.__blankline() diff --git a/src/source_map.c b/src/source_map.c new file mode 100644 index 000000000..754c5bb6c --- /dev/null +++ b/src/source_map.c @@ -0,0 +1,344 @@ +#include + +#include "source_map.h" + +cmark_source_map * +source_map_new(cmark_mem *mem) +{ + cmark_source_map *res = (cmark_source_map *) mem->calloc(1, sizeof(cmark_source_map)); + res->mem = mem; + return res; +} + +void +source_map_free(cmark_source_map *self) +{ + if (!self) + return; + + cmark_source_extent *tmp; + for (tmp = self->head; tmp; tmp = source_map_free_extent(self, tmp)); + self->mem->free(self); +} + +cmark_source_extent * +source_map_append_extent(cmark_source_map *self, bufsize_t start, bufsize_t stop, cmark_node *node, cmark_extent_type type) +{ + if (!self) + return NULL; + + assert (start <= stop); + assert (!self->tail || self->tail->stop <= start); + + cmark_source_extent *res = (cmark_source_extent *) self->mem->calloc(1, sizeof(cmark_source_extent)); + + res->start = start; + res->stop = stop; + res->node = node; + res->type = type; + + res->next = NULL; + res->prev = self->tail; + + if (!self->head) + self->head = res; + else + self->tail->next = res; + + self->tail = res; + + return res; +} + +cmark_source_extent * +source_map_insert_extent(cmark_source_map *self, cmark_source_extent *previous, + bufsize_t start, bufsize_t stop, cmark_node *node, cmark_extent_type type) +{ + if (!self) + return NULL; + + if (start == stop) + return previous; + + cmark_source_extent *extent = (cmark_source_extent *) self->mem->calloc(1, sizeof(cmark_source_extent)); + + extent->start = start; + extent->stop = stop; + extent->node = node; + extent->type = type; + extent->next = previous->next; + extent->prev = previous; + previous->next = extent; + + if (extent->next) + extent->next->prev = extent; + else + self->tail = extent; + + return extent; +} + +cmark_source_extent * +source_map_free_extent(cmark_source_map *self, cmark_source_extent *extent) +{ + if (!self) + return NULL; + + cmark_source_extent *next = extent->next; + + if (extent->prev) + extent->prev->next = next; + + if (extent->next) + extent->next->prev = extent->prev; + + if (extent == self->tail) + self->tail = extent->prev; + + if (extent == self->head) + self->head = extent->next; + + if (extent == self->cursor) { + self->cursor = extent->prev; + } + + if (extent == self->next_cursor) { + self->next_cursor = extent->next; + } + + self->mem->free(extent); + + return next; +} + +cmark_source_extent * +source_map_stitch_extent(cmark_source_map *self, cmark_source_extent *extent, + cmark_node *root, cmark_node *target_node, bufsize_t total_length) +{ + cmark_source_extent *res; + + if (!self) + return NULL; + + while (extent->next && extent->start == extent->stop) { + extent = source_map_free_extent(self, extent); + if (extent->node == target_node) + return extent; + } + + if (extent->next) { + res = source_map_insert_extent(self, + extent, + extent->stop, + extent->next->start, + root, + CMARK_EXTENT_BLANK)->next; + } else { + res = source_map_insert_extent(self, + extent, + extent->stop, + total_length, + root, + CMARK_EXTENT_BLANK)->next; + } + + if (extent->start == extent->stop) + source_map_free_extent(self, extent); + + return res; +} + +cmark_source_extent * +source_map_splice_extent(cmark_source_map *self, bufsize_t start, bufsize_t stop, + cmark_node *node, cmark_extent_type type) +{ + if (!self) + return NULL; + + if (!self->next_cursor) { + self->cursor = source_map_insert_extent(self, + self->cursor, + start + self->cursor_offset, + stop + self->cursor_offset, node, type); + + return self->cursor; + } else if (start + self->cursor_offset < self->next_cursor->start && + stop + self->cursor_offset <= self->next_cursor->start) { + self->cursor = source_map_insert_extent(self, + self->cursor, + start + self->cursor_offset, + stop + self->cursor_offset, node, type); + + return self->cursor; + } else if (start + self->cursor_offset < self->next_cursor->start) { + bufsize_t new_start = self->next_cursor->start - self->cursor_offset; + + self->cursor = source_map_insert_extent(self, + self->cursor, + start + self->cursor_offset, + self->next_cursor->start, + node, type); + + if (new_start == stop) + return self->cursor; + + start = new_start; + } + + while (self->next_cursor && start + self->cursor_offset >= self->next_cursor->start) { + self->cursor_offset += self->next_cursor->stop - self->next_cursor->start; + self->cursor = self->cursor->next; + self->next_cursor = self->cursor->next; + } + + return source_map_splice_extent(self, start, stop, node, type); +} + +bool +source_map_start_cursor(cmark_source_map *self, cmark_source_extent *cursor) +{ + if (!self) + return false; + + self->cursor = cursor ? cursor : self->head; + + if (!self->cursor) + return false; + + self->next_cursor = self->cursor->next; + self->cursor_offset = self->cursor->stop; + + return true; +} + +void +source_map_pretty_print(cmark_source_map *self) { + cmark_source_extent *tmp; + + if (!self) + return; + + for (tmp = self->head; tmp; tmp = tmp->next) { + printf ("%d:%d - %s, %s (%p)\n", tmp->start, tmp->stop, + cmark_node_get_type_string(tmp->node), + cmark_source_extent_get_type_string(tmp), + (void *) tmp->node); + } +} + +bool +source_map_check(cmark_source_map *self, bufsize_t total_length) +{ + bufsize_t last_stop = 0; + cmark_source_extent *tmp; + + if (!self) + return true; + + for (tmp = self->head; tmp; tmp = tmp->next) { + if (tmp->start != last_stop) { + return false; + } if (tmp->start == tmp->stop) + return false; + last_stop = tmp->stop; + } + + if (last_stop != total_length) + return false; + + return true; +} + +cmark_source_extent * +source_map_get_cursor(cmark_source_map *self) +{ + if (!self) + return NULL; + + return self->cursor; +} + +cmark_source_extent * +source_map_get_head(cmark_source_map *self) { + if (!self) + return NULL; + + return self->head; +} + +cmark_source_extent * +source_map_get_tail(cmark_source_map *self) +{ + if (!self) + return NULL; + + return self->tail; +} + +size_t +cmark_source_extent_get_start(cmark_source_extent *extent) +{ + return extent->start; +} + +size_t +cmark_source_extent_get_stop(cmark_source_extent *extent) +{ + return extent->stop; +} + +cmark_node * +cmark_source_extent_get_node(cmark_source_extent *extent) +{ + return extent->node; +} + +cmark_source_extent * +cmark_source_extent_get_next(cmark_source_extent *extent) +{ + return extent->next; +} + +cmark_source_extent * +cmark_source_extent_get_previous(cmark_source_extent *extent) +{ + return extent->prev; +} + +cmark_extent_type +cmark_source_extent_get_type(cmark_source_extent *extent) +{ + return extent->type; +} + +const char * +cmark_source_extent_get_type_string(cmark_source_extent *extent) +{ + switch (extent->type) { + case CMARK_EXTENT_NONE: + return "unknown"; + case CMARK_EXTENT_OPENER: + return "opener"; + case CMARK_EXTENT_CLOSER: + return "closer"; + case CMARK_EXTENT_BLANK: + return "blank"; + case CMARK_EXTENT_CONTENT: + return "content"; + case CMARK_EXTENT_PUNCTUATION: + return "punctuation"; + case CMARK_EXTENT_LINK_DESTINATION: + return "link_destination"; + case CMARK_EXTENT_LINK_TITLE: + return "link_title"; + case CMARK_EXTENT_LINK_LABEL: + return "link_label"; + case CMARK_EXTENT_REFERENCE_DESTINATION: + return "reference_destination"; + case CMARK_EXTENT_REFERENCE_LABEL: + return "reference_label"; + case CMARK_EXTENT_REFERENCE_TITLE: + return "reference_title"; + } + return "unknown"; +} diff --git a/src/source_map.h b/src/source_map.h new file mode 100644 index 000000000..de13f8ed7 --- /dev/null +++ b/src/source_map.h @@ -0,0 +1,74 @@ +#ifndef CMARK_SOURCE_MAP_H +#define CMARK_SOURCE_MAP_H + +#include "cmark.h" +#include "config.h" +#include "buffer.h" + +typedef struct _cmark_source_map +{ + cmark_source_extent *head; + cmark_source_extent *tail; + cmark_source_extent *cursor; + cmark_source_extent *next_cursor; + bufsize_t cursor_offset; + cmark_mem *mem; +} cmark_source_map; + +struct cmark_source_extent +{ + bufsize_t start; + bufsize_t stop; + struct cmark_source_extent *next; + struct cmark_source_extent *prev; + cmark_node *node; + cmark_extent_type type; +}; + +cmark_source_map * source_map_new (cmark_mem *mem); + +void source_map_free (cmark_source_map *self); + +bool source_map_check (cmark_source_map *self, + bufsize_t total_length); + +void source_map_pretty_print (cmark_source_map *self); + +cmark_source_extent * source_map_append_extent(cmark_source_map *self, + bufsize_t start, + bufsize_t stop, + cmark_node *node, + cmark_extent_type type); + +cmark_source_extent * source_map_insert_extent(cmark_source_map *self, + cmark_source_extent *previous, + bufsize_t start, + bufsize_t stop, + cmark_node *node, + cmark_extent_type type); + +cmark_source_extent * source_map_free_extent (cmark_source_map *self, + cmark_source_extent *extent); + +cmark_source_extent * source_map_stitch_extent(cmark_source_map *self, + cmark_source_extent *extent, + cmark_node *root, + cmark_node *target_node, + bufsize_t total_length); + +cmark_source_extent * source_map_splice_extent(cmark_source_map *self, + bufsize_t start, + bufsize_t stop, + cmark_node *node, + cmark_extent_type type); + +cmark_source_extent * source_map_get_cursor (cmark_source_map *self); + +cmark_source_extent * source_map_get_head (cmark_source_map *self); + +cmark_source_extent * source_map_get_tail (cmark_source_map *self); + +bool source_map_start_cursor (cmark_source_map *self, + cmark_source_extent *cursor); + +#endif diff --git a/src/xml.c b/src/xml.c index 4898cd2e8..fcbd93db5 100644 --- a/src/xml.c +++ b/src/xml.c @@ -126,6 +126,17 @@ static int S_render_node(cmark_node *node, cmark_event_type ev_type, escape_xml(xml, node->as.link.title.data, node->as.link.title.len); cmark_strbuf_putc(xml, '"'); break; + case CMARK_NODE_REFERENCE: + cmark_strbuf_puts(xml, " label=\""); + escape_xml(xml, node->as.reference.label.data, node->as.reference.label.len); + cmark_strbuf_putc(xml, '"'); + cmark_strbuf_puts(xml, " destination=\""); + escape_xml(xml, node->as.reference.url.data, node->as.reference.url.len); + cmark_strbuf_putc(xml, '"'); + cmark_strbuf_puts(xml, " title=\""); + escape_xml(xml, node->as.reference.title.data, node->as.reference.title.len); + cmark_strbuf_putc(xml, '"'); + break; default: break; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6da3a6bac..5dfd2c2f5 100755 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -78,3 +78,27 @@ ELSE(PYTHONINTERP_FOUND) ENDIF(PYTHONINTERP_FOUND) +if (PYTHON_BINDING_TESTS) + find_package(PythonInterp 3 REQUIRED) +else(PYTHON_BINDING_TESTS) + find_package(PythonInterp 3) +endif(PYTHON_BINDING_TESTS) + +IF (PYTHONINTERP_FOUND) + add_test(python3_bindings + ${PYTHON_EXECUTABLE} + "${CMAKE_CURRENT_SOURCE_DIR}/test_cmark.py" + "${CMAKE_CURRENT_BINARY_DIR}/../src" + ) + + add_test(remarkor + ${PYTHON_EXECUTABLE} + "${CMAKE_CURRENT_SOURCE_DIR}/test_remarkor.py" + "${CMAKE_CURRENT_BINARY_DIR}/../src" + "${CMAKE_CURRENT_SOURCE_DIR}/spec.txt" + ) +ELSE(PYTHONINTERP_FOUND) + message("\n*** A python 3 interpreter is required to run the python binding tests.\n") + add_test(skipping_python_binding_tests + echo "Skipping python binding tests, because no python 3 interpreter is available.") +ENDIF(PYTHONINTERP_FOUND) diff --git a/test/cmark.py b/test/cmark.py index 4be85a3b0..fd35d54bb 100644 --- a/test/cmark.py +++ b/test/cmark.py @@ -6,6 +6,8 @@ import platform import os +OPT_SOURCEPOS = 1 << 1 + def pipe_through_prog(prog, text): p1 = Popen(prog.split(), stdout=PIPE, stdin=PIPE, stderr=PIPE) [result, err] = p1.communicate(input=text.encode('utf-8')) @@ -29,7 +31,10 @@ def to_commonmark(lib, text): render_commonmark = lib.cmark_render_commonmark render_commonmark.restype = c_char_p render_commonmark.argtypes = [c_void_p, c_int, c_int] - node = parse_document(textbytes, textlen, 0) + # We want tests to go through the source map code + node = parse_document(textbytes, textlen, OPT_SOURCEPOS) + if node is None: + raise Exception("parse_document failed") result = render_commonmark(node, 0, 0).decode('utf-8') return [0, result, ''] diff --git a/test/test_cmark.py b/test/test_cmark.py new file mode 100644 index 000000000..e86e38bab --- /dev/null +++ b/test/test_cmark.py @@ -0,0 +1,517 @@ +# -*- coding: utf8 -*- + +from __future__ import unicode_literals + +import sys +import os +import unittest +import argparse + +here = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(here, os.pardir, 'wrappers')) +from wrapper import * + +class TestHighLevel(unittest.TestCase): + def test_markdown_to_html(self): + self.assertEqual(markdown_to_html('foo'), '

foo

\n') + + def test_parse_document(self): + doc = parse_document('foo') + self.assertEqual(type(doc), Document) + +class TestParser(unittest.TestCase): + def test_lifecycle(self): + parser = Parser() + del parser + + def test_feed(self): + parser = Parser() + parser.feed('‘') + + def test_finish(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + + def test_source_map(self): + parser = Parser(options=Parser.OPT_SOURCEPOS) + parser.feed('‘') + doc = parser.finish() + source_map = parser.get_source_map() + extents = [e for e in source_map] + self.assertEqual(len(extents), 1) + self.assertEqual(extents[0].type, ExtentType.CONTENT) + self.assertEqual(extents[0].start, 0) + self.assertEqual(extents[0].stop, 3) + + def test_render_html(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_html() + self.assertEqual(res, '

\n') + + def test_render_xml(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_xml() + self.assertEqual( + res, + '\n' + '\n' + '\n' + ' \n' + ' \n' + ' \n' + '\n') + + def test_render_commonmark(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_commonmark() + self.assertEqual(res, '‘\n') + + def test_render_man(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_man() + self.assertEqual( + res, + '.PP\n' + '\[oq]\n') + + def test_render_latex(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_latex() + self.assertEqual(res, '`\n') + +class TestNode(unittest.TestCase): + def test_type(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + self.assertEqual(type(doc), Document) + + def test_equal(self): + parser = Parser() + parser.feed('foo\n\nbar') + doc = parser.finish() + para_one = doc.first_child + para_two = doc.last_child + self.assertEqual(doc.last_child, para_one.next) + self.assertEqual(para_one != para_two, True) + + def test_first_child(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + child1 = doc.first_child + child2 = doc.first_child + self.assertEqual(child1, child2) + self.assertEqual((child1 != child2), False) + + def test_last_child(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + child1 = doc.first_child + child2 = doc.last_child + self.assertEqual(child1, child2) + self.assertEqual((child1 != child2), False) + + def test_next(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + text = para.first_child + self.assertEqual(type(text), Text) + emph = text.next + self.assertEqual(type(emph), Emph) + self.assertEqual(para.next, None) + + def test_previous(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + text = para.first_child + emph = text.next + self.assertEqual(emph.previous, text) + self.assertEqual(para.previous, None) + + def test_children(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + children = [c for c in para] + self.assertEqual(len(children), 2) + self.assertEqual(type(children[0]), Text) + self.assertEqual(type(children[1]), Emph) + + # Test unlinking while iterating + + children = [] + for c in para: + children.append(c) + c.unlink() + + self.assertEqual(len(children), 2) + self.assertEqual(type(children[0]), Text) + self.assertEqual(type(children[1]), Emph) + + def test_parent(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + self.assertEqual(para.parent, doc) + + def test_new(self): + with self.assertRaises(NotImplementedError): + n = Node() + + def test_unlink(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + para.unlink() + self.assertEqual(doc.to_html(), '') + + def test_append_child(self): + parser = Parser() + parser.feed('') + doc = parser.finish() + doc.append_child(Paragraph()) + self.assertEqual(doc.to_html(), '

\n') + with self.assertRaises(LibcmarkError): + doc.append_child(Text(literal='foo')) + + def test_prepend_child(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + doc.prepend_child(Paragraph()) + self.assertEqual(doc.to_html(), '

\n

foo

\n') + with self.assertRaises(LibcmarkError): + doc.prepend_child(Text(literal='foo')) + + def test_insert_before(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + para.insert_before(Paragraph()) + self.assertEqual(doc.to_html(), '

\n

foo

\n') + with self.assertRaises(LibcmarkError): + para.insert_before(Text(literal='foo')) + + def test_insert_after(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + para.insert_after(Paragraph()) + self.assertEqual(doc.to_html(), '

foo

\n

\n') + with self.assertRaises(LibcmarkError): + para.insert_after(Text(literal='foo')) + + def test_consolidate_text_nodes(self): + parser = Parser() + parser.feed('foo **bar*') + doc = parser.finish() + self.assertEqual(len([c for c in doc.first_child]), 3) + doc.consolidate_text_nodes() + self.assertEqual(len([c for c in doc.first_child]), 2) + +class TestLiteral(unittest.TestCase): + def test_text(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + text = para.first_child + self.assertEqual(type(text), Text) + self.assertEqual(text.literal, 'foo') + text.literal = 'bar' + self.assertEqual(text.to_html(), 'bar') + +class TestDocument(unittest.TestCase): + def test_new(self): + doc = Document() + self.assertEqual(doc.to_html(), + '') + +class TestBlockQuote(unittest.TestCase): + def test_new(self): + bq = BlockQuote() + self.assertEqual(bq.to_html(), + '
\n
\n') + +class TestList(unittest.TestCase): + def test_new(self): + list_ = List() + self.assertEqual(list_.to_html(), + '
    \n
\n') + + def test_type(self): + parser = Parser() + parser.feed('* foo') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.type, ListType.BULLET) + list_.type = ListType.ORDERED + self.assertEqual(doc.to_html(), + '
    \n' + '
  1. foo
  2. \n' + '
\n') + + def test_start(self): + parser = Parser() + parser.feed('2. foo') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.start, 2) + list_.start = 1 + self.assertEqual(doc.to_commonmark(), + '1. foo\n') + with self.assertRaises(LibcmarkError): + list_.start = -1 + list_.type = ListType.BULLET + + def test_delim(self): + parser = Parser() + parser.feed('1. foo') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.delim, '.') + list_.delim = ')' + self.assertEqual(doc.to_commonmark(), + '1) foo\n') + + def test_tight(self): + parser = Parser() + parser.feed('* foo\n' + '\n' + '* bar\n') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.tight, False) + self.assertEqual(doc.to_commonmark(), + ' - foo\n' + '\n' + ' - bar\n') + + list_.tight = True + self.assertEqual(doc.to_commonmark(), + ' - foo\n' + ' - bar\n') + + with self.assertRaises(LibcmarkError): + list_.tight = 42 + +class TestItem(unittest.TestCase): + def test_new(self): + item = Item() + self.assertEqual(item.to_html(), + '
  • \n') + +class TestCodeBlock(unittest.TestCase): + def test_new(self): + cb = CodeBlock(literal='foo', fence_info='python') + self.assertEqual(cb.to_html(), + '
    foo
    \n') + + def test_fence_info(self): + parser = Parser() + parser.feed('``` markdown\n' + 'hello\n' + '```\n') + doc = parser.finish() + code_block = doc.first_child + self.assertEqual(type(code_block), CodeBlock) + self.assertEqual(code_block.fence_info, 'markdown') + code_block.fence_info = 'python' + self.assertEqual(doc.to_commonmark(), + '``` python\n' + 'hello\n' + '```\n') + +class TestHtmlBlock(unittest.TestCase): + def test_new(self): + hb = HtmlBlock(literal='

    foo

    ') + self.assertEqual(hb.to_html(), + '

    foo

    \n') + +class TestCustomBlock(unittest.TestCase): + def test_new(self): + cb = CustomBlock() + self.assertEqual(cb.to_html(), + '') + +class TestParagraph(unittest.TestCase): + def test_new(self): + para = Paragraph() + self.assertEqual(para.to_html(), + '

    \n') + +class TestHeading(unittest.TestCase): + def test_new(self): + heading = Heading(level=3) + self.assertEqual(heading.to_html(), + '

    \n') + + def test_level(self): + parser = Parser() + parser.feed('# foo') + doc = parser.finish() + heading = doc.first_child + self.assertEqual(type(heading), Heading) + self.assertEqual(heading.level, 1) + heading.level = 3 + self.assertEqual(heading.level, 3) + + self.assertEqual(doc.to_html(), + '

    foo

    \n') + + with self.assertRaises(LibcmarkError): + heading.level = 10 + +class TestThematicBreak(unittest.TestCase): + def test_new(self): + tb = ThematicBreak() + self.assertEqual(tb.to_html(), + '
    \n') + +class TestText(unittest.TestCase): + def test_new(self): + text = Text(literal='foo') + self.assertEqual(text.to_html(), + 'foo') + +class TestSoftBreak(unittest.TestCase): + def test_new(self): + sb = SoftBreak() + self.assertEqual(sb.to_html(), '\n') + self.assertEqual(sb.to_html(options=Parser.OPT_HARDBREAKS), + '
    \n') + self.assertEqual(sb.to_html(options=Parser.OPT_NOBREAKS), + ' ') + +class TestLineBreak(unittest.TestCase): + def test_new(self): + lb = LineBreak() + self.assertEqual(lb.to_html(), '
    \n') + +class TestCode(unittest.TestCase): + def test_new(self): + code = Code(literal='bar') + self.assertEqual(code.to_html(), 'bar') + +class TestHtmlInline(unittest.TestCase): + def test_new(self): + hi = HtmlInline(literal='baz') + self.assertEqual(hi.to_html(), 'baz') + +class TestCustomInline(unittest.TestCase): + def test_new(self): + ci = CustomInline() + self.assertEqual(ci.to_html(), + '') + +class TestEmph(unittest.TestCase): + def test_new(self): + emph = Emph() + self.assertEqual(emph.to_html(), + '') + +class TestStrong(unittest.TestCase): + def test_new(self): + strong = Strong() + self.assertEqual(strong.to_html(), + '') + +class TestLink(unittest.TestCase): + def test_new(self): + link = Link(url='http://foo.com', title='foo') + self.assertEqual(link.to_html(), + '') + + def test_url(self): + parser = Parser() + parser.feed('\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + link = para.first_child + self.assertEqual(type(link), Link) + self.assertEqual(link.url, 'http://foo.com') + link.url = 'http://bar.net' + # Yeah that's crappy behaviour but not our problem here + self.assertEqual(doc.to_commonmark(), + '[http://foo.com](http://bar.net)\n') + + def test_title(self): + parser = Parser() + parser.feed('\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + link = para.first_child + self.assertEqual(type(link), Link) + self.assertEqual(link.title, '') + link.title = 'foo' + self.assertEqual(doc.to_html(), + '

    http://foo.com

    \n') + +class TestImage(unittest.TestCase): + def test_new(self): + image = Image(url='http://foo.com', title='foo') + self.assertEqual(image.to_html(), + '') + + def test_url(self): + parser = Parser() + parser.feed('![image](image.com)\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + link = para.first_child + self.assertEqual(type(link), Image) + self.assertEqual(link.url, 'image.com') + link.url = 'http://bar.net' + self.assertEqual(doc.to_commonmark(), + '![image](http://bar.net)\n') + + def test_title(self): + parser = Parser() + parser.feed('![image](image.com "ze image")\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + image = para.first_child + self.assertEqual(type(image), Image) + self.assertEqual(image.title, 'ze image') + image.title = 'foo' + self.assertEqual(doc.to_html(), + '

    image

    \n') + +if __name__=='__main__': + parser = argparse.ArgumentParser() + parser.add_argument('libdir') + args = parser.parse_known_args() + conf.set_library_path(args[0].libdir) + unittest.main(argv=[sys.argv[0]] + args[1]) diff --git a/test/test_remarkor.py b/test/test_remarkor.py new file mode 100644 index 000000000..f4d495c68 --- /dev/null +++ b/test/test_remarkor.py @@ -0,0 +1,44 @@ +import unittest +import argparse +import os +import sys + +from spec_tests import get_tests + +here = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(here) +sys.path.append(os.path.join(here, os.pardir, 'src')) +sys.path.append(os.path.join(here, os.pardir, 'wrappers')) + +from remarkor import * + +if __name__=='__main__': + parser = argparse.ArgumentParser() + parser.add_argument('libdir') + parser.add_argument('specpath') + args = parser.parse_known_args() + conf.set_library_path(args[0].libdir) + SPEC_PATH = args[0].specpath + +class TestRemarkorMeta(type): + def __new__(mcs, name, bases, dict): + def gen_test(test_description): + def test(self): + remarkor = Remarkor(test_description['markdown']) + remarkor.remark(width=1, validate=True) + return test + + for t in get_tests(SPEC_PATH): + test_name = 'test_%s' % re.sub('\W|^(?=\d)','_', t['section']) + cnt = 1 + while '%s_%d' % (test_name, cnt) in dict: + cnt += 1 + test_name = '%s_%d' % (test_name, cnt) + dict[test_name] = gen_test(t) + return type.__new__(mcs, name, bases, dict) + +class TestRemarkor(unittest.TestCase, metaclass=TestRemarkorMeta): + pass + +if __name__=='__main__': + unittest.main(argv=[sys.argv[0]] + args[1]) diff --git a/wrappers/wrapper.py b/wrappers/wrapper.py old mode 100755 new mode 100644 index 98e7f2b46..048d33bf5 --- a/wrappers/wrapper.py +++ b/wrappers/wrapper.py @@ -1,37 +1,938 @@ -#!/usr/bin/env python +from __future__ import unicode_literals -# Example for using the shared library from python -# Will work with either python 2 or python 3 -# Requires cmark library to be installed - -from ctypes import CDLL, c_char_p, c_long +from ctypes import * import sys import platform -sysname = platform.system() +c_object_p = POINTER(c_void_p) -if sysname == 'Darwin': - libname = "libcmark.dylib" -elif sysname == 'Windows': - libname = "cmark.dll" +if sys.version_info[0] > 2: + def bytes_and_length(text): + if type(text) == str: + text = text.encode("utf8") + return text, len(text) else: - libname = "libcmark.so" -cmark = CDLL(libname) - -markdown = cmark.cmark_markdown_to_html -markdown.restype = c_char_p -markdown.argtypes = [c_char_p, c_long, c_long] - -opts = 0 # defaults - -def md2html(text): - if sys.version_info >= (3,0): - textbytes = text.encode('utf-8') - textlen = len(textbytes) - return markdown(textbytes, textlen, opts).decode('utf-8') - else: - textbytes = text - textlen = len(text) - return markdown(textbytes, textlen, opts) - -sys.stdout.write(md2html(sys.stdin.read())) + def bytes_and_length(text): + if type(text) == unicode: + text = text.encode("utf8") + return text, len(text) + +def unicode_from_char_p(res, fn, args): + ret = res.decode("utf8") + return ret + +class owned_char_p(c_void_p): + def __del__(self): + conf.lib.cmark_default_mem_free(self.value) + +def unicode_from_owned_char_p(res, fn, args): + ret = cast(res, c_char_p).value.decode("utf8") + return ret + +def boolean_from_result(res, fn, args): + return bool(res) + +def delim_from_int(res, fn, args): + if res == 0: + return '' + elif res == 1: + return '.' + elif res == 2: + return ')' + +class BaseEnumeration(object): + def __init__(self, value): + if value >= len(self.__class__._kinds): + self.__class__._kinds += [None] * (value - len(self.__class__._kinds) + 1) + if self.__class__._kinds[value] is not None: + raise ValueError('{0} value {1} already loaded'.format( + str(self.__class__), value)) + self.value = value + self.__class__._kinds[value] = self + self.__class__._name_map = None + + def from_param(self): + return self.value + + @classmethod + def from_id(cls, id, fn, args): + if id >= len(cls._kinds) or cls._kinds[id] is None: + raise ValueError('Unknown template argument kind %d' % id) + return cls._kinds[id] + + @property + def name(self): + """Get the enumeration name of this cursor kind.""" + if self._name_map is None: + self._name_map = {} + for key, value in self.__class__.__dict__.items(): + if isinstance(value, self.__class__): + self._name_map[value] = key + return str(self._name_map[self]) + + def __repr__(self): + return '%s.%s' % (self.__class__.__name__, self.name,) + +class Parser(object): + OPT_DEFAULT = 0 + OPT_SOURCEPOS = 1 << 1 + OPT_HARDBREAKS = 1 << 2 + OPT_SAFE = 1 << 3 + OPT_NOBREAKS = 1 << 4 + OPT_NORMALIZE = 1 << 8 + OPT_VALIDATE_UTF8 = 1 << 9 + OPT_SMART = 1 << 10 + + def __init__(self, options=0): + self._parser = conf.lib.cmark_parser_new(options) + + def __del__(self): + conf.lib.cmark_parser_free(self._parser) + + def feed(self, text): + conf.lib.cmark_parser_feed(self._parser, *bytes_and_length(text)) + + def finish(self): + return conf.lib.cmark_parser_finish(self._parser) + + def get_source_map(self): + return conf.lib.cmark_parser_get_first_source_extent(self._parser) + +class LibcmarkError(Exception): + def __init__(self, message): + self.m = message + + def __str__(self): + return self.m + +class NodeType(BaseEnumeration): + _kinds = [] + _name_map = None + +# FIXME: a bit awkward to update, not sure what the best practice is +NodeType.NONE = NodeType(0) +NodeType.DOCUMENT = NodeType(1) +NodeType.BLOCK_QUOTE = NodeType(2) +NodeType.LIST = NodeType(3) +NodeType.ITEM = NodeType(4) +NodeType.CODE_BLOCK = NodeType(5) +NodeType.HTML_BLOCK = NodeType(6) +NodeType.CUSTOM_BLOCK = NodeType(7) +NodeType.PARAGRAPH = NodeType(8) +NodeType.HEADING = NodeType(9) +NodeType.THEMATIC_BREAK = NodeType(10) +NodeType.REFERENCE = NodeType(11) +NodeType.TEXT = NodeType(12) +NodeType.SOFTBREAK = NodeType(13) +NodeType.LINEBREAK = NodeType(14) +NodeType.CODE = NodeType(15) +NodeType.HTML_INLINE = NodeType(16) +NodeType.CUSTOM_INLINE = NodeType(17) +NodeType.EMPH = NodeType(18) +NodeType.STRONG = NodeType(19) +NodeType.LINK = NodeType(20) +NodeType.IMAGE = NodeType(21) + +class ListType(BaseEnumeration): + _kinds = [] + _name_map = None + +ListType.BULLET = ListType(1) +ListType.ORDERED = ListType(2) + +class Node(object): + __subclass_map = {} + + def __init__(self): + self._owned = False + raise NotImplementedError + + @staticmethod + def from_result(res, fn=None, args=None): + try: + res.contents + except ValueError: + return None + cls = Node.get_subclass_map()[conf.lib.cmark_node_get_type(res)] + + ret = cls.__new__(cls) + ret._node = res + ret._owned = False + return ret + + @classmethod + def get_subclass_map(cls): + if cls.__subclass_map: + return cls.__subclass_map + + res = {c._node_type: c for c in cls.__subclasses__()} + + for c in cls.__subclasses__(): + res.update(c.get_subclass_map()) + + return res + + def unlink(self): + conf.lib.cmark_node_unlink(self._node) + self._owned = True + + def append_child(self, child): + res = conf.lib.cmark_node_append_child(self._node, child._node) + if not res: + raise LibcmarkError("Can't append child %s to node %s" % (str(child), str(self))) + child._owned = False + + def prepend_child(self, child): + res = conf.lib.cmark_node_prepend_child(self._node, child._node) + if not res: + raise LibcmarkError("Can't prepend child %s to node %s" % (str(child), str(self))) + child._owned = False + + def insert_before(self, sibling): + res = conf.lib.cmark_node_insert_before(self._node, sibling._node) + if not res: + raise LibcmarkError("Can't insert sibling %s before node %s" % (str(sibling), str(self))) + sibling._owned = False + + def insert_after(self, sibling): + res = conf.lib.cmark_node_insert_after(self._node, sibling._node) + if not res: + raise LibcmarkError("Can't insert sibling %s after node %s" % (str(sibling), str(self))) + sibling._owned = False + + def consolidate_text_nodes(self): + conf.lib.cmark_consolidate_text_nodes(self._node) + + def to_html(self, options=Parser.OPT_DEFAULT): + return conf.lib.cmark_render_html(self._node, options) + + def to_xml(self, options=Parser.OPT_DEFAULT): + return conf.lib.cmark_render_xml(self._node, options) + + def to_commonmark(self, options=Parser.OPT_DEFAULT, width=0): + return conf.lib.cmark_render_commonmark(self._node, options, width) + + def to_man(self, options=Parser.OPT_DEFAULT, width=0): + return conf.lib.cmark_render_man(self._node, options, width) + + def to_latex(self, options=Parser.OPT_DEFAULT, width=0): + return conf.lib.cmark_render_latex(self._node, options, width) + + @property + def parent(self): + return conf.lib.cmark_node_parent(self._node) + + @property + def first_child(self): + return conf.lib.cmark_node_first_child(self._node) + + @property + def last_child(self): + return conf.lib.cmark_node_last_child(self._node) + + @property + def next(self): + return conf.lib.cmark_node_next(self._node) + + @property + def previous(self): + return conf.lib.cmark_node_previous(self._node) + + def __eq__(self, other): + if other is None: + return False + return addressof(self._node.contents) == addressof(other._node.contents) + + def __ne__(self, other): + if other is None: + return True + return addressof(self._node.contents) != addressof(other._node.contents) + + def __hash__(self): + return hash(addressof(self._node.contents)) + + def __del__(self): + if self._owned: + conf.lib.cmark_node_free(self._node) + + def __iter__(self): + cur = self.first_child + while (cur): + next_ = cur.next + yield cur + cur = next_ + +class Literal(Node): + _node_type = NodeType.NONE + + @property + def literal(self): + return conf.lib.cmark_node_get_literal(self._node) + + @literal.setter + def literal(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_literal(self._node, bytes_): + raise LibcmarkError("Invalid literal %s\n" % str(value)) + +class Document(Node): + _node_type = NodeType.DOCUMENT + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class BlockQuote(Node): + _node_type = NodeType.BLOCK_QUOTE + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class List(Node): + _node_type = NodeType.LIST + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + @property + def type(self): + return conf.lib.cmark_node_get_list_type(self._node) + + @type.setter + def type(self, type_): + if not conf.lib.cmark_node_set_list_type(self._node, type_.value): + raise LibcmarkError("Invalid type %s" % str(type_)) + + @property + def delim(self): + return conf.lib.cmark_node_get_list_delim(self._node) + + @delim.setter + def delim(self, value): + if value == '.': + delim_type = 1 + elif value == ')': + delim_type = 2 + else: + raise LibcmarkError('Invalid delim type %s' % str(value)) + + conf.lib.cmark_node_set_list_delim(self._node, delim_type) + + @property + def start(self): + return conf.lib.cmark_node_get_list_start(self._node) + + @start.setter + def start(self, value): + if not conf.lib.cmark_node_set_list_start(self._node, value): + raise LibcmarkError("Invalid list start %s\n" % str(value)) + + @property + def tight(self): + return conf.lib.cmark_node_get_list_tight(self._node) + + @tight.setter + def tight(self, value): + if value is True: + tightness = 1 + elif value is False: + tightness = 0 + else: + raise LibcmarkError("Invalid list tightness %s\n" % str(value)) + if not conf.lib.cmark_node_set_list_tight(self._node, tightness): + raise LibcmarkError("Invalid list tightness %s\n" % str(value)) + +class Item(Node): + _node_type = NodeType.ITEM + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class CodeBlock(Literal): + _node_type = NodeType.CODE_BLOCK + + def __init__(self, literal='', fence_info=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + self.fence_info = fence_info + + @property + def fence_info(self): + return conf.lib.cmark_node_get_fence_info(self._node) + + @fence_info.setter + def fence_info(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_fence_info(self._node, bytes_): + raise LibcmarkError("Invalid fence info %s\n" % str(value)) + +class HtmlBlock(Literal): + _node_type = NodeType.HTML_BLOCK + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class CustomBlock(Node): + _node_type = NodeType.CUSTOM_BLOCK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Paragraph(Node): + _node_type = NodeType.PARAGRAPH + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Heading(Node): + _node_type = NodeType.HEADING + + def __init__(self, level=1): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self.level = level + self._owned = True + + @property + def level(self): + return int(conf.lib.cmark_node_get_heading_level(self._node)) + + @level.setter + def level(self, value): + res = conf.lib.cmark_node_set_heading_level(self._node, value) + if (res == 0): + raise LibcmarkError("Invalid heading level %s" % str(value)) + +class ThematicBreak(Node): + _node_type = NodeType.THEMATIC_BREAK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Reference(Node): + _node_type = NodeType.REFERENCE + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Text(Literal): + _node_type = NodeType.TEXT + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class SoftBreak(Node): + _node_type = NodeType.SOFTBREAK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class LineBreak(Node): + _node_type = NodeType.LINEBREAK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Code(Literal): + _node_type = NodeType.CODE + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class HtmlInline(Literal): + _node_type = NodeType.HTML_INLINE + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class CustomInline(Node): + _node_type = NodeType.CUSTOM_INLINE + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Emph(Node): + _node_type = NodeType.EMPH + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Strong(Node): + _node_type = NodeType.STRONG + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Link(Node): + _node_type = NodeType.LINK + + def __init__(self, url='', title=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.url = url + self.title = title + + @property + def url(self): + return conf.lib.cmark_node_get_url(self._node) + + @url.setter + def url(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_url(self._node, bytes_): + raise LibcmarkError("Invalid url %s\n" % str(value)) + + @property + def title(self): + return conf.lib.cmark_node_get_title(self._node) + + @title.setter + def title(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_title(self._node, bytes_): + raise LibcmarkError("Invalid title %s\n" % str(value)) + +class Image(Link): + _node_type = NodeType.IMAGE + +class ExtentType(BaseEnumeration): + _kinds = [] + _name_map = None + +ExtentType.NONE = ExtentType(0) +ExtentType.OPENER = ExtentType(1) +ExtentType.CLOSER = ExtentType(2) +ExtentType.BLANK = ExtentType(3) +ExtentType.CONTENT = ExtentType(4) +ExtentType.PUNCTUATION = ExtentType(5) +ExtentType.LINK_DESTINATION = ExtentType(6) +ExtentType.LINK_TITLE = ExtentType(7) +ExtentType.LINK_LABEL = ExtentType(8) +ExtentType.REFERENCE_DESTINATION = ExtentType(9) +ExtentType.REFERENCE_LABEL = ExtentType(10) +ExtentType.REFERENCE_TITLE = ExtentType(11) + +class Extent(object): + @staticmethod + def from_result(res, fn=None, args=None): + ret = Extent() + ret._extent = res + return ret + + @property + def start(self): + return conf.lib.cmark_source_extent_get_start(self._extent) + + @property + def stop(self): + return conf.lib.cmark_source_extent_get_stop(self._extent) + + @property + def type(self): + return conf.lib.cmark_source_extent_get_type(self._extent) + + @property + def node(self): + return conf.lib.cmark_source_extent_get_node(self._extent) + +class SourceMap(object): + @staticmethod + def from_result(res, fn, args): + ret = SourceMap() + ret._root = res + return ret + + def __iter__(self): + cur = self._root + while (cur): + yield Extent.from_result(cur) + cur = conf.lib.cmark_source_extent_get_next(cur) + +def markdown_to_html(text, options=Parser.OPT_DEFAULT): + bytes_, length = bytes_and_length(text) + return conf.lib.cmark_markdown_to_html(bytes_, length, options) + +def parse_document(text, options=Parser.OPT_DEFAULT): + bytes_, length = bytes_and_length(text) + return conf.lib.cmark_parse_document(bytes_, length, options) + +functionList = [ + ("cmark_default_mem_free", + [c_void_p]), + ("cmark_markdown_to_html", + [c_char_p, c_long, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_parse_document", + [c_char_p, c_long, c_int], + c_object_p, + Node.from_result), + ("cmark_parser_new", + [c_int], + c_object_p), + ("cmark_parser_free", + [c_object_p]), + ("cmark_parser_feed", + [c_object_p, c_char_p, c_long]), + ("cmark_parser_finish", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_parser_get_first_source_extent", + [c_object_p], + c_object_p, + SourceMap.from_result), + ("cmark_source_extent_get_next", + [c_object_p], + c_object_p), + ("cmark_source_extent_get_start", + [c_object_p], + c_ulonglong), + ("cmark_source_extent_get_stop", + [c_object_p], + c_ulonglong), + ("cmark_source_extent_get_type", + [c_object_p], + c_int, + ExtentType.from_id), + ("cmark_source_extent_get_node", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_render_html", + [c_object_p, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_xml", + [c_object_p, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_commonmark", + [c_object_p, c_int, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_man", + [c_object_p, c_int, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_latex", + [c_object_p, c_int, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_node_new", + [c_int], + c_object_p), + ("cmark_node_free", + [c_object_p]), + ("cmark_node_get_type", + [c_object_p], + c_int, + NodeType.from_id), + ("cmark_node_parent", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_first_child", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_last_child", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_next", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_previous", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_unlink", + [c_object_p]), + ("cmark_node_append_child", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_node_prepend_child", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_node_insert_before", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_node_insert_after", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_consolidate_text_nodes", + [c_object_p]), + ("cmark_node_get_literal", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_literal", + [c_object_p, c_char_p], + c_int, + boolean_from_result), + ("cmark_node_get_heading_level", + [c_object_p], + c_int), + ("cmark_node_set_heading_level", + [c_object_p, c_int], + c_int, + boolean_from_result), + ("cmark_node_get_list_type", + [c_object_p], + c_int, + ListType.from_id), + ("cmark_node_set_list_type", + [c_object_p], + c_int, + boolean_from_result), + ("cmark_node_get_list_delim", + [c_object_p], + c_int, + delim_from_int), + ("cmark_node_set_list_delim", + [c_object_p, c_int], + c_int), + ("cmark_node_get_list_start", + [c_object_p], + c_int), + ("cmark_node_set_list_start", + [c_object_p, c_int], + c_int, + boolean_from_result), + ("cmark_node_get_list_tight", + [c_object_p], + c_int, + boolean_from_result), + ("cmark_node_set_list_tight", + [c_object_p, c_int], + c_int, + boolean_from_result), + ("cmark_node_get_fence_info", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_fence_info", + [c_object_p, c_char_p], + c_int, + boolean_from_result), + ("cmark_node_get_url", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_url", + [c_object_p, c_char_p], + c_int, + boolean_from_result), + ("cmark_node_get_title", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_title", + [c_object_p, c_char_p], + c_int, + boolean_from_result), +] + +# Taken from clang.cindex +def register_function(lib, item, ignore_errors): + # A function may not exist, if these bindings are used with an older or + # incompatible version of libcmark.so. + try: + func = getattr(lib, item[0]) + except AttributeError as e: + msg = str(e) + ". Please ensure that your python bindings are "\ + "compatible with your libcmark version." + if ignore_errors: + return + raise LibcmarkError(msg) + + if len(item) >= 2: + func.argtypes = item[1] + + if len(item) >= 3: + func.restype = item[2] + + if len(item) == 4: + func.errcheck = item[3] + +def register_functions(lib, ignore_errors): + """Register function prototypes with a libccmark library instance. + + This must be called as part of library instantiation so Python knows how + to call out to the shared library. + """ + + def register(item): + return register_function(lib, item, ignore_errors) + + for f in functionList: + register(f) + +class Config: + library_path = None + library_file = None + compatibility_check = True + loaded = False + lib_ = None + + @staticmethod + def set_library_path(path): + """Set the path in which to search for libcmark""" + if Config.loaded: + raise Exception("library path must be set before before using " \ + "any other functionalities in libcmark.") + + Config.library_path = path + + @staticmethod + def set_library_file(filename): + """Set the exact location of libcmark""" + if Config.loaded: + raise Exception("library file must be set before before using " \ + "any other functionalities in libcmark.") + + Config.library_file = filename + + @staticmethod + def set_compatibility_check(check_status): + """ Perform compatibility check when loading libcmark + + The python bindings are only tested and evaluated with the version of + libcmark they are provided with. To ensure correct behavior a (limited) + compatibility check is performed when loading the bindings. This check + will throw an exception, as soon as it fails. + + In case these bindings are used with an older version of libcmark, parts + that have been stable between releases may still work. Users of the + python bindings can disable the compatibility check. This will cause + the python bindings to load, even though they are written for a newer + version of libcmark. Failures now arise if unsupported or incompatible + features are accessed. The user is required to test themselves if the + features they are using are available and compatible between different + libcmark versions. + """ + if Config.loaded: + raise Exception("compatibility_check must be set before before " \ + "using any other functionalities in libcmark.") + + Config.compatibility_check = check_status + + @property + def lib(self): + if self.lib_: + return self.lib_ + lib = self.get_cmark_library() + register_functions(lib, not Config.compatibility_check) + Config.loaded = True + self.lib_ = lib + return lib + + def get_filename(self): + if Config.library_file: + return Config.library_file + + import platform + name = platform.system() + + if name == 'Darwin': + file = 'libcmark.dylib' + elif name == 'Windows': + file = 'cmark.dll' + else: + file = 'libcmark.so' + + if Config.library_path: + file = Config.library_path + '/' + file + + return file + + def get_cmark_library(self): + try: + library = cdll.LoadLibrary(self.get_filename()) + except OSError as e: + msg = str(e) + "(%s). To provide a path to libcmark use " \ + "Config.set_library_path() or " \ + "Config.set_library_file()." % self.get_filename() + raise LibcmarkError(msg) + + return library + + def function_exists(self, name): + try: + getattr(self.lib, name) + except AttributeError: + return False + + return True + +conf = Config() + +__alla__ = [ + 'Parser', + 'LibcmarkError', + 'NodeType', + 'ListType', + 'Node', + 'Document', + 'BlockQuote', + 'List', + 'Item', + 'CodeBlock', + 'HtmlBlock', + 'CustomBlock', + 'Paragraph', + 'Heading', + 'ThematicBreak', + 'Text', + 'SoftBreak', + 'LineBreak', + 'Code', + 'HtmlInline', + 'CustomInline', + 'Emph', + 'Strong', + 'Link', + 'Image', + 'ExtentType', + 'Extent', + 'SourceMap', + 'markdown_to_html', + 'parse_document', + 'Config', + 'conf' +]