Skip to content

Commit 10589dc

Browse files
ArtUkrainskiybukka
authored andcommitted
Refactor traverse_for_entities for unescape_html_entities
Optimize scanning for '&' and ';' using memchr. Use memcpy instead of character-by-character copying language. Closes GH-18092
1 parent d154c72 commit 10589dc

File tree

1 file changed

+114
-78
lines changed

1 file changed

+114
-78
lines changed

ext/standard/html.c

Lines changed: 114 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -809,112 +809,148 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809809
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810810
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(oldlen) ((oldlen) + (oldlen) / 5 + 2)
811811
static void traverse_for_entities(
812-
const char *old,
813-
size_t oldlen,
814-
zend_string *ret, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815-
int all,
816-
int flags,
812+
const zend_string *input,
813+
zend_string *output, /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
814+
const int all,
815+
const int flags,
817816
const entity_ht *inv_map,
818-
enum entity_charset charset)
817+
const enum entity_charset charset)
819818
{
820-
const char *p,
821-
*lim;
822-
char *q;
823-
int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
824-
825-
lim = old + oldlen; /* terminator address */
826-
assert(*lim == '\0');
827-
828-
for (p = old, q = ZSTR_VAL(ret); p < lim;) {
829-
unsigned code, code2 = 0;
830-
const char *next = NULL; /* when set, next > p, otherwise possible inf loop */
831-
832-
/* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
833-
* ASCII range byte can be part of a multi-byte sequence.
834-
* However, they start at 0x40, therefore if we find a 0x26 byte,
835-
* we're sure it represents the '&' character. */
819+
const char *current_ptr = ZSTR_VAL(input);
820+
const char *input_end = current_ptr + ZSTR_LEN(input); /* terminator address */
821+
char *output_ptr = ZSTR_VAL(output);
822+
const int doctype = flags & ENT_HTML_DOC_TYPE_MASK;
823+
824+
while (current_ptr < input_end) {
825+
const char *ampersand_ptr = memchr(current_ptr, '&', input_end - current_ptr);
826+
if (!ampersand_ptr) {
827+
const size_t tail_len = input_end - current_ptr;
828+
if (tail_len > 0) {
829+
memcpy(output_ptr, current_ptr, tail_len);
830+
output_ptr += tail_len;
831+
}
832+
break;
833+
}
836834

837-
/* assumes there are no single-char entities */
838-
if (p[0] != '&' || (p + 3 >= lim)) {
839-
*(q++) = *(p++);
840-
continue;
835+
/* Copy everything up to the found '&' */
836+
const size_t chunk_len = ampersand_ptr - current_ptr;
837+
if (chunk_len > 0) {
838+
memcpy(output_ptr, current_ptr, chunk_len);
839+
output_ptr += chunk_len;
841840
}
842841

843-
/* now p[3] is surely valid and is no terminator */
844-
845-
/* numerical entity */
846-
if (p[1] == '#') {
847-
next = &p[2];
848-
if (process_numeric_entity(&next, &code) == FAILURE)
849-
goto invalid_code;
850-
851-
/* If we're in htmlspecialchars_decode, we're only decoding entities
852-
* that represent &, <, >, " and '. Is this one of them? */
853-
if (!all && (code > 63U ||
854-
stage3_table_be_apos_00000[code].data.ent.entity == NULL))
855-
goto invalid_code;
856-
857-
/* are we allowed to decode this entity in this document type?
858-
* HTML 5 is the only that has a character that cannot be used in
859-
* a numeric entity but is allowed literally (U+000D). The
860-
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
861-
if (!unicode_cp_is_allowed(code, doctype) ||
862-
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D))
863-
goto invalid_code;
864-
} else {
865-
const char *start;
866-
size_t ent_len;
842+
/* Now current_ptr points to the '&' character. */
843+
current_ptr = ampersand_ptr;
867844

868-
next = &p[1];
869-
start = next;
845+
/* If there are less than 4 bytes remaining, there isn't enough for an entity -
846+
* copy '&' as a normal character. */
847+
if (input_end - current_ptr < 4) {
848+
const size_t remaining = input_end - current_ptr;
849+
memcpy(output_ptr, current_ptr, remaining);
850+
output_ptr += remaining;
851+
break;
852+
}
870853

871-
if (process_named_entity_html(&next, &start, &ent_len) == FAILURE)
872-
goto invalid_code;
854+
unsigned code = 0, code2 = 0;
855+
const char *entity_end_ptr = NULL;
873856

874-
if (resolve_named_entity_html(start, ent_len, inv_map, &code, &code2) == FAILURE) {
875-
if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start[0] == 'a'
876-
&& start[1] == 'p' && start[2] == 'o' && start[3] == 's') {
877-
/* uses html4 inv_map, which doesn't include apos;. This is a
878-
* hack to support it */
879-
code = (unsigned) '\'';
857+
if (current_ptr[1] == '#') {
858+
/* Processing numeric entity */
859+
const char *num_start = current_ptr + 2;
860+
entity_end_ptr = num_start;
861+
if (process_numeric_entity(&entity_end_ptr, &code) == FAILURE) {
862+
goto invalid_incomplete_entity;
863+
}
864+
if (!all && (code > 63U || stage3_table_be_apos_00000[code].data.ent.entity == NULL)) {
865+
/* If we're in htmlspecialchars_decode, we're only decoding entities
866+
* that represent &, <, >, " and '. Is this one of them? */
867+
goto invalid_incomplete_entity;
868+
} else if (!unicode_cp_is_allowed(code, doctype) ||
869+
(doctype == ENT_HTML_DOC_HTML5 && code == 0x0D)) {
870+
/* are we allowed to decode this entity in this document type?
871+
* HTML 5 is the only that has a character that cannot be used in
872+
* a numeric entity but is allowed literally (U+000D). The
873+
* unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874+
goto invalid_incomplete_entity;
875+
}
876+
} else {
877+
/* Processing named entity */
878+
const char *name_start = current_ptr + 1;
879+
/* Search for ';' */
880+
const size_t max_search_len = MIN(LONGEST_ENTITY_LENGTH + 1, input_end - name_start);
881+
const char *semi_colon_ptr = memchr(name_start, ';', max_search_len);
882+
if (!semi_colon_ptr) {
883+
goto invalid_incomplete_entity;
884+
} else {
885+
const size_t name_len = semi_colon_ptr - name_start;
886+
if (name_len == 0) {
887+
goto invalid_incomplete_entity;
880888
} else {
881-
goto invalid_code;
889+
if (resolve_named_entity_html(name_start, name_len, inv_map, &code, &code2) == FAILURE) {
890+
if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
891+
name_start[0] == 'a' && name_start[1] == 'p' &&
892+
name_start[2] == 'o' && name_start[3] == 's')
893+
{
894+
/* uses html4 inv_map, which doesn't include apos;. This is a
895+
* hack to support it */
896+
code = (unsigned)'\'';
897+
} else {
898+
goto invalid_incomplete_entity;
899+
}
900+
}
901+
entity_end_ptr = semi_colon_ptr;
882902
}
883903
}
884904
}
885905

886-
assert(*next == ';');
906+
/* At this stage the entity_end_ptr should be always set. */
907+
ZEND_ASSERT(entity_end_ptr != NULL);
887908

888-
if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
889-
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
890-
/* && code2 == '\0' always true for current maps */)
891-
goto invalid_code;
909+
/* Check if quotes are allowed for entities representing ' or " */
910+
if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE)) ||
911+
(code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE)))
912+
{
913+
goto invalid_complete_entity;
914+
}
892915

893916
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
894917
* the call is needed to ensure the codepoint <= U+00FF) */
895918
if (charset != cs_utf_8) {
896919
/* replace unicode code point */
897-
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0)
898-
goto invalid_code; /* not representable in target charset */
920+
if (map_from_unicode(code, charset, &code) == FAILURE || code2 != 0) {
921+
goto invalid_complete_entity;
922+
}
899923
}
900924

901-
q += write_octet_sequence((unsigned char*)q, charset, code);
925+
/* Write the parsed entity into the output buffer */
926+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code);
902927
if (code2) {
903-
q += write_octet_sequence((unsigned char*)q, charset, code2);
928+
output_ptr += write_octet_sequence((unsigned char*)output_ptr, charset, code2);
904929
}
930+
/* Move current_ptr past the semicolon */
931+
current_ptr = entity_end_ptr + 1;
932+
continue;
905933

906-
/* jump over the valid entity; may go beyond size of buffer; np */
907-
p = next + 1;
934+
invalid_incomplete_entity:
935+
/* If the entity is invalid at parse stage or entity_end_ptr was never found, copy '&' as normal */
936+
*output_ptr++ = *current_ptr++;
908937
continue;
909938

910-
invalid_code:
911-
for (; p < next; p++) {
912-
*(q++) = *p;
939+
invalid_complete_entity:
940+
/* If the entity became invalid after we found entity_end_ptr */
941+
if (entity_end_ptr) {
942+
const size_t len = entity_end_ptr - current_ptr;
943+
memcpy(output_ptr, current_ptr, len);
944+
output_ptr += len;
945+
current_ptr = entity_end_ptr;
946+
} else {
947+
*output_ptr++ = *current_ptr++;
913948
}
949+
continue;
914950
}
915951

916-
*q = '\0';
917-
ZSTR_LEN(ret) = (size_t)(q - ZSTR_VAL(ret));
952+
*output_ptr = '\0';
953+
ZSTR_LEN(output) = (size_t)(output_ptr - ZSTR_VAL(output));
918954
}
919955
/* }}} */
920956

@@ -999,7 +1035,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl
9991035
inverse_map = unescape_inverse_map(all, flags);
10001036

10011037
/* replace numeric entities */
1002-
traverse_for_entities(ZSTR_VAL(str), ZSTR_LEN(str), ret, all, flags, inverse_map, charset);
1038+
traverse_for_entities(str, ret, all, flags, inverse_map, charset);
10031039

10041040
return ret;
10051041
}

0 commit comments

Comments
 (0)