@@ -809,112 +809,148 @@ static inline size_t write_octet_sequence(unsigned char *buf, enum entity_charse
809
809
/* +2 is 1 because of rest (probably unnecessary), 1 because of terminating 0 */
810
810
#define TRAVERSE_FOR_ENTITIES_EXPAND_SIZE (oldlen ) ((oldlen) + (oldlen) / 5 + 2)
811
811
static void traverse_for_entities (
812
- const char * old ,
813
- size_t oldlen ,
814
- zend_string * ret , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
815
- int all ,
816
- int flags ,
812
+ const zend_string * input ,
813
+ zend_string * output , /* should have allocated TRAVERSE_FOR_ENTITIES_EXPAND_SIZE(olden) */
814
+ const int all ,
815
+ const int flags ,
817
816
const entity_ht * inv_map ,
818
- enum entity_charset charset )
817
+ const enum entity_charset charset )
819
818
{
820
- const char * p ,
821
- * lim ;
822
- char * q ;
823
- int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
824
-
825
- lim = old + oldlen ; /* terminator address */
826
- assert (* lim == '\0' );
827
-
828
- for (p = old , q = ZSTR_VAL (ret ); p < lim ;) {
829
- unsigned code , code2 = 0 ;
830
- const char * next = NULL ; /* when set, next > p, otherwise possible inf loop */
831
-
832
- /* Shift JIS, Big5 and HKSCS use multi-byte encodings where an
833
- * ASCII range byte can be part of a multi-byte sequence.
834
- * However, they start at 0x40, therefore if we find a 0x26 byte,
835
- * we're sure it represents the '&' character. */
819
+ const char * current_ptr = ZSTR_VAL (input );
820
+ const char * input_end = current_ptr + ZSTR_LEN (input ); /* terminator address */
821
+ char * output_ptr = ZSTR_VAL (output );
822
+ const int doctype = flags & ENT_HTML_DOC_TYPE_MASK ;
823
+
824
+ while (current_ptr < input_end ) {
825
+ const char * ampersand_ptr = memchr (current_ptr , '&' , input_end - current_ptr );
826
+ if (!ampersand_ptr ) {
827
+ const size_t tail_len = input_end - current_ptr ;
828
+ if (tail_len > 0 ) {
829
+ memcpy (output_ptr , current_ptr , tail_len );
830
+ output_ptr += tail_len ;
831
+ }
832
+ break ;
833
+ }
836
834
837
- /* assumes there are no single-char entities */
838
- if (p [0 ] != '&' || (p + 3 >= lim )) {
839
- * (q ++ ) = * (p ++ );
840
- continue ;
835
+ /* Copy everything up to the found '&' */
836
+ const size_t chunk_len = ampersand_ptr - current_ptr ;
837
+ if (chunk_len > 0 ) {
838
+ memcpy (output_ptr , current_ptr , chunk_len );
839
+ output_ptr += chunk_len ;
841
840
}
842
841
843
- /* now p[3] is surely valid and is no terminator */
844
-
845
- /* numerical entity */
846
- if (p [1 ] == '#' ) {
847
- next = & p [2 ];
848
- if (process_numeric_entity (& next , & code ) == FAILURE )
849
- goto invalid_code ;
850
-
851
- /* If we're in htmlspecialchars_decode, we're only decoding entities
852
- * that represent &, <, >, " and '. Is this one of them? */
853
- if (!all && (code > 63U ||
854
- stage3_table_be_apos_00000 [code ].data .ent .entity == NULL ))
855
- goto invalid_code ;
856
-
857
- /* are we allowed to decode this entity in this document type?
858
- * HTML 5 is the only that has a character that cannot be used in
859
- * a numeric entity but is allowed literally (U+000D). The
860
- * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
861
- if (!unicode_cp_is_allowed (code , doctype ) ||
862
- (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D ))
863
- goto invalid_code ;
864
- } else {
865
- const char * start ;
866
- size_t ent_len ;
842
+ /* Now current_ptr points to the '&' character. */
843
+ current_ptr = ampersand_ptr ;
867
844
868
- next = & p [1 ];
869
- start = next ;
845
+ /* If there are less than 4 bytes remaining, there isn't enough for an entity -
846
+ * copy '&' as a normal character. */
847
+ if (input_end - current_ptr < 4 ) {
848
+ const size_t remaining = input_end - current_ptr ;
849
+ memcpy (output_ptr , current_ptr , remaining );
850
+ output_ptr += remaining ;
851
+ break ;
852
+ }
870
853
871
- if ( process_named_entity_html ( & next , & start , & ent_len ) == FAILURE )
872
- goto invalid_code ;
854
+ unsigned code = 0 , code2 = 0 ;
855
+ const char * entity_end_ptr = NULL ;
873
856
874
- if (resolve_named_entity_html (start , ent_len , inv_map , & code , & code2 ) == FAILURE ) {
875
- if (doctype == ENT_HTML_DOC_XHTML && ent_len == 4 && start [0 ] == 'a'
876
- && start [1 ] == 'p' && start [2 ] == 'o' && start [3 ] == 's' ) {
877
- /* uses html4 inv_map, which doesn't include apos;. This is a
878
- * hack to support it */
879
- code = (unsigned ) '\'' ;
857
+ if (current_ptr [1 ] == '#' ) {
858
+ /* Processing numeric entity */
859
+ const char * num_start = current_ptr + 2 ;
860
+ entity_end_ptr = num_start ;
861
+ if (process_numeric_entity (& entity_end_ptr , & code ) == FAILURE ) {
862
+ goto invalid_incomplete_entity ;
863
+ }
864
+ if (!all && (code > 63U || stage3_table_be_apos_00000 [code ].data .ent .entity == NULL )) {
865
+ /* If we're in htmlspecialchars_decode, we're only decoding entities
866
+ * that represent &, <, >, " and '. Is this one of them? */
867
+ goto invalid_incomplete_entity ;
868
+ } else if (!unicode_cp_is_allowed (code , doctype ) ||
869
+ (doctype == ENT_HTML_DOC_HTML5 && code == 0x0D )) {
870
+ /* are we allowed to decode this entity in this document type?
871
+ * HTML 5 is the only that has a character that cannot be used in
872
+ * a numeric entity but is allowed literally (U+000D). The
873
+ * unoptimized version would be ... || !numeric_entity_is_allowed(code) */
874
+ goto invalid_incomplete_entity ;
875
+ }
876
+ } else {
877
+ /* Processing named entity */
878
+ const char * name_start = current_ptr + 1 ;
879
+ /* Search for ';' */
880
+ const size_t max_search_len = MIN (LONGEST_ENTITY_LENGTH + 1 , input_end - name_start );
881
+ const char * semi_colon_ptr = memchr (name_start , ';' , max_search_len );
882
+ if (!semi_colon_ptr ) {
883
+ goto invalid_incomplete_entity ;
884
+ } else {
885
+ const size_t name_len = semi_colon_ptr - name_start ;
886
+ if (name_len == 0 ) {
887
+ goto invalid_incomplete_entity ;
880
888
} else {
881
- goto invalid_code ;
889
+ if (resolve_named_entity_html (name_start , name_len , inv_map , & code , & code2 ) == FAILURE ) {
890
+ if (doctype == ENT_HTML_DOC_XHTML && name_len == 4 &&
891
+ name_start [0 ] == 'a' && name_start [1 ] == 'p' &&
892
+ name_start [2 ] == 'o' && name_start [3 ] == 's' )
893
+ {
894
+ /* uses html4 inv_map, which doesn't include apos;. This is a
895
+ * hack to support it */
896
+ code = (unsigned )'\'' ;
897
+ } else {
898
+ goto invalid_incomplete_entity ;
899
+ }
900
+ }
901
+ entity_end_ptr = semi_colon_ptr ;
882
902
}
883
903
}
884
904
}
885
905
886
- assert (* next == ';' );
906
+ /* At this stage the entity_end_ptr should be always set. */
907
+ ZEND_ASSERT (entity_end_ptr != NULL );
887
908
888
- if (((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
889
- (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
890
- /* && code2 == '\0' always true for current maps */ )
891
- goto invalid_code ;
909
+ /* Check if quotes are allowed for entities representing ' or " */
910
+ if ((code == '\'' && !(flags & ENT_HTML_QUOTE_SINGLE )) ||
911
+ (code == '"' && !(flags & ENT_HTML_QUOTE_DOUBLE )))
912
+ {
913
+ goto invalid_complete_entity ;
914
+ }
892
915
893
916
/* UTF-8 doesn't need mapping (ISO-8859-1 doesn't either, but
894
917
* the call is needed to ensure the codepoint <= U+00FF) */
895
918
if (charset != cs_utf_8 ) {
896
919
/* replace unicode code point */
897
- if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 )
898
- goto invalid_code ; /* not representable in target charset */
920
+ if (map_from_unicode (code , charset , & code ) == FAILURE || code2 != 0 ) {
921
+ goto invalid_complete_entity ;
922
+ }
899
923
}
900
924
901
- q += write_octet_sequence ((unsigned char * )q , charset , code );
925
+ /* Write the parsed entity into the output buffer */
926
+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code );
902
927
if (code2 ) {
903
- q += write_octet_sequence ((unsigned char * )q , charset , code2 );
928
+ output_ptr += write_octet_sequence ((unsigned char * )output_ptr , charset , code2 );
904
929
}
930
+ /* Move current_ptr past the semicolon */
931
+ current_ptr = entity_end_ptr + 1 ;
932
+ continue ;
905
933
906
- /* jump over the valid entity; may go beyond size of buffer; np */
907
- p = next + 1 ;
934
+ invalid_incomplete_entity :
935
+ /* If the entity is invalid at parse stage or entity_end_ptr was never found, copy '&' as normal */
936
+ * output_ptr ++ = * current_ptr ++ ;
908
937
continue ;
909
938
910
- invalid_code :
911
- for (; p < next ; p ++ ) {
912
- * (q ++ ) = * p ;
939
+ invalid_complete_entity :
940
+ /* If the entity became invalid after we found entity_end_ptr */
941
+ if (entity_end_ptr ) {
942
+ const size_t len = entity_end_ptr - current_ptr ;
943
+ memcpy (output_ptr , current_ptr , len );
944
+ output_ptr += len ;
945
+ current_ptr = entity_end_ptr ;
946
+ } else {
947
+ * output_ptr ++ = * current_ptr ++ ;
913
948
}
949
+ continue ;
914
950
}
915
951
916
- * q = '\0' ;
917
- ZSTR_LEN (ret ) = (size_t )(q - ZSTR_VAL (ret ));
952
+ * output_ptr = '\0' ;
953
+ ZSTR_LEN (output ) = (size_t )(output_ptr - ZSTR_VAL (output ));
918
954
}
919
955
/* }}} */
920
956
@@ -999,7 +1035,7 @@ PHPAPI zend_string *php_unescape_html_entities(zend_string *str, int all, int fl
999
1035
inverse_map = unescape_inverse_map (all , flags );
1000
1036
1001
1037
/* replace numeric entities */
1002
- traverse_for_entities (ZSTR_VAL ( str ), ZSTR_LEN ( str ) , ret , all , flags , inverse_map , charset );
1038
+ traverse_for_entities (str , ret , all , flags , inverse_map , charset );
1003
1039
1004
1040
return ret ;
1005
1041
}
0 commit comments