Skip to content

Commit de6af10

Browse files
authored
BlockMarkupUrlProcessor: Iterate over the attributes using names, not indices (#181)
@dmsnell [noted](WordPress/wordpress-importer#202 (comment)): > the passing around of the index with the `-1` sentinel value is hard for me to follow. did you consider setting something like `$this->currently_examined_attribute_name = null;`? This PR switches from to using indices to using a list of names. > or is the issue reentrancy in all of these functions? Block markup parser is not re-entrant. We should never need to process block markup that blows up the memory so I didn't bother implementing re-entrancy there. # Testing instructions CI
1 parent 6553d59 commit de6af10

File tree

2 files changed

+63
-35
lines changed

2 files changed

+63
-35
lines changed

components/DataLiberation/BlockMarkup/class-blockmarkupurlprocessor.php

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,22 @@ class BlockMarkupUrlProcessor extends BlockMarkupProcessor {
2222
private $base_url_object;
2323
private $url_in_text_processor;
2424
private $url_in_text_node_updated;
25-
private $inspected_url_attribute_idx = - 1;
25+
26+
/**
27+
* The list of names of URL-related HTML attributes that may be available on
28+
* the current token. They will be inspected by next_url_attribute().
29+
*
30+
* Possible values:
31+
*
32+
* - null: We haven't inspected any attribute yet.
33+
* - array: The first element is the currently inspected attribute
34+
* and the rest of the list are elements yet to be inspected on
35+
* the upcoming next_url_attribute() call.
36+
* - empty array: We've already inspected all the URL-related attributes.
37+
*
38+
* @var array<string>|null
39+
*/
40+
private $inspecting_html_attributes;
2641

2742
public function __construct( $html, ?string $base_url_string = null ) {
2843
parent::__construct( $html );
@@ -50,10 +65,10 @@ public function get_parsed_url() {
5065
public function next_token(): bool {
5166
$this->get_updated_html();
5267

53-
$this->raw_url = null;
54-
$this->parsed_url = null;
55-
$this->inspected_url_attribute_idx = - 1;
56-
$this->url_in_text_processor = null;
68+
$this->raw_url = null;
69+
$this->parsed_url = null;
70+
$this->inspecting_html_attributes = null;
71+
$this->url_in_text_processor = null;
5772
// Do not reset url_in_text_node_updated – it's reset in get_updated_html() which.
5873
// is called in parent::next_token().
5974

@@ -116,39 +131,51 @@ private function next_url_in_text_node() {
116131

117132
private function next_url_attribute() {
118133
$tag = $this->get_tag();
119-
if (
120-
! array_key_exists( $tag, self::URL_ATTRIBUTES ) &&
121-
'INPUT' !== $tag // type=image => src,.
122-
) {
134+
135+
if ( ! array_key_exists( $tag, self::URL_ATTRIBUTES ) ) {
123136
return false;
124137
}
125138

126-
while ( ++$this->inspected_url_attribute_idx < count( self::URL_ATTRIBUTES[ $tag ] ) ) {
127-
$attr = self::URL_ATTRIBUTES[ $tag ][ $this->inspected_url_attribute_idx ];
128-
if ( false === $attr ) {
129-
return false;
130-
}
139+
if ( null === $this->inspecting_html_attributes ) {
140+
/**
141+
* Initialize the list on the first call to next_url_attribute()
142+
* for the current token. The last element is the attribute we'll
143+
* inspect in the while() loop below.
144+
*/
145+
$this->inspecting_html_attributes = self::URL_ATTRIBUTES[ $tag ];
146+
} else {
147+
/**
148+
* Forget the attribute we've inspected on the previous call to
149+
* next_url_attribute().
150+
*/
151+
array_pop( $this->inspecting_html_attributes );
152+
}
131153

154+
while ( count( $this->inspecting_html_attributes ) > 0 ) {
155+
$attr = $this->inspecting_html_attributes[ count( $this->inspecting_html_attributes ) - 1 ];
132156
$url_maybe = $this->get_attribute( $attr );
157+
if ( ! is_string( $url_maybe ) ) {
158+
array_pop( $this->inspecting_html_attributes );
159+
continue;
160+
}
161+
133162
/*
134163
* Use base URL to resolve known URI attributes as we are certain we're
135164
* dealing with URI values.
136165
* With a base URL, the string "plugins.php" in <a href="plugins.php"> will
137166
* be correctly recognized as a URL.
138167
* Without a base URL, this Processor would incorrectly skip it.
139168
*/
169+
$parsed_url = WPURL::parse( $url_maybe, $this->base_url_string );
140170

141-
if ( is_string( $url_maybe ) ) {
142-
$parsed_url = WPURL::parse( $url_maybe, $this->base_url_string );
143-
144-
if ( false === $parsed_url ) {
145-
return false;
146-
}
147-
$this->raw_url = $url_maybe;
148-
$this->parsed_url = $parsed_url;
149-
150-
return true;
171+
if ( false === $parsed_url ) {
172+
array_pop( $this->inspecting_html_attributes );
173+
continue;
151174
}
175+
$this->raw_url = $url_maybe;
176+
$this->parsed_url = $parsed_url;
177+
178+
return true;
152179
}
153180

154181
return false;
@@ -324,19 +351,15 @@ public function get_inspected_attribute_name() {
324351
return false;
325352
}
326353

327-
$tag = $this->get_tag();
328-
if ( ! array_key_exists( $tag, self::URL_ATTRIBUTES ) ) {
354+
if ( null === $this->inspecting_html_attributes ) {
329355
return false;
330356
}
331357

332-
if (
333-
$this->inspected_url_attribute_idx < 0 ||
334-
$this->inspected_url_attribute_idx >= count( self::URL_ATTRIBUTES[ $tag ] )
335-
) {
358+
if ( empty( $this->inspecting_html_attributes ) ) {
336359
return false;
337360
}
338361

339-
return self::URL_ATTRIBUTES[ $tag ][ $this->inspected_url_attribute_idx ];
362+
return $this->inspecting_html_attributes[ count( $this->inspecting_html_attributes ) - 1 ];
340363
}
341364

342365

components/DataLiberation/Tests/BlockMarkupUrlProcessorTest.php

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ public static function provider_test_finds_next_url() {
7474
'<a href=""></a><a href="https://developer.w.org"></a>',
7575
null,
7676
),
77+
'Skips over a class name in the <a> tag' => array(
78+
'https://developer.w.org',
79+
'<a class="http://example.com" href="https://developer.w.org"></a>',
80+
null,
81+
),
7782
);
7883
}
7984

@@ -120,21 +125,21 @@ public function test_next_url_finds_urls_in_multiple_attributes() {
120125
$markup = '<img longdesc="https://first-url.org" src="https://mysite.com/wp-content/image.png">';
121126
$p = new BlockMarkupUrlProcessor( $markup );
122127
$this->assertTrue( $p->next_url(), 'Failed to find the URL in the markup.' );
123-
$this->assertEquals( 'https://first-url.org', $p->get_raw_url(), 'Found a URL in the markup, but it wasn\'t the expected one.' );
128+
$this->assertEquals( 'https://mysite.com/wp-content/image.png', $p->get_raw_url(), 'Found a URL in the markup, but it wasn\'t the expected one.' );
124129

125130
$this->assertTrue( $p->next_url(), 'Failed to find the URL in the markup.' );
126-
$this->assertEquals( 'https://mysite.com/wp-content/image.png', $p->get_raw_url(),
131+
$this->assertEquals( 'https://first-url.org', $p->get_raw_url(),
127132
'Found a URL in the markup, but it wasn\'t the expected one.' );
128133
}
129134

130135
public function test_next_url_finds_urls_in_multiple_tags() {
131136
$markup = '<img longdesc="https://first-url.org" src="https://mysite.com/wp-content/image.png"><a href="https://third-url.org">';
132137
$p = new BlockMarkupUrlProcessor( $markup );
133138
$this->assertTrue( $p->next_url(), 'Failed to find the URL in the markup.' );
134-
$this->assertEquals( 'https://first-url.org', $p->get_raw_url(), 'Found a URL in the markup, but it wasn\'t the expected one.' );
139+
$this->assertEquals( 'https://mysite.com/wp-content/image.png', $p->get_raw_url(), 'Found a URL in the markup, but it wasn\'t the expected one.' );
135140

136141
$this->assertTrue( $p->next_url(), 'Failed to find the URL in the markup.' );
137-
$this->assertEquals( 'https://mysite.com/wp-content/image.png', $p->get_raw_url(),
142+
$this->assertEquals( 'https://first-url.org', $p->get_raw_url(),
138143
'Found a URL in the markup, but it wasn\'t the expected one.' );
139144

140145
$this->assertTrue( $p->next_url(), 'Failed to find the URL in the markup.' );

0 commit comments

Comments
 (0)