diff --git a/.github/workflows/native-apis.yml b/.github/workflows/native-apis.yml new file mode 100644 index 000000000..c49c141f5 --- /dev/null +++ b/.github/workflows/native-apis.yml @@ -0,0 +1,52 @@ +name: Native APIs + +on: + push: + branches: + - trunk + pull_request: + paths: + - '.github/workflows/native-apis.yml' + - 'components/DataLiberation/URL/**' + - 'components/HTML/**' + - 'components/XML/**' + - 'extensions/native-apis/**' + +jobs: + build-and-verify: + name: Build and verify PHP extension + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup PHP + uses: shivammathur/setup-php@v2 + with: + php-version: '8.3' + extensions: mbstring, json + coverage: none + tools: composer:v2 + + - name: Setup Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install native build dependencies + run: | + sudo apt-get update + sudo apt-get install -y clang libclang-dev + php-config --version + + - name: Install Composer dependencies + run: composer install --prefer-dist --no-progress --no-suggest + + - name: Run Rust tests + working-directory: extensions/native-apis + run: cargo test + + - name: Build native extension + run: extensions/native-apis/build-extension.sh + + - name: Verify native extension + run: php -d extension=extensions/native-apis/target/release/libwp_native_apis.so extensions/native-apis/tests/verify-native-apis.php diff --git a/extensions/native-apis/build-extension.sh b/extensions/native-apis/build-extension.sh index 9dcfaf945..d14007a40 100755 --- a/extensions/native-apis/build-extension.sh +++ b/extensions/native-apis/build-extension.sh @@ -25,7 +25,8 @@ if ! command -v clang >/dev/null 2>&1 && [ -z "${LIBCLANG_PATH:-}" ]; then exit 1 fi -export PHP_CONFIG="${php_config}" +PHP_CONFIG="$(command -v "${php_config}")" +export PHP_CONFIG cargo build --release --features php-extension diff --git a/extensions/native-apis/src/html.rs b/extensions/native-apis/src/html.rs index 5e27173c3..63f79926e 100644 --- a/extensions/native-apis/src/html.rs +++ b/extensions/native-apis/src/html.rs @@ -8,7 +8,7 @@ use ext_php_rs::prelude::*; #[cfg(feature = "php-extension")] use ext_php_rs::{ boxed::ZBox, - types::{ZendCallable, ZendHashTable, Zval}, + types::{ZendHashTable, Zval}, zend::Function, }; @@ -82,6 +82,15 @@ enum HtmlAttributeValue { String(String), } +#[derive(Default)] +struct HtmlNextTagQuery { + tag_name: Option, + class_name: Option, + match_offset: i64, + visit_closers: bool, + breadcrumbs: Option>, +} + #[cfg(feature = "php-extension")] impl WpHtmlNativeTagProcessor { fn get_attribute_value(&self, name: &str) -> Option { @@ -169,8 +178,27 @@ impl WpHtmlNativeTagProcessor { } } - pub fn next_tag(&mut self) -> bool { - self.next_tag_any(false, 1) + pub fn supports_public_api() -> bool { + true + } + + #[php(optional = query)] + pub fn next_tag(&mut self, query: Option<&Zval>) -> bool { + let query = html_parse_tag_processor_next_tag_query(query); + let mut match_offset = query.match_offset.max(1); + + while self.advance_to_next_tag_token() { + if !self.current_html_tag_matches_query(&query, false) { + continue; + } + + match_offset -= 1; + if match_offset == 0 { + return true; + } + } + + false } pub fn next_tag_any(&mut self, visit_closers: bool, mut match_offset: i64) -> bool { @@ -2436,6 +2464,44 @@ impl WpHtmlNativeTagProcessor { fn current_tag(&self) -> Option<&HtmlTag> { self.current.as_ref() } + + fn current_html_tag_matches_query( + &self, + query: &HtmlNextTagQuery, + use_breadcrumbs: bool, + ) -> bool { + let Some(tag) = self.current_tag() else { + return false; + }; + + if tag.token_type != "#tag" { + return false; + } + + if tag.closing && (!query.visit_closers || use_breadcrumbs) { + return false; + } + + if let Some(tag_name) = query.tag_name.as_ref() { + if !tag.name.eq_ignore_ascii_case(tag_name) { + return false; + } + } + + if let Some(class_name) = query.class_name.as_ref() { + if self.has_class(class_name.clone()) != Some(true) { + return false; + } + } + + if use_breadcrumbs { + if let Some(breadcrumbs) = query.breadcrumbs.as_ref() { + return html_breadcrumbs_match(&tag.breadcrumbs, breadcrumbs); + } + } + + true + } } #[cfg(feature = "php-extension")] @@ -2449,12 +2515,27 @@ pub struct WpHtmlNativeProcessor { #[php_impl] #[php(change_method_case = "snake_case")] impl WpHtmlNativeProcessor { - pub fn create_fragment(html: String) -> Self { + pub fn supports_public_api() -> bool { + true + } + + #[php(optional = context)] + pub fn create_fragment( + html: String, + context: Option, + encoding: Option, + ) -> Option { + if context.as_deref().unwrap_or("") != "" + || encoding.as_deref().unwrap_or("UTF-8") != "UTF-8" + { + return None; + } + let mut inner = WpHtmlNativeTagProcessor::__construct(html); inner.synthesize_implied_closers = true; inner.ignore_html_body_starts = true; - Self { inner } + Some(Self { inner }) } #[php(optional = known_definite_encoding)] @@ -2473,7 +2554,7 @@ impl WpHtmlNativeProcessor { } pub fn normalize(html: String) -> Option { - html_normalize_via_php(&html) + html_serialize_native_fragment(&html) } pub fn serialize(&mut self) -> Option { @@ -2481,7 +2562,7 @@ impl WpHtmlNativeProcessor { return None; } - let serialized = html_normalize_via_php(&self.inner.html)?; + let serialized = html_serialize_native_fragment(&self.inner.html)?; self.inner.offset = self.inner.html.len(); Some(serialized) } @@ -2570,8 +2651,29 @@ impl WpHtmlNativeProcessor { rows } - pub fn next_tag(&mut self) -> bool { - self.inner.next_tag() + #[php(optional = query)] + pub fn next_tag(&mut self, query: Option<&Zval>) -> bool { + let Some(query) = html_parse_processor_next_tag_query(query) else { + return false; + }; + let use_breadcrumbs = query.breadcrumbs.is_some(); + let mut match_offset = query.match_offset.max(1); + + while self.next_token() { + if !self + .inner + .current_html_tag_matches_query(&query, use_breadcrumbs) + { + continue; + } + + match_offset -= 1; + if match_offset == 0 { + return true; + } + } + + false } pub fn next_tag_summary_batch( @@ -3020,6 +3122,243 @@ fn token_metadata(tag: &HtmlTag) -> String { metadata } +#[cfg(feature = "php-extension")] +fn html_parse_tag_processor_next_tag_query(query: Option<&Zval>) -> HtmlNextTagQuery { + let mut parsed = HtmlNextTagQuery { + match_offset: 1, + ..Default::default() + }; + + let Some(query) = query else { + return parsed; + }; + + if let Some(tag_name) = query.str() { + parsed.tag_name = Some(tag_name.to_string()); + return parsed; + } + + let Some(query) = query.array() else { + return parsed; + }; + + if let Some(tag_name) = html_array_string(query, "tag_name") { + parsed.tag_name = Some(tag_name); + } + + if let Some(class_name) = html_array_string(query, "class_name") { + parsed.class_name = Some(class_name); + } + + if let Some(match_offset) = html_array_positive_i64(query, "match_offset") { + parsed.match_offset = match_offset; + } + + parsed.visit_closers = html_array_string(query, "tag_closers").as_deref() == Some("visit"); + + parsed +} + +#[cfg(feature = "php-extension")] +fn html_parse_processor_next_tag_query(query: Option<&Zval>) -> Option { + let mut parsed = HtmlNextTagQuery { + match_offset: 1, + ..Default::default() + }; + + let Some(query) = query else { + return Some(parsed); + }; + + if let Some(tag_name) = query.str() { + parsed.breadcrumbs = Some(vec![tag_name.to_string()]); + return Some(parsed); + } + + let query = query.array()?; + + if let Some(tag_name) = html_array_string(query, "tag_name") { + parsed.tag_name = Some(tag_name); + } + + if let Some(class_name) = html_array_string(query, "class_name") { + parsed.class_name = Some(class_name); + } + + if let Some(match_offset) = html_array_positive_i64(query, "match_offset") { + parsed.match_offset = match_offset; + } + + parsed.visit_closers = html_array_string(query, "tag_closers").as_deref() == Some("visit"); + parsed.breadcrumbs = html_array_string_breadcrumbs(query, "breadcrumbs"); + + Some(parsed) +} + +#[cfg(feature = "php-extension")] +fn html_array_string(query: &ZendHashTable, key: &str) -> Option { + query + .get(key) + .and_then(|value| value.str()) + .map(str::to_string) +} + +#[cfg(feature = "php-extension")] +fn html_array_positive_i64(query: &ZendHashTable, key: &str) -> Option { + query + .get(key) + .and_then(|value| value.long()) + .filter(|value| *value > 0) + .map(|value| value as i64) +} + +#[cfg(feature = "php-extension")] +fn html_array_string_breadcrumbs(query: &ZendHashTable, key: &str) -> Option> { + let breadcrumbs = query.get(key)?.array()?; + let mut parsed = Vec::with_capacity(breadcrumbs.len()); + + for value in breadcrumbs.iter().map(|(_, value)| value) { + parsed.push(value.str()?.to_string()); + } + + Some(parsed) +} + +fn html_breadcrumbs_match(current: &[String], breadcrumbs: &[String]) -> bool { + if breadcrumbs.is_empty() { + return true; + } + + if breadcrumbs.len() > current.len() { + return false; + } + + let offset = current.len() - breadcrumbs.len(); + for (index, breadcrumb) in breadcrumbs.iter().enumerate() { + let crumb = breadcrumb.to_ascii_uppercase(); + let node = ¤t[offset + index]; + if crumb != "*" && node != &crumb { + return false; + } + } + + true +} + +#[cfg(feature = "php-extension")] +fn html_serialize_native_fragment(html: &str) -> Option { + let mut processor = WpHtmlNativeProcessor::create_fragment( + html.to_string(), + Some("".to_string()), + Some("UTF-8".to_string()), + )?; + let mut serialized = String::with_capacity(html.len()); + + while processor.next_token() { + let Some(tag) = processor.inner.current_tag() else { + continue; + }; + + serialized.push_str(&html_serialize_token(html, tag)); + } + + Some(serialized) +} + +fn html_serialize_token(html: &str, tag: &HtmlTag) -> String { + match tag.token_type.as_str() { + "#tag" if tag.closing => format!("", tag.name), + "#tag" => html_serialize_opening_tag(html, tag), + "#comment" => format!("", tag.text), + "#text" => html_escape_text(&tag.text), + _ => String::new(), + } +} + +fn html_serialize_opening_tag(html: &str, tag: &HtmlTag) -> String { + let mut output = String::new(); + output.push('<'); + output.push_str(&tag.name); + + for (attribute_name, value) in + html_source_attribute_items(html.as_bytes(), tag.source_start, tag.source_end) + { + output.push(' '); + output.push_str(&attribute_name); + + if let Some(value) = value { + output.push_str("=\""); + output.push_str(&html_escape_attribute_value(&value)); + output.push('"'); + } + } + + output.push('>'); + output +} + +fn html_source_attribute_items( + bytes: &[u8], + source_start: usize, + source_end: usize, +) -> Vec<(String, Option)> { + if source_start >= bytes.len() || source_end <= source_start { + return Vec::new(); + } + + let tag_end = source_end.saturating_sub(1).min(bytes.len()); + let mut cursor = source_start.saturating_add(1); + if cursor < tag_end && bytes[cursor] == b'/' { + return Vec::new(); + } + + cursor = skip_ascii_whitespace(bytes, cursor); + cursor = span_name(bytes, cursor); + + let mut items = Vec::new(); + let mut seen = Vec::new(); + while cursor < tag_end { + cursor = skip_ascii_whitespace(bytes, cursor); + if cursor >= tag_end || bytes[cursor] == b'>' { + break; + } + if bytes[cursor] == b'/' && cursor + 1 < bytes.len() && bytes[cursor + 1] == b'>' { + break; + } + + let attr_start = cursor; + cursor = span_html_attribute_name(bytes, cursor); + if cursor == attr_start { + cursor += 1; + continue; + } + + let attr_name = ascii_lower(&bytes[attr_start..cursor]); + cursor = skip_ascii_whitespace(bytes, cursor); + + let mut value = None; + if cursor < tag_end && bytes[cursor] == b'=' { + cursor += 1; + cursor = skip_ascii_whitespace(bytes, cursor); + let parsed = parse_attribute_value(bytes, cursor); + value = Some(parsed.0); + cursor = parsed.1; + } + + if seen + .iter() + .any(|seen_name: &String| seen_name == &attr_name) + { + continue; + } + + seen.push(attr_name.clone()); + items.push((attr_name, value)); + } + + items +} + #[cfg(feature = "php-extension")] fn html_tag_public_summary_row(tag: &HtmlTag) -> Vec<(String, Zval)> { vec![ @@ -3204,13 +3543,6 @@ fn html_doctype_info_zval(html: &str, tag: &HtmlTag) -> Option { } } -#[cfg(feature = "php-extension")] -fn html_normalize_via_php(html: &str) -> Option { - let callable = ZendCallable::try_from_name("WP_HTML_Processor::normalize").ok()?; - let value = callable.try_call(vec![&html]).ok()?; - value.string() -} - fn html_token_compact_summary(tag: &HtmlTag) -> String { let token_kind = match tag.token_type.as_str() { "#tag" => "t", @@ -5910,9 +6242,10 @@ mod tests { use super::{ apply_html_text_removals, find_html_attribute_names_with_prefix_count, find_html_attribute_names_with_prefix_string, find_html_attribute_removals, - find_html_attribute_removals_with_prefix, html_tag_has_self_closing_flag, - html_token_compact_summary, initial_html_breadcrumbs, parse_html_tags, - parse_next_html_token, parse_next_plain_html_tag_token, HtmlTextRemoval, + find_html_attribute_removals_with_prefix, html_serialize_opening_tag, + html_source_attribute_items, html_tag_has_self_closing_flag, html_token_compact_summary, + initial_html_breadcrumbs, parse_html_tags, parse_next_html_token, + parse_next_plain_html_tag_token, HtmlTextRemoval, }; fn collect_processor_tokens(html: &str) -> Vec<(String, bool, String, String)> { @@ -5990,6 +6323,25 @@ mod tests { ); } + #[test] + fn serializes_opening_tag_attributes_from_source() { + let html = "One"; + let tags = parse_html_tags(html); + + assert_eq!( + vec![ + ("href".to_string(), Some("#anchor".to_string())), + ("v".to_string(), Some("5".to_string())), + ("enabled".to_string(), None), + ], + html_source_attribute_items(html.as_bytes(), tags[0].source_start, tags[0].source_end) + ); + assert_eq!( + "", + html_serialize_opening_tag(html, &tags[0]) + ); + } + #[test] fn parses_plain_html_tags_without_decoding_discarded_tokens() { let html = diff --git a/extensions/native-apis/src/url_text.rs b/extensions/native-apis/src/url_text.rs index cc805d9c1..1ba04d223 100644 --- a/extensions/native-apis/src/url_text.rs +++ b/extensions/native-apis/src/url_text.rs @@ -1,14 +1,19 @@ #![cfg_attr(not(feature = "php-extension"), allow(dead_code))] #[cfg(feature = "php-extension")] -use ext_php_rs::prelude::*; +use ext_php_rs::{ + prelude::*, + types::{ZendCallable, Zval}, +}; #[derive(Clone, Debug, PartialEq, Eq)] pub struct UrlTextCandidate { pub raw_url: String, + pub preprocessed_url: String, pub starts_at: usize, pub length: usize, pub had_protocol: bool, + pub did_prepend_protocol: bool, } #[cfg(feature = "php-extension")] @@ -19,6 +24,9 @@ pub struct NativeUrlInTextProcessor { bytes_already_parsed: usize, current: Option, replacements: Vec, + validate_urls: bool, + base_url: Option, + base_protocol: Option, } #[derive(Clone, Debug)] @@ -32,26 +40,53 @@ struct UrlTextReplacement { #[php_impl] #[php(change_method_case = "snake_case")] impl NativeUrlInTextProcessor { - pub fn __construct(text: String) -> Self { + #[php(optional = base_url)] + pub fn __construct(text: String, base_url: Option) -> Self { + let base_protocol = base_url.as_deref().and_then(parse_url_scheme); + Self { text, bytes_already_parsed: 0, current: None, replacements: Vec::new(), + validate_urls: true, + base_url, + base_protocol, } } + pub fn supports_public_api() -> bool { + true + } + + pub fn use_url_validation(&mut self) { + self.validate_urls = true; + } + + pub fn set_base_url(&mut self, base_url: String) { + self.base_protocol = parse_url_scheme(&base_url); + self.base_url = Some(base_url); + } + pub fn next_url(&mut self) -> bool { self.current = None; - let Some(candidate) = find_next_url_text_candidate(&self.text, self.bytes_already_parsed) - else { - return false; - }; + while let Some(mut candidate) = + find_next_url_text_candidate(&self.text, self.bytes_already_parsed) + { + self.bytes_already_parsed = candidate.starts_at + candidate.length; - self.bytes_already_parsed = candidate.starts_at + candidate.length; - self.current = Some(candidate); - true + if self.validate_urls + && !validate_url_text_candidate(&mut candidate, self.base_protocol.as_deref()) + { + continue; + } + + self.current = Some(candidate); + return true; + } + + false } pub fn get_raw_url(&self) -> Option { @@ -60,6 +95,34 @@ impl NativeUrlInTextProcessor { .map(|candidate| candidate.raw_url.clone()) } + pub fn get_preprocessed_url(&self) -> Option { + self.current + .as_ref() + .map(|candidate| candidate.preprocessed_url.clone()) + } + + pub fn get_parsed_url(&self) -> Zval { + let Some(candidate) = self.current.as_ref() else { + return url_zval_bool(false); + }; + + let Ok(callable) = + ZendCallable::try_from_name("WordPress\\DataLiberation\\URL\\WPURL::parse") + else { + return url_zval_bool(false); + }; + + let result = match self.base_url.as_ref() { + Some(base_url) => callable.try_call(vec![&candidate.preprocessed_url, base_url]), + None => callable.try_call(vec![&candidate.preprocessed_url]), + }; + + match result { + Ok(value) if !value.is_false() && !value.is_null() => value, + _ => url_zval_bool(false), + } + } + pub fn get_url_starts_at(&self) -> Option { self.current .as_ref() @@ -78,16 +141,31 @@ impl NativeUrlInTextProcessor { .map(|candidate| candidate.had_protocol) } + pub fn did_prepend_protocol(&self) -> Option { + self.current + .as_ref() + .map(|candidate| candidate.did_prepend_protocol) + } + pub fn set_raw_url(&mut self, new_url: String) -> bool { let Some(candidate) = self.current.as_mut() else { return false; }; - self.replacements.push(UrlTextReplacement { - start: candidate.starts_at, - length: candidate.length, - text: new_url.clone(), - }); + if let Some(replacement) = self + .replacements + .iter_mut() + .find(|replacement| replacement.start == candidate.starts_at) + { + replacement.length = candidate.length; + replacement.text = new_url.clone(); + } else { + self.replacements.push(UrlTextReplacement { + start: candidate.starts_at, + length: candidate.length, + text: new_url.clone(), + }); + } candidate.raw_url = new_url; true } @@ -208,12 +286,174 @@ fn parse_url_text_candidate_at(text: &str, start: usize) -> Option, +) -> bool { + let mut preprocessed_url = candidate.raw_url.clone(); + if !candidate.had_protocol { + let Some(protocol) = base_protocol else { + return false; + }; + + if !is_http_or_https_scheme(protocol) { + return false; + } + + preprocessed_url = format!("{protocol}://{}", candidate.raw_url); + candidate.did_prepend_protocol = true; + } else if preprocessed_url.starts_with("//") { + let Some(protocol) = base_protocol else { + return false; + }; + + if !is_http_or_https_scheme(protocol) { + return false; + } + } else if !starts_with_http_or_https_scheme(&preprocessed_url) { + return false; + } + + if has_authority_auth_details(&preprocessed_url) { + return false; + } + + if has_invalid_authority_port(&preprocessed_url) { + return false; + } + + if !candidate.had_protocol { + let Some(hostname) = candidate_hostname(&candidate.raw_url) else { + return false; + }; + + let Some(last_dot) = hostname.rfind('.') else { + return false; + }; + + if !is_known_public_domain(&hostname[last_dot + 1..]) { + return false; + } + } + + candidate.preprocessed_url = preprocessed_url; + true +} + +#[cfg(feature = "php-extension")] +fn url_zval_bool(value: bool) -> Zval { + let mut zval = Zval::new(); + zval.set_bool(value); + zval +} + +fn parse_url_scheme(url: &str) -> Option { + let colon = url.find(':')?; + let first_delimiter = url + .find(|character| matches!(character, '/' | '?' | '#')) + .unwrap_or(url.len()); + if colon > first_delimiter { + return None; + } + + Some(url[..colon].to_ascii_lowercase()) +} + +fn is_http_or_https_scheme(scheme: &str) -> bool { + scheme.eq_ignore_ascii_case("http") || scheme.eq_ignore_ascii_case("https") +} + +fn starts_with_http_or_https_scheme(url: &str) -> bool { + ascii_starts_with(url.as_bytes(), 0, b"http:") + || ascii_starts_with(url.as_bytes(), 0, b"https:") +} + +fn authority_range(url: &str) -> Option<(usize, usize)> { + let bytes = url.as_bytes(); + let authority_start = if bytes.starts_with(b"//") { + 2 + } else if ascii_starts_with(bytes, 0, b"http://") { + 7 + } else if ascii_starts_with(bytes, 0, b"https://") { + 8 + } else { + return None; + }; + + let authority_end = bytes[authority_start..] + .iter() + .position(|byte| matches!(*byte, b'/' | b'?' | b'#')) + .map(|offset| authority_start + offset) + .unwrap_or(bytes.len()); + + Some((authority_start, authority_end)) +} + +fn has_authority_auth_details(url: &str) -> bool { + let Some((start, end)) = authority_range(url) else { + return false; + }; + + url.as_bytes()[start..end].contains(&b'@') +} + +fn has_invalid_authority_port(url: &str) -> bool { + let Some((start, end)) = authority_range(url) else { + return false; + }; + + let authority = &url[start..end]; + if authority.starts_with('[') { + return authority.find(']').is_none(); + } + + let Some(colon) = authority.rfind(':') else { + return false; + }; + + let port = &authority[colon + 1..]; + !port.is_empty() + && port.bytes().all(|byte| byte.is_ascii_digit()) + && port.parse::().is_err() +} + +fn candidate_hostname(raw_url: &str) -> Option<&str> { + let bytes = raw_url.as_bytes(); + let mut start = 0; + if bytes.starts_with(b"//") { + start = 2; + } else if ascii_starts_with(bytes, 0, b"http:") { + start = 5; + while start < bytes.len() && bytes[start] == b'/' { + start += 1; + } + } else if ascii_starts_with(bytes, 0, b"https:") { + start = 6; + while start < bytes.len() && bytes[start] == b'/' { + start += 1; + } + } + + let end = bytes[start..] + .iter() + .position(|byte| !is_hostish_byte(*byte)) + .map(|offset| start + offset) + .unwrap_or(bytes.len()); + if end <= start { + return None; + } + + Some(&raw_url[start..end]) +} + fn find_candidate_end(bytes: &[u8], mut cursor: usize) -> usize { while cursor < bytes.len() { let byte = bytes[cursor]; @@ -288,10 +528,29 @@ fn candidate_host_has_url_shape(host: &str) -> bool { let tld = &host[last_dot + 1..]; tld.len() >= 2 && tld.len() <= 63 - && tld.bytes().all(|byte| byte.is_ascii_alphanumeric()) + && tld + .bytes() + .all(|byte| byte.is_ascii_alphanumeric() || byte == b'-') && host.split('.').all(is_valid_hostname_label) } +fn is_known_public_domain(tld: &str) -> bool { + if tld.eq_ignore_ascii_case("internal") { + return true; + } + + if tld.is_empty() + || !tld + .bytes() + .all(|byte| byte.is_ascii_alphanumeric() || byte == b'-') + { + return false; + } + + let needle = format!("'{}'", tld.to_ascii_lowercase()); + include_str!("../../../components/DataLiberation/URL/public-suffix-list.php").contains(&needle) +} + fn is_valid_hostname_label(label: &str) -> bool { let bytes = label.as_bytes(); !bytes.is_empty() @@ -319,7 +578,7 @@ fn ascii_starts_with(bytes: &[u8], offset: usize, needle: &[u8]) -> bool { #[cfg(test)] mod tests { - use super::find_next_url_text_candidate; + use super::{find_next_url_text_candidate, validate_url_text_candidate, UrlTextCandidate}; #[test] fn finds_http_https_and_bare_domain_candidates() { @@ -352,4 +611,44 @@ mod tests { fn ignores_embedded_protocol_fragments() { assert!(find_next_url_text_candidate("ahttp://example.com", 0).is_none()); } + + #[test] + fn accepts_punycode_tlds() { + let text = "Visit http://xn--fsqu00a.xn--0zwm56d"; + let candidate = find_next_url_text_candidate(text, 0).expect("URL"); + assert_eq!("http://xn--fsqu00a.xn--0zwm56d", candidate.raw_url); + } + + #[test] + fn validates_public_url_candidates_with_base_protocol() { + let mut candidate = find_next_url_text_candidate("Visit example.com/docs", 0).expect("URL"); + assert!(validate_url_text_candidate(&mut candidate, Some("https"))); + assert_eq!("https://example.com/docs", candidate.preprocessed_url); + assert!(candidate.did_prepend_protocol); + } + + #[test] + fn rejects_filename_like_bare_domains_with_unknown_tlds() { + let mut candidate = find_next_url_text_candidate("Edit plugins.php", 0).expect("candidate"); + assert!(!validate_url_text_candidate(&mut candidate, Some("https"))); + } + + #[test] + fn rejects_authority_credentials() { + let mut candidate = UrlTextCandidate { + raw_url: "https://user@example.com/path".to_string(), + preprocessed_url: "https://user@example.com/path".to_string(), + starts_at: 6, + length: 29, + had_protocol: true, + did_prepend_protocol: false, + }; + assert!(!validate_url_text_candidate(&mut candidate, Some("https"))); + } + + #[test] + fn rejects_bare_domains_without_base_protocol() { + let mut candidate = find_next_url_text_candidate("Visit example.com", 0).expect("URL"); + assert!(!validate_url_text_candidate(&mut candidate, None)); + } } diff --git a/extensions/native-apis/src/xml.rs b/extensions/native-apis/src/xml.rs index 29d11a16f..eae7bca7a 100644 --- a/extensions/native-apis/src/xml.rs +++ b/extensions/native-apis/src/xml.rs @@ -7,7 +7,7 @@ use std::rc::Rc; #[cfg(feature = "php-extension")] use ext_php_rs::prelude::*; #[cfg(feature = "php-extension")] -use ext_php_rs::types::Zval; +use ext_php_rs::types::{ZendHashTable, Zval}; #[derive(Clone, Debug, PartialEq, Eq)] pub struct XmlToken { @@ -81,6 +81,12 @@ struct XmlBookmark { pending_stream_error: Option, } +#[derive(Default)] +struct XmlNextTagQuery { + breadcrumbs: Option>, + match_offset: i64, +} + #[cfg(feature = "php-extension")] fn encode_xml_native_cursor(state: &XmlStreamState) -> String { format!( @@ -289,7 +295,7 @@ pub struct NativeXmlProcessor { #[php_impl] #[php(change_method_case = "snake_case")] impl NativeXmlProcessor { - pub fn create_from_string(xml: String) -> Self { + fn new_from_string(xml: String) -> Self { Self { source: xml, document: None, @@ -307,6 +313,27 @@ impl NativeXmlProcessor { } } + pub fn supports_public_api() -> bool { + true + } + + #[php(optional = cursor)] + pub fn create_from_string( + xml: String, + cursor: Option, + known_definite_encoding: Option, + document_namespaces: Option<&Zval>, + ) -> Option { + if cursor.is_some() + || known_definite_encoding.as_deref().unwrap_or("UTF-8") != "UTF-8" + || document_namespaces.is_some() + { + return None; + } + + Some(Self::new_from_string(xml)) + } + pub fn create_for_streaming( xml: String, cursor: Option, @@ -329,7 +356,7 @@ impl NativeXmlProcessor { None }; - let mut processor = Self::create_from_string(xml); + let mut processor = Self::new_from_string(xml); processor.stream_reentrancy_base_state = Some(stream.clone().unwrap_or_else(XmlStreamState::new)); processor.stream = stream; @@ -1473,9 +1500,35 @@ impl NativeXmlProcessor { xml_token_stream_summary(&summary) } - pub fn next_tag(&mut self) -> bool { + #[php(optional = query_or_ns)] + pub fn next_tag( + &mut self, + query_or_ns: Option<&Zval>, + null_or_local_name: Option, + ) -> bool { + let Some(query) = xml_parse_next_tag_query(query_or_ns, null_or_local_name) else { + return false; + }; + + let mut match_offset = query.match_offset.max(1); + while self.next_token() { - if self.get_token_type().as_deref() == Some("#tag") && !self.is_tag_closer() { + if self.get_token_type().as_deref() != Some("#tag") || self.is_tag_closer() { + continue; + } + + if let Some(breadcrumbs) = query.breadcrumbs.as_ref() { + let Some(current_breadcrumbs) = self.current_token_breadcrumbs() else { + continue; + }; + + if !xml_namespaced_breadcrumbs_match(¤t_breadcrumbs, breadcrumbs) { + continue; + } + } + + match_offset -= 1; + if match_offset == 0 { return true; } } @@ -3333,6 +3386,114 @@ impl NativeXmlProcessor { } } +#[cfg(feature = "php-extension")] +fn xml_parse_next_tag_query( + query_or_ns: Option<&Zval>, + null_or_local_name: Option, +) -> Option { + let mut parsed = XmlNextTagQuery { + match_offset: 1, + ..Default::default() + }; + + let Some(query_or_ns) = query_or_ns else { + return Some(parsed); + }; + + if let Some(namespace_or_local_name) = query_or_ns.str() { + parsed.breadcrumbs = Some(vec![match null_or_local_name { + Some(local_name) => (namespace_or_local_name.to_string(), local_name), + None => ("".to_string(), namespace_or_local_name.to_string()), + }]); + return Some(parsed); + } + + let query = query_or_ns.array()?; + + if query.get_index(0).and_then(|value| value.str()).is_some() + && query.get_index(1).and_then(|value| value.str()).is_some() + { + parsed.breadcrumbs = Some(vec![( + query.get_index(0)?.str()?.to_string(), + query.get_index(1)?.str()?.to_string(), + )]); + return Some(parsed); + } + + if let Some(match_offset) = xml_array_positive_i64(query, "match_offset") { + parsed.match_offset = match_offset; + } + + parsed.breadcrumbs = xml_array_namespaced_breadcrumbs(query, "breadcrumbs"); + + Some(parsed) +} + +#[cfg(feature = "php-extension")] +fn xml_array_positive_i64(query: &ZendHashTable, key: &str) -> Option { + query + .get(key) + .and_then(|value| value.long()) + .filter(|value| *value > 0) + .map(|value| value as i64) +} + +#[cfg(feature = "php-extension")] +fn xml_array_namespaced_breadcrumbs( + query: &ZendHashTable, + key: &str, +) -> Option> { + let breadcrumbs = query.get(key)?.array()?; + let mut parsed = Vec::with_capacity(breadcrumbs.len()); + + for value in breadcrumbs.iter().map(|(_, value)| value) { + if let Some(local_name) = value.str() { + if local_name == "*" { + parsed.push(("*".to_string(), "*".to_string())); + } else { + parsed.push(("*".to_string(), local_name.to_string())); + } + continue; + } + + let pair = value.array()?; + parsed.push(( + pair.get_index(0)?.str()?.to_string(), + pair.get_index(1)?.str()?.to_string(), + )); + } + + Some(parsed) +} + +fn xml_namespaced_breadcrumbs_match( + current: &[(String, String)], + breadcrumbs: &[(String, String)], +) -> bool { + if breadcrumbs.is_empty() { + return true; + } + + if breadcrumbs.len() > current.len() { + return false; + } + + let offset = current.len() - breadcrumbs.len(); + for (index, (namespace, local_name)) in breadcrumbs.iter().enumerate() { + let (current_namespace, current_local_name) = ¤t[offset + index]; + + if local_name != "*" && local_name != current_local_name { + return false; + } + + if namespace != "*" && namespace != current_namespace { + return false; + } + } + + true +} + #[cfg(feature = "php-extension")] fn is_incomplete_xml_stream_error(error: &str) -> bool { matches!( diff --git a/extensions/native-apis/tests/verify-native-apis.php b/extensions/native-apis/tests/verify-native-apis.php index 4e022de03..6290b17d9 100644 --- a/extensions/native-apis/tests/verify-native-apis.php +++ b/extensions/native-apis/tests/verify-native-apis.php @@ -60,6 +60,12 @@ class_exists( 'WP_HTML_Doctype_Info' ); assert_same( false, $tag_processor->next_tag(), 'Expected HTML next_tag() to skip closing tags.' ); assert_false( $tag_processor->paused_at_incomplete_token(), 'Expected native HTML tag processor not to pause at an incomplete token after complete input.' ); +$tag_query_processor = new WP_HTML_Native_Tag_Processor( '

' ); +assert_true( $tag_query_processor->next_tag( array( 'tag_name' => 'p', 'class_name' => 'target' ) ), 'Expected native HTML tag processor to honor tag and class next_tag() queries.' ); +assert_same( 'P', $tag_query_processor->get_tag(), 'Expected native HTML tag query to stop on the matching paragraph.' ); +assert_true( $tag_query_processor->next_tag( array( 'tag_name' => 'p', 'tag_closers' => 'visit' ) ), 'Expected native HTML tag processor to honor tag_closers query mode.' ); +assert_true( $tag_query_processor->is_tag_closer(), 'Expected native HTML tag processor query to visit matching closers.' ); + $comment_qualified_name_processor = new WP_HTML_Native_Tag_Processor( '' ); assert_true( $comment_qualified_name_processor->next_token(), 'Expected comment token before qualified name checks.' ); assert_same( null, $comment_qualified_name_processor->get_qualified_tag_name(), 'Expected native HTML qualified tag name to return null on comments.' ); @@ -453,6 +459,10 @@ class_exists( 'WP_HTML_Doctype_Info' ); assert_same( 'HTML', $html_full_parser->get_tag(), 'Expected native HTML full parser to start at the document HTML tag.' ); assert_same( null, WP_HTML_Native_Processor::create_full_parser( '', 'ISO-8859-1' ), 'Expected native HTML full parser factory to reject unsupported encodings.' ); +$html_processor_query = WP_HTML_Native_Processor::create_fragment( '
' ); +assert_true( $html_processor_query->next_tag( array( 'breadcrumbs' => array( 'ARTICLE', 'IMG' ), 'class_name' => 'hero' ) ), 'Expected native HTML processor to honor breadcrumb and class next_tag() queries.' ); +assert_same( 'IMG', $html_processor_query->get_tag(), 'Expected native HTML processor query to stop on the matching image.' ); + $html_step_processor = WP_HTML_Native_Processor::create_fragment( '

Text

' ); assert_true( $html_step_processor->step(), 'Expected native HTML processor step() to advance by default.' ); assert_same( 'SECTION', $html_step_processor->get_tag(), 'Expected native HTML processor default step() to reach the first tag.' ); @@ -1121,6 +1131,12 @@ class_exists( 'WP_HTML_Doctype_Info' ); assert_same( 'root', $xml_streaming->get_token_name(), 'Expected native XML streaming factory root tag name.' ); assert_same( null, $xml_class::create_for_streaming( '', null, 'ISO-8859-1', array() ), 'Expected native XML streaming factory to reject unsupported encodings.' ); +$xml_query = $xml_class::create_from_string( '' ); +assert_true( $xml_query->next_tag( array( 'breadcrumbs' => array( array( 'https://wordpress.org', 'item' ) ) ) ), 'Expected native XML processor to honor namespaced breadcrumb next_tag() queries.' ); +assert_same( 'https://wordpress.org', $xml_query->get_tag_namespace(), 'Expected native XML namespaced query to stop on the namespaced item.' ); +assert_true( $xml_query->next_tag( 'item' ), 'Expected native XML processor to honor local-name next_tag() queries.' ); +assert_same( '', $xml_query->get_tag_namespace(), 'Expected native XML local-name query to stop on the unnamespaced item.' ); + $xml_streaming_incomplete = $xml_class::create_for_streaming( 'next_token(), 'Expected incomplete native XML streaming input to pause token scanning.' ); assert_same( null, $xml_streaming_incomplete->get_last_error(), 'Expected incomplete native XML streaming input not to report a syntax error before input is finished.' ); @@ -1872,11 +1888,15 @@ class_exists( 'WP_HTML_Doctype_Info' ); assert_true( null !== $xml_processing_instruction->get_last_error(), 'Expected XML processing instruction parse error.' ); $url_text_class = 'WordPress\\DataLiberation\\URL\\NativeURLInTextProcessor'; -$url_text_processor = new $url_text_class( 'Visit https://WordPress.org/plugins, then example.com/docs.' ); +$url_text_processor = new $url_text_class( 'Visit https://WordPress.org/plugins, then example.com/docs.', 'https://wordpress.org' ); assert_true( $url_text_processor->next_url(), 'Expected native URL-in-text processor to find the first URL.' ); assert_same( 'https://WordPress.org/plugins', $url_text_processor->get_raw_url(), 'Expected native URL-in-text processor to trim trailing punctuation.' ); assert_same( 6, $url_text_processor->get_url_starts_at(), 'Expected native URL-in-text processor to expose byte offset.' ); assert_true( $url_text_processor->had_protocol(), 'Expected native URL-in-text processor to mark explicit protocols.' ); +$url_text_parsed = $url_text_processor->get_parsed_url(); +assert_true( is_object( $url_text_parsed ), 'Expected native URL-in-text get_parsed_url() to expose the parsed URL object.' ); +assert_same( 'https:', $url_text_parsed->protocol, 'Expected native URL-in-text parsed URL protocol.' ); +assert_same( 'wordpress.org', $url_text_parsed->hostname, 'Expected native URL-in-text parsed URL hostname.' ); assert_true( $url_text_processor->next_url(), 'Expected native URL-in-text processor to find the second URL.' ); assert_same( 'example.com/docs', $url_text_processor->get_raw_url(), 'Expected native URL-in-text processor to find bare-domain URLs.' ); assert_false( $url_text_processor->had_protocol(), 'Expected native URL-in-text processor to mark bare domains.' );