diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 56533f655..fe7f23f48 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -157,7 +157,7 @@ class LinkDetailsExtractor end def title - html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip + html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || head.at_xpath('title')&.content)&.strip end def description @@ -205,11 +205,11 @@ class LinkDetailsExtractor end def language - valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang')) + valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.root.attr('lang')) end def icon - valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon')) + valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('icon')) end private @@ -237,18 +237,20 @@ class LinkDetailsExtractor end def link_tag(name) - document.xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler).pick('href') + head.at_xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler)&.attr('href') end def opengraph_tag(name) - document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content') + head.at_xpath("//meta[nokogiri:casecmp(@property, '#{name}') or nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content') end def meta_tag(name) - document.xpath("//meta[@name=\"#{name}\"]").pick('content') + head.at_xpath("//meta[nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content') end def structured_data + return @structured_data if defined?(@structured_data) + # Some publications have more than one JSON-LD definition on the page, # and some of those definitions aren't valid JSON either, so we have # to loop through here until we find something that is the right type @@ -273,6 +275,10 @@ class LinkDetailsExtractor @document ||= detect_encoding_and_parse_document end + def head + @head ||= document.at_xpath('/html/head') + end + def detect_encoding_and_parse_document html = nil encoding = nil diff --git a/app/lib/nokogiri_handler.rb b/app/lib/nokogiri_handler.rb index 804bcb9c0..26cf45795 100644 --- a/app/lib/nokogiri_handler.rb +++ b/app/lib/nokogiri_handler.rb @@ -8,5 +8,9 @@ class NokogiriHandler def link_rel_include(token_list, token) token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase) end + + def casecmp(str1, str2) + str1.to_s.casecmp?(str2.to_s) + end end end diff --git a/spec/lib/link_details_extractor_spec.rb b/spec/lib/link_details_extractor_spec.rb index d8d9db0ad..36d6f22b0 100644 --- a/spec/lib/link_details_extractor_spec.rb +++ b/spec/lib/link_details_extractor_spec.rb @@ -49,7 +49,8 @@ RSpec.describe LinkDetailsExtractor do