LinkDetailsExtractor adjustments (#31357)
This commit is contained in:
		
							parent
							
								
									68c7782940
								
							
						
					
					
						commit
						0518613dd7
					
				| 
						 | 
				
			
			@ -157,7 +157,7 @@ class LinkDetailsExtractor
 | 
			
		|||
  end
 | 
			
		||||
 | 
			
		||||
  def title
 | 
			
		||||
    html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip
 | 
			
		||||
    html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || head.at_xpath('title')&.content)&.strip
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def description
 | 
			
		||||
| 
						 | 
				
			
			@ -205,11 +205,11 @@ class LinkDetailsExtractor
 | 
			
		|||
  end
 | 
			
		||||
 | 
			
		||||
  def language
 | 
			
		||||
    valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang'))
 | 
			
		||||
    valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.root.attr('lang'))
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def icon
 | 
			
		||||
    valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
 | 
			
		||||
    valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('icon'))
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  private
 | 
			
		||||
| 
						 | 
				
			
			@ -237,18 +237,20 @@ class LinkDetailsExtractor
 | 
			
		|||
  end
 | 
			
		||||
 | 
			
		||||
  def link_tag(name)
 | 
			
		||||
    document.xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler).pick('href')
 | 
			
		||||
    head.at_xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler)&.attr('href')
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def opengraph_tag(name)
 | 
			
		||||
    document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content')
 | 
			
		||||
    head.at_xpath("//meta[nokogiri:casecmp(@property, '#{name}') or nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def meta_tag(name)
 | 
			
		||||
    document.xpath("//meta[@name=\"#{name}\"]").pick('content')
 | 
			
		||||
    head.at_xpath("//meta[nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def structured_data
 | 
			
		||||
    return @structured_data if defined?(@structured_data)
 | 
			
		||||
 | 
			
		||||
    # Some publications have more than one JSON-LD definition on the page,
 | 
			
		||||
    # and some of those definitions aren't valid JSON either, so we have
 | 
			
		||||
    # to loop through here until we find something that is the right type
 | 
			
		||||
| 
						 | 
				
			
			@ -273,6 +275,10 @@ class LinkDetailsExtractor
 | 
			
		|||
    @document ||= detect_encoding_and_parse_document
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def head
 | 
			
		||||
    @head ||= document.at_xpath('/html/head')
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def detect_encoding_and_parse_document
 | 
			
		||||
    html = nil
 | 
			
		||||
    encoding = nil
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,5 +8,9 @@ class NokogiriHandler
 | 
			
		|||
    def link_rel_include(token_list, token)
 | 
			
		||||
      token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def casecmp(str1, str2)
 | 
			
		||||
      str1.to_s.casecmp?(str2.to_s)
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -49,7 +49,8 @@ RSpec.describe LinkDetailsExtractor do
 | 
			
		|||
      <html lang="en">
 | 
			
		||||
      <head>
 | 
			
		||||
        <title>Man bites dog</title>
 | 
			
		||||
        <meta name="description" content="A dog's tale">
 | 
			
		||||
        <meta name="descripTION" content="A dog's tale">
 | 
			
		||||
        <link rel="pretty IcoN" href="/favicon.ico">
 | 
			
		||||
      </head>
 | 
			
		||||
      </html>
 | 
			
		||||
    HTML
 | 
			
		||||
| 
						 | 
				
			
			@ -59,7 +60,8 @@ RSpec.describe LinkDetailsExtractor do
 | 
			
		|||
        .to have_attributes(
 | 
			
		||||
          title: eq('Man bites dog'),
 | 
			
		||||
          description: eq("A dog's tale"),
 | 
			
		||||
          language: eq('en')
 | 
			
		||||
          language: eq('en'),
 | 
			
		||||
          icon: eq('https://example.com/favicon.ico')
 | 
			
		||||
        )
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
| 
						 | 
				
			
			@ -256,7 +258,7 @@ RSpec.describe LinkDetailsExtractor do
 | 
			
		|||
      <head>
 | 
			
		||||
        <meta property="og:url" content="https://example.com/dog.html">
 | 
			
		||||
        <meta property="og:title" content="Man bites dog">
 | 
			
		||||
        <meta property="og:description" content="A dog's tale">
 | 
			
		||||
        <meta property="OG:description" content="A dog's tale">
 | 
			
		||||
        <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
 | 
			
		||||
        <meta property="og:author" content="Charlie Brown">
 | 
			
		||||
        <meta property="og:locale" content="en">
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue