LinkDetailsExtractor adjustments (#31357)
This commit is contained in:
		
							parent
							
								
									68c7782940
								
							
						
					
					
						commit
						0518613dd7
					
				| 
						 | 
					@ -157,7 +157,7 @@ class LinkDetailsExtractor
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def title
 | 
					  def title
 | 
				
			||||||
    html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip
 | 
					    html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || head.at_xpath('title')&.content)&.strip
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def description
 | 
					  def description
 | 
				
			||||||
| 
						 | 
					@ -205,11 +205,11 @@ class LinkDetailsExtractor
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def language
 | 
					  def language
 | 
				
			||||||
    valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang'))
 | 
					    valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.root.attr('lang'))
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def icon
 | 
					  def icon
 | 
				
			||||||
    valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
 | 
					    valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('icon'))
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  private
 | 
					  private
 | 
				
			||||||
| 
						 | 
					@ -237,18 +237,20 @@ class LinkDetailsExtractor
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def link_tag(name)
 | 
					  def link_tag(name)
 | 
				
			||||||
    document.xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler).pick('href')
 | 
					    head.at_xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler)&.attr('href')
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def opengraph_tag(name)
 | 
					  def opengraph_tag(name)
 | 
				
			||||||
    document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content')
 | 
					    head.at_xpath("//meta[nokogiri:casecmp(@property, '#{name}') or nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def meta_tag(name)
 | 
					  def meta_tag(name)
 | 
				
			||||||
    document.xpath("//meta[@name=\"#{name}\"]").pick('content')
 | 
					    head.at_xpath("//meta[nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def structured_data
 | 
					  def structured_data
 | 
				
			||||||
 | 
					    return @structured_data if defined?(@structured_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Some publications have more than one JSON-LD definition on the page,
 | 
					    # Some publications have more than one JSON-LD definition on the page,
 | 
				
			||||||
    # and some of those definitions aren't valid JSON either, so we have
 | 
					    # and some of those definitions aren't valid JSON either, so we have
 | 
				
			||||||
    # to loop through here until we find something that is the right type
 | 
					    # to loop through here until we find something that is the right type
 | 
				
			||||||
| 
						 | 
					@ -273,6 +275,10 @@ class LinkDetailsExtractor
 | 
				
			||||||
    @document ||= detect_encoding_and_parse_document
 | 
					    @document ||= detect_encoding_and_parse_document
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  def head
 | 
				
			||||||
 | 
					    @head ||= document.at_xpath('/html/head')
 | 
				
			||||||
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def detect_encoding_and_parse_document
 | 
					  def detect_encoding_and_parse_document
 | 
				
			||||||
    html = nil
 | 
					    html = nil
 | 
				
			||||||
    encoding = nil
 | 
					    encoding = nil
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,5 +8,9 @@ class NokogiriHandler
 | 
				
			||||||
    def link_rel_include(token_list, token)
 | 
					    def link_rel_include(token_list, token)
 | 
				
			||||||
      token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase)
 | 
					      token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase)
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def casecmp(str1, str2)
 | 
				
			||||||
 | 
					      str1.to_s.casecmp?(str2.to_s)
 | 
				
			||||||
 | 
					    end
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
end
 | 
					end
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -49,7 +49,8 @@ RSpec.describe LinkDetailsExtractor do
 | 
				
			||||||
      <html lang="en">
 | 
					      <html lang="en">
 | 
				
			||||||
      <head>
 | 
					      <head>
 | 
				
			||||||
        <title>Man bites dog</title>
 | 
					        <title>Man bites dog</title>
 | 
				
			||||||
        <meta name="description" content="A dog's tale">
 | 
					        <meta name="descripTION" content="A dog's tale">
 | 
				
			||||||
 | 
					        <link rel="pretty IcoN" href="/favicon.ico">
 | 
				
			||||||
      </head>
 | 
					      </head>
 | 
				
			||||||
      </html>
 | 
					      </html>
 | 
				
			||||||
    HTML
 | 
					    HTML
 | 
				
			||||||
| 
						 | 
					@ -59,7 +60,8 @@ RSpec.describe LinkDetailsExtractor do
 | 
				
			||||||
        .to have_attributes(
 | 
					        .to have_attributes(
 | 
				
			||||||
          title: eq('Man bites dog'),
 | 
					          title: eq('Man bites dog'),
 | 
				
			||||||
          description: eq("A dog's tale"),
 | 
					          description: eq("A dog's tale"),
 | 
				
			||||||
          language: eq('en')
 | 
					          language: eq('en'),
 | 
				
			||||||
 | 
					          icon: eq('https://example.com/favicon.ico')
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
| 
						 | 
					@ -256,7 +258,7 @@ RSpec.describe LinkDetailsExtractor do
 | 
				
			||||||
      <head>
 | 
					      <head>
 | 
				
			||||||
        <meta property="og:url" content="https://example.com/dog.html">
 | 
					        <meta property="og:url" content="https://example.com/dog.html">
 | 
				
			||||||
        <meta property="og:title" content="Man bites dog">
 | 
					        <meta property="og:title" content="Man bites dog">
 | 
				
			||||||
        <meta property="og:description" content="A dog's tale">
 | 
					        <meta property="OG:description" content="A dog's tale">
 | 
				
			||||||
        <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
 | 
					        <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
 | 
				
			||||||
        <meta property="og:author" content="Charlie Brown">
 | 
					        <meta property="og:author" content="Charlie Brown">
 | 
				
			||||||
        <meta property="og:locale" content="en">
 | 
					        <meta property="og:locale" content="en">
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue