Change spam check to apply to local accounts and add a threshold (#11806)
Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives
This commit is contained in:
		
							parent
							
								
									577706987d
								
							
						
					
					
						commit
						4f6af87906
					
				| 
						 | 
					@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def check_for_spam
 | 
					  def check_for_spam
 | 
				
			||||||
    spam_check = SpamCheck.new(@status)
 | 
					    SpamCheck.perform(@status)
 | 
				
			||||||
 | 
					 | 
				
			||||||
    return if spam_check.skip?
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if spam_check.spam?
 | 
					 | 
				
			||||||
      spam_check.flag!
 | 
					 | 
				
			||||||
    else
 | 
					 | 
				
			||||||
      spam_check.remember!
 | 
					 | 
				
			||||||
    end
 | 
					 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def forward_for_reply
 | 
					  def forward_for_reply
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,9 +4,25 @@ class SpamCheck
 | 
				
			||||||
  include Redisable
 | 
					  include Redisable
 | 
				
			||||||
  include ActionView::Helpers::TextHelper
 | 
					  include ActionView::Helpers::TextHelper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # Threshold over which two Nilsimsa values are considered
 | 
				
			||||||
 | 
					  # to refer to the same text
 | 
				
			||||||
  NILSIMSA_COMPARE_THRESHOLD = 95
 | 
					  NILSIMSA_COMPARE_THRESHOLD = 95
 | 
				
			||||||
  NILSIMSA_MIN_SIZE          = 10
 | 
					
 | 
				
			||||||
  EXPIRE_SET_AFTER           = 1.week.seconds
 | 
					  # Nilsimsa doesn't work well on small inputs, so below
 | 
				
			||||||
 | 
					  # this size, we check only for exact matches with MD5
 | 
				
			||||||
 | 
					  NILSIMSA_MIN_SIZE = 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # How long to keep the trail of digests between updates,
 | 
				
			||||||
 | 
					  # there is no reason to store it forever
 | 
				
			||||||
 | 
					  EXPIRE_SET_AFTER = 1.week.seconds
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # How many digests to keep in an account's trail. If it's
 | 
				
			||||||
 | 
					  # too small, spam could rotate around different message templates
 | 
				
			||||||
 | 
					  MAX_TRAIL_SIZE = 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # How many detected duplicates to allow through before
 | 
				
			||||||
 | 
					  # considering the message as spam
 | 
				
			||||||
 | 
					  THRESHOLD = 5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def initialize(status)
 | 
					  def initialize(status)
 | 
				
			||||||
    @account = status.account
 | 
					    @account = status.account
 | 
				
			||||||
| 
						 | 
					@ -21,9 +37,9 @@ class SpamCheck
 | 
				
			||||||
    if insufficient_data?
 | 
					    if insufficient_data?
 | 
				
			||||||
      false
 | 
					      false
 | 
				
			||||||
    elsif nilsimsa?
 | 
					    elsif nilsimsa?
 | 
				
			||||||
      any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
 | 
					      digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
 | 
				
			||||||
    else
 | 
					    else
 | 
				
			||||||
      any_other_digest?('md5') { |_, other_digest| other_digest == digest }
 | 
					      digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -38,7 +54,7 @@ class SpamCheck
 | 
				
			||||||
    # get the correct status ID back, we have to save it in the string value
 | 
					    # get the correct status ID back, we have to save it in the string value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    redis.zadd(redis_key, @status.id, digest_with_algorithm)
 | 
					    redis.zadd(redis_key, @status.id, digest_with_algorithm)
 | 
				
			||||||
    redis.zremrangebyrank(redis_key, '0', '-10')
 | 
					    redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
 | 
				
			||||||
    redis.expire(redis_key, EXPIRE_SET_AFTER)
 | 
					    redis.expire(redis_key, EXPIRE_SET_AFTER)
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -78,6 +94,20 @@ class SpamCheck
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  class << self
 | 
				
			||||||
 | 
					    def perform(status)
 | 
				
			||||||
 | 
					      spam_check = new(status)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      return if spam_check.skip?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if spam_check.spam?
 | 
				
			||||||
 | 
					        spam_check.flag!
 | 
				
			||||||
 | 
					      else
 | 
				
			||||||
 | 
					        spam_check.remember!
 | 
				
			||||||
 | 
					      end
 | 
				
			||||||
 | 
					    end
 | 
				
			||||||
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  private
 | 
					  private
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def disabled?
 | 
					  def disabled?
 | 
				
			||||||
| 
						 | 
					@ -149,14 +179,14 @@ class SpamCheck
 | 
				
			||||||
    redis.zrange(redis_key, 0, -1)
 | 
					    redis.zrange(redis_key, 0, -1)
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def any_other_digest?(filter_algorithm)
 | 
					  def digests_over_threshold?(filter_algorithm)
 | 
				
			||||||
    other_digests.any? do |record|
 | 
					    other_digests.select do |record|
 | 
				
			||||||
      algorithm, other_digest, status_id = record.split(':')
 | 
					      algorithm, other_digest, status_id = record.split(':')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      next unless algorithm == filter_algorithm
 | 
					      next unless algorithm == filter_algorithm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      yield algorithm, other_digest, status_id
 | 
					      yield algorithm, other_digest, status_id
 | 
				
			||||||
    end
 | 
					    end.size >= THRESHOLD
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def matching_status_ids
 | 
					  def matching_status_ids
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    status.save!
 | 
					    status.save!
 | 
				
			||||||
 | 
					    check_for_spam(status)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    mentions.each { |mention| create_notification(mention) }
 | 
					    mentions.each { |mention| create_notification(mention) }
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
| 
						 | 
					@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService
 | 
				
			||||||
  def resolve_account_service
 | 
					  def resolve_account_service
 | 
				
			||||||
    ResolveAccountService.new
 | 
					    ResolveAccountService.new
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  def check_for_spam(status)
 | 
				
			||||||
 | 
					    SpamCheck.perform(status)
 | 
				
			||||||
 | 
					  end
 | 
				
			||||||
end
 | 
					end
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -86,23 +86,33 @@ RSpec.describe SpamCheck do
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    it 'returns true for duplicate statuses to the same recipient' do
 | 
					    it 'returns true for duplicate statuses to the same recipient' do
 | 
				
			||||||
      status1 = status_with_html('@alice Hello')
 | 
					      described_class::THRESHOLD.times do
 | 
				
			||||||
      described_class.new(status1).remember!
 | 
					        status1 = status_with_html('@alice Hello')
 | 
				
			||||||
 | 
					        described_class.new(status1).remember!
 | 
				
			||||||
 | 
					      end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      status2 = status_with_html('@alice Hello')
 | 
					      status2 = status_with_html('@alice Hello')
 | 
				
			||||||
      expect(described_class.new(status2).spam?).to be true
 | 
					      expect(described_class.new(status2).spam?).to be true
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    it 'returns true for duplicate statuses to different recipients' do
 | 
					    it 'returns true for duplicate statuses to different recipients' do
 | 
				
			||||||
      status1 = status_with_html('@alice Hello')
 | 
					      described_class::THRESHOLD.times do
 | 
				
			||||||
      described_class.new(status1).remember!
 | 
					        status1 = status_with_html('@alice Hello')
 | 
				
			||||||
 | 
					        described_class.new(status1).remember!
 | 
				
			||||||
 | 
					      end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      status2 = status_with_html('@bob Hello')
 | 
					      status2 = status_with_html('@bob Hello')
 | 
				
			||||||
      expect(described_class.new(status2).spam?).to be true
 | 
					      expect(described_class.new(status2).spam?).to be true
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    it 'returns true for nearly identical statuses with random numbers' do
 | 
					    it 'returns true for nearly identical statuses with random numbers' do
 | 
				
			||||||
      source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
 | 
					      source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
 | 
				
			||||||
      status1 = status_with_html('@alice ' + source_text + ' 1234')
 | 
					
 | 
				
			||||||
      described_class.new(status1).remember!
 | 
					      described_class::THRESHOLD.times do
 | 
				
			||||||
 | 
					        status1 = status_with_html('@alice ' + source_text + ' 1234')
 | 
				
			||||||
 | 
					        described_class.new(status1).remember!
 | 
				
			||||||
 | 
					      end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
      status2 = status_with_html('@bob ' + source_text + ' 9568')
 | 
					      status2 = status_with_html('@bob ' + source_text + ' 9568')
 | 
				
			||||||
      expect(described_class.new(status2).spam?).to be true
 | 
					      expect(described_class.new(status2).spam?).to be true
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
| 
						 | 
					@ -140,9 +150,9 @@ RSpec.describe SpamCheck do
 | 
				
			||||||
    let(:redis_key) { spam_check.send(:redis_key) }
 | 
					    let(:redis_key) { spam_check.send(:redis_key) }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    it 'remembers' do
 | 
					    it 'remembers' do
 | 
				
			||||||
      expect do
 | 
					      expect(Redis.current.exists(redis_key)).to be true
 | 
				
			||||||
        spam_check.remember!
 | 
					      spam_check.remember!
 | 
				
			||||||
      end.to change { Redis.current.exists(redis_key) }.from(false).to(true)
 | 
					      expect(Redis.current.exists(redis_key)).to be true
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -156,9 +166,9 @@ RSpec.describe SpamCheck do
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    it 'resets' do
 | 
					    it 'resets' do
 | 
				
			||||||
      expect do
 | 
					      expect(Redis.current.exists(redis_key)).to be true
 | 
				
			||||||
        spam_check.reset!
 | 
					      spam_check.reset!
 | 
				
			||||||
      end.to change { Redis.current.exists(redis_key) }.from(true).to(false)
 | 
					      expect(Redis.current.exists(redis_key)).to be false
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue