Browse Source

Store HTML cache only on changes

master
aeris 3 years ago
parent
commit
cbfdbc2f66
3 changed files with 79 additions and 91 deletions
  1. +0
    -41
      app/lib/http.rb
  2. +76
    -29
      app/models/site.rb
  3. +3
    -21
      bin/cli.rb

+ 0
- 41
app/lib/http.rb View File

@@ -44,50 +44,9 @@ class Http
@html ||= Html.new @response.body
end

DATE_FORMAT = '%Y%m%d_%H%M%S'.freeze

def self.prefix(url)
Digest::SHA256.hexdigest url
end

HTTP_CACHE_DIR = File.join Rails.root, 'tmp', 'cache', 'http'
FileUtils.mkdir_p HTTP_CACHE_DIR unless Dir.exist? HTTP_CACHE_DIR

def cache(response)
return unless ENV['DEBUG_HTTP']

prefix = self.class.prefix @url

body = response.body
last = Dir[File.join dir, "#{prefix}_*.xz"].sort.last
if last
last = self.class.cache last
old = Digest::SHA256.hexdigest last
new = Digest::SHA256.hexdigest body
return if old == new
end

time = Time.now.strftime DATE_FORMAT
file = prefix + '_' + time + '.xz'
file = File.join HTTP_CACHE_DIR, file
body = XZ.compress body, level: 9
File.binwrite file, body
end

def self.cache(file)
body = File.binread file
XZ.decompress body
end

def self.caches(url)
prefix = self.prefix url
Dir["#{HTTP_CACHE_DIR}/#{prefix}_*.xz"]
end

def grab
response = HTTParty.get @url, timeout: 10.seconds
raise "Receive #{response.code}" unless response.success?
self.cache response
response
end



+ 76
- 29
app/models/site.rb View File

@@ -41,33 +41,27 @@ class Site < ApplicationRecord
self.checked_at = date
state = :unchanged

begin
diffs = self.diff reference, content
unless diffs.empty?
self.reference = content
self.changed_at = self.checked_at
state = :changed

diffs = diffs.collect do |diff|
case diff
when Diffy::Diff
{ diff: diff.dump }
else
target, diff = diff
{
target: target.to_h,
diff: diff.dump
}
end
diffs = self.diff reference, content
unless diffs.empty?
self.reference = content
self.changed_at = self.checked_at
state = :changed

diffs = diffs.collect do |diff|
case diff
when Diffy::Diff
{ diff: diff.dump }
else
target, diff = diff
{
target: target.to_h,
diff: diff.dump
}
end
self.diffs.create! content: diffs, created_at: date
end
self.last_error = nil
rescue => e
$stderr.puts e
self.last_error = e
state = :error
self.diffs.create! content: diffs, created_at: date
end
self.last_error = nil

self.save!
state
@@ -78,11 +72,64 @@ class Site < ApplicationRecord
content = grab.body
self.update! name: grab.title unless self.name

unless self.reference
self.update! reference: content
:reference
else
self.diff! self.reference, content
status = unless self.reference
self.update! reference: content
:reference
else
self.diff! self.reference, content
end
self.store content unless status == :unchanged

status
rescue => e
$stderr.puts e
self.update! checked_at: Time.now, last_error: e
:error
end

def caches
Dir["#{HTTP_CACHE_DIR}/#{self.prefix}_*.xz"].sort.collect do |file|
name = File.basename file
date = name.split('_', 2).last
date = DateTime.strptime date, DATE_FORMAT
content = Html.to_s self.class.load file
[date, content]
end
end

protected

def prefix
Digest::SHA256.hexdigest self.url
end

def self.load(file)
body = File.binread file
XZ.decompress body
end

DATE_FORMAT = '%Y%m%d_%H%M%S'.freeze
HTTP_CACHE_DIR = File.join Rails.root, 'tmp', 'cache', 'http'
FileUtils.mkdir_p HTTP_CACHE_DIR unless Dir.exist? HTTP_CACHE_DIR

def store(content)
return unless ENV['DEBUG_HTTP']

prefix = self.prefix

# last = Dir[File.join HTTP_CACHE_DIR, "#{prefix}_*.xz"].sort.last
# if last
# last = self.class.cache last
# old = Digest::SHA256.hexdigest last
# new = Digest::SHA256.hexdigest body
# return if old == new
# end

time = Time.now.strftime DATE_FORMAT
file = prefix + '_' + time + '.xz'
file = File.join HTTP_CACHE_DIR, file
content = XZ.compress content, level: 9
File.binwrite file, content
file
end
end

+ 3
- 21
bin/cli.rb View File

@@ -70,18 +70,14 @@ class App < Thor

site.diffs.delete_all
reference = nil
Http.caches(site.url).sort.each do |file|
name = File.basename file
date = name.split('_', 2).last
date = DateTime.strptime date, Http::DATE_FORMAT
content = Html.to_s Http.cache file

site.caches.each do |date, content|
status = unless reference
site.update! reference: content
:reference
else
site.diff! reference, content, date: date
end
# FileUtils.rm_f file if status == :unchanged
results[status] += 1
color = COLORS[status]
puts " #{date}: #{status.to_s.colorize color}"
@@ -96,20 +92,6 @@ class App < Thor
end
end

# desc 'redo <url> <date1> <date2>', 'Redo check from cache'
#
# def redo(url, date1 = nil, date2 = nil)
# site = Site.where(url: url).first
# fp = Digest::SHA256.hexdigest url
# dir = File.join Rails.root, 'tmp/cache/http'
# reference = File.join dir, "#{fp}_#{date1}"
# reference = File.read reference
# content = File.join dir, "#{fp}_#{date2}"
# content = File.read content
#
# ap site.changed? reference, content, debug: true
# end

protected

def sites(url)
@@ -119,7 +101,7 @@ class App < Thor

def process(urls)
sites = self.sites urls
Parallel.each sites, in_threads: 1 do |site|
Parallel.each sites, in_threads: 16 do |site|
ActiveRecord::Base.transaction do
url = site.url.colorize :yellow
begin


Loading…
Cancel
Save