Store HTML cache only on changes

master
aeris 5 years ago
parent 14d983ec93
commit cbfdbc2f66
  1. 41
      app/lib/http.rb
  2. 105
      app/models/site.rb
  3. 24
      bin/cli.rb

@ -44,50 +44,9 @@ class Http
@html ||= Html.new @response.body
end
DATE_FORMAT = '%Y%m%d_%H%M%S'.freeze
def self.prefix(url)
Digest::SHA256.hexdigest url
end
HTTP_CACHE_DIR = File.join Rails.root, 'tmp', 'cache', 'http'
FileUtils.mkdir_p HTTP_CACHE_DIR unless Dir.exist? HTTP_CACHE_DIR
def cache(response)
return unless ENV['DEBUG_HTTP']
prefix = self.class.prefix @url
body = response.body
last = Dir[File.join dir, "#{prefix}_*.xz"].sort.last
if last
last = self.class.cache last
old = Digest::SHA256.hexdigest last
new = Digest::SHA256.hexdigest body
return if old == new
end
time = Time.now.strftime DATE_FORMAT
file = prefix + '_' + time + '.xz'
file = File.join HTTP_CACHE_DIR, file
body = XZ.compress body, level: 9
File.binwrite file, body
end
def self.cache(file)
body = File.binread file
XZ.decompress body
end
def self.caches(url)
prefix = self.prefix url
Dir["#{HTTP_CACHE_DIR}/#{prefix}_*.xz"]
end
def grab
response = HTTParty.get @url, timeout: 10.seconds
raise "Receive #{response.code}" unless response.success?
self.cache response
response
end

@ -41,33 +41,27 @@ class Site < ApplicationRecord
self.checked_at = date
state = :unchanged
begin
diffs = self.diff reference, content
unless diffs.empty?
self.reference = content
self.changed_at = self.checked_at
state = :changed
diffs = diffs.collect do |diff|
case diff
when Diffy::Diff
{ diff: diff.dump }
else
target, diff = diff
{
target: target.to_h,
diff: diff.dump
}
end
diffs = self.diff reference, content
unless diffs.empty?
self.reference = content
self.changed_at = self.checked_at
state = :changed
diffs = diffs.collect do |diff|
case diff
when Diffy::Diff
{ diff: diff.dump }
else
target, diff = diff
{
target: target.to_h,
diff: diff.dump
}
end
self.diffs.create! content: diffs, created_at: date
end
self.last_error = nil
rescue => e
$stderr.puts e
self.last_error = e
state = :error
self.diffs.create! content: diffs, created_at: date
end
self.last_error = nil
self.save!
state
@ -78,11 +72,64 @@ class Site < ApplicationRecord
content = grab.body
self.update! name: grab.title unless self.name
unless self.reference
self.update! reference: content
:reference
else
self.diff! self.reference, content
status = unless self.reference
self.update! reference: content
:reference
else
self.diff! self.reference, content
end
self.store content unless status == :unchanged
status
rescue => e
$stderr.puts e
self.update! checked_at: Time.now, last_error: e
:error
end
def caches
Dir["#{HTTP_CACHE_DIR}/#{self.prefix}_*.xz"].sort.collect do |file|
name = File.basename file
date = name.split('_', 2).last
date = DateTime.strptime date, DATE_FORMAT
content = Html.to_s self.class.load file
[date, content]
end
end
protected
def prefix
Digest::SHA256.hexdigest self.url
end
def self.load(file)
body = File.binread file
XZ.decompress body
end
DATE_FORMAT = '%Y%m%d_%H%M%S'.freeze
HTTP_CACHE_DIR = File.join Rails.root, 'tmp', 'cache', 'http'
FileUtils.mkdir_p HTTP_CACHE_DIR unless Dir.exist? HTTP_CACHE_DIR
def store(content)
return unless ENV['DEBUG_HTTP']
prefix = self.prefix
# last = Dir[File.join HTTP_CACHE_DIR, "#{prefix}_*.xz"].sort.last
# if last
# last = self.class.cache last
# old = Digest::SHA256.hexdigest last
# new = Digest::SHA256.hexdigest body
# return if old == new
# end
time = Time.now.strftime DATE_FORMAT
file = prefix + '_' + time + '.xz'
file = File.join HTTP_CACHE_DIR, file
content = XZ.compress content, level: 9
File.binwrite file, content
file
end
end

@ -70,18 +70,14 @@ class App < Thor
site.diffs.delete_all
reference = nil
Http.caches(site.url).sort.each do |file|
name = File.basename file
date = name.split('_', 2).last
date = DateTime.strptime date, Http::DATE_FORMAT
content = Html.to_s Http.cache file
site.caches.each do |date, content|
status = unless reference
site.update! reference: content
:reference
else
site.diff! reference, content, date: date
end
# FileUtils.rm_f file if status == :unchanged
results[status] += 1
color = COLORS[status]
puts " #{date}: #{status.to_s.colorize color}"
@ -96,20 +92,6 @@ class App < Thor
end
end
# desc 'redo <url> <date1> <date2>', 'Redo check from cache'
#
# def redo(url, date1 = nil, date2 = nil)
# site = Site.where(url: url).first
# fp = Digest::SHA256.hexdigest url
# dir = File.join Rails.root, 'tmp/cache/http'
# reference = File.join dir, "#{fp}_#{date1}"
# reference = File.read reference
# content = File.join dir, "#{fp}_#{date2}"
# content = File.read content
#
# ap site.changed? reference, content, debug: true
# end
protected
def sites(url)
@ -119,7 +101,7 @@ class App < Thor
def process(urls)
sites = self.sites urls
Parallel.each sites, in_threads: 1 do |site|
Parallel.each sites, in_threads: 16 do |site|
ActiveRecord::Base.transaction do
url = site.url.colorize :yellow
begin

Loading…
Cancel
Save