Handle HTML encoding
parent
39bef45c92
commit
99b3f29b41
1
Gemfile
1
Gemfile
|
@ -19,6 +19,7 @@ group :development, :test do
|
|||
gem 'pry-byebug'
|
||||
gem 'timecop'
|
||||
gem 'rspec-rails'
|
||||
gem 'webmock'
|
||||
end
|
||||
|
||||
group :development do
|
||||
|
|
14
Gemfile.lock
14
Gemfile.lock
|
@ -38,6 +38,8 @@ GEM
|
|||
i18n (>= 0.7, < 2)
|
||||
minitest (~> 5.1)
|
||||
tzinfo (~> 1.1)
|
||||
addressable (2.5.2)
|
||||
public_suffix (>= 2.0.2, < 4.0)
|
||||
arel (8.0.0)
|
||||
awesome_print (1.8.0)
|
||||
better_errors (2.4.0)
|
||||
|
@ -51,6 +53,8 @@ GEM
|
|||
coderay (1.1.2)
|
||||
colorize (0.8.1)
|
||||
concurrent-ruby (1.0.5)
|
||||
crack (0.4.3)
|
||||
safe_yaml (~> 1.0.0)
|
||||
crass (1.0.4)
|
||||
debug_inspector (0.0.3)
|
||||
diff-lcs (1.3)
|
||||
|
@ -59,6 +63,7 @@ GEM
|
|||
ffi (1.9.23)
|
||||
globalid (0.4.1)
|
||||
activesupport (>= 4.2.0)
|
||||
hashdiff (0.3.7)
|
||||
httparty (0.16.2)
|
||||
multi_xml (>= 0.5.2)
|
||||
i18n (1.0.1)
|
||||
|
@ -90,6 +95,7 @@ GEM
|
|||
pry (~> 0.10)
|
||||
pry-rails (0.3.6)
|
||||
pry (>= 0.10.4)
|
||||
public_suffix (3.0.3)
|
||||
puma (3.11.4)
|
||||
rack (2.0.5)
|
||||
rack-test (1.0.0)
|
||||
|
@ -139,6 +145,7 @@ GEM
|
|||
rspec-support (~> 3.7.0)
|
||||
rspec-support (3.7.1)
|
||||
ruby_dep (1.5.0)
|
||||
safe_yaml (1.0.4)
|
||||
spring (2.0.2)
|
||||
activesupport (>= 4.2)
|
||||
spring-watcher-listen (2.0.1)
|
||||
|
@ -157,6 +164,10 @@ GEM
|
|||
timecop (0.9.1)
|
||||
tzinfo (1.2.5)
|
||||
thread_safe (~> 0.1)
|
||||
webmock (3.4.2)
|
||||
addressable (>= 2.3.6)
|
||||
crack (>= 0.3.2)
|
||||
hashdiff
|
||||
websocket-driver (0.6.5)
|
||||
websocket-extensions (>= 0.1.0)
|
||||
websocket-extensions (0.1.3)
|
||||
|
@ -186,6 +197,7 @@ DEPENDENCIES
|
|||
thor
|
||||
timecop
|
||||
tzinfo-data
|
||||
webmock
|
||||
|
||||
BUNDLED WITH
|
||||
1.16.1
|
||||
1.16.2
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
guard :rails do
|
||||
watch('Gemfile.lock')
|
||||
watch(%r{^(config|lib)/.*})
|
||||
end
|
||||
|
||||
guard :livereload do
|
||||
watch(%r{app/views/.+\.(erb|haml|slim)$})
|
||||
watch(%r{app/helpers/.+\.rb})
|
||||
watch(%r{public/.+\.(css|js|html)})
|
||||
watch(%r{config/locales/.+\.yml})
|
||||
# Rails Assets Pipeline
|
||||
watch(%r{(app|vendor)(/assets/\w+/(.+\.(css|js|html|png|jpg|coffee|scss))).*}) { |m| "/assets/#{m[3]}" }
|
||||
end
|
|
@ -0,0 +1,71 @@
|
|||
class Http
|
||||
def initialize(url)
|
||||
@url = url
|
||||
@response = self.grab
|
||||
end
|
||||
|
||||
def code
|
||||
@response.code
|
||||
end
|
||||
|
||||
def success?
|
||||
@response.success?
|
||||
end
|
||||
|
||||
def html?
|
||||
@response.content_type == 'text/html'
|
||||
end
|
||||
|
||||
def title
|
||||
html = self.parse
|
||||
return nil unless html
|
||||
tag = html.at 'head title'
|
||||
tag&.text
|
||||
end
|
||||
|
||||
def charset
|
||||
return nil unless self.html?
|
||||
|
||||
body = @response.body
|
||||
html = Nokogiri::HTML.parse body
|
||||
|
||||
# Content-Type charset seems already processed by HTTParty
|
||||
# charset = @response.headers['content-type']
|
||||
# charset = /text\/html;\s*charset=(.*)/i.match charset
|
||||
# return charset[1] if charset
|
||||
|
||||
charset = html.at 'head meta[charset]'
|
||||
return charset['charset'] if charset
|
||||
|
||||
charset = html.at 'head meta[http-equiv="Content-Type"]'
|
||||
if charset
|
||||
charset = charset['content']
|
||||
charset = /text\/html;\s*charset=(.*)/i.match charset
|
||||
return charset[1] if charset
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
def body
|
||||
charset = self.charset
|
||||
body = @response.body
|
||||
body = body.force_encoding charset if charset
|
||||
body.encode! 'utf-8' unless body.encoding == Encoding::UTF_8
|
||||
body
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def grab
|
||||
response = HTTParty.get @url, timeout: 10.seconds
|
||||
raise "Receive #{response.code}" unless response.success?
|
||||
response
|
||||
end
|
||||
|
||||
def parse
|
||||
return nil unless self.html?
|
||||
body = @response.body
|
||||
Nokogiri::HTML.parse body
|
||||
end
|
||||
end
|
|
@ -1,13 +1,5 @@
|
|||
module Utils
|
||||
def self.utf8!(text)
|
||||
return nil unless text
|
||||
return text if text.encoding == Encoding::UTF_8
|
||||
text.force_encoding 'utf-8'
|
||||
end
|
||||
|
||||
def self.diff(a, b, context: 3, limit: 30)
|
||||
a = self.utf8! a
|
||||
b = self.utf8! b
|
||||
diff = Diffy::Diff.new a, b, context: context
|
||||
diff = diff.to_s :color
|
||||
return '...(too much diff)...'.colorize :light_red if diff.lines.size > limit
|
||||
|
|
|
@ -13,8 +13,8 @@ class Check < ApplicationRecord
|
|||
state = :unchanged
|
||||
|
||||
begin
|
||||
target = self.target
|
||||
reference = Utils.utf8! self.content
|
||||
target = self.target
|
||||
reference = self.content
|
||||
content = target.extract content
|
||||
changed = reference != content
|
||||
if changed
|
||||
|
@ -25,7 +25,6 @@ class Check < ApplicationRecord
|
|||
end
|
||||
self.last_error = nil
|
||||
rescue => e
|
||||
raise
|
||||
$stderr.puts e
|
||||
state = :error
|
||||
self.last_error = e
|
||||
|
@ -35,6 +34,13 @@ class Check < ApplicationRecord
|
|||
state
|
||||
end
|
||||
|
||||
def diff(context: 3, **kwargs)
|
||||
reference = self.reference
|
||||
target = self.target
|
||||
content = target.extract self.content
|
||||
Diffy::Diff.new reference, content, context: context, **kwargs
|
||||
end
|
||||
|
||||
def recalculate!(debug: false)
|
||||
state = :unchanged
|
||||
|
||||
|
|
|
@ -10,24 +10,8 @@ class Site < ApplicationRecord
|
|||
self.where(url: url).first
|
||||
end
|
||||
|
||||
def self.grab(url)
|
||||
response = HTTParty.get url, timeout: 10.seconds
|
||||
raise "Receive #{response.code}" unless response.success?
|
||||
response
|
||||
end
|
||||
|
||||
def self.html(url)
|
||||
response = self.grab url
|
||||
content_type = response.content_type
|
||||
raise "Expecting #{'text/html'.colorize :yellow}, got #{content_type.colorize :yellow}" unless content_type == 'text/html'
|
||||
content = response.body
|
||||
Nokogiri::HTML.parse content
|
||||
end
|
||||
|
||||
def self.title(url)
|
||||
html = self.html url
|
||||
tag = html.at 'head title'
|
||||
tag&.text
|
||||
def grab
|
||||
Http.new self.url
|
||||
end
|
||||
|
||||
def inherited_targets
|
||||
|
@ -58,12 +42,18 @@ class Site < ApplicationRecord
|
|||
current_index < state_index ? state : current
|
||||
end
|
||||
|
||||
def diff(context: 3, **kwargs)
|
||||
reference = self.reference
|
||||
content = self.content
|
||||
Diffy::Diff.new reference, content, context: context, **kwargs
|
||||
end
|
||||
|
||||
def diff!(content, debug: false)
|
||||
self.checked_at = Time.now
|
||||
state = :unchanged
|
||||
|
||||
begin
|
||||
reference = Utils.utf8! self.content
|
||||
reference = self.content
|
||||
checks = self.checks
|
||||
if checks.empty?
|
||||
if reference != content
|
||||
|
@ -94,12 +84,7 @@ class Site < ApplicationRecord
|
|||
|
||||
def check(debug: false)
|
||||
reference = self.reference
|
||||
response = self.class.grab self.url
|
||||
content = response.body
|
||||
# case response.content_type
|
||||
# when 'text/html'
|
||||
# content = content.force_encoding 'utf-8'
|
||||
# end
|
||||
content = self.grab.body
|
||||
unless reference
|
||||
self.reference! content
|
||||
return :reference
|
||||
|
|
|
@ -3,6 +3,7 @@ require 'ostruct'
|
|||
require 'optparse'
|
||||
|
||||
# Force resolution to avoid cycle in autoloading
|
||||
Http
|
||||
Check
|
||||
Target
|
||||
Site
|
||||
|
|
|
@ -4,8 +4,8 @@ class CreateSites < ActiveRecord::Migration[5.1]
|
|||
t.string :url, null: false
|
||||
t.string :name, index: true
|
||||
|
||||
t.binary :reference, null: false
|
||||
t.binary :content, null: false
|
||||
t.text :reference
|
||||
t.text :content
|
||||
|
||||
t.belongs_to :group, index: true, foreign_key: true
|
||||
t.belongs_to :template, index: true, foreign_key: true
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
class CreateChecks < ActiveRecord::Migration[5.1]
|
||||
def change
|
||||
create_table :checks do |t|
|
||||
t.binary :reference, null: false
|
||||
t.binary :content, null: false
|
||||
t.text :reference
|
||||
t.text :content
|
||||
|
||||
t.belongs_to :target, index: true, foreign_key: true
|
||||
t.belongs_to :site, index: true, foreign_key: true
|
||||
|
|
|
@ -16,8 +16,8 @@ ActiveRecord::Schema.define(version: 20180510000004) do
|
|||
enable_extension "plpgsql"
|
||||
|
||||
create_table "checks", force: :cascade do |t|
|
||||
t.binary "reference"
|
||||
t.binary "content"
|
||||
t.text "reference"
|
||||
t.text "content"
|
||||
t.bigint "target_id"
|
||||
t.bigint "site_id"
|
||||
t.string "last_error"
|
||||
|
@ -37,8 +37,8 @@ ActiveRecord::Schema.define(version: 20180510000004) do
|
|||
create_table "sites", force: :cascade do |t|
|
||||
t.string "url", null: false
|
||||
t.string "name"
|
||||
t.binary "reference"
|
||||
t.binary "content"
|
||||
t.text "reference"
|
||||
t.text "content"
|
||||
t.bigint "group_id"
|
||||
t.bigint "template_id"
|
||||
t.string "last_error"
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
RSpec.describe Http do
|
||||
let :site do
|
||||
Http.new 'http://localhost/'
|
||||
end
|
||||
|
||||
it 'must encode to utf-8 if not HTML' do
|
||||
stub_request(:any, 'localhost').to_return body: "\xC3\xA9", status: 200,
|
||||
headers: { 'Content-Type': 'application/pdf' }
|
||||
expect(site.body.encoding).to eq Encoding::UTF_8
|
||||
expect(site.body).to eq "\xC3\xA9"
|
||||
end
|
||||
|
||||
it 'must encode to utf-8 if HTML and nothing specified' do
|
||||
body = <<-HEREDOC
|
||||
<html>
|
||||
<body>\xC3\xA9</body>
|
||||
</html>'
|
||||
HEREDOC
|
||||
stub_request(:any, 'localhost').to_return body: body, status: 200,
|
||||
headers: { 'Content-Type': 'text/html' }
|
||||
body = site.body
|
||||
expect(body.encoding).to eq Encoding::UTF_8
|
||||
body = Nokogiri::HTML.parse body
|
||||
body = body.at('body').content
|
||||
expect(body).to eq "\xC3\xA9"
|
||||
end
|
||||
|
||||
it 'must encode to given content-type charset if nothing specified' do
|
||||
body = <<-HEREDOC
|
||||
<html>
|
||||
<body>\xE9</body>
|
||||
</html>'
|
||||
HEREDOC
|
||||
stub_request(:any, 'localhost').to_return body: body, status: 200,
|
||||
headers: { 'Content-Type': 'text/html; charset=iso-8859-1' }
|
||||
body = site.body
|
||||
expect(body.encoding).to eq Encoding::UTF_8
|
||||
body = Nokogiri::HTML.parse body
|
||||
body = body.at('body').content
|
||||
expect(body).to eq "\xC3\xA9"
|
||||
end
|
||||
|
||||
it 'must encode to given meta charset' do
|
||||
body = <<-HEREDOC
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="ISO-8859-1"/>
|
||||
</head>
|
||||
<body>\xE9</body>
|
||||
</html>'
|
||||
HEREDOC
|
||||
stub_request(:any, 'localhost').to_return body: body, status: 200,
|
||||
headers: { 'Content-Type': 'text/html' }
|
||||
body = site.body
|
||||
expect(body.encoding).to eq Encoding::UTF_8
|
||||
body = Nokogiri::HTML.parse body
|
||||
body = body.at('body').content
|
||||
expect(body).to eq "\xC3\xA9"
|
||||
end
|
||||
|
||||
it 'must encode to given meta http-equiv' do
|
||||
body = <<-HEREDOC
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; CHARSET=iso-8859-1">
|
||||
</head>
|
||||
<body>\xE9</body>
|
||||
</html>'
|
||||
HEREDOC
|
||||
stub_request(:any, 'localhost').to_return body: body, status: 200,
|
||||
headers: { 'Content-Type': 'text/html' }
|
||||
body = site.body
|
||||
expect(body.encoding).to eq Encoding::UTF_8
|
||||
body = Nokogiri::HTML.parse body
|
||||
body = body.at('body').content
|
||||
expect(body).to eq "\xC3\xA9"
|
||||
end
|
||||
end
|
|
@ -19,7 +19,7 @@ RSpec.describe Site, type: :model do
|
|||
end
|
||||
|
||||
def stub_page(content)
|
||||
allow(Site).to receive(:grab) { OpenStruct.new body: content }
|
||||
allow(site).to receive(:grab) { OpenStruct.new body: content }
|
||||
end
|
||||
|
||||
def check!(content)
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
require 'webmock/rspec'
|
||||
|
||||
# This file was generated by the `rails generate rspec:install` command. Conventionally, all
|
||||
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
||||
# The generated `.rspec` file contains `--require spec_helper` which will cause
|
||||
|
@ -14,38 +16,38 @@
|
|||
#
|
||||
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
||||
RSpec.configure do |config|
|
||||
# rspec-expectations config goes here. You can use an alternate
|
||||
# assertion/expectation library such as wrong or the stdlib/minitest
|
||||
# assertions if you prefer.
|
||||
config.expect_with :rspec do |expectations|
|
||||
# This option will default to `true` in RSpec 4. It makes the `description`
|
||||
# and `failure_message` of custom matchers include text for helper methods
|
||||
# defined using `chain`, e.g.:
|
||||
# be_bigger_than(2).and_smaller_than(4).description
|
||||
# # => "be bigger than 2 and smaller than 4"
|
||||
# ...rather than:
|
||||
# # => "be bigger than 2"
|
||||
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
||||
end
|
||||
# rspec-expectations config goes here. You can use an alternate
|
||||
# assertion/expectation library such as wrong or the stdlib/minitest
|
||||
# assertions if you prefer.
|
||||
config.expect_with :rspec do |expectations|
|
||||
# This option will default to `true` in RSpec 4. It makes the `description`
|
||||
# and `failure_message` of custom matchers include text for helper methods
|
||||
# defined using `chain`, e.g.:
|
||||
# be_bigger_than(2).and_smaller_than(4).description
|
||||
# # => "be bigger than 2 and smaller than 4"
|
||||
# ...rather than:
|
||||
# # => "be bigger than 2"
|
||||
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
||||
end
|
||||
|
||||
# rspec-mocks config goes here. You can use an alternate test double
|
||||
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
||||
config.mock_with :rspec do |mocks|
|
||||
# Prevents you from mocking or stubbing a method that does not exist on
|
||||
# a real object. This is generally recommended, and will default to
|
||||
# `true` in RSpec 4.
|
||||
mocks.verify_partial_doubles = true
|
||||
end
|
||||
# rspec-mocks config goes here. You can use an alternate test double
|
||||
# library (such as bogus or mocha) by changing the `mock_with` option here.
|
||||
config.mock_with :rspec do |mocks|
|
||||
# Prevents you from mocking or stubbing a method that does not exist on
|
||||
# a real object. This is generally recommended, and will default to
|
||||
# `true` in RSpec 4.
|
||||
mocks.verify_partial_doubles = true
|
||||
end
|
||||
|
||||
# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
|
||||
# have no way to turn it off -- the option exists only for backwards
|
||||
# compatibility in RSpec 3). It causes shared context metadata to be
|
||||
# inherited by the metadata hash of host groups and examples, rather than
|
||||
# triggering implicit auto-inclusion in groups with matching metadata.
|
||||
config.shared_context_metadata_behavior = :apply_to_host_groups
|
||||
# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
|
||||
# have no way to turn it off -- the option exists only for backwards
|
||||
# compatibility in RSpec 3). It causes shared context metadata to be
|
||||
# inherited by the metadata hash of host groups and examples, rather than
|
||||
# triggering implicit auto-inclusion in groups with matching metadata.
|
||||
config.shared_context_metadata_behavior = :apply_to_host_groups
|
||||
|
||||
# The settings below are suggested to provide a good initial experience
|
||||
# with RSpec, but feel free to customize to your heart's content.
|
||||
# The settings below are suggested to provide a good initial experience
|
||||
# with RSpec, but feel free to customize to your heart's content.
|
||||
=begin
|
||||
# This allows you to limit a spec run to individual examples or groups
|
||||
# you care about by tagging them with `:focus` metadata. When nothing
|
||||
|
@ -94,3 +96,4 @@ RSpec.configure do |config|
|
|||
Kernel.srand config.seed
|
||||
=end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in New Issue