Browse Source

Handle HTML encoding

webui
aeris 4 years ago
parent
commit
99b3f29b41
  1. 1
      Gemfile
  2. 14
      Gemfile.lock
  3. 13
      Guardfile
  4. 71
      app/lib/http.rb
  5. 8
      app/lib/utils.rb
  6. 12
      app/models/check.rb
  7. 35
      app/models/site.rb
  8. 1
      bin/cli.rb
  9. 4
      db/migrate/20180510000002_create_sites.rb
  10. 4
      db/migrate/20180510000004_create_checks.rb
  11. 8
      db/schema.rb
  12. 78
      spec/http_spec.rb
  13. 2
      spec/models/site_spec.rb
  14. 61
      spec/spec_helper.rb

1
Gemfile

@ -19,6 +19,7 @@ group :development, :test do
gem 'pry-byebug'
gem 'timecop'
gem 'rspec-rails'
gem 'webmock'
end
group :development do

14
Gemfile.lock

@ -38,6 +38,8 @@ GEM
i18n (>= 0.7, < 2)
minitest (~> 5.1)
tzinfo (~> 1.1)
addressable (2.5.2)
public_suffix (>= 2.0.2, < 4.0)
arel (8.0.0)
awesome_print (1.8.0)
better_errors (2.4.0)
@ -51,6 +53,8 @@ GEM
coderay (1.1.2)
colorize (0.8.1)
concurrent-ruby (1.0.5)
crack (0.4.3)
safe_yaml (~> 1.0.0)
crass (1.0.4)
debug_inspector (0.0.3)
diff-lcs (1.3)
@ -59,6 +63,7 @@ GEM
ffi (1.9.23)
globalid (0.4.1)
activesupport (>= 4.2.0)
hashdiff (0.3.7)
httparty (0.16.2)
multi_xml (>= 0.5.2)
i18n (1.0.1)
@ -90,6 +95,7 @@ GEM
pry (~> 0.10)
pry-rails (0.3.6)
pry (>= 0.10.4)
public_suffix (3.0.3)
puma (3.11.4)
rack (2.0.5)
rack-test (1.0.0)
@ -139,6 +145,7 @@ GEM
rspec-support (~> 3.7.0)
rspec-support (3.7.1)
ruby_dep (1.5.0)
safe_yaml (1.0.4)
spring (2.0.2)
activesupport (>= 4.2)
spring-watcher-listen (2.0.1)
@ -157,6 +164,10 @@ GEM
timecop (0.9.1)
tzinfo (1.2.5)
thread_safe (~> 0.1)
webmock (3.4.2)
addressable (>= 2.3.6)
crack (>= 0.3.2)
hashdiff
websocket-driver (0.6.5)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.3)
@ -186,6 +197,7 @@ DEPENDENCIES
thor
timecop
tzinfo-data
webmock
BUNDLED WITH
1.16.1
1.16.2

13
Guardfile

@ -0,0 +1,13 @@
guard :rails do
watch('Gemfile.lock')
watch(%r{^(config|lib)/.*})
end
guard :livereload do
watch(%r{app/views/.+\.(erb|haml|slim)$})
watch(%r{app/helpers/.+\.rb})
watch(%r{public/.+\.(css|js|html)})
watch(%r{config/locales/.+\.yml})
# Rails Assets Pipeline
watch(%r{(app|vendor)(/assets/\w+/(.+\.(css|js|html|png|jpg|coffee|scss))).*}) { |m| "/assets/#{m[3]}" }
end

71
app/lib/http.rb

@ -0,0 +1,71 @@
class Http
def initialize(url)
@url = url
@response = self.grab
end
def code
@response.code
end
def success?
@response.success?
end
def html?
@response.content_type == 'text/html'
end
def title
html = self.parse
return nil unless html
tag = html.at 'head title'
tag&.text
end
def charset
return nil unless self.html?
body = @response.body
html = Nokogiri::HTML.parse body
# Content-Type charset seems already processed by HTTParty
# charset = @response.headers['content-type']
# charset = /text\/html;\s*charset=(.*)/i.match charset
# return charset[1] if charset
charset = html.at 'head meta[charset]'
return charset['charset'] if charset
charset = html.at 'head meta[http-equiv="Content-Type"]'
if charset
charset = charset['content']
charset = /text\/html;\s*charset=(.*)/i.match charset
return charset[1] if charset
end
nil
end
def body
charset = self.charset
body = @response.body
body = body.force_encoding charset if charset
body.encode! 'utf-8' unless body.encoding == Encoding::UTF_8
body
end
protected
def grab
response = HTTParty.get @url, timeout: 10.seconds
raise "Receive #{response.code}" unless response.success?
response
end
def parse
return nil unless self.html?
body = @response.body
Nokogiri::HTML.parse body
end
end

8
app/lib/utils.rb

@ -1,13 +1,5 @@
module Utils
def self.utf8!(text)
return nil unless text
return text if text.encoding == Encoding::UTF_8
text.force_encoding 'utf-8'
end
def self.diff(a, b, context: 3, limit: 30)
a = self.utf8! a
b = self.utf8! b
diff = Diffy::Diff.new a, b, context: context
diff = diff.to_s :color
return '...(too much diff)...'.colorize :light_red if diff.lines.size > limit

12
app/models/check.rb

@ -13,8 +13,8 @@ class Check < ApplicationRecord
state = :unchanged
begin
target = self.target
reference = Utils.utf8! self.content
target = self.target
reference = self.content
content = target.extract content
changed = reference != content
if changed
@ -25,7 +25,6 @@ class Check < ApplicationRecord
end
self.last_error = nil
rescue => e
raise
$stderr.puts e
state = :error
self.last_error = e
@ -35,6 +34,13 @@ class Check < ApplicationRecord
state
end
def diff(context: 3, **kwargs)
reference = self.reference
target = self.target
content = target.extract self.content
Diffy::Diff.new reference, content, context: context, **kwargs
end
def recalculate!(debug: false)
state = :unchanged

35
app/models/site.rb

@ -10,24 +10,8 @@ class Site < ApplicationRecord
self.where(url: url).first
end
def self.grab(url)
response = HTTParty.get url, timeout: 10.seconds
raise "Receive #{response.code}" unless response.success?
response
end
def self.html(url)
response = self.grab url
content_type = response.content_type
raise "Expecting #{'text/html'.colorize :yellow}, got #{content_type.colorize :yellow}" unless content_type == 'text/html'
content = response.body
Nokogiri::HTML.parse content
end
def self.title(url)
html = self.html url
tag = html.at 'head title'
tag&.text
def grab
Http.new self.url
end
def inherited_targets
@ -58,12 +42,18 @@ class Site < ApplicationRecord
current_index < state_index ? state : current
end
def diff(context: 3, **kwargs)
reference = self.reference
content = self.content
Diffy::Diff.new reference, content, context: context, **kwargs
end
def diff!(content, debug: false)
self.checked_at = Time.now
state = :unchanged
begin
reference = Utils.utf8! self.content
reference = self.content
checks = self.checks
if checks.empty?
if reference != content
@ -94,12 +84,7 @@ class Site < ApplicationRecord
def check(debug: false)
reference = self.reference
response = self.class.grab self.url
content = response.body
# case response.content_type
# when 'text/html'
# content = content.force_encoding 'utf-8'
# end
content = self.grab.body
unless reference
self.reference! content
return :reference

1
bin/cli.rb

@ -3,6 +3,7 @@ require 'ostruct'
require 'optparse'
# Force resolution to avoid cycle in autoloading
Http
Check
Target
Site

4
db/migrate/20180510000002_create_sites.rb

@ -4,8 +4,8 @@ class CreateSites < ActiveRecord::Migration[5.1]
t.string :url, null: false
t.string :name, index: true
t.binary :reference, null: false
t.binary :content, null: false
t.text :reference
t.text :content
t.belongs_to :group, index: true, foreign_key: true
t.belongs_to :template, index: true, foreign_key: true

4
db/migrate/20180510000004_create_checks.rb

@ -1,8 +1,8 @@
class CreateChecks < ActiveRecord::Migration[5.1]
def change
create_table :checks do |t|
t.binary :reference, null: false
t.binary :content, null: false
t.text :reference
t.text :content
t.belongs_to :target, index: true, foreign_key: true
t.belongs_to :site, index: true, foreign_key: true

8
db/schema.rb

@ -16,8 +16,8 @@ ActiveRecord::Schema.define(version: 20180510000004) do
enable_extension "plpgsql"
create_table "checks", force: :cascade do |t|
t.binary "reference"
t.binary "content"
t.text "reference"
t.text "content"
t.bigint "target_id"
t.bigint "site_id"
t.string "last_error"
@ -37,8 +37,8 @@ ActiveRecord::Schema.define(version: 20180510000004) do
create_table "sites", force: :cascade do |t|
t.string "url", null: false
t.string "name"
t.binary "reference"
t.binary "content"
t.text "reference"
t.text "content"
t.bigint "group_id"
t.bigint "template_id"
t.string "last_error"

78
spec/http_spec.rb

@ -0,0 +1,78 @@
RSpec.describe Http do
let :site do
Http.new 'http://localhost/'
end
it 'must encode to utf-8 if not HTML' do
stub_request(:any, 'localhost').to_return body: "\xC3\xA9", status: 200,
headers: { 'Content-Type': 'application/pdf' }
expect(site.body.encoding).to eq Encoding::UTF_8
expect(site.body).to eq "\xC3\xA9"
end
it 'must encode to utf-8 if HTML and nothing specified' do
body = <<-HEREDOC
<html>
<body>\xC3\xA9</body>
</html>'
HEREDOC
stub_request(:any, 'localhost').to_return body: body, status: 200,
headers: { 'Content-Type': 'text/html' }
body = site.body
expect(body.encoding).to eq Encoding::UTF_8
body = Nokogiri::HTML.parse body
body = body.at('body').content
expect(body).to eq "\xC3\xA9"
end
it 'must encode to given content-type charset if nothing specified' do
body = <<-HEREDOC
<html>
<body>\xE9</body>
</html>'
HEREDOC
stub_request(:any, 'localhost').to_return body: body, status: 200,
headers: { 'Content-Type': 'text/html; charset=iso-8859-1' }
body = site.body
expect(body.encoding).to eq Encoding::UTF_8
body = Nokogiri::HTML.parse body
body = body.at('body').content
expect(body).to eq "\xC3\xA9"
end
it 'must encode to given meta charset' do
body = <<-HEREDOC
<html>
<head>
<meta charset="ISO-8859-1"/>
</head>
<body>\xE9</body>
</html>'
HEREDOC
stub_request(:any, 'localhost').to_return body: body, status: 200,
headers: { 'Content-Type': 'text/html' }
body = site.body
expect(body.encoding).to eq Encoding::UTF_8
body = Nokogiri::HTML.parse body
body = body.at('body').content
expect(body).to eq "\xC3\xA9"
end
it 'must encode to given meta http-equiv' do
body = <<-HEREDOC
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; CHARSET=iso-8859-1">
</head>
<body>\xE9</body>
</html>'
HEREDOC
stub_request(:any, 'localhost').to_return body: body, status: 200,
headers: { 'Content-Type': 'text/html' }
body = site.body
expect(body.encoding).to eq Encoding::UTF_8
body = Nokogiri::HTML.parse body
body = body.at('body').content
expect(body).to eq "\xC3\xA9"
end
end

2
spec/models/site_spec.rb

@ -19,7 +19,7 @@ RSpec.describe Site, type: :model do
end
def stub_page(content)
allow(Site).to receive(:grab) { OpenStruct.new body: content }
allow(site).to receive(:grab) { OpenStruct.new body: content }
end
def check!(content)

61
spec/spec_helper.rb

@ -1,3 +1,5 @@
require 'webmock/rspec'
# This file was generated by the `rails generate rspec:install` command. Conventionally, all
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
# The generated `.rspec` file contains `--require spec_helper` which will cause
@ -14,38 +16,38 @@
#
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
RSpec.configure do |config|
# rspec-expectations config goes here. You can use an alternate
# assertion/expectation library such as wrong or the stdlib/minitest
# assertions if you prefer.
config.expect_with :rspec do |expectations|
# This option will default to `true` in RSpec 4. It makes the `description`
# and `failure_message` of custom matchers include text for helper methods
# defined using `chain`, e.g.:
# be_bigger_than(2).and_smaller_than(4).description
# # => "be bigger than 2 and smaller than 4"
# ...rather than:
# # => "be bigger than 2"
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
end
# rspec-expectations config goes here. You can use an alternate
# assertion/expectation library such as wrong or the stdlib/minitest
# assertions if you prefer.
config.expect_with :rspec do |expectations|
# This option will default to `true` in RSpec 4. It makes the `description`
# and `failure_message` of custom matchers include text for helper methods
# defined using `chain`, e.g.:
# be_bigger_than(2).and_smaller_than(4).description
# # => "be bigger than 2 and smaller than 4"
# ...rather than:
# # => "be bigger than 2"
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
end
# rspec-mocks config goes here. You can use an alternate test double
# library (such as bogus or mocha) by changing the `mock_with` option here.
config.mock_with :rspec do |mocks|
# Prevents you from mocking or stubbing a method that does not exist on
# a real object. This is generally recommended, and will default to
# `true` in RSpec 4.
mocks.verify_partial_doubles = true
end
# rspec-mocks config goes here. You can use an alternate test double
# library (such as bogus or mocha) by changing the `mock_with` option here.
config.mock_with :rspec do |mocks|
# Prevents you from mocking or stubbing a method that does not exist on
# a real object. This is generally recommended, and will default to
# `true` in RSpec 4.
mocks.verify_partial_doubles = true
end
# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
# have no way to turn it off -- the option exists only for backwards
# compatibility in RSpec 3). It causes shared context metadata to be
# inherited by the metadata hash of host groups and examples, rather than
# triggering implicit auto-inclusion in groups with matching metadata.
config.shared_context_metadata_behavior = :apply_to_host_groups
# This option will default to `:apply_to_host_groups` in RSpec 4 (and will
# have no way to turn it off -- the option exists only for backwards
# compatibility in RSpec 3). It causes shared context metadata to be
# inherited by the metadata hash of host groups and examples, rather than
# triggering implicit auto-inclusion in groups with matching metadata.
config.shared_context_metadata_behavior = :apply_to_host_groups
# The settings below are suggested to provide a good initial experience
# with RSpec, but feel free to customize to your heart's content.
# The settings below are suggested to provide a good initial experience
# with RSpec, but feel free to customize to your heart's content.
=begin
# This allows you to limit a spec run to individual examples or groups
# you care about by tagging them with `:focus` metadata. When nothing
@ -94,3 +96,4 @@ RSpec.configure do |config|
Kernel.srand config.seed
=end
end

Loading…
Cancel
Save