diff --git a/lib/wombat/processing/parser.rb b/lib/wombat/processing/parser.rb index 1fdcd4f..12cfbf5 100644 --- a/lib/wombat/processing/parser.rb +++ b/lib/wombat/processing/parser.rb @@ -17,8 +17,11 @@ module Processing module Parser attr_accessor :mechanize, :context, :response_code, :page - def initialize + def initialize(opts={}) @mechanize = Mechanize.new + if opts[:allowed_error_codes] + @mechanize.agent.allowed_error_codes = opts[:allowed_error_codes] + end @mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args end diff --git a/lib/wombat/property/locators/follow.rb b/lib/wombat/property/locators/follow.rb index 4cd6389..b5dfc0b 100644 --- a/lib/wombat/property/locators/follow.rb +++ b/lib/wombat/property/locators/follow.rb @@ -8,7 +8,12 @@ def locate(context, page = nil) super do locate_nodes(context).flat_map do |node| target_page = page.click node - context = target_page.parser + if target_page.respond_to? :parser + context = target_page.parser + else + # Mechanize returns different types depending on status code :/ + context = Nokogiri::HTML(target_page.body) + end filter_properties(context, page) end @@ -17,4 +22,4 @@ def locate(context, page = nil) end end end -end \ No newline at end of file +end diff --git a/spec/integration/integration_spec.rb b/spec/integration/integration_spec.rb index 4400271..23aef44 100644 --- a/spec/integration/integration_spec.rb +++ b/spec/integration/integration_spec.rb @@ -245,32 +245,68 @@ end end - it 'should follow links' do - VCR.use_cassette('follow_links') do - crawler = Class.new - crawler.send(:include, Wombat::Crawler) + context "when following links" do + it "should be successful when all links are valid" do + VCR.use_cassette('follow_links') do + crawler = Class.new + crawler.send(:include, Wombat::Crawler) - crawler.base_url "https://www.github.com" - crawler.path "/" + crawler.base_url "https://www.github.com" + crawler.path "/" - crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do - heading 'css=h1' + crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do + heading 'css=h1' + end + + crawler_instance = crawler.new + results = crawler_instance.crawl + + results.should == { + "github" => [ + { "heading"=>"GitHub helps people build software together." }, + { "heading"=>nil }, + { "heading"=>"Features" }, + { "heading"=>"Contact GitHub" }, + { "heading"=>"GitHub Training — Git Training from the Experts" }, + { "heading"=>"GitHub on Your Servers" }, + { "heading"=>"Loading..." } + ] + } end + end - crawler_instance = crawler.new - results = crawler_instance.crawl + it "should be successful when respecting allowed_error_codes" do + VCR.use_cassette('follow_links') do + crawler = Class.new + crawler.send(:include, Wombat::Crawler) + + crawler.base_url "https://www.github.com" + crawler.path "/" + + # This takes precedence over the VCR cassette + FakeWeb.register_uri(:get, "https://github.com/contact", + body: "

This is not the web page you are looking for.

", + status: ["404", "Not Found"]) - results.should == { - "github" => [ - { "heading"=>"GitHub helps people build software together." }, - { "heading"=>nil }, - { "heading"=>"Features" }, - { "heading"=>"Contact GitHub" }, - { "heading"=>"GitHub Training — Git Training from the Experts" }, - { "heading"=>"GitHub on Your Servers" }, - { "heading"=>"Loading..." } - ] - } + crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do + heading 'css=h1' + end + + crawler_instance = crawler.new(allowed_error_codes: ['404']) + results = crawler_instance.crawl + + results.should == { + "github" => [ + { "heading"=>"GitHub helps people build software together." }, + { "heading"=>nil }, + { "heading"=>"Features" }, + { "heading"=>"This is not the web page you are looking for." }, + { "heading"=>"GitHub Training — Git Training from the Experts" }, + { "heading"=>"GitHub on Your Servers" }, + { "heading"=>"Loading..." } + ] + } + end end end end