require 'fileutils' require 'date/format' require 'date' require 'time' require 'net/http' #Optimize to get rid of redundant code (3 separate copy pastes for 3 addresses) #Update to use Head call to check for existence of page. Cheaper than full get #Maybe should just parse his main pages to find URL's of items.... #Add links to bottom of items referencing his various sites def fetch_url(address) body = "" #Hit the URL and see if anything is there url = URI.parse(address) req = Net::HTTP::Head.new(url.path) res = Net::HTTP.start(url.host, url.port) {|http| http.request(req) } #puts res.http_version #puts res.message if(res.code == "200") #200 = HTTP OK puts res.code + " - " + address #Exists so go ahead and do a full get req = Net::HTTP::Get.new(url.path) res = Net::HTTP.start(url.host, url.port) {|http| http.request(req) } body = res.body end return body end def create_item(address, body, regexp, existing_filename, d, title) #puts address reg = Regexp.new('
(.+)', Regexp::MULTILINE) regtitle = (body.scan(/((.+)<\/TITLE>)/im)[0])[1] body2 = body[reg] if body2 == nil then puts "Body was nil" #puts address reg = Regexp.new('<HR width="98%" color=#336699 noShade SIZE=2>(.+)', Regexp::MULTILINE) regtitle = (body.scan(/(<TITLE>(.+)<\/TITLE>)/im)[0])[1] body = body[reg] puts "body action" else body = body2 end body = body.gsub(/<hr width="98%" size="2" color="#336699" noshade>/, "") body = body.gsub(/<HR width="98%" color=#336699 noShade SIZE=2>/, "") #Replace EM with apostrophe (encoding issue I presume) body = body.gsub(/\x19/, "'") body = body.gsub('’', "'") #Get rid of weird MS specific content body = body.gsub(/<o:p>/i, '<p>') body = body.gsub(/<\/o:p>/i, '</p>') body = body.gsub(/style=".+?"/im, '') #get rid of any pesky non-ascii characters body = body.gsub(/[^\x20-\x7F]/, ' ') #body = body.gsub(/<div class="storyhed">(.+?)<\/div>/, '<b>\1</b>') #Need to see if we have a file that corresponds to this day, #if we do, we need to compare it to the content we retrieved #to see if anything has changed. If so, we need to update the #link to be different so that feed readers will notice the change #existing_filename = filepath + 'ronpaultst_' + d.strftime('%m-%d-%y') does_file_exist = File.exist?(existing_filename) if(does_file_exist) #Fetch our existing file contents existing_file = File.new(existing_filename) existing_contents = "" while (existing_file.gets != nil) existing_contents += $_ end existing_contents = existing_contents.chomp #Compare to what we fetched from the web #If different update our file on disk #along with the address variable (this is what the #feedreaders will key off of to recognize a change if(!body.eql?(existing_contents)) puts "File #{existing_filename} does not match content from web, overwriting" existing_file = File.new(existing_filename, 'w') existing_file.puts body address = address + '#' + Time.now.strftime('%H%M%S') end existing_file.close() else existing_file = File.new(existing_filename, 'w') existing_file.puts body end # end file exist item = ' <item>' item += " <title>#{title}: #{regtitle}" item += " #{address}" item += ' ' item += "

' #adding in some related links (Home Page, House of Reps webpage #Campaign webpage and donation webpage) to the end of each item item += ' Ron Paul 2008 Webpage  |  ' item += ' Donate to the Ron Paul 2008 Campaign  |  ' item += ' Ron Paul Personal Webpage  |  ' item += ' Ron Paul House of Representatives Webpage' item += ' ]]>' item += '
' item += ' Ron Paul' item += " #{d.strftime('%a, %d %b %Y 23:00:00 CST')}" item += " #{address}" item += ' ' end def print_last_x_weekdays(filepath, copypath, x) d = Date.today() counter = 1 badcounter = 0 filename = 'index.rss' f = File.new(filepath + filename, "w") f.puts '' f.puts '' f.puts ' ' f.puts ' Unofficial Ron Paul RSS Feed' f.puts ' http://www.house.gov/paul/' f.puts ' Unofficial Ron Paul RSS feed containing his writings and site updates' f.puts ' en-us' f.puts " #{d.strftime('%a, %d %b %Y 23:00:00 CST')}" f.puts " #{d.strftime('%a, %d %b %Y 23:00:00 CST')}" #Go through until we get a correct number of #days or the number of bad hits exceeds limit while(counter <= x ) #We have 3 places to check #Texas Straight Talk address = "http://www.house.gov/paul/tst/tst#{d.strftime('%Y')}/tst#{d.strftime('%m%d%y')}.htm" #Speeches And Statements address2 = "http://www.house.gov/paul/congrec/congrec#{d.strftime('%Y')}/cr#{d.strftime('%m%d%y')}.htm" #Press Releases address3 = "http://www.house.gov/paul/press/press#{d.strftime('%Y')}/pr#{d.strftime('%m%d%y')}.htm" #puts address #puts address2 #puts address3 body = fetch_url(address) if(body.length > 0) counter +=1 existing_filename = filepath + 'ronpaultst_' + d.strftime('%m-%d-%y') regexp = '
(.+)' item = create_item(address, body, regexp, existing_filename, d, "Texas Straight Talk") f.puts item end # end res.code test body = fetch_url(address2) if(body.length > 0) counter +=1 existing_filename = filepath + 'ronpaulcongrec_' + d.strftime('%m-%d-%y') regexp = '
(.+)' item = create_item(address2, body, regexp, existing_filename, d, "Speeches And Statements") f.puts item end # end res.code test body = fetch_url(address3) if(body.length > 0) counter +=1 existing_filename = filepath + 'ronpaulpress_' + d.strftime('%m-%d-%y') regexp = '
(.+)' item = create_item(address3, body, regexp, existing_filename, d, "Press Releases") f.puts item end # end res.code test #Hit the URL and see if anything is there #url = URI.parse(address2) #req = Net::HTTP::Get.new(url.path) #res = Net::HTTP.start(url.host, url.port) {|http| # http.request(req) #} #200 = HTTP OK #puts res.code + " - " + address2 =begin #if(res.code=="200") body = fetch_url(address2) if(body.length > 0) counter +=1 body = res.body #puts body reg = Regexp.new('
(.+)', Regexp::MULTILINE) regtitle = (body.scan(/((.+)<\/title>)/)[0])[1] body = body[reg] body = body.gsub(/<hr width="98%" size="2" color="#336699" noshade>/, "") #Replace EM with apostrophe (encoding issue I presume) body = body.gsub(/\x19/, "'") #get rid of any pesky non-ascii characters body = body.gsub(/[^\x20-\x7F]/, ' ') #body = body.gsub(/<div class="storyhed">(.+?)<\/div>/, '<b>\1</b>') #Need to see if we have a file that corresponds to this day, #if we do, we need to compare it to the content we retrieved #to see if anything has changed. If so, we need to update the #link to be different so that feed readers will notice the change existing_filename = filepath + 'ronpaulcongrec_' + d.strftime('%m-%d-%y') does_file_exist = File.exist?(existing_filename) if(does_file_exist) #Fetch our existing file contents existing_file = File.new(existing_filename) existing_contents = "" while (existing_file.gets != nil) existing_contents += $_ end existing_contents = existing_contents.chomp #Compare to what we fetched from the web #If different update our file on disk #along with the address variable (this is what the #feedreaders will key off of to recognize a change if(!body.eql?(existing_contents)) puts "File #{existing_filename} does not match content from web, overwriting" existing_file = File.new(existing_filename, 'w') existing_file.puts body address = address + '#' + Time.now.strftime('%H%M%S') end existing_file.close() else existing_file = File.new(existing_filename, 'w') existing_file.puts body end # end file exist item = ' <item>' item += " <title>Speeches And Statements: #{regtitle}" item += " #{address}" item += ' ' item += " " item += ' ' item += ' Ron Paul' item += " #{d.strftime('%a, %d %b %Y 23:00:00 CST')}" item += " #{address}" item += ' ' f.puts item elsif badcounter+=1 end # end res.code test #Hit the URL and see if anything is there #url = URI.parse(address3) #req = Net::HTTP::Get.new(url.path) #res = Net::HTTP.start(url.host, url.port) {|http| # http.request(req) #} #200 = HTTP OK #puts res.code + " - " + address3 #if(res.code=="200") body = fetch_url(address3) if(body.length > 0) counter +=1 body = res.body #puts body reg = Regexp.new('
(.+)', Regexp::MULTILINE) regtitle = Regexp.new('((.+)<\/TITLE><\/HEAD>)', Regexp::MULTILINE) regtitle = (body.scan(regtitle)[0])[1] body = body[reg] body = body.gsub(/<hr width="98%" size="2" color="#336699" noshade>/, "") #Replace EM with apostrophe (encoding issue I presume) body = body.gsub(/\x19/, "'") #get rid of any pesky non-ascii characters body = body.gsub(/[^\x20-\x7F]/, ' ') #body = body.gsub(/<div class="storyhed">(.+?)<\/div>/, '<b>\1</b>') #Need to see if we have a file that corresponds to this day, #if we do, we need to compare it to the content we retrieved #to see if anything has changed. If so, we need to update the #link to be different so that feed readers will notice the change existing_filename = filepath + 'ronpaulpress_' + d.strftime('%m-%d-%y') does_file_exist = File.exist?(existing_filename) if(does_file_exist) #Fetch our existing file contents existing_file = File.new(existing_filename) existing_contents = "" while (existing_file.gets != nil) existing_contents += $_ end existing_contents = existing_contents.chomp #Compare to what we fetched from the web #If different update our file on disk #along with the address variable (this is what the #feedreaders will key off of to recognize a change if(!body.eql?(existing_contents)) puts "File #{existing_filename} does not match content from web, overwriting" existing_file = File.new(existing_filename, 'w') existing_file.puts body address = address + '#' + Time.now.strftime('%H%M%S') end existing_file.close() else existing_file = File.new(existing_filename, 'w') existing_file.puts body end # end file exist item = ' <item>' item += " <title>Press Releases: #{regtitle}" item += " #{address}" item += ' ' item += " " item += ' ' item += ' Ron Paul' item += " #{d.strftime('%a, %d %b %Y 23:00:00 CST')}" item += " #{address}" item += ' ' f.puts item elsif badcounter+=1 end # end res.code test =end #move back one day d = d-1 end #end while loop f.puts '
' f.puts '
' f.close() #Copy our completed file to its final location FileUtils.copy(filepath + filename, copypath + filename) end #call our method with command line arguments #ARGV[0]: file path to use for file construction #ARGV[1]: file path to copy finished file to #ARGV[2]: number of days back to fetch into file print_last_x_weekdays(ARGV[0], ARGV[1], ARGV[2].to_i()) #puts fetch_url("http://www.house.gov/paul/tst/tst2007/tst041607.htm") #print_last_x_weekdays("c:\\test\\", "c:\\test\\test2\\", 1)